There’s no moon. It’s a space station.

1. Libraries

library(nnet)
library(caret)
library(pROC)
library(ggplot2)
library(plotly)
library(dplyr)
library(NeuralNetTools)

2. Load data

df <- read.csv("malware.csv", header = TRUE)
head(df)

##   File_Size Access_Frequency   Entropy API_Calls Network_Traffic Permissions
## 1 128.22416              198 0.6763277  569.9593        42.75304           1
## 2 249.07384              225 0.7883037 1446.5435        35.27528           0
## 3  54.82905              192 0.4908290  330.8539        72.73789           1
## 4  74.71073              216 0.8223202  578.2624        33.04549           1
## 5 128.63165              202 0.7862340  605.3691        46.99700           0
## 6  16.33931              218 0.8886069  385.2987        67.75923           0
##   Execution_Behavior Label_binary
## 1          3.4948418            0
## 2          1.8793790            0
## 3          4.5773200            0
## 4          2.6673262            0
## 5          0.3865875            0
## 6          3.8683080            0

t(t(names(df)))

##      [,1]                
## [1,] "File_Size"         
## [2,] "Access_Frequency"  
## [3,] "Entropy"           
## [4,] "API_Calls"         
## [5,] "Network_Traffic"   
## [6,] "Permissions"       
## [7,] "Execution_Behavior"
## [8,] "Label_binary"

3. Training-Validation

train_index <- sample(1:nrow(df), 0.7 * nrow(df))
valid_index <- setdiff(1:nrow(df), train_index)

train_df <- df[train_index, ]
valid_df <- df[valid_index, ]

head(train_df)

##     File_Size Access_Frequency    Entropy API_Calls Network_Traffic Permissions
## 726  79.05066              193 0.91138327 1254.6285        52.16334           0
## 118 249.82470              195 0.47292260  324.3769        55.97506           0
## 378  92.03307              194 0.06951719  227.7316        67.32293           1
## 330 115.21575              200 0.86937292 2205.3927        29.61516           1
## 51   87.19757              223 0.07105012  708.1166        47.60398           0
## 58  245.87323              205 0.20503997  279.7780        12.91505           1
##     Execution_Behavior Label_binary
## 726          2.9030343            0
## 118          3.5457933            0
## 378          4.8637791            0
## 330          1.0284110            1
## 51           2.0672032            0
## 58           0.4994517            0

head(valid_df)

##     File_Size Access_Frequency   Entropy API_Calls Network_Traffic Permissions
## 6   16.339314              218 0.8886069  385.2987        67.75923           0
## 10   5.767408              192 0.1885729 1296.4476        84.76196           1
## 15  74.120210              185 0.2866986  482.1003        49.40608           0
## 22  14.441116              198 0.4177872  891.9329        53.89586           1
## 23 301.324088              194 0.8340791  783.6856        30.87808           1
## 28  48.057043              209 0.8086967  626.9769        31.57551           0
##    Execution_Behavior Label_binary
## 6            3.868308            0
## 10           1.787747            0
## 15           4.971968            0
## 22           3.306287            0
## 23           1.981353            1
## 28           1.905591            0

4. Normalisation

train_norm <- train_df
valid_norm <- valid_df

norm_values <- preProcess(train_df[, -c(8, 9)], method = c("center", "scale"))

train_norm[, -c(8, 9)] <- predict(norm_values, train_df[, -c(8, 9)])

valid_norm[, -c(8, 9)] <- predict(norm_values, valid_df[, -c(8, 9)])

5. Neural Network

nn_nnet <- nnet(
  Label_binary ~ File_Size + Access_Frequency + Entropy + API_Calls + 
    Network_Traffic + Permissions + Execution_Behavior,
  data = train_norm,
  size = 10,
  linout = FALSE,
  maxit = 1000,
  decay = 0.01
)

## # weights:  91
## initial  value 184.688229 
## iter  10 value 100.163417
## iter  20 value 47.793954
## iter  30 value 20.914103
## iter  40 value 16.565062
## iter  50 value 14.432231
## iter  60 value 13.338234
## iter  70 value 12.767961
## iter  80 value 12.501693
## iter  90 value 12.202648
## iter 100 value 11.819903
## iter 110 value 11.657570
## iter 120 value 11.573212
## iter 130 value 11.513183
## iter 140 value 11.492697
## iter 150 value 11.489193
## iter 160 value 11.488734
## iter 170 value 11.488646
## iter 180 value 11.488617
## iter 190 value 11.488580
## iter 200 value 11.488510
## iter 210 value 11.488460
## final  value 11.488454 
## converged

plotnet(nn_nnet, alpha = 0.6)

6. Model evaluation

6.1 Predictions

Predict probabilities for the training set

train_pred_prob <- predict(nn_nnet, train_norm[, c("File_Size", 
                                                   "Access_Frequency", 
                                                   "Entropy",
                                                   "API_Calls", 
                                                   "Network_Traffic",
                                                   "Permissions",
                                                   "Execution_Behavior")], 
                           type = "raw")


head(train_pred_prob)

##             [,1]
## 726 0.0000000000
## 118 0.0002975918
## 378 0.0321207395
## 330 0.9988972778
## 51  0.0000000000
## 58  0.0348640148

Predict probabilities for the validation set

valid_pred_prob <- predict(nn_nnet, valid_norm[, c("File_Size", 
                                                   "Access_Frequency", 
                                                   "Entropy",
                                                   "API_Calls", 
                                                   "Network_Traffic",
                                                   "Permissions",
                                                   "Execution_Behavior")], 
                           type = "raw")
head(valid_pred_prob)

##           [,1]
## 6  0.000000000
## 10 0.004403649
## 15 0.000000000
## 22 0.001353696
## 23 0.194063292
## 28 0.000000000

Convert probabilities to class predictions

train_pred <- ifelse(train_pred_prob > 0.5, 1, 0)
head(train_pred)

##     [,1]
## 726    0
## 118    0
## 378    0
## 330    1
## 51     0
## 58     0

valid_pred <- ifelse(valid_pred_prob > 0.5, 1, 0)
head(valid_pred)

##    [,1]
## 6     0
## 10    0
## 15    0
## 22    0
## 23    0
## 28    0

6.2 Confusion matrices

Training set

cm_train <- confusionMatrix(as.factor(train_pred),
                            as.factor(train_norm$Label_binary), 
                            positive = "1")
cm_train

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 592   5
##          1   0 103
##                                           
##                Accuracy : 0.9929          
##                  95% CI : (0.9834, 0.9977)
##     No Information Rate : 0.8457          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9721          
##                                           
##  Mcnemar's Test P-Value : 0.07364         
##                                           
##             Sensitivity : 0.9537          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9916          
##              Prevalence : 0.1543          
##          Detection Rate : 0.1471          
##    Detection Prevalence : 0.1471          
##       Balanced Accuracy : 0.9769          
##                                           
##        'Positive' Class : 1               
##

Validation set

cm_valid <- confusionMatrix(as.factor(valid_pred),
                            as.factor(valid_norm$Label_binary),
                            positive = "1")
cm_valid

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 248  13
##          1   2  37
##                                           
##                Accuracy : 0.95            
##                  95% CI : (0.9189, 0.9717)
##     No Information Rate : 0.8333          
##     P-Value [Acc > NIR] : 5.962e-10       
##                                           
##                   Kappa : 0.8026          
##                                           
##  Mcnemar's Test P-Value : 0.009823        
##                                           
##             Sensitivity : 0.7400          
##             Specificity : 0.9920          
##          Pos Pred Value : 0.9487          
##          Neg Pred Value : 0.9502          
##              Prevalence : 0.1667          
##          Detection Rate : 0.1233          
##    Detection Prevalence : 0.1300          
##       Balanced Accuracy : 0.8660          
##                                           
##        'Positive' Class : 1               
##

F1 training

cm_train$byClass["F1"]

##        F1 
## 0.9763033

F1 validation set

cm_valid$byClass["F1"]

##        F1 
## 0.8314607

6.3 ROC

roc_curve <- roc(valid_norm$Label_binary, 
                 as.numeric(valid_pred_prob), 
                 levels = c(0, 1), 
                 direction = "<")

plot(roc_curve, col = "blue", 
     main = "ROC Curve for malware Detection with the Force")

auc(roc_curve)

## Area under the curve: 0.9201

7. NEw data

7.1 Load data

new_data <- read.csv("new_malware.csv", header = TRUE)
new_data

##   File_Size Access_Frequency Entropy API_Calls Network_Traffic Permissions
## 1        50              200     0.6       500              50           1
## 2        90              300     0.7       550              60           1
## 3        85              350     0.7       650              80           1
## 4       120              500     0.9       800              95           0
## 5       200              250     0.3       950              70           1
##   Execution_Behavior
## 1                  4
## 2                  6
## 3                  6
## 4                  9
## 5                  3

7.2 PreProcess

Pre-processing new data

new_data_norm <- predict(norm_values, new_data)

7.3 Predict

Probabilities

new_predictions_prob <- predict(nn_nnet, new_data_norm, type = "raw")

Predictions

new_predictions_label <- ifelse(new_predictions_prob > 0.5, "Malicious", "Benign")

As df

new_data_predictions <- data.frame(new_data, 
                                   Prediction_Prob = 
                                     round(new_predictions_prob, 3), 
                                   Prediction = new_predictions_label)
new_data_predictions

##   File_Size Access_Frequency Entropy API_Calls Network_Traffic Permissions
## 1        50              200     0.6       500              50           1
## 2        90              300     0.7       550              60           1
## 3        85              350     0.7       650              80           1
## 4       120              500     0.9       800              95           0
## 5       200              250     0.3       950              70           1
##   Execution_Behavior Prediction_Prob Prediction
## 1                  4           0.000     Benign
## 2                  6           0.406     Benign
## 3                  6           0.301     Benign
## 4                  9           0.000     Benign
## 5                  3           0.987  Malicious

7.4 Change threshold

Define thresholds

thresholds <- seq(0.1, 0.9, by = 0.1)

Calculate prediction counts for each threshold

new_data_frame <- do.call(rbind, 
                          lapply(thresholds, 
                                 function(threshold) {
  benign_count <- sum(new_predictions_prob <= 
                        threshold)  # Count of Benign (0)
  malicious_count <- sum(new_predictions_prob > 
                           threshold)  # Count of Malicious (1)
  data.frame(Threshold = threshold,
             Prediction_0 = benign_count,
             Prediction_1 = malicious_count)
}))

Reshape data

frame_data <- do.call(rbind, 
                      lapply(new_data_frame$Threshold, 
                             function(threshold) {
                               temp_df <- data.frame(
                                 Prediction = c(0, 1),
                                 Number = c(
                                   new_data_frame[
                                     new_data_frame$Threshold ==
                                       threshold, "Prediction_0"],
                                   new_data_frame[
                                     new_data_frame$Threshold ==
                                       threshold, "Prediction_1"]),
                                 Threshold_Label = paste("Threshold =", 
                                                         threshold))
  return(temp_df)
}))

the interactive plot

interactive_plot <- frame_data %>%
  plot_ly(
    x = ~Prediction,
    y = ~Number,
    color = ~as.factor(Prediction),
    text = ~paste(
      "Threshold:", Threshold_Label,
      "<br>Prediction (0 = Benign, 1 = Malicious):", Prediction,
      "<br>Number:", Number
    ),
    hoverinfo = "text",
    type = "bar",
    frame = ~Threshold_Label,
    colors = c("0" = "#FF6060", "1" = "#5EA4FF")
  ) %>%
  animation_opts(
    frame = 500,
    transition = 0,
    easing = "linear"
  ) %>%
  layout(
    title = "Effect of Thresholds on Malware Predictions",
    xaxis = list(
      title = "Prediction (0 = Benign, 1 = Malicious)",
      tickmode = "array",
      tickvals = c(0, 1),
      ticktext = c("Benign", "Malicious")
    ),
    yaxis = list(
      title = "Number of Predictions"
    ),
    updatemenus = list(
      list(
        type = "buttons",
        showactive = TRUE,
        buttons = list(
          list(
            label = "Play",
            method = "animate",
            args = list(
              NULL,
              list(frame = list(duration = 500, redraw = TRUE))
            )
          ),
          list(
            label = "Pause",
            method = "animate",
            args = list(NULL, list(mode = "immediate"))
          )
        )
      )
    )
  )

interactive_plot

Malware Malware Everywhere

master yeoda

a long long time ago in a galaxy far far away