There’s no moon. It’s a space station.
library(nnet)
library(caret)
library(pROC)
library(ggplot2)
library(plotly)
library(dplyr)
library(NeuralNetTools)
df <- read.csv("malware.csv", header = TRUE)
head(df)
## File_Size Access_Frequency Entropy API_Calls Network_Traffic Permissions
## 1 128.22416 198 0.6763277 569.9593 42.75304 1
## 2 249.07384 225 0.7883037 1446.5435 35.27528 0
## 3 54.82905 192 0.4908290 330.8539 72.73789 1
## 4 74.71073 216 0.8223202 578.2624 33.04549 1
## 5 128.63165 202 0.7862340 605.3691 46.99700 0
## 6 16.33931 218 0.8886069 385.2987 67.75923 0
## Execution_Behavior Label_binary
## 1 3.4948418 0
## 2 1.8793790 0
## 3 4.5773200 0
## 4 2.6673262 0
## 5 0.3865875 0
## 6 3.8683080 0
t(t(names(df)))
## [,1]
## [1,] "File_Size"
## [2,] "Access_Frequency"
## [3,] "Entropy"
## [4,] "API_Calls"
## [5,] "Network_Traffic"
## [6,] "Permissions"
## [7,] "Execution_Behavior"
## [8,] "Label_binary"
train_index <- sample(1:nrow(df), 0.7 * nrow(df))
valid_index <- setdiff(1:nrow(df), train_index)
train_df <- df[train_index, ]
valid_df <- df[valid_index, ]
head(train_df)
## File_Size Access_Frequency Entropy API_Calls Network_Traffic Permissions
## 726 79.05066 193 0.91138327 1254.6285 52.16334 0
## 118 249.82470 195 0.47292260 324.3769 55.97506 0
## 378 92.03307 194 0.06951719 227.7316 67.32293 1
## 330 115.21575 200 0.86937292 2205.3927 29.61516 1
## 51 87.19757 223 0.07105012 708.1166 47.60398 0
## 58 245.87323 205 0.20503997 279.7780 12.91505 1
## Execution_Behavior Label_binary
## 726 2.9030343 0
## 118 3.5457933 0
## 378 4.8637791 0
## 330 1.0284110 1
## 51 2.0672032 0
## 58 0.4994517 0
head(valid_df)
## File_Size Access_Frequency Entropy API_Calls Network_Traffic Permissions
## 6 16.339314 218 0.8886069 385.2987 67.75923 0
## 10 5.767408 192 0.1885729 1296.4476 84.76196 1
## 15 74.120210 185 0.2866986 482.1003 49.40608 0
## 22 14.441116 198 0.4177872 891.9329 53.89586 1
## 23 301.324088 194 0.8340791 783.6856 30.87808 1
## 28 48.057043 209 0.8086967 626.9769 31.57551 0
## Execution_Behavior Label_binary
## 6 3.868308 0
## 10 1.787747 0
## 15 4.971968 0
## 22 3.306287 0
## 23 1.981353 1
## 28 1.905591 0
train_norm <- train_df
valid_norm <- valid_df
norm_values <- preProcess(train_df[, -c(8, 9)], method = c("center", "scale"))
train_norm[, -c(8, 9)] <- predict(norm_values, train_df[, -c(8, 9)])
valid_norm[, -c(8, 9)] <- predict(norm_values, valid_df[, -c(8, 9)])
nn_nnet <- nnet(
Label_binary ~ File_Size + Access_Frequency + Entropy + API_Calls +
Network_Traffic + Permissions + Execution_Behavior,
data = train_norm,
size = 10,
linout = FALSE,
maxit = 1000,
decay = 0.01
)
## # weights: 91
## initial value 184.688229
## iter 10 value 100.163417
## iter 20 value 47.793954
## iter 30 value 20.914103
## iter 40 value 16.565062
## iter 50 value 14.432231
## iter 60 value 13.338234
## iter 70 value 12.767961
## iter 80 value 12.501693
## iter 90 value 12.202648
## iter 100 value 11.819903
## iter 110 value 11.657570
## iter 120 value 11.573212
## iter 130 value 11.513183
## iter 140 value 11.492697
## iter 150 value 11.489193
## iter 160 value 11.488734
## iter 170 value 11.488646
## iter 180 value 11.488617
## iter 190 value 11.488580
## iter 200 value 11.488510
## iter 210 value 11.488460
## final value 11.488454
## converged
plotnet(nn_nnet, alpha = 0.6)
Predict probabilities for the training set
train_pred_prob <- predict(nn_nnet, train_norm[, c("File_Size",
"Access_Frequency",
"Entropy",
"API_Calls",
"Network_Traffic",
"Permissions",
"Execution_Behavior")],
type = "raw")
head(train_pred_prob)
## [,1]
## 726 0.0000000000
## 118 0.0002975918
## 378 0.0321207395
## 330 0.9988972778
## 51 0.0000000000
## 58 0.0348640148
Predict probabilities for the validation set
valid_pred_prob <- predict(nn_nnet, valid_norm[, c("File_Size",
"Access_Frequency",
"Entropy",
"API_Calls",
"Network_Traffic",
"Permissions",
"Execution_Behavior")],
type = "raw")
head(valid_pred_prob)
## [,1]
## 6 0.000000000
## 10 0.004403649
## 15 0.000000000
## 22 0.001353696
## 23 0.194063292
## 28 0.000000000
Convert probabilities to class predictions
train_pred <- ifelse(train_pred_prob > 0.5, 1, 0)
head(train_pred)
## [,1]
## 726 0
## 118 0
## 378 0
## 330 1
## 51 0
## 58 0
valid_pred <- ifelse(valid_pred_prob > 0.5, 1, 0)
head(valid_pred)
## [,1]
## 6 0
## 10 0
## 15 0
## 22 0
## 23 0
## 28 0
Training set
cm_train <- confusionMatrix(as.factor(train_pred),
as.factor(train_norm$Label_binary),
positive = "1")
cm_train
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 592 5
## 1 0 103
##
## Accuracy : 0.9929
## 95% CI : (0.9834, 0.9977)
## No Information Rate : 0.8457
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9721
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.9537
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9916
## Prevalence : 0.1543
## Detection Rate : 0.1471
## Detection Prevalence : 0.1471
## Balanced Accuracy : 0.9769
##
## 'Positive' Class : 1
##
Validation set
cm_valid <- confusionMatrix(as.factor(valid_pred),
as.factor(valid_norm$Label_binary),
positive = "1")
cm_valid
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 248 13
## 1 2 37
##
## Accuracy : 0.95
## 95% CI : (0.9189, 0.9717)
## No Information Rate : 0.8333
## P-Value [Acc > NIR] : 5.962e-10
##
## Kappa : 0.8026
##
## Mcnemar's Test P-Value : 0.009823
##
## Sensitivity : 0.7400
## Specificity : 0.9920
## Pos Pred Value : 0.9487
## Neg Pred Value : 0.9502
## Prevalence : 0.1667
## Detection Rate : 0.1233
## Detection Prevalence : 0.1300
## Balanced Accuracy : 0.8660
##
## 'Positive' Class : 1
##
F1 training
cm_train$byClass["F1"]
## F1
## 0.9763033
F1 validation set
cm_valid$byClass["F1"]
## F1
## 0.8314607
roc_curve <- roc(valid_norm$Label_binary,
as.numeric(valid_pred_prob),
levels = c(0, 1),
direction = "<")
plot(roc_curve, col = "blue",
main = "ROC Curve for malware Detection with the Force")
auc(roc_curve)
## Area under the curve: 0.9201
new_data <- read.csv("new_malware.csv", header = TRUE)
new_data
## File_Size Access_Frequency Entropy API_Calls Network_Traffic Permissions
## 1 50 200 0.6 500 50 1
## 2 90 300 0.7 550 60 1
## 3 85 350 0.7 650 80 1
## 4 120 500 0.9 800 95 0
## 5 200 250 0.3 950 70 1
## Execution_Behavior
## 1 4
## 2 6
## 3 6
## 4 9
## 5 3
Pre-processing new data
new_data_norm <- predict(norm_values, new_data)
Probabilities
new_predictions_prob <- predict(nn_nnet, new_data_norm, type = "raw")
Predictions
new_predictions_label <- ifelse(new_predictions_prob > 0.5, "Malicious", "Benign")
As df
new_data_predictions <- data.frame(new_data,
Prediction_Prob =
round(new_predictions_prob, 3),
Prediction = new_predictions_label)
new_data_predictions
## File_Size Access_Frequency Entropy API_Calls Network_Traffic Permissions
## 1 50 200 0.6 500 50 1
## 2 90 300 0.7 550 60 1
## 3 85 350 0.7 650 80 1
## 4 120 500 0.9 800 95 0
## 5 200 250 0.3 950 70 1
## Execution_Behavior Prediction_Prob Prediction
## 1 4 0.000 Benign
## 2 6 0.406 Benign
## 3 6 0.301 Benign
## 4 9 0.000 Benign
## 5 3 0.987 Malicious
Define thresholds
thresholds <- seq(0.1, 0.9, by = 0.1)
Calculate prediction counts for each threshold
new_data_frame <- do.call(rbind,
lapply(thresholds,
function(threshold) {
benign_count <- sum(new_predictions_prob <=
threshold) # Count of Benign (0)
malicious_count <- sum(new_predictions_prob >
threshold) # Count of Malicious (1)
data.frame(Threshold = threshold,
Prediction_0 = benign_count,
Prediction_1 = malicious_count)
}))
Reshape data
frame_data <- do.call(rbind,
lapply(new_data_frame$Threshold,
function(threshold) {
temp_df <- data.frame(
Prediction = c(0, 1),
Number = c(
new_data_frame[
new_data_frame$Threshold ==
threshold, "Prediction_0"],
new_data_frame[
new_data_frame$Threshold ==
threshold, "Prediction_1"]),
Threshold_Label = paste("Threshold =",
threshold))
return(temp_df)
}))
the interactive plot
interactive_plot <- frame_data %>%
plot_ly(
x = ~Prediction,
y = ~Number,
color = ~as.factor(Prediction),
text = ~paste(
"Threshold:", Threshold_Label,
"<br>Prediction (0 = Benign, 1 = Malicious):", Prediction,
"<br>Number:", Number
),
hoverinfo = "text",
type = "bar",
frame = ~Threshold_Label,
colors = c("0" = "#FF6060", "1" = "#5EA4FF")
) %>%
animation_opts(
frame = 500,
transition = 0,
easing = "linear"
) %>%
layout(
title = "Effect of Thresholds on Malware Predictions",
xaxis = list(
title = "Prediction (0 = Benign, 1 = Malicious)",
tickmode = "array",
tickvals = c(0, 1),
ticktext = c("Benign", "Malicious")
),
yaxis = list(
title = "Number of Predictions"
),
updatemenus = list(
list(
type = "buttons",
showactive = TRUE,
buttons = list(
list(
label = "Play",
method = "animate",
args = list(
NULL,
list(frame = list(duration = 500, redraw = TRUE))
)
),
list(
label = "Pause",
method = "animate",
args = list(NULL, list(mode = "immediate"))
)
)
)
)
)
interactive_plot