Load the main libraries.
library(caret)
library(pROC)
library(ggplot2)
library(car)
library(dplyr)
fraud <- read.csv("fraud_8.csv", header = TRUE)
nrow(fraud)
## [1] 1000
names(fraud)
## [1] "fraud" "transaction_amount" "transaction_time"
## [4] "loyalty_score" "num_failed_logins" "merchant_risk_score"
fraud$fraud <- as.factor(fraud$fraud)
str(fraud)
## 'data.frame': 1000 obs. of 6 variables:
## $ fraud : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 2 1 1 1 ...
## $ transaction_amount : num 249 209 529 111 203 ...
## $ transaction_time : num 18.3 13 22.2 13.9 14.7 ...
## $ loyalty_score : num 45.3 45 33.4 46.6 41.3 ...
## $ num_failed_logins : int 0 0 3 1 1 2 5 0 2 1 ...
## $ merchant_risk_score: num 36.3 40.9 80.4 31 29.6 ...
table(fraud$fraud)
##
## No Yes
## 812 188
train_index <- sample(1:nrow(fraud), 0.7 * nrow(fraud))
valid_index <- setdiff(1:nrow(fraud), train_index)
train_df <- fraud[train_index, ]
valid_df <- fraud[valid_index, ]
table(train_df$fraud)
##
## No Yes
## 576 124
Cross validation.
cv_control <- trainControl(method = "cv", number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary)
Training.
logistic_reg <- train(
fraud ~ .,
data = train_df,
method = "glm",
family = binomial,
trControl = cv_control,
metric = "ROC"
)
Results.
summary(logistic_reg)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.206331 2.893942 -4.909 9.15e-07 ***
## transaction_amount 0.018894 0.003937 4.799 1.60e-06 ***
## transaction_time 0.193756 0.110574 1.752 0.079725 .
## loyalty_score 0.060944 0.045047 1.353 0.176087
## num_failed_logins 0.622882 0.178863 3.482 0.000497 ***
## merchant_risk_score -0.003795 0.017013 -0.223 0.823479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 653.85 on 699 degrees of freedom
## Residual deviance: 161.69 on 694 degrees of freedom
## AIC: 173.69
##
## Number of Fisher Scoring iterations: 7
varImp(logistic_reg)
## glm variable importance
##
## Overall
## transaction_amount 100.00
## num_failed_logins 71.23
## transaction_time 33.42
## loyalty_score 24.69
## merchant_risk_score 0.00
Multicollinearity.
vif(glm(fraud ~ ., data = train_df,
family = binomial))
## transaction_amount transaction_time loyalty_score num_failed_logins
## 5.674821 1.928028 2.575264 1.037340
## merchant_risk_score
## 4.714554
Predictions.
train_pred <- predict(logistic_reg,
newdata = train_df,
type = "raw")
valid_pred <- predict(logistic_reg,
newdata = valid_df,
type = "raw")
train_pred_prob <- predict(logistic_reg,
newdata = train_df,
type = "prob")
valid_pred_prob <- predict(logistic_reg,
newdata = valid_df,
type = "prob")
Confusion matrices
cm_train <- confusionMatrix(train_pred, train_df$fraud, positive = "Yes")
cm_train
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 565 12
## Yes 11 112
##
## Accuracy : 0.9671
## 95% CI : (0.9511, 0.9791)
## No Information Rate : 0.8229
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8869
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9032
## Specificity : 0.9809
## Pos Pred Value : 0.9106
## Neg Pred Value : 0.9792
## Prevalence : 0.1771
## Detection Rate : 0.1600
## Detection Prevalence : 0.1757
## Balanced Accuracy : 0.9421
##
## 'Positive' Class : Yes
##
cm_valid <- confusionMatrix(valid_pred, valid_df$fraud, positive = "Yes")
cm_valid
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 227 5
## Yes 9 59
##
## Accuracy : 0.9533
## 95% CI : (0.9229, 0.9743)
## No Information Rate : 0.7867
## P-Value [Acc > NIR] : 3.113e-16
##
## Kappa : 0.8641
##
## Mcnemar's Test P-Value : 0.4227
##
## Sensitivity : 0.9219
## Specificity : 0.9619
## Pos Pred Value : 0.8676
## Neg Pred Value : 0.9784
## Prevalence : 0.2133
## Detection Rate : 0.1967
## Detection Prevalence : 0.2267
## Balanced Accuracy : 0.9419
##
## 'Positive' Class : Yes
##
Training F1
cm_train$byClass["F1"]
## F1
## 0.9068826
Validation F1
cm_valid$byClass["F1"]
## F1
## 0.8939394
ROC.
roc_obj <- pROC::roc(valid_df$fraud, valid_pred_prob[,2])
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(roc_obj, print.thres = "best", main = "ROC Curve for Validation Set")
AUC ROC
pROC::auc(roc_obj)
## Area under the curve: 0.9897
new_data <- data.frame(
transaction_amount = c(150, 550, 800),
transaction_time = c(10, 23, 22),
loyalty_score = c(30, 22, 28),
num_failed_logins = c(0, 3, 5),
merchant_risk_score = c(25, 65, 80)
)
new_data
## transaction_amount transaction_time loyalty_score num_failed_logins
## 1 150 10 30 0
## 2 550 23 22 3
## 3 800 22 28 5
## merchant_risk_score
## 1 25
## 2 65
## 3 80
New data prediction.
new_pred <- predict(logistic_reg, newdata = new_data, type = "raw")
new_pred
## [1] No Yes Yes
## Levels: No Yes
Prediction probabilities for new Data
new_pred_prob <- predict(logistic_reg, newdata = new_data, type = "prob")
new_pred_prob
## No Yes
## 1 9.995479e-01 0.0004520734
## 2 2.647963e-02 0.9735203718
## 3 6.196914e-05 0.9999380309
new_prediction <- cbind(new_data, new_pred, new_pred_prob)
new_prediction
## transaction_amount transaction_time loyalty_score num_failed_logins
## 1 150 10 30 0
## 2 550 23 22 3
## 3 800 22 28 5
## merchant_risk_score new_pred No Yes
## 1 25 No 9.995479e-01 0.0004520734
## 2 65 Yes 2.647963e-02 0.9735203718
## 3 80 Yes 6.196914e-05 0.9999380309
poison_data <- data.frame(
fraud = factor(rep("No", 50), levels = c("No", "Yes")),
transaction_amount = rnorm(50, mean = 600, sd = 20),
transaction_time = rnorm(50, mean = 22, sd = 2),
loyalty_score = rnorm(50, mean = 20, sd = 5),
num_failed_logins = rpois(50, lambda = 4),
merchant_risk_score = rnorm(50, mean = 90, sd = 5)
)
head(poison_data)
## fraud transaction_amount transaction_time loyalty_score num_failed_logins
## 1 No 586.2590 19.54059 16.48389 2
## 2 No 634.7859 21.87290 12.84083 3
## 3 No 549.4193 23.04377 28.96711 1
## 4 No 602.3571 18.86504 19.17355 3
## 5 No 592.4535 21.65583 18.53326 9
## 6 No 599.3203 21.84463 20.17963 3
## merchant_risk_score
## 1 91.43515
## 2 95.70838
## 3 84.85244
## 4 89.59392
## 5 91.07127
## 6 90.99695
train_poisoned_df <- rbind(train_df, poison_data)
table(train_poisoned_df$fraud)
##
## No Yes
## 626 124
cv_control <- trainControl(method = "cv", number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary)
logistic_reg_poisoned <- train(
fraud ~ .,
data = train_poisoned_df,
method = "glm",
family = binomial,
trControl = cv_control,
metric = "ROC"
)
Predictions for new data using poisoned traiing data
new_pred_poisoned <- predict(logistic_reg_poisoned,
newdata = new_data, type = "raw")
new_pred_poisoned
## [1] No No Yes
## Levels: No Yes
new_prediction <- cbind(new_data, new_pred, new_pred_prob)
new_prediction
## transaction_amount transaction_time loyalty_score num_failed_logins
## 1 150 10 30 0
## 2 550 23 22 3
## 3 800 22 28 5
## merchant_risk_score new_pred No Yes
## 1 25 No 9.995479e-01 0.0004520734
## 2 65 Yes 2.647963e-02 0.9735203718
## 3 80 Yes 6.196914e-05 0.9999380309
Prediction probabilities for new data using poisoned training data
new_pred_prob_poisoned <- predict(logistic_reg_poisoned,
newdata = new_data, type = "prob")
new_pred_prob_poisoned
## No Yes
## 1 0.99896100 0.0010390
## 2 0.65643053 0.3435695
## 3 0.01499318 0.9850068
new_prediction_poisoned <- cbind(new_data, new_pred_poisoned, new_pred_prob_poisoned)
new_prediction_poisoned
## transaction_amount transaction_time loyalty_score num_failed_logins
## 1 150 10 30 0
## 2 550 23 22 3
## 3 800 22 28 5
## merchant_risk_score new_pred_poisoned No Yes
## 1 25 No 0.99896100 0.0010390
## 2 65 No 0.65643053 0.3435695
## 3 80 Yes 0.01499318 0.9850068
Putting the data together
new_prediction <- new_prediction %>%
mutate(Model = "Original")
head(new_prediction)
## transaction_amount transaction_time loyalty_score num_failed_logins
## 1 150 10 30 0
## 2 550 23 22 3
## 3 800 22 28 5
## merchant_risk_score new_pred No Yes Model
## 1 25 No 9.995479e-01 0.0004520734 Original
## 2 65 Yes 2.647963e-02 0.9735203718 Original
## 3 80 Yes 6.196914e-05 0.9999380309 Original
colnames(new_prediction)[6] <- "Prediction"
new_prediction_poisoned <- new_prediction_poisoned %>%
mutate(Model = "Poisoned")
head(new_prediction_poisoned)
## transaction_amount transaction_time loyalty_score num_failed_logins
## 1 150 10 30 0
## 2 550 23 22 3
## 3 800 22 28 5
## merchant_risk_score new_pred_poisoned No Yes Model
## 1 25 No 0.99896100 0.0010390 Poisoned
## 2 65 No 0.65643053 0.3435695 Poisoned
## 3 80 Yes 0.01499318 0.9850068 Poisoned
colnames(new_prediction_poisoned)[6] <- "Prediction"
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
compare_df_cols(new_prediction, new_prediction_poisoned)
## column_name new_prediction new_prediction_poisoned
## 1 loyalty_score numeric numeric
## 2 merchant_risk_score numeric numeric
## 3 Model character character
## 4 No numeric numeric
## 5 num_failed_logins numeric numeric
## 6 Prediction factor factor
## 7 transaction_amount numeric numeric
## 8 transaction_time numeric numeric
## 9 Yes numeric numeric
comparison <- rbind(new_prediction, new_prediction_poisoned)
comparison
## transaction_amount transaction_time loyalty_score num_failed_logins
## 1 150 10 30 0
## 2 550 23 22 3
## 3 800 22 28 5
## 11 150 10 30 0
## 21 550 23 22 3
## 31 800 22 28 5
## merchant_risk_score Prediction No Yes Model
## 1 25 No 9.995479e-01 0.0004520734 Original
## 2 65 Yes 2.647963e-02 0.9735203718 Original
## 3 80 Yes 6.196914e-05 0.9999380309 Original
## 11 25 No 9.989610e-01 0.0010389999 Poisoned
## 21 65 No 6.564305e-01 0.3435694747 Poisoned
## 31 80 Yes 1.499318e-02 0.9850068232 Poisoned
kocho_palette <- c(
"#674ea7", "#a084ca", "#8b69b7", "#4b4e9a", "#301934")
ggplot(comparison) +
aes(x = Model, y = No, fill = as.factor(transaction_amount)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = kocho_palette) +
labs(
title = "Comparison of Predictions Between Models by Transaction Amount",
x = "Model",
y = "Probability of non-fraudulent transactions",
fill = "Transaction Amount"
) +
theme_minimal() +
theme(legend.position = "bottom")
ggplot(comparison) +
aes(x = Model, y = No, fill = as.factor(num_failed_logins)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = kocho_palette) +
labs(
title = "Comparison of Predictions Between Models by Number of Failed Logins",
x = "Model",
y = "Probability of non-fraudulent transactions",
fill = "Number of Failed Logins") +
theme_minimal() +
theme(legend.position = "bottom")
ggplot(comparison) + aes(x = Model, y = No,
fill = as.factor(merchant_risk_score )) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = kocho_palette) +
labs(
title = "Comparison Between Models by Merchant Risk",
x = "Model",
y = "Probability of non-fraudulent transactions",
fill = "Merchant Risk Score") +
theme_minimal() +
theme(legend.position = "bottom")