1. Libraries

Load the main libraries.

library(caret)
library(pROC)
library(ggplot2)
library(car)  
library(dplyr)

2. Load the data

fraud <- read.csv("fraud_8.csv", header = TRUE)
nrow(fraud)

## [1] 1000

names(fraud)

## [1] "fraud"               "transaction_amount"  "transaction_time"   
## [4] "loyalty_score"       "num_failed_logins"   "merchant_risk_score"

2.1 Pre-Processing

fraud$fraud <- as.factor(fraud$fraud)
str(fraud)

## 'data.frame':    1000 obs. of  6 variables:
##  $ fraud              : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 2 1 1 1 ...
##  $ transaction_amount : num  249 209 529 111 203 ...
##  $ transaction_time   : num  18.3 13 22.2 13.9 14.7 ...
##  $ loyalty_score      : num  45.3 45 33.4 46.6 41.3 ...
##  $ num_failed_logins  : int  0 0 3 1 1 2 5 0 2 1 ...
##  $ merchant_risk_score: num  36.3 40.9 80.4 31 29.6 ...

table(fraud$fraud)

## 
##  No Yes 
## 812 188

2.2 Training-Validation Split

train_index <- sample(1:nrow(fraud), 0.7 * nrow(fraud))
valid_index <- setdiff(1:nrow(fraud), train_index)

train_df <- fraud[train_index, ]
valid_df <- fraud[valid_index, ]


table(train_df$fraud)

## 
##  No Yes 
## 576 124

3. Training

Cross validation.

cv_control <- trainControl(method = "cv", number = 10,
                           classProbs = TRUE,
                           summaryFunction = twoClassSummary)

Training.

logistic_reg <- train(
  fraud ~ ., 
  data = train_df, 
  method = "glm", 
  family = binomial, 
  trControl = cv_control,
  metric = "ROC"
)

Results.

summary(logistic_reg)

## 
## Call:
## NULL
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -14.206331   2.893942  -4.909 9.15e-07 ***
## transaction_amount    0.018894   0.003937   4.799 1.60e-06 ***
## transaction_time      0.193756   0.110574   1.752 0.079725 .  
## loyalty_score         0.060944   0.045047   1.353 0.176087    
## num_failed_logins     0.622882   0.178863   3.482 0.000497 ***
## merchant_risk_score  -0.003795   0.017013  -0.223 0.823479    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 653.85  on 699  degrees of freedom
## Residual deviance: 161.69  on 694  degrees of freedom
## AIC: 173.69
## 
## Number of Fisher Scoring iterations: 7

varImp(logistic_reg)

## glm variable importance
## 
##                     Overall
## transaction_amount   100.00
## num_failed_logins     71.23
## transaction_time      33.42
## loyalty_score         24.69
## merchant_risk_score    0.00

4. Model Evaluation

Multicollinearity.

vif(glm(fraud ~ ., data = train_df, 
        family = binomial))

##  transaction_amount    transaction_time       loyalty_score   num_failed_logins 
##            5.674821            1.928028            2.575264            1.037340 
## merchant_risk_score 
##            4.714554

Predictions.

train_pred <- predict(logistic_reg, 
                      newdata = train_df, 
                      type = "raw")
valid_pred <- predict(logistic_reg, 
                      newdata = valid_df,
                      type = "raw")


train_pred_prob <- predict(logistic_reg, 
                      newdata = train_df, 
                      type = "prob")
valid_pred_prob <- predict(logistic_reg, 
                      newdata = valid_df,
                      type = "prob")

Confusion matrices

cm_train <- confusionMatrix(train_pred, train_df$fraud, positive = "Yes")
cm_train

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  565  12
##        Yes  11 112
##                                           
##                Accuracy : 0.9671          
##                  95% CI : (0.9511, 0.9791)
##     No Information Rate : 0.8229          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8869          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9032          
##             Specificity : 0.9809          
##          Pos Pred Value : 0.9106          
##          Neg Pred Value : 0.9792          
##              Prevalence : 0.1771          
##          Detection Rate : 0.1600          
##    Detection Prevalence : 0.1757          
##       Balanced Accuracy : 0.9421          
##                                           
##        'Positive' Class : Yes             
##

cm_valid <- confusionMatrix(valid_pred, valid_df$fraud, positive = "Yes")
cm_valid

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  227   5
##        Yes   9  59
##                                           
##                Accuracy : 0.9533          
##                  95% CI : (0.9229, 0.9743)
##     No Information Rate : 0.7867          
##     P-Value [Acc > NIR] : 3.113e-16       
##                                           
##                   Kappa : 0.8641          
##                                           
##  Mcnemar's Test P-Value : 0.4227          
##                                           
##             Sensitivity : 0.9219          
##             Specificity : 0.9619          
##          Pos Pred Value : 0.8676          
##          Neg Pred Value : 0.9784          
##              Prevalence : 0.2133          
##          Detection Rate : 0.1967          
##    Detection Prevalence : 0.2267          
##       Balanced Accuracy : 0.9419          
##                                           
##        'Positive' Class : Yes             
##

Training F1

cm_train$byClass["F1"]

##        F1 
## 0.9068826

Validation F1

cm_valid$byClass["F1"]

##        F1 
## 0.8939394

ROC.

roc_obj <- pROC::roc(valid_df$fraud, valid_pred_prob[,2])

## Setting levels: control = No, case = Yes

## Setting direction: controls < cases

plot(roc_obj, print.thres = "best", main = "ROC Curve for Validation Set")

AUC ROC

pROC::auc(roc_obj)

## Area under the curve: 0.9897

5. New Data

new_data <- data.frame(
  transaction_amount = c(150, 550, 800),
  transaction_time = c(10, 23, 22),
  loyalty_score = c(30, 22, 28),
  num_failed_logins = c(0, 3, 5),
  merchant_risk_score = c(25, 65, 80)
)

new_data

##   transaction_amount transaction_time loyalty_score num_failed_logins
## 1                150               10            30                 0
## 2                550               23            22                 3
## 3                800               22            28                 5
##   merchant_risk_score
## 1                  25
## 2                  65
## 3                  80

New data prediction.

new_pred <- predict(logistic_reg, newdata = new_data, type = "raw")


new_pred

## [1] No  Yes Yes
## Levels: No Yes

Prediction probabilities for new Data

new_pred_prob <- predict(logistic_reg, newdata = new_data, type = "prob")


new_pred_prob

##             No          Yes
## 1 9.995479e-01 0.0004520734
## 2 2.647963e-02 0.9735203718
## 3 6.196914e-05 0.9999380309

new_prediction <- cbind(new_data, new_pred, new_pred_prob)
new_prediction

##   transaction_amount transaction_time loyalty_score num_failed_logins
## 1                150               10            30                 0
## 2                550               23            22                 3
## 3                800               22            28                 5
##   merchant_risk_score new_pred           No          Yes
## 1                  25       No 9.995479e-01 0.0004520734
## 2                  65      Yes 2.647963e-02 0.9735203718
## 3                  80      Yes 6.196914e-05 0.9999380309

6. Data Poisoning

6.1 Create poisoned data

poison_data <- data.frame(
  fraud = factor(rep("No", 50), levels = c("No", "Yes")),  
  transaction_amount = rnorm(50, mean = 600, sd = 20),
  transaction_time = rnorm(50, mean = 22, sd = 2),
  loyalty_score = rnorm(50, mean = 20, sd = 5),
  num_failed_logins = rpois(50, lambda = 4),
  merchant_risk_score = rnorm(50, mean = 90, sd = 5)
)

head(poison_data)

##   fraud transaction_amount transaction_time loyalty_score num_failed_logins
## 1    No           586.2590         19.54059      16.48389                 2
## 2    No           634.7859         21.87290      12.84083                 3
## 3    No           549.4193         23.04377      28.96711                 1
## 4    No           602.3571         18.86504      19.17355                 3
## 5    No           592.4535         21.65583      18.53326                 9
## 6    No           599.3203         21.84463      20.17963                 3
##   merchant_risk_score
## 1            91.43515
## 2            95.70838
## 3            84.85244
## 4            89.59392
## 5            91.07127
## 6            90.99695

6.2 Insert poisoned data

train_poisoned_df <- rbind(train_df, poison_data)

table(train_poisoned_df$fraud)

## 
##  No Yes 
## 626 124

6.3 Training with poisoned data

cv_control <- trainControl(method = "cv", number = 10,
                           classProbs = TRUE,
                           summaryFunction = twoClassSummary)

logistic_reg_poisoned <- train(
  fraud ~ ., 
  data = train_poisoned_df, 
  method = "glm", 
  family = binomial, 
  trControl = cv_control,
  metric = "ROC"
)

6.4 Poisoned Predictions

Predictions for new data using poisoned traiing data

new_pred_poisoned <- predict(logistic_reg_poisoned, 
                             newdata = new_data, type = "raw")




new_pred_poisoned

## [1] No  No  Yes
## Levels: No Yes

new_prediction <- cbind(new_data, new_pred, new_pred_prob)
new_prediction

##   transaction_amount transaction_time loyalty_score num_failed_logins
## 1                150               10            30                 0
## 2                550               23            22                 3
## 3                800               22            28                 5
##   merchant_risk_score new_pred           No          Yes
## 1                  25       No 9.995479e-01 0.0004520734
## 2                  65      Yes 2.647963e-02 0.9735203718
## 3                  80      Yes 6.196914e-05 0.9999380309

Prediction probabilities for new data using poisoned training data

new_pred_prob_poisoned <- predict(logistic_reg_poisoned, 
                                  newdata = new_data, type = "prob")


new_pred_prob_poisoned

##           No       Yes
## 1 0.99896100 0.0010390
## 2 0.65643053 0.3435695
## 3 0.01499318 0.9850068

new_prediction_poisoned <- cbind(new_data, new_pred_poisoned, new_pred_prob_poisoned)
new_prediction_poisoned

##   transaction_amount transaction_time loyalty_score num_failed_logins
## 1                150               10            30                 0
## 2                550               23            22                 3
## 3                800               22            28                 5
##   merchant_risk_score new_pred_poisoned         No       Yes
## 1                  25                No 0.99896100 0.0010390
## 2                  65                No 0.65643053 0.3435695
## 3                  80               Yes 0.01499318 0.9850068

6.5 Comparison

Putting the data together

new_prediction <- new_prediction %>% 
  mutate(Model = "Original")

head(new_prediction)

##   transaction_amount transaction_time loyalty_score num_failed_logins
## 1                150               10            30                 0
## 2                550               23            22                 3
## 3                800               22            28                 5
##   merchant_risk_score new_pred           No          Yes    Model
## 1                  25       No 9.995479e-01 0.0004520734 Original
## 2                  65      Yes 2.647963e-02 0.9735203718 Original
## 3                  80      Yes 6.196914e-05 0.9999380309 Original

colnames(new_prediction)[6] <- "Prediction"

new_prediction_poisoned <- new_prediction_poisoned %>% 
  mutate(Model = "Poisoned")

head(new_prediction_poisoned)

##   transaction_amount transaction_time loyalty_score num_failed_logins
## 1                150               10            30                 0
## 2                550               23            22                 3
## 3                800               22            28                 5
##   merchant_risk_score new_pred_poisoned         No       Yes    Model
## 1                  25                No 0.99896100 0.0010390 Poisoned
## 2                  65                No 0.65643053 0.3435695 Poisoned
## 3                  80               Yes 0.01499318 0.9850068 Poisoned

colnames(new_prediction_poisoned)[6] <- "Prediction"

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

compare_df_cols(new_prediction, new_prediction_poisoned)

##           column_name new_prediction new_prediction_poisoned
## 1       loyalty_score        numeric                 numeric
## 2 merchant_risk_score        numeric                 numeric
## 3               Model      character               character
## 4                  No        numeric                 numeric
## 5   num_failed_logins        numeric                 numeric
## 6          Prediction         factor                  factor
## 7  transaction_amount        numeric                 numeric
## 8    transaction_time        numeric                 numeric
## 9                 Yes        numeric                 numeric

comparison <- rbind(new_prediction, new_prediction_poisoned)
comparison

##    transaction_amount transaction_time loyalty_score num_failed_logins
## 1                 150               10            30                 0
## 2                 550               23            22                 3
## 3                 800               22            28                 5
## 11                150               10            30                 0
## 21                550               23            22                 3
## 31                800               22            28                 5
##    merchant_risk_score Prediction           No          Yes    Model
## 1                   25         No 9.995479e-01 0.0004520734 Original
## 2                   65        Yes 2.647963e-02 0.9735203718 Original
## 3                   80        Yes 6.196914e-05 0.9999380309 Original
## 11                  25         No 9.989610e-01 0.0010389999 Poisoned
## 21                  65         No 6.564305e-01 0.3435694747 Poisoned
## 31                  80        Yes 1.499318e-02 0.9850068232 Poisoned

kocho_palette <- c(
  "#674ea7", "#a084ca", "#8b69b7", "#4b4e9a", "#301934")

ggplot(comparison) +
  aes(x = Model, y = No, fill = as.factor(transaction_amount)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = kocho_palette) +
  labs(
    title = "Comparison of Predictions Between Models by Transaction Amount",
    x = "Model",
    y = "Probability of non-fraudulent transactions",
    fill = "Transaction Amount"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

ggplot(comparison) +
  aes(x = Model, y = No, fill = as.factor(num_failed_logins)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = kocho_palette) +
  labs(
    title = "Comparison of Predictions Between Models by Number of Failed Logins",
    x = "Model",
    y = "Probability of non-fraudulent transactions",
    fill = "Number of Failed Logins") +
  theme_minimal() +
  theme(legend.position = "bottom")

ggplot(comparison) + aes(x = Model, y = No, 
                         fill = as.factor(merchant_risk_score )) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = kocho_palette) +
  labs(
    title = "Comparison Between Models by Merchant Risk",
    x = "Model",
    y = "Probability of non-fraudulent transactions",
    fill = "Merchant Risk Score") +
  theme_minimal() +
  theme(legend.position = "bottom")

Kocho Shinobu

Dance of the Butterfly

master yeoda

a long, long time ago in a galaxy far, far away