Directions

When we woke up, we were wasted and drunk.

Determine whether one gets a hangover, and the diagnostics.

Data for demo

Back to the spellbook

1. Load data

Load the data.

Then explore the data.

hangover <- read.csv("hangover_2.csv", header = TRUE, fileEncoding = "latin1")
names(hangover)
## [1] "ID"               "Night"            "Ladies_Night"     "Number_of_Drinks"
## [5] "Spent"            "Chow"             "Hangover"
head(hangover, 10)
##    ID Night Ladies_Night Number_of_Drinks Spent Chow Hangover
## 1   1     5            1                2   703    1        0
## 2   2     6            0                8   287    0        1
## 3   3     3            0                3   346    1        0
## 4   4     6            0                1   312    0        1
## 5   5     1            1                5   919    0        1
## 6   6     1            0                5   926    1        0
## 7   7     5            1                3   193    1        0
## 8   8     2            0               10   710    1        1
## 9   9     3            1                5    47    0        0
## 10 10     4            1                8   280    1        0
nrow(hangover)
## [1] 2000

2. Filter data

Remove the ID field.

hangover <- hangover[, -c(1)]
names(hangover)
## [1] "Night"            "Ladies_Night"     "Number_of_Drinks" "Spent"           
## [5] "Chow"             "Hangover"

Check the target variable.

table(hangover$Hangover)
## 
##    0    1 
##  851 1149

Change variables to categorical and recode. This makes it easier to make sense of the data.

For Ladies Night.

table(hangover$Ladies_Night)
## 
##    0    1 
## 1015  985
str(hangover$Ladies_Night)
##  int [1:2000] 1 0 0 0 1 0 1 0 1 1 ...
hangover$Ladies_Night <- as.factor(hangover$Ladies_Night)


hangover$Ladies_Night <- ifelse(hangover$Ladies_Night == 1, "Ladies Night",
                         "Not Ladies Night")
table(hangover$Ladies_Night)
## 
##     Ladies Night Not Ladies Night 
##              985             1015

Recode the night of the week.

library(car)
## Loading required package: carData
hangover$Night <- recode(hangover$Night,
                         " '1' = 'Mon'; '2' = 'Tue'; '3' = 'Wed';
                         '4' = 'Thu'; '5' = 'Fri'; '6' = 'Sat';
                         '7' = 'Sun'")
table(hangover$Night)
## 
## Fri Mon Sat Sun Thu Tue Wed 
## 292 262 268 278 280 305 315

Recode Hangover.

str(hangover$Hangover)
##  int [1:2000] 0 1 0 1 1 0 0 1 0 0 ...
hangover$Hangover <- as.factor(hangover$Hangover)
hangover$Hangover <- ifelse(hangover$Hangover == 1, "Yes",
                                "No")
table(hangover$Hangover)
## 
##   No  Yes 
##  851 1149

Recode as a factor for the classification algorithm to work.

str(hangover$Hangover)
##  chr [1:2000] "No" "Yes" "No" "Yes" "Yes" "No" "No" "Yes" "No" "No" "Yes" ...
hangover$Hangover <- as.factor(hangover$Hangover)

3. Training validation split

Set the seed using our favourite number.

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation.

train_index <- sample(1:nrow(hangover), 0.6 * nrow(hangover))
valid_index <- setdiff(1:nrow(hangover), train_index)

Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.

train_df <- hangover[train_index, ]
valid_df <- hangover[valid_index, ]

Check the splits.

nrow(train_df)
## [1] 1200
nrow(valid_df)
## [1] 800

4. Decision tree

Load the libraries.

library(rpart)
library(rpart.plot)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2

Train the decision tree.

class_tr <- rpart(Hangover ~ ., data = train_df, 
                  method = "class",
                  maxdepth = 10)

prp(class_tr, cex = 0.8, tweak = 1)

Predict using the trained decision tree.

class_tr_train_predict <- predict(class_tr, valid_df,
                                  type = "class")
confusionMatrix(class_tr_train_predict, valid_df$Hangover,
                positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  158  77
##        Yes 171 394
##                                           
##                Accuracy : 0.69            
##                  95% CI : (0.6567, 0.7219)
##     No Information Rate : 0.5888          
##     P-Value [Acc > NIR] : 2.091e-09       
##                                           
##                   Kappa : 0.331           
##                                           
##  Mcnemar's Test P-Value : 3.516e-09       
##                                           
##             Sensitivity : 0.8365          
##             Specificity : 0.4802          
##          Pos Pred Value : 0.6973          
##          Neg Pred Value : 0.6723          
##              Prevalence : 0.5887          
##          Detection Rate : 0.4925          
##    Detection Prevalence : 0.7063          
##       Balanced Accuracy : 0.6584          
##                                           
##        'Positive' Class : Yes             
## 

The probabilities from the prediction.

prob_2 <- predict(class_tr, newdata = valid_df, type = "prob")
head(prob_2)
##           No       Yes
## 2  0.2312634 0.7687366
## 5  0.4179104 0.5820896
## 6  0.3820225 0.6179775
## 10 0.2312634 0.7687366
## 11 0.2312634 0.7687366
## 12 0.2312634 0.7687366

If a cut off of a specific probability is preferred, let’s say 0.7.

confusionMatrix(as.factor(ifelse(prob_2[,2] > 0.7, "Yes", "No")), valid_df$Hangover,positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  249 228
##        Yes  80 243
##                                           
##                Accuracy : 0.615           
##                  95% CI : (0.5803, 0.6489)
##     No Information Rate : 0.5888          
##     P-Value [Acc > NIR] : 0.07002         
##                                           
##                   Kappa : 0.2554          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.5159          
##             Specificity : 0.7568          
##          Pos Pred Value : 0.7523          
##          Neg Pred Value : 0.5220          
##              Prevalence : 0.5887          
##          Detection Rate : 0.3038          
##    Detection Prevalence : 0.4037          
##       Balanced Accuracy : 0.6364          
##                                           
##        'Positive' Class : Yes             
## 

4.1. ROC curve for decision tree

The ROC is a trade off the rate of a correct vs incorrect prediction.

The AUC metric ranges from 0.5 to 1.0.

Values >= 0.8 are good.

library(ROSE)
## Loaded ROSE 0.0-3
ROSE::roc.curve(valid_df$Hangover, class_tr_train_predict)

## Area under the curve (AUC): 0.658

4.2 Gains and lift charts for decision tree

Load the library.

library(modelplotr)
## Package modelplotr loaded! Happy model plotting!

Compute the scores.

scores_and_ntiles_dt <- prepare_scores_and_ntiles(datasets = 
                                                 list("valid_df"),
                                               dataset_labels = 
                                                 list("Validation data"),
                                               models = 
                                                 list("class_tr"),
                                               model_labels = 
                                                 list("Classification tree"),
                                               target_column = "Hangover",
                                               ntiles = 100)
## ... scoring caret model "class_tr" on dataset "valid_df".
## Data preparation step 1 succeeded! Dataframe created.
head(scores_and_ntiles_dt)
##            model_label   dataset_label y_true   prob_No  prob_Yes ntl_No
## 2  Classification tree Validation data    Yes 0.2312634 0.7687366    100
## 5  Classification tree Validation data    Yes 0.4179104 0.5820896     31
## 6  Classification tree Validation data     No 0.3820225 0.6179775     41
## 10 Classification tree Validation data     No 0.2312634 0.7687366     60
## 11 Classification tree Validation data    Yes 0.2312634 0.7687366     92
## 12 Classification tree Validation data    Yes 0.2312634 0.7687366     98
##    ntl_Yes
## 2       39
## 5       66
## 6       60
## 10      37
## 11       8
## 12       8

Specify the select_targetclass argument to the preferred class.

plot_input_dt <- plotting_scope(prepared_input = scores_and_ntiles_dt,
                                select_targetclass = "Yes")
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
## 
## No comparison specified, default values are used. 
## 
## Single evaluation line will be plotted: Target value "Yes" plotted for dataset "Validation data" and model "Classification tree.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
head(plot_input_dt)
##           scope         model_label   dataset_label target_class ntile neg pos
## 1 no_comparison Classification tree Validation data          Yes     0   0   0
## 2 no_comparison Classification tree Validation data          Yes     1   2   6
## 3 no_comparison Classification tree Validation data          Yes     2   3   5
## 4 no_comparison Classification tree Validation data          Yes     3   3   5
## 5 no_comparison Classification tree Validation data          Yes     4   2   6
## 6 no_comparison Classification tree Validation data          Yes     5   0   8
##   tot   pct negtot postot tottot  pcttot cumneg cumpos cumtot    cumpct
## 1   0    NA     NA     NA     NA      NA      0      0      0        NA
## 2   8 0.750    329    471    800 0.58875      2      6      8 0.7500000
## 3   8 0.625    329    471    800 0.58875      5     11     16 0.6875000
## 4   8 0.625    329    471    800 0.58875      8     16     24 0.6666667
## 5   8 0.750    329    471    800 0.58875     10     22     32 0.6875000
## 6   8 1.000    329    471    800 0.58875     10     30     40 0.7500000
##         gain    cumgain gain_ref   gain_opt     lift  cumlift cumlift_ref
## 1 0.00000000 0.00000000     0.00 0.00000000       NA       NA           1
## 2 0.01273885 0.01273885     0.01 0.01698514 1.273885 1.273885           1
## 3 0.01061571 0.02335456     0.02 0.03397028 1.061571 1.167728           1
## 4 0.01061571 0.03397028     0.03 0.05095541 1.061571 1.132343           1
## 5 0.01273885 0.04670913     0.04 0.06794055 1.273885 1.167728           1
## 6 0.01698514 0.06369427     0.05 0.08492569 1.698514 1.273885           1
##   legend
## 1    Yes
## 2    Yes
## 3    Yes
## 4    Yes
## 5    Yes
## 6    Yes

Cumulative gains for decision tree.

plot_cumgains(data = plot_input_dt)

Cumulative lift for decision tree.

plot_cumlift(data = plot_input_dt)

Response plot for decision tree.

plot_response(data = plot_input_dt)

Cumulative response plot for decision tree.

plot_cumresponse(data = plot_input_dt)

5. Logistic regression

The diagnostics also work for logistic regressions.

5.1 Load the library

This should have been done in the previous section.

library(caret)

5.2 Split the data

This should have been done in the previous section.

Set the seed using our favourite number.

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation.

train_index <- sample(1:nrow(hangover), 0.6 * nrow(hangover))
valid_index <- setdiff(1:nrow(hangover), train_index)

Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.

train_df <- hangover[train_index, ]
valid_df <- hangover[valid_index, ]

Check the splits.

nrow(train_df)
## [1] 1200
nrow(valid_df)
## [1] 800

5.3 Build the logistic regression

logistic_reg <- train(Hangover ~ ., data = train_df, method="glm", family="binomial")


summary(logistic_reg)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1549  -1.0082   0.5883   0.9563   1.9003  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -1.501e+00  2.533e-01  -5.928 3.08e-09 ***
## NightMon                        1.347e-01  2.464e-01   0.547   0.5846    
## NightSat                        1.878e-01  2.453e-01   0.766   0.4439    
## NightSun                       -1.279e-01  2.348e-01  -0.545   0.5859    
## NightThu                       -4.151e-01  2.351e-01  -1.765   0.0775 .  
## NightTue                       -2.550e-01  2.313e-01  -1.102   0.2703    
## NightWed                       -3.263e-01  2.272e-01  -1.436   0.1510    
## `Ladies_NightNot Ladies Night`  7.490e-01  1.281e-01   5.847 5.00e-09 ***
## Number_of_Drinks                2.735e-01  2.338e-02  11.696  < 2e-16 ***
## Spent                           5.173e-05  2.204e-04   0.235   0.8145    
## Chow                            9.205e-02  1.272e-01   0.724   0.4693    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1643.2  on 1199  degrees of freedom
## Residual deviance: 1449.1  on 1189  degrees of freedom
## AIC: 1471.1
## 
## Number of Fisher Scoring iterations: 4

Predict using the logistic regression.

logistic_reg_pred <- predict(logistic_reg, 
                             newdata = valid_df, type = "raw")

head(logistic_reg_pred)
## [1] Yes Yes Yes Yes Yes Yes
## Levels: No Yes

Compute the probabilities.

logistic_reg_pred_prob <- predict(logistic_reg, 
                             newdata = valid_df, type = "prob")

head(logistic_reg_pred_prob)
##           No       Yes
## 2  0.1626688 0.8373312
## 5  0.4878738 0.5121262
## 6  0.2911174 0.7088826
## 10 0.4065387 0.5934613
## 11 0.1473743 0.8526257
## 12 0.2867861 0.7132139

The confusion matrix.

Caret requires the outcome variable to be a factor.

This should already be done. If not, factorise.

confusionMatrix(as.factor(logistic_reg_pred), 
                valid_df$Hangover, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  169 119
##        Yes 160 352
##                                           
##                Accuracy : 0.6512          
##                  95% CI : (0.6171, 0.6843)
##     No Information Rate : 0.5888          
##     P-Value [Acc > NIR] : 0.0001676       
##                                           
##                   Kappa : 0.266           
##                                           
##  Mcnemar's Test P-Value : 0.0166323       
##                                           
##             Sensitivity : 0.7473          
##             Specificity : 0.5137          
##          Pos Pred Value : 0.6875          
##          Neg Pred Value : 0.5868          
##              Prevalence : 0.5887          
##          Detection Rate : 0.4400          
##    Detection Prevalence : 0.6400          
##       Balanced Accuracy : 0.6305          
##                                           
##        'Positive' Class : Yes             
## 

If a cut off of a specific probability is preferred, let’s say 0.7.

confusionMatrix(as.factor(ifelse(logistic_reg_pred_prob[,2] > 0.7, "Yes", "No")), valid_df$Hangover,positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  266 276
##        Yes  63 195
##                                           
##                Accuracy : 0.5762          
##                  95% CI : (0.5412, 0.6108)
##     No Information Rate : 0.5888          
##     P-Value [Acc > NIR] : 0.775           
##                                           
##                   Kappa : 0.2027          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.4140          
##             Specificity : 0.8085          
##          Pos Pred Value : 0.7558          
##          Neg Pred Value : 0.4908          
##              Prevalence : 0.5887          
##          Detection Rate : 0.2437          
##    Detection Prevalence : 0.3225          
##       Balanced Accuracy : 0.6113          
##                                           
##        'Positive' Class : Yes             
## 

5.4 ROC curve for logistic regression

The ROC is a trade off the rate of a correct vs incorrect prediction.

The AUC metric ranges from 0.5 to 1.0.

Values >= 0.8 are good.

Load the library. This should have been done in the previous section.

library(ROSE)

ROSE::roc.curve(valid_df$Hangover, logistic_reg_pred)

## Area under the curve (AUC): 0.631

5.5 Gains and lift charts for logistic regression

Note that this method only works if caret is used.

Load the library. This should have been done in the previous section.

library(modelplotr)

The scores need to be computed for the logistic regression.

We’ll use deciles instead of percentiles.

scores_and_ntiles <- prepare_scores_and_ntiles(datasets = 
                                                 list("valid_df"),
                                               dataset_labels = 
                                                 list("Validation data"),
                                               models = 
                                                 list("logistic_reg"),
                                               model_labels = 
                                                 list("Logistic regression"),
                                               target_column = "Hangover",
                                               ntiles = 10)
## ... scoring caret model "logistic_reg" on dataset "valid_df".
## Data preparation step 1 succeeded! Dataframe created.
head(scores_and_ntiles)
##            model_label   dataset_label y_true   prob_No  prob_Yes ntl_No
## 2  Logistic regression Validation data    Yes 0.1626688 0.8373312     10
## 5  Logistic regression Validation data    Yes 0.4878738 0.5121262      4
## 6  Logistic regression Validation data     No 0.2911174 0.7088826      8
## 10 Logistic regression Validation data     No 0.4065387 0.5934613      6
## 11 Logistic regression Validation data    Yes 0.1473743 0.8526257     10
## 12 Logistic regression Validation data    Yes 0.2867861 0.7132139      8
##    ntl_Yes
## 2        1
## 5        7
## 6        3
## 10       5
## 11       1
## 12       3

Specify the select_targetclass argument to the preferred class.

plot_input <- plotting_scope(prepared_input = scores_and_ntiles,
                             select_targetclass = "Yes")
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
## 
## No comparison specified, default values are used. 
## 
## Single evaluation line will be plotted: Target value "Yes" plotted for dataset "Validation data" and model "Logistic regression.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
head(plot_input)
##           scope         model_label   dataset_label target_class ntile neg pos
## 1 no_comparison Logistic regression Validation data          Yes     0   0   0
## 2 no_comparison Logistic regression Validation data          Yes     1  16  64
## 3 no_comparison Logistic regression Validation data          Yes     2  17  63
## 4 no_comparison Logistic regression Validation data          Yes     3  26  54
## 5 no_comparison Logistic regression Validation data          Yes     4  24  56
## 6 no_comparison Logistic regression Validation data          Yes     5  30  50
##   tot    pct negtot postot tottot  pcttot cumneg cumpos cumtot    cumpct
## 1   0     NA     NA     NA     NA      NA      0      0      0        NA
## 2  80 0.8000    329    471    800 0.58875     16     64     80 0.8000000
## 3  80 0.7875    329    471    800 0.58875     33    127    160 0.7937500
## 4  80 0.6750    329    471    800 0.58875     59    181    240 0.7541667
## 5  80 0.7000    329    471    800 0.58875     83    237    320 0.7406250
## 6  80 0.6250    329    471    800 0.58875    113    287    400 0.7175000
##        gain   cumgain gain_ref  gain_opt     lift  cumlift cumlift_ref legend
## 1 0.0000000 0.0000000      0.0 0.0000000       NA       NA           1    Yes
## 2 0.1358811 0.1358811      0.1 0.1698514 1.358811 1.358811           1    Yes
## 3 0.1337580 0.2696391      0.2 0.3397028 1.337580 1.348195           1    Yes
## 4 0.1146497 0.3842887      0.3 0.5095541 1.146497 1.280962           1    Yes
## 5 0.1188960 0.5031847      0.4 0.6794055 1.188960 1.257962           1    Yes
## 6 0.1061571 0.6093418      0.5 0.8492569 1.061571 1.218684           1    Yes

Cumulative gains for logistic regression.

Variation: Highlight the 30th percentile or 3rd decile and change the colour.

plot_cumgains(data = plot_input)

plot_cumgains(data = plot_input, highlight_ntile = 3,
              custom_line_colors = "#1E9C33")
##  
## Plot annotation for plot: Cumulative gains
## - When we select 30% with the highest probability according to model Logistic regression, this selection holds 38% of all Yes cases in Validation data. 
##  
## 

Cumulative lift for logistic regression.

Variation: Highlight the 20th percentile or 2nd decile and change the colour.

plot_cumlift(data = plot_input)

plot_cumlift(data = plot_input, highlight_ntile = 2,
             custom_line_colors = "#1B18CC")
##  
## Plot annotation for plot: Cumulative lift
## - When we select 20% with the highest probability according to model Logistic regression in Validation data, this selection for Yes cases is 1.3 times better than selecting without a model. 
##  
## 

Response plot for logistic regression.

Variation: Highlight the 30th percentile or 3rd decile and change the colour.

plot_response(data = plot_input)

plot_response(data = plot_input, highlight_ntile = 3,
              custom_line_colors = "#B717BF")
##  
## Plot annotation for plot: Response
## - When we select ntile 3 according to model Logistic regression in dataset Validation data the % of Yes cases in the selection is 67.5%. 
##  
## 

Cumulative response plot for logistic regression.

Variation: Highlight the 20th percentile or 2nd decile and change the colour.

plot_cumresponse(data = plot_input)

plot_cumresponse(data = plot_input, highlight_ntile = 2,
              custom_line_colors = "#B37D12")
##  
## Plot annotation for plot: Cumulative response
## - When we select ntiles 1 until 2 according to model Logistic regression in dataset Validation data the % of Yes cases in the selection is 79.4%. 
##  
## 

Multiple plots together.

plot_multiplot(data = plot_input,
               custom_line_colors = "#552682") 

5.6 Financial plots for logistic regression

Applying the diagnostics to business.

plot_roi(data = plot_input,
         fixed_costs = 10000,
         variable_costs_per_unit = 200,
         profit_per_unit = 500)
##  
## Plot annotation for plot: Return on Investment (ROI)
## - When we select ntiles 1 until 4 in dataset Validation data using model Logistic regression to target Yes cases the expected return on investment is 60%. 
##  
## 

plot_costsrevs(data = plot_input,
               fixed_costs = 10000,
               variable_costs_per_unit = 200,
               profit_per_unit = 500,
               highlight_ntile = "max_roi",
               custom_line_colors = "#552682")
##  
## Plot annotation for plot: Costs and Revenues
## - When we select ntiles 1 until 4 in dataset Validation data using model  to target Yes cases the revenues are €118,500 
##  
## 

plot_profit(data = plot_input,
            fixed_costs = 10000,
            variable_costs_per_unit = 200,
            profit_per_unit = 500,
            highlight_ntile = 4,
            custom_line_colors = "#089E12")
##  
## Plot annotation for plot: Profit
## - When we select ntiles 1 until 4 in dataset Validation data using model Logistic regression to target Yes cases the expected profit is €44,500 
##  
## 

6. kNN

The diagnostics also work for kNN.

6.1 Load the library

This should have been done in the previous section.

library(caret)

6.2 Split the data

Load the data again. They have to be numerical for kNN.

Note that kNN may not work very well here.

Remove the first column.

Rename the target variable.

hangover_knn <- read.csv("hangover_2.csv", header = TRUE, fileEncoding = "latin1")

hangover_knn <- hangover_knn[, -c(1)]


hangover_knn$Hangover <- ifelse(hangover_knn$Hangover == 1, "Yes",
                                "No")

hangover_knn[,6] <- as.factor(hangover_knn[,6])

str(hangover_knn)
## 'data.frame':    2000 obs. of  6 variables:
##  $ Night           : int  5 6 3 6 1 1 5 2 3 4 ...
##  $ Ladies_Night    : int  1 0 0 0 1 0 1 0 1 1 ...
##  $ Number_of_Drinks: int  2 8 3 1 5 5 3 10 5 8 ...
##  $ Spent           : int  703 287 346 312 919 926 193 710 47 280 ...
##  $ Chow            : int  1 0 1 0 0 1 1 1 0 1 ...
##  $ Hangover        : Factor w/ 2 levels "No","Yes": 1 2 1 2 2 1 1 2 1 1 ...

Set the seed using our favourite number.

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation.

train_index <- sample(1:nrow(hangover_knn), 0.6 * nrow(hangover_knn))
valid_index <- setdiff(1:nrow(hangover_knn), train_index)

Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.

train_df <- hangover_knn[train_index, ]
valid_df <- hangover_knn[valid_index, ]

Check the splits.

nrow(train_df)
## [1] 1200
nrow(valid_df)
## [1] 800

6.3 Normalise

This is needed if predictors are on a different scale.

train_norm <- train_df
valid_norm <- valid_df
names(train_df)
## [1] "Night"            "Ladies_Night"     "Number_of_Drinks" "Spent"           
## [5] "Chow"             "Hangover"
names(valid_df)
## [1] "Night"            "Ladies_Night"     "Number_of_Drinks" "Spent"           
## [5] "Chow"             "Hangover"

Create a normalising algorithm using the 8 variables in the training set.

norm_values <- caret::preProcess(train_df[, -c(6)],
                          method = c("center",
                                     "scale"))
train_norm[, -c(6)] <- predict(norm_values,
                                train_df[, -c(6)])

head(train_norm)
##           Night Ladies_Night Number_of_Drinks      Spent       Chow Hangover
## 1598  1.5329482   -0.9700324       -1.1791359 -1.1099927 -1.0231854      Yes
## 638  -0.5030649   -0.9700324       -0.1440495  0.8224320  0.9765255       No
## 608  -1.0120682   -0.9700324       -0.8341071 -1.5428836  0.9765255      Yes
## 907  -0.5030649    1.0300344       -1.5241647  0.5834763 -1.0231854       No
## 1147  1.0239449   -0.9700324        0.5460081  1.1548922 -1.0231854      Yes
## 1564  1.5329482    1.0300344        1.2360657 -1.7160399 -1.0231854      Yes

Then using these normalising algorithm, predict the normalised values of the validation set.

valid_norm[, -c(6)] <- predict(norm_values,
                                valid_df[, -c(6)])

head(valid_norm)
##           Night Ladies_Night Number_of_Drinks       Spent       Chow Hangover
## 2   1.023944930   -0.9700324        0.8910369 -0.79484820 -1.0231854      Yes
## 5  -1.521071466    1.0300344       -0.1440495  1.39384792 -1.0231854      Yes
## 6  -1.521071466   -0.9700324       -0.1440495  1.41808981  0.9765255       No
## 10  0.005938372    1.0300344        0.8910369 -0.81909009  0.9765255       No
## 11 -1.012068187   -0.9700324        1.5810945  0.05361786 -1.0231854      Yes
## 12 -0.503064908    1.0300344        1.5810945 -1.55327297 -1.0231854      Yes

6.4 Build the kNN model

Train the kNN model using k = 5.

Other values of k can be used too.

knn_model_k5 <- caret::train(Hangover ~ ., data = train_norm,
                            tuneGrid = data.frame(k = 5), method = "knn")

library(caret)
knn_pred_k5 <- predict(knn_model_k5, newdata = valid_df)
head(knn_pred_k5)
## [1] Yes Yes Yes Yes Yes Yes
## Levels: No Yes

Confusion matrix for the model.

confusionMatrix(knn_pred_k5, as.factor(valid_df[, 6]),
                positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No    3   3
##        Yes 326 468
##                                           
##                Accuracy : 0.5888          
##                  95% CI : (0.5538, 0.6231)
##     No Information Rate : 0.5888          
##     P-Value [Acc > NIR] : 0.5152          
##                                           
##                   Kappa : 0.0032          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.993631        
##             Specificity : 0.009119        
##          Pos Pred Value : 0.589421        
##          Neg Pred Value : 0.500000        
##              Prevalence : 0.588750        
##          Detection Rate : 0.585000        
##    Detection Prevalence : 0.992500        
##       Balanced Accuracy : 0.501375        
##                                           
##        'Positive' Class : Yes             
## 

Associated probabilities.

knn_pred_k5_prob <- predict(knn_model_k5, newdata = valid_df, type = "prob")
head(knn_pred_k5_prob)
##          No       Yes
## 1 0.2857143 0.7142857
## 2 0.2222222 0.7777778
## 3 0.2222222 0.7777778
## 4 0.3333333 0.6666667
## 5 0.2222222 0.7777778
## 6 0.4000000 0.6000000

6.5 ROC curve for kNN

The ROC is a trade off the rate of a correct vs incorrect prediction.

The AUC metric ranges from 0.5 to 1.0.

Values >= 0.8 are good.

Load the library. This should have been done in the previous section.

library(ROSE)

ROSE::roc.curve(valid_df$Hangover, knn_pred_k5)

## Area under the curve (AUC): 0.501

6.6 Gains and lift charts for kNN

Note that this method only works if caret is used.

Load the library. This should have been done in the previous section.

library(modelplotr)

The scores need to be computed for the logistic regression.

We’ll use deciles instead of percentiles.

scores_and_ntiles <- prepare_scores_and_ntiles(datasets = 
                                                 list("valid_df"),
                                               dataset_labels = 
                                                 list("Validation data"),
                                               models = 
                                                 list("knn_model_k5"),
                                               model_labels = 
                                                 list("k Nearest Neighbours"),
                                               target_column = "Hangover",
                                               ntiles = 10)
## ... scoring caret model "knn_model_k5" on dataset "valid_df".
## Data preparation step 1 succeeded! Dataframe created.
head(scores_and_ntiles)
##             model_label   dataset_label y_true   prob_No  prob_Yes ntl_No
## 2  k Nearest Neighbours Validation data    Yes 0.2857143 0.7142857      2
## 5  k Nearest Neighbours Validation data    Yes 0.2222222 0.7777778      8
## 6  k Nearest Neighbours Validation data     No 0.2222222 0.7777778      7
## 10 k Nearest Neighbours Validation data     No 0.3333333 0.6666667      2
## 11 k Nearest Neighbours Validation data    Yes 0.2222222 0.7777778      6
## 12 k Nearest Neighbours Validation data    Yes 0.4000000 0.6000000      1
##    ntl_Yes
## 2        8
## 5        3
## 6        6
## 10      10
## 11       6
## 12      10

Specify the select_targetclass argument to the preferred class.

plot_input <- plotting_scope(prepared_input = scores_and_ntiles,
                             select_targetclass = "Yes")
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
## 
## No comparison specified, default values are used. 
## 
## Single evaluation line will be plotted: Target value "Yes" plotted for dataset "Validation data" and model "k Nearest Neighbours.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
head(plot_input)
##           scope          model_label   dataset_label target_class ntile neg pos
## 1 no_comparison k Nearest Neighbours Validation data          Yes     0   0   0
## 2 no_comparison k Nearest Neighbours Validation data          Yes     1  35  45
## 3 no_comparison k Nearest Neighbours Validation data          Yes     2  31  49
## 4 no_comparison k Nearest Neighbours Validation data          Yes     3  34  46
## 5 no_comparison k Nearest Neighbours Validation data          Yes     4  30  50
## 6 no_comparison k Nearest Neighbours Validation data          Yes     5  28  52
##   tot    pct negtot postot tottot  pcttot cumneg cumpos cumtot    cumpct
## 1   0     NA     NA     NA     NA      NA      0      0      0        NA
## 2  80 0.5625    329    471    800 0.58875     35     45     80 0.5625000
## 3  80 0.6125    329    471    800 0.58875     66     94    160 0.5875000
## 4  80 0.5750    329    471    800 0.58875    100    140    240 0.5833333
## 5  80 0.6250    329    471    800 0.58875    130    190    320 0.5937500
## 6  80 0.6500    329    471    800 0.58875    158    242    400 0.6050000
##         gain   cumgain gain_ref  gain_opt      lift   cumlift cumlift_ref
## 1 0.00000000 0.0000000      0.0 0.0000000        NA        NA           1
## 2 0.09554140 0.0955414      0.1 0.1698514 0.9554140 0.9554140           1
## 3 0.10403397 0.1995754      0.2 0.3397028 1.0403397 0.9978769           1
## 4 0.09766454 0.2972399      0.3 0.5095541 0.9766454 0.9907997           1
## 5 0.10615711 0.4033970      0.4 0.6794055 1.0615711 1.0084926           1
## 6 0.11040340 0.5138004      0.5 0.8492569 1.1040340 1.0276008           1
##   legend
## 1    Yes
## 2    Yes
## 3    Yes
## 4    Yes
## 5    Yes
## 6    Yes

Cumulative gains for logistic regression.

Variation: Highlight the 30th percentile or 3rd decile and change the colour.

plot_cumgains(data = plot_input)

plot_cumgains(data = plot_input, highlight_ntile = 3,
              custom_line_colors = "#1E9C33")
##  
## Plot annotation for plot: Cumulative gains
## - When we select 30% with the highest probability according to model k Nearest Neighbours, this selection holds 30% of all Yes cases in Validation data. 
##  
## 

Cumulative lift for logistic regression.

Variation: Highlight the 20th percentile or 2nd decile and change the colour.

plot_cumlift(data = plot_input)

plot_cumlift(data = plot_input, highlight_ntile = 2,
             custom_line_colors = "#1B18CC")
##  
## Plot annotation for plot: Cumulative lift
## - When we select 20% with the highest probability according to model k Nearest Neighbours in Validation data, this selection for Yes cases is 1.0 times better than selecting without a model. 
##  
## 

Response plot for logistic regression.

Variation: Highlight the 30th percentile or 3rd decile and change the colour.

plot_response(data = plot_input)

plot_response(data = plot_input, highlight_ntile = 3,
              custom_line_colors = "#B717BF")
##  
## Plot annotation for plot: Response
## - When we select ntile 3 according to model k Nearest Neighbours in dataset Validation data the % of Yes cases in the selection is 57.5%. 
##  
## 

Cumulative response plot for logistic regression.

Variation: Highlight the 20th percentile or 2nd decile and change the colour.

plot_cumresponse(data = plot_input)

plot_cumresponse(data = plot_input, highlight_ntile = 2,
              custom_line_colors = "#B37D12")
##  
## Plot annotation for plot: Cumulative response
## - When we select ntiles 1 until 2 according to model k Nearest Neighbours in dataset Validation data the % of Yes cases in the selection is 58.8%. 
##  
## 

Multiple plots together.

plot_multiplot(data = plot_input,
               custom_line_colors = "#552682")