When we woke up, we were wasted and drunk.
Determine whether one gets a hangover, and the diagnostics.
Load the data.
Then explore the data.
hangover <- read.csv("hangover_2.csv", header = TRUE, fileEncoding = "latin1")
names(hangover)
## [1] "ID" "Night" "Ladies_Night" "Number_of_Drinks"
## [5] "Spent" "Chow" "Hangover"
head(hangover, 10)
## ID Night Ladies_Night Number_of_Drinks Spent Chow Hangover
## 1 1 5 1 2 703 1 0
## 2 2 6 0 8 287 0 1
## 3 3 3 0 3 346 1 0
## 4 4 6 0 1 312 0 1
## 5 5 1 1 5 919 0 1
## 6 6 1 0 5 926 1 0
## 7 7 5 1 3 193 1 0
## 8 8 2 0 10 710 1 1
## 9 9 3 1 5 47 0 0
## 10 10 4 1 8 280 1 0
nrow(hangover)
## [1] 2000
Remove the ID field.
hangover <- hangover[, -c(1)]
names(hangover)
## [1] "Night" "Ladies_Night" "Number_of_Drinks" "Spent"
## [5] "Chow" "Hangover"
Check the target variable.
table(hangover$Hangover)
##
## 0 1
## 851 1149
Change variables to categorical and recode. This makes it easier to make sense of the data.
For Ladies Night.
table(hangover$Ladies_Night)
##
## 0 1
## 1015 985
str(hangover$Ladies_Night)
## int [1:2000] 1 0 0 0 1 0 1 0 1 1 ...
hangover$Ladies_Night <- as.factor(hangover$Ladies_Night)
hangover$Ladies_Night <- ifelse(hangover$Ladies_Night == 1, "Ladies Night",
"Not Ladies Night")
table(hangover$Ladies_Night)
##
## Ladies Night Not Ladies Night
## 985 1015
Recode the night of the week.
library(car)
## Loading required package: carData
hangover$Night <- recode(hangover$Night,
" '1' = 'Mon'; '2' = 'Tue'; '3' = 'Wed';
'4' = 'Thu'; '5' = 'Fri'; '6' = 'Sat';
'7' = 'Sun'")
table(hangover$Night)
##
## Fri Mon Sat Sun Thu Tue Wed
## 292 262 268 278 280 305 315
Recode Hangover.
str(hangover$Hangover)
## int [1:2000] 0 1 0 1 1 0 0 1 0 0 ...
hangover$Hangover <- as.factor(hangover$Hangover)
hangover$Hangover <- ifelse(hangover$Hangover == 1, "Yes",
"No")
table(hangover$Hangover)
##
## No Yes
## 851 1149
Recode as a factor for the classification algorithm to work.
str(hangover$Hangover)
## chr [1:2000] "No" "Yes" "No" "Yes" "Yes" "No" "No" "Yes" "No" "No" "Yes" ...
hangover$Hangover <- as.factor(hangover$Hangover)
Set the seed using our favourite number.
set.seed(666)
Create the indices for the split This samples the row indices to split the data into training and validation.
train_index <- sample(1:nrow(hangover), 0.6 * nrow(hangover))
valid_index <- setdiff(1:nrow(hangover), train_index)
Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.
train_df <- hangover[train_index, ]
valid_df <- hangover[valid_index, ]
Check the splits.
nrow(train_df)
## [1] 1200
nrow(valid_df)
## [1] 800
Load the libraries.
library(rpart)
library(rpart.plot)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
Train the decision tree.
class_tr <- rpart(Hangover ~ ., data = train_df,
method = "class",
maxdepth = 10)
prp(class_tr, cex = 0.8, tweak = 1)
Predict using the trained decision tree.
class_tr_train_predict <- predict(class_tr, valid_df,
type = "class")
confusionMatrix(class_tr_train_predict, valid_df$Hangover,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 158 77
## Yes 171 394
##
## Accuracy : 0.69
## 95% CI : (0.6567, 0.7219)
## No Information Rate : 0.5888
## P-Value [Acc > NIR] : 2.091e-09
##
## Kappa : 0.331
##
## Mcnemar's Test P-Value : 3.516e-09
##
## Sensitivity : 0.8365
## Specificity : 0.4802
## Pos Pred Value : 0.6973
## Neg Pred Value : 0.6723
## Prevalence : 0.5887
## Detection Rate : 0.4925
## Detection Prevalence : 0.7063
## Balanced Accuracy : 0.6584
##
## 'Positive' Class : Yes
##
The probabilities from the prediction.
prob_2 <- predict(class_tr, newdata = valid_df, type = "prob")
head(prob_2)
## No Yes
## 2 0.2312634 0.7687366
## 5 0.4179104 0.5820896
## 6 0.3820225 0.6179775
## 10 0.2312634 0.7687366
## 11 0.2312634 0.7687366
## 12 0.2312634 0.7687366
If a cut off of a specific probability is preferred, let’s say 0.7.
confusionMatrix(as.factor(ifelse(prob_2[,2] > 0.7, "Yes", "No")), valid_df$Hangover,positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 249 228
## Yes 80 243
##
## Accuracy : 0.615
## 95% CI : (0.5803, 0.6489)
## No Information Rate : 0.5888
## P-Value [Acc > NIR] : 0.07002
##
## Kappa : 0.2554
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.5159
## Specificity : 0.7568
## Pos Pred Value : 0.7523
## Neg Pred Value : 0.5220
## Prevalence : 0.5887
## Detection Rate : 0.3038
## Detection Prevalence : 0.4037
## Balanced Accuracy : 0.6364
##
## 'Positive' Class : Yes
##
The ROC is a trade off the rate of a correct vs incorrect prediction.
The AUC metric ranges from 0.5 to 1.0.
Values >= 0.8 are good.
library(ROSE)
## Loaded ROSE 0.0-3
ROSE::roc.curve(valid_df$Hangover, class_tr_train_predict)
## Area under the curve (AUC): 0.658
Load the library.
library(modelplotr)
## Package modelplotr loaded! Happy model plotting!
Compute the scores.
scores_and_ntiles_dt <- prepare_scores_and_ntiles(datasets =
list("valid_df"),
dataset_labels =
list("Validation data"),
models =
list("class_tr"),
model_labels =
list("Classification tree"),
target_column = "Hangover",
ntiles = 100)
## ... scoring caret model "class_tr" on dataset "valid_df".
## Data preparation step 1 succeeded! Dataframe created.
head(scores_and_ntiles_dt)
## model_label dataset_label y_true prob_No prob_Yes ntl_No
## 2 Classification tree Validation data Yes 0.2312634 0.7687366 100
## 5 Classification tree Validation data Yes 0.4179104 0.5820896 31
## 6 Classification tree Validation data No 0.3820225 0.6179775 41
## 10 Classification tree Validation data No 0.2312634 0.7687366 60
## 11 Classification tree Validation data Yes 0.2312634 0.7687366 92
## 12 Classification tree Validation data Yes 0.2312634 0.7687366 98
## ntl_Yes
## 2 39
## 5 66
## 6 60
## 10 37
## 11 8
## 12 8
Specify the select_targetclass argument to the preferred class.
plot_input_dt <- plotting_scope(prepared_input = scores_and_ntiles_dt,
select_targetclass = "Yes")
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
##
## No comparison specified, default values are used.
##
## Single evaluation line will be plotted: Target value "Yes" plotted for dataset "Validation data" and model "Classification tree.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
head(plot_input_dt)
## scope model_label dataset_label target_class ntile neg pos
## 1 no_comparison Classification tree Validation data Yes 0 0 0
## 2 no_comparison Classification tree Validation data Yes 1 2 6
## 3 no_comparison Classification tree Validation data Yes 2 3 5
## 4 no_comparison Classification tree Validation data Yes 3 3 5
## 5 no_comparison Classification tree Validation data Yes 4 2 6
## 6 no_comparison Classification tree Validation data Yes 5 0 8
## tot pct negtot postot tottot pcttot cumneg cumpos cumtot cumpct
## 1 0 NA NA NA NA NA 0 0 0 NA
## 2 8 0.750 329 471 800 0.58875 2 6 8 0.7500000
## 3 8 0.625 329 471 800 0.58875 5 11 16 0.6875000
## 4 8 0.625 329 471 800 0.58875 8 16 24 0.6666667
## 5 8 0.750 329 471 800 0.58875 10 22 32 0.6875000
## 6 8 1.000 329 471 800 0.58875 10 30 40 0.7500000
## gain cumgain gain_ref gain_opt lift cumlift cumlift_ref
## 1 0.00000000 0.00000000 0.00 0.00000000 NA NA 1
## 2 0.01273885 0.01273885 0.01 0.01698514 1.273885 1.273885 1
## 3 0.01061571 0.02335456 0.02 0.03397028 1.061571 1.167728 1
## 4 0.01061571 0.03397028 0.03 0.05095541 1.061571 1.132343 1
## 5 0.01273885 0.04670913 0.04 0.06794055 1.273885 1.167728 1
## 6 0.01698514 0.06369427 0.05 0.08492569 1.698514 1.273885 1
## legend
## 1 Yes
## 2 Yes
## 3 Yes
## 4 Yes
## 5 Yes
## 6 Yes
Cumulative gains for decision tree.
plot_cumgains(data = plot_input_dt)
Cumulative lift for decision tree.
plot_cumlift(data = plot_input_dt)
Response plot for decision tree.
plot_response(data = plot_input_dt)
Cumulative response plot for decision tree.
plot_cumresponse(data = plot_input_dt)
The diagnostics also work for logistic regressions.
This should have been done in the previous section.
library(caret)
This should have been done in the previous section.
Set the seed using our favourite number.
set.seed(666)
Create the indices for the split This samples the row indices to split the data into training and validation.
train_index <- sample(1:nrow(hangover), 0.6 * nrow(hangover))
valid_index <- setdiff(1:nrow(hangover), train_index)
Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.
train_df <- hangover[train_index, ]
valid_df <- hangover[valid_index, ]
Check the splits.
nrow(train_df)
## [1] 1200
nrow(valid_df)
## [1] 800
logistic_reg <- train(Hangover ~ ., data = train_df, method="glm", family="binomial")
summary(logistic_reg)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1549 -1.0082 0.5883 0.9563 1.9003
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.501e+00 2.533e-01 -5.928 3.08e-09 ***
## NightMon 1.347e-01 2.464e-01 0.547 0.5846
## NightSat 1.878e-01 2.453e-01 0.766 0.4439
## NightSun -1.279e-01 2.348e-01 -0.545 0.5859
## NightThu -4.151e-01 2.351e-01 -1.765 0.0775 .
## NightTue -2.550e-01 2.313e-01 -1.102 0.2703
## NightWed -3.263e-01 2.272e-01 -1.436 0.1510
## `Ladies_NightNot Ladies Night` 7.490e-01 1.281e-01 5.847 5.00e-09 ***
## Number_of_Drinks 2.735e-01 2.338e-02 11.696 < 2e-16 ***
## Spent 5.173e-05 2.204e-04 0.235 0.8145
## Chow 9.205e-02 1.272e-01 0.724 0.4693
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1643.2 on 1199 degrees of freedom
## Residual deviance: 1449.1 on 1189 degrees of freedom
## AIC: 1471.1
##
## Number of Fisher Scoring iterations: 4
Predict using the logistic regression.
logistic_reg_pred <- predict(logistic_reg,
newdata = valid_df, type = "raw")
head(logistic_reg_pred)
## [1] Yes Yes Yes Yes Yes Yes
## Levels: No Yes
Compute the probabilities.
logistic_reg_pred_prob <- predict(logistic_reg,
newdata = valid_df, type = "prob")
head(logistic_reg_pred_prob)
## No Yes
## 2 0.1626688 0.8373312
## 5 0.4878738 0.5121262
## 6 0.2911174 0.7088826
## 10 0.4065387 0.5934613
## 11 0.1473743 0.8526257
## 12 0.2867861 0.7132139
The confusion matrix.
Caret requires the outcome variable to be a factor.
This should already be done. If not, factorise.
confusionMatrix(as.factor(logistic_reg_pred),
valid_df$Hangover, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 169 119
## Yes 160 352
##
## Accuracy : 0.6512
## 95% CI : (0.6171, 0.6843)
## No Information Rate : 0.5888
## P-Value [Acc > NIR] : 0.0001676
##
## Kappa : 0.266
##
## Mcnemar's Test P-Value : 0.0166323
##
## Sensitivity : 0.7473
## Specificity : 0.5137
## Pos Pred Value : 0.6875
## Neg Pred Value : 0.5868
## Prevalence : 0.5887
## Detection Rate : 0.4400
## Detection Prevalence : 0.6400
## Balanced Accuracy : 0.6305
##
## 'Positive' Class : Yes
##
If a cut off of a specific probability is preferred, let’s say 0.7.
confusionMatrix(as.factor(ifelse(logistic_reg_pred_prob[,2] > 0.7, "Yes", "No")), valid_df$Hangover,positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 266 276
## Yes 63 195
##
## Accuracy : 0.5762
## 95% CI : (0.5412, 0.6108)
## No Information Rate : 0.5888
## P-Value [Acc > NIR] : 0.775
##
## Kappa : 0.2027
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.4140
## Specificity : 0.8085
## Pos Pred Value : 0.7558
## Neg Pred Value : 0.4908
## Prevalence : 0.5887
## Detection Rate : 0.2437
## Detection Prevalence : 0.3225
## Balanced Accuracy : 0.6113
##
## 'Positive' Class : Yes
##
The ROC is a trade off the rate of a correct vs incorrect prediction.
The AUC metric ranges from 0.5 to 1.0.
Values >= 0.8 are good.
Load the library. This should have been done in the previous section.
library(ROSE)
ROSE::roc.curve(valid_df$Hangover, logistic_reg_pred)
## Area under the curve (AUC): 0.631
Note that this method only works if caret is used.
Load the library. This should have been done in the previous section.
library(modelplotr)
The scores need to be computed for the logistic regression.
We’ll use deciles instead of percentiles.
scores_and_ntiles <- prepare_scores_and_ntiles(datasets =
list("valid_df"),
dataset_labels =
list("Validation data"),
models =
list("logistic_reg"),
model_labels =
list("Logistic regression"),
target_column = "Hangover",
ntiles = 10)
## ... scoring caret model "logistic_reg" on dataset "valid_df".
## Data preparation step 1 succeeded! Dataframe created.
head(scores_and_ntiles)
## model_label dataset_label y_true prob_No prob_Yes ntl_No
## 2 Logistic regression Validation data Yes 0.1626688 0.8373312 10
## 5 Logistic regression Validation data Yes 0.4878738 0.5121262 4
## 6 Logistic regression Validation data No 0.2911174 0.7088826 8
## 10 Logistic regression Validation data No 0.4065387 0.5934613 6
## 11 Logistic regression Validation data Yes 0.1473743 0.8526257 10
## 12 Logistic regression Validation data Yes 0.2867861 0.7132139 8
## ntl_Yes
## 2 1
## 5 7
## 6 3
## 10 5
## 11 1
## 12 3
Specify the select_targetclass argument to the preferred class.
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,
select_targetclass = "Yes")
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
##
## No comparison specified, default values are used.
##
## Single evaluation line will be plotted: Target value "Yes" plotted for dataset "Validation data" and model "Logistic regression.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
head(plot_input)
## scope model_label dataset_label target_class ntile neg pos
## 1 no_comparison Logistic regression Validation data Yes 0 0 0
## 2 no_comparison Logistic regression Validation data Yes 1 16 64
## 3 no_comparison Logistic regression Validation data Yes 2 17 63
## 4 no_comparison Logistic regression Validation data Yes 3 26 54
## 5 no_comparison Logistic regression Validation data Yes 4 24 56
## 6 no_comparison Logistic regression Validation data Yes 5 30 50
## tot pct negtot postot tottot pcttot cumneg cumpos cumtot cumpct
## 1 0 NA NA NA NA NA 0 0 0 NA
## 2 80 0.8000 329 471 800 0.58875 16 64 80 0.8000000
## 3 80 0.7875 329 471 800 0.58875 33 127 160 0.7937500
## 4 80 0.6750 329 471 800 0.58875 59 181 240 0.7541667
## 5 80 0.7000 329 471 800 0.58875 83 237 320 0.7406250
## 6 80 0.6250 329 471 800 0.58875 113 287 400 0.7175000
## gain cumgain gain_ref gain_opt lift cumlift cumlift_ref legend
## 1 0.0000000 0.0000000 0.0 0.0000000 NA NA 1 Yes
## 2 0.1358811 0.1358811 0.1 0.1698514 1.358811 1.358811 1 Yes
## 3 0.1337580 0.2696391 0.2 0.3397028 1.337580 1.348195 1 Yes
## 4 0.1146497 0.3842887 0.3 0.5095541 1.146497 1.280962 1 Yes
## 5 0.1188960 0.5031847 0.4 0.6794055 1.188960 1.257962 1 Yes
## 6 0.1061571 0.6093418 0.5 0.8492569 1.061571 1.218684 1 Yes
Cumulative gains for logistic regression.
Variation: Highlight the 30th percentile or 3rd decile and change the colour.
plot_cumgains(data = plot_input)
plot_cumgains(data = plot_input, highlight_ntile = 3,
custom_line_colors = "#1E9C33")
##
## Plot annotation for plot: Cumulative gains
## - When we select 30% with the highest probability according to model Logistic regression, this selection holds 38% of all Yes cases in Validation data.
##
##
Cumulative lift for logistic regression.
Variation: Highlight the 20th percentile or 2nd decile and change the colour.
plot_cumlift(data = plot_input)
plot_cumlift(data = plot_input, highlight_ntile = 2,
custom_line_colors = "#1B18CC")
##
## Plot annotation for plot: Cumulative lift
## - When we select 20% with the highest probability according to model Logistic regression in Validation data, this selection for Yes cases is 1.3 times better than selecting without a model.
##
##
Response plot for logistic regression.
Variation: Highlight the 30th percentile or 3rd decile and change the colour.
plot_response(data = plot_input)
plot_response(data = plot_input, highlight_ntile = 3,
custom_line_colors = "#B717BF")
##
## Plot annotation for plot: Response
## - When we select ntile 3 according to model Logistic regression in dataset Validation data the % of Yes cases in the selection is 67.5%.
##
##
Cumulative response plot for logistic regression.
Variation: Highlight the 20th percentile or 2nd decile and change the colour.
plot_cumresponse(data = plot_input)
plot_cumresponse(data = plot_input, highlight_ntile = 2,
custom_line_colors = "#B37D12")
##
## Plot annotation for plot: Cumulative response
## - When we select ntiles 1 until 2 according to model Logistic regression in dataset Validation data the % of Yes cases in the selection is 79.4%.
##
##
Multiple plots together.
plot_multiplot(data = plot_input,
custom_line_colors = "#552682")
Applying the diagnostics to business.
plot_roi(data = plot_input,
fixed_costs = 10000,
variable_costs_per_unit = 200,
profit_per_unit = 500)
##
## Plot annotation for plot: Return on Investment (ROI)
## - When we select ntiles 1 until 4 in dataset Validation data using model Logistic regression to target Yes cases the expected return on investment is 60%.
##
##
plot_costsrevs(data = plot_input,
fixed_costs = 10000,
variable_costs_per_unit = 200,
profit_per_unit = 500,
highlight_ntile = "max_roi",
custom_line_colors = "#552682")
##
## Plot annotation for plot: Costs and Revenues
## - When we select ntiles 1 until 4 in dataset Validation data using model to target Yes cases the revenues are €118,500
##
##
plot_profit(data = plot_input,
fixed_costs = 10000,
variable_costs_per_unit = 200,
profit_per_unit = 500,
highlight_ntile = 4,
custom_line_colors = "#089E12")
##
## Plot annotation for plot: Profit
## - When we select ntiles 1 until 4 in dataset Validation data using model Logistic regression to target Yes cases the expected profit is €44,500
##
##
The diagnostics also work for kNN.
This should have been done in the previous section.
library(caret)
Load the data again. They have to be numerical for kNN.
Note that kNN may not work very well here.
Remove the first column.
Rename the target variable.
hangover_knn <- read.csv("hangover_2.csv", header = TRUE, fileEncoding = "latin1")
hangover_knn <- hangover_knn[, -c(1)]
hangover_knn$Hangover <- ifelse(hangover_knn$Hangover == 1, "Yes",
"No")
hangover_knn[,6] <- as.factor(hangover_knn[,6])
str(hangover_knn)
## 'data.frame': 2000 obs. of 6 variables:
## $ Night : int 5 6 3 6 1 1 5 2 3 4 ...
## $ Ladies_Night : int 1 0 0 0 1 0 1 0 1 1 ...
## $ Number_of_Drinks: int 2 8 3 1 5 5 3 10 5 8 ...
## $ Spent : int 703 287 346 312 919 926 193 710 47 280 ...
## $ Chow : int 1 0 1 0 0 1 1 1 0 1 ...
## $ Hangover : Factor w/ 2 levels "No","Yes": 1 2 1 2 2 1 1 2 1 1 ...
Set the seed using our favourite number.
set.seed(666)
Create the indices for the split This samples the row indices to split the data into training and validation.
train_index <- sample(1:nrow(hangover_knn), 0.6 * nrow(hangover_knn))
valid_index <- setdiff(1:nrow(hangover_knn), train_index)
Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.
train_df <- hangover_knn[train_index, ]
valid_df <- hangover_knn[valid_index, ]
Check the splits.
nrow(train_df)
## [1] 1200
nrow(valid_df)
## [1] 800
This is needed if predictors are on a different scale.
train_norm <- train_df
valid_norm <- valid_df
names(train_df)
## [1] "Night" "Ladies_Night" "Number_of_Drinks" "Spent"
## [5] "Chow" "Hangover"
names(valid_df)
## [1] "Night" "Ladies_Night" "Number_of_Drinks" "Spent"
## [5] "Chow" "Hangover"
Create a normalising algorithm using the 8 variables in the training set.
norm_values <- caret::preProcess(train_df[, -c(6)],
method = c("center",
"scale"))
train_norm[, -c(6)] <- predict(norm_values,
train_df[, -c(6)])
head(train_norm)
## Night Ladies_Night Number_of_Drinks Spent Chow Hangover
## 1598 1.5329482 -0.9700324 -1.1791359 -1.1099927 -1.0231854 Yes
## 638 -0.5030649 -0.9700324 -0.1440495 0.8224320 0.9765255 No
## 608 -1.0120682 -0.9700324 -0.8341071 -1.5428836 0.9765255 Yes
## 907 -0.5030649 1.0300344 -1.5241647 0.5834763 -1.0231854 No
## 1147 1.0239449 -0.9700324 0.5460081 1.1548922 -1.0231854 Yes
## 1564 1.5329482 1.0300344 1.2360657 -1.7160399 -1.0231854 Yes
Then using these normalising algorithm, predict the normalised values of the validation set.
valid_norm[, -c(6)] <- predict(norm_values,
valid_df[, -c(6)])
head(valid_norm)
## Night Ladies_Night Number_of_Drinks Spent Chow Hangover
## 2 1.023944930 -0.9700324 0.8910369 -0.79484820 -1.0231854 Yes
## 5 -1.521071466 1.0300344 -0.1440495 1.39384792 -1.0231854 Yes
## 6 -1.521071466 -0.9700324 -0.1440495 1.41808981 0.9765255 No
## 10 0.005938372 1.0300344 0.8910369 -0.81909009 0.9765255 No
## 11 -1.012068187 -0.9700324 1.5810945 0.05361786 -1.0231854 Yes
## 12 -0.503064908 1.0300344 1.5810945 -1.55327297 -1.0231854 Yes
Train the kNN model using k = 5.
Other values of k can be used too.
knn_model_k5 <- caret::train(Hangover ~ ., data = train_norm,
tuneGrid = data.frame(k = 5), method = "knn")
library(caret)
knn_pred_k5 <- predict(knn_model_k5, newdata = valid_df)
head(knn_pred_k5)
## [1] Yes Yes Yes Yes Yes Yes
## Levels: No Yes
Confusion matrix for the model.
confusionMatrix(knn_pred_k5, as.factor(valid_df[, 6]),
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 3 3
## Yes 326 468
##
## Accuracy : 0.5888
## 95% CI : (0.5538, 0.6231)
## No Information Rate : 0.5888
## P-Value [Acc > NIR] : 0.5152
##
## Kappa : 0.0032
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.993631
## Specificity : 0.009119
## Pos Pred Value : 0.589421
## Neg Pred Value : 0.500000
## Prevalence : 0.588750
## Detection Rate : 0.585000
## Detection Prevalence : 0.992500
## Balanced Accuracy : 0.501375
##
## 'Positive' Class : Yes
##
Associated probabilities.
knn_pred_k5_prob <- predict(knn_model_k5, newdata = valid_df, type = "prob")
head(knn_pred_k5_prob)
## No Yes
## 1 0.2857143 0.7142857
## 2 0.2222222 0.7777778
## 3 0.2222222 0.7777778
## 4 0.3333333 0.6666667
## 5 0.2222222 0.7777778
## 6 0.4000000 0.6000000
The ROC is a trade off the rate of a correct vs incorrect prediction.
The AUC metric ranges from 0.5 to 1.0.
Values >= 0.8 are good.
Load the library. This should have been done in the previous section.
library(ROSE)
ROSE::roc.curve(valid_df$Hangover, knn_pred_k5)
## Area under the curve (AUC): 0.501
Note that this method only works if caret is used.
Load the library. This should have been done in the previous section.
library(modelplotr)
The scores need to be computed for the logistic regression.
We’ll use deciles instead of percentiles.
scores_and_ntiles <- prepare_scores_and_ntiles(datasets =
list("valid_df"),
dataset_labels =
list("Validation data"),
models =
list("knn_model_k5"),
model_labels =
list("k Nearest Neighbours"),
target_column = "Hangover",
ntiles = 10)
## ... scoring caret model "knn_model_k5" on dataset "valid_df".
## Data preparation step 1 succeeded! Dataframe created.
head(scores_and_ntiles)
## model_label dataset_label y_true prob_No prob_Yes ntl_No
## 2 k Nearest Neighbours Validation data Yes 0.2857143 0.7142857 2
## 5 k Nearest Neighbours Validation data Yes 0.2222222 0.7777778 8
## 6 k Nearest Neighbours Validation data No 0.2222222 0.7777778 7
## 10 k Nearest Neighbours Validation data No 0.3333333 0.6666667 2
## 11 k Nearest Neighbours Validation data Yes 0.2222222 0.7777778 6
## 12 k Nearest Neighbours Validation data Yes 0.4000000 0.6000000 1
## ntl_Yes
## 2 8
## 5 3
## 6 6
## 10 10
## 11 6
## 12 10
Specify the select_targetclass argument to the preferred class.
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,
select_targetclass = "Yes")
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
##
## No comparison specified, default values are used.
##
## Single evaluation line will be plotted: Target value "Yes" plotted for dataset "Validation data" and model "k Nearest Neighbours.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
head(plot_input)
## scope model_label dataset_label target_class ntile neg pos
## 1 no_comparison k Nearest Neighbours Validation data Yes 0 0 0
## 2 no_comparison k Nearest Neighbours Validation data Yes 1 35 45
## 3 no_comparison k Nearest Neighbours Validation data Yes 2 31 49
## 4 no_comparison k Nearest Neighbours Validation data Yes 3 34 46
## 5 no_comparison k Nearest Neighbours Validation data Yes 4 30 50
## 6 no_comparison k Nearest Neighbours Validation data Yes 5 28 52
## tot pct negtot postot tottot pcttot cumneg cumpos cumtot cumpct
## 1 0 NA NA NA NA NA 0 0 0 NA
## 2 80 0.5625 329 471 800 0.58875 35 45 80 0.5625000
## 3 80 0.6125 329 471 800 0.58875 66 94 160 0.5875000
## 4 80 0.5750 329 471 800 0.58875 100 140 240 0.5833333
## 5 80 0.6250 329 471 800 0.58875 130 190 320 0.5937500
## 6 80 0.6500 329 471 800 0.58875 158 242 400 0.6050000
## gain cumgain gain_ref gain_opt lift cumlift cumlift_ref
## 1 0.00000000 0.0000000 0.0 0.0000000 NA NA 1
## 2 0.09554140 0.0955414 0.1 0.1698514 0.9554140 0.9554140 1
## 3 0.10403397 0.1995754 0.2 0.3397028 1.0403397 0.9978769 1
## 4 0.09766454 0.2972399 0.3 0.5095541 0.9766454 0.9907997 1
## 5 0.10615711 0.4033970 0.4 0.6794055 1.0615711 1.0084926 1
## 6 0.11040340 0.5138004 0.5 0.8492569 1.1040340 1.0276008 1
## legend
## 1 Yes
## 2 Yes
## 3 Yes
## 4 Yes
## 5 Yes
## 6 Yes
Cumulative gains for logistic regression.
Variation: Highlight the 30th percentile or 3rd decile and change the colour.
plot_cumgains(data = plot_input)
plot_cumgains(data = plot_input, highlight_ntile = 3,
custom_line_colors = "#1E9C33")
##
## Plot annotation for plot: Cumulative gains
## - When we select 30% with the highest probability according to model k Nearest Neighbours, this selection holds 30% of all Yes cases in Validation data.
##
##
Cumulative lift for logistic regression.
Variation: Highlight the 20th percentile or 2nd decile and change the colour.
plot_cumlift(data = plot_input)
plot_cumlift(data = plot_input, highlight_ntile = 2,
custom_line_colors = "#1B18CC")
##
## Plot annotation for plot: Cumulative lift
## - When we select 20% with the highest probability according to model k Nearest Neighbours in Validation data, this selection for Yes cases is 1.0 times better than selecting without a model.
##
##
Response plot for logistic regression.
Variation: Highlight the 30th percentile or 3rd decile and change the colour.
plot_response(data = plot_input)
plot_response(data = plot_input, highlight_ntile = 3,
custom_line_colors = "#B717BF")
##
## Plot annotation for plot: Response
## - When we select ntile 3 according to model k Nearest Neighbours in dataset Validation data the % of Yes cases in the selection is 57.5%.
##
##
Cumulative response plot for logistic regression.
Variation: Highlight the 20th percentile or 2nd decile and change the colour.
plot_cumresponse(data = plot_input)
plot_cumresponse(data = plot_input, highlight_ntile = 2,
custom_line_colors = "#B37D12")
##
## Plot annotation for plot: Cumulative response
## - When we select ntiles 1 until 2 according to model k Nearest Neighbours in dataset Validation data the % of Yes cases in the selection is 58.8%.
##
##
Multiple plots together.
plot_multiplot(data = plot_input,
custom_line_colors = "#552682")