Even superheroes learn magic :-)
They drink to victory because they deserve it, and they drink to defeat because they need it
Load the required libraries.
If you do not have these, install them.
Be sure to also install the dependencies.
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
Load the data and explore them.
It’s a good idea to see if they look (and feel) ok.
hogwarts <- read.csv("super_heroes_hogwarts_v3a.csv", header = TRUE)
head(hogwarts, 10)
## ID Name Gender Race Height Publisher
## 1 A001 A-Bomb Male Human 203 Marvel Comics
## 2 A002 Abe Sapien Male Icthyo Sapien 191 Dark Horse Comics
## 3 A004 Abomination Male Human / Radiation 203 Marvel Comics
## 4 A009 Agent 13 Female <NA> 173 Marvel Comics
## 5 A015 Alex Mercer Male Human NA Wildstorm
## 6 A016 Alex Woolsly Male <NA> NA NBC - Heroes
## 7 A024 Angel Male Vampire NA Dark Horse Comics
## 8 A025 Angel Dust Female Mutant 165 Marvel Comics
## 9 A028 Animal Man Male Human 183 DC Comics
## 10 A032 Anti-Monitor Male God / Eternal 61 DC Comics
## Alignment Weight Manipulative Resourceful Dismissive Intelligent Trusting
## 1 good 441 10 10 7 6 7
## 2 good 65 7 7 6 8 6
## 3 bad 441 6 8 1 6 3
## 4 good 61 7 7 1 9 7
## 5 bad NA 10 6 8 3 4
## 6 good NA 8 10 5 5 6
## 7 good NA 8 6 8 7 4
## 8 good 57 9 8 9 4 1
## 9 good 83 7 6 6 5 8
## 10 bad NA 7 7 7 1 9
## Loyal Stubborn Brave HouseID House STR DEX CON INT WIS CHA Level HP
## 1 7 7 9 1 Slytherin 18 11 17 12 13 11 1 7
## 2 7 6 9 1 Slytherin 16 17 10 13 15 11 8 72
## 3 3 5 2 1 Slytherin 13 14 13 10 18 15 15 135
## 4 4 6 6 1 Slytherin 15 18 16 16 17 10 14 140
## 5 4 1 8 1 Slytherin 14 17 13 12 10 11 9 72
## 6 7 7 6 1 Slytherin 14 14 11 13 12 12 1 8
## 7 1 5 2 1 Slytherin 15 17 15 18 13 18 11 88
## 8 6 5 4 1 Slytherin 8 17 12 15 17 18 1 8
## 9 3 3 2 1 Slytherin 10 17 15 18 13 14 8 56
## 10 1 6 5 1 Slytherin 8 10 11 16 12 11 7 63
str(hogwarts)
## 'data.frame': 734 obs. of 26 variables:
## $ ID : chr "A001" "A002" "A004" "A009" ...
## $ Name : chr "A-Bomb" "Abe Sapien" "Abomination" "Agent 13" ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ Race : chr "Human" "Icthyo Sapien" "Human / Radiation" NA ...
## $ Height : num 203 191 203 173 NA NA NA 165 183 61 ...
## $ Publisher : chr "Marvel Comics" "Dark Horse Comics" "Marvel Comics" "Marvel Comics" ...
## $ Alignment : chr "good" "good" "bad" "good" ...
## $ Weight : int 441 65 441 61 NA NA NA 57 83 NA ...
## $ Manipulative: int 10 7 6 7 10 8 8 9 7 7 ...
## $ Resourceful : int 10 7 8 7 6 10 6 8 6 7 ...
## $ Dismissive : int 7 6 1 1 8 5 8 9 6 7 ...
## $ Intelligent : int 6 8 6 9 3 5 7 4 5 1 ...
## $ Trusting : int 7 6 3 7 4 6 4 1 8 9 ...
## $ Loyal : int 7 7 3 4 4 7 1 6 3 1 ...
## $ Stubborn : int 7 6 5 6 1 7 5 5 3 6 ...
## $ Brave : int 9 9 2 6 8 6 2 4 2 5 ...
## $ HouseID : int 1 1 1 1 1 1 1 1 1 1 ...
## $ House : chr "Slytherin" "Slytherin" "Slytherin" "Slytherin" ...
## $ STR : int 18 16 13 15 14 14 15 8 10 8 ...
## $ DEX : int 11 17 14 18 17 14 17 17 17 10 ...
## $ CON : int 17 10 13 16 13 11 15 12 15 11 ...
## $ INT : int 12 13 10 16 12 13 18 15 18 16 ...
## $ WIS : int 13 15 18 17 10 12 13 17 13 12 ...
## $ CHA : int 11 11 15 10 11 12 18 18 14 11 ...
## $ Level : int 1 8 15 14 9 1 11 1 8 7 ...
## $ HP : int 7 72 135 140 72 8 88 8 56 63 ...
names(hogwarts)
## [1] "ID" "Name" "Gender" "Race" "Height"
## [6] "Publisher" "Alignment" "Weight" "Manipulative" "Resourceful"
## [11] "Dismissive" "Intelligent" "Trusting" "Loyal" "Stubborn"
## [16] "Brave" "HouseID" "House" "STR" "DEX"
## [21] "CON" "INT" "WIS" "CHA" "Level"
## [26] "HP"
nrow(hogwarts)
## [1] 734
We don’t need all the variables.
Slicing the dataset for what we need can make things easier.
hogwarts <- hogwarts[ , -c(1:8, 17, 19:26)]
names(hogwarts)
## [1] "Manipulative" "Resourceful" "Dismissive" "Intelligent" "Trusting"
## [6] "Loyal" "Stubborn" "Brave" "House"
t(t(names(hogwarts)))
## [,1]
## [1,] "Manipulative"
## [2,] "Resourceful"
## [3,] "Dismissive"
## [4,] "Intelligent"
## [5,] "Trusting"
## [6,] "Loyal"
## [7,] "Stubborn"
## [8,] "Brave"
## [9,] "House"
str(hogwarts)
## 'data.frame': 734 obs. of 9 variables:
## $ Manipulative: int 10 7 6 7 10 8 8 9 7 7 ...
## $ Resourceful : int 10 7 8 7 6 10 6 8 6 7 ...
## $ Dismissive : int 7 6 1 1 8 5 8 9 6 7 ...
## $ Intelligent : int 6 8 6 9 3 5 7 4 5 1 ...
## $ Trusting : int 7 6 3 7 4 6 4 1 8 9 ...
## $ Loyal : int 7 7 3 4 4 7 1 6 3 1 ...
## $ Stubborn : int 7 6 5 6 1 7 5 5 3 6 ...
## $ Brave : int 9 9 2 6 8 6 2 4 2 5 ...
## $ House : chr "Slytherin" "Slytherin" "Slytherin" "Slytherin" ...
table(hogwarts$House)
##
## Gryffindor Hufflepuff Ravenclaw Slytherin
## 188 189 156 201
nrow(hogwarts)
## [1] 734
Set House as factor. It has more than 2 categories.
This is also necessary for the confusion matrix.
Note that House has more than 2 classes.
hogwarts$House <- as.factor(hogwarts$House)
First, set the seed. We’ll use our favourite number :-)
set.seed(666)
Then randomly sample the rows via their indices (i.e. row numbers).
We will do a 70-30 split.
train_index <- sample(1:nrow(hogwarts), 0.7 * nrow(hogwarts))
valid_index <- setdiff(1:nrow(hogwarts), train_index)
Assign the randomly selected indices to the dataset to create the training and validation sets.
train_df <- hogwarts[train_index, ]
valid_df <- hogwarts[valid_index, ]
It’s a good idea to check the 2 sets before continuing.
nrow(train_df)
## [1] 513
nrow(valid_df)
## [1] 221
head(train_df)
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn
## 574 6 5 7 7 4 5 9
## 638 5 9 5 4 2 5 7
## 608 6 9 5 4 2 6 6
## 123 7 8 5 7 4 7 5
## 540 9 7 7 9 8 6 8
## 654 6 5 7 2 3 4 7
## Brave House
## 574 9 Gryffindor
## 638 10 Gryffindor
## 608 9 Gryffindor
## 123 6 Slytherin
## 540 3 Hufflepuff
## 654 8 Gryffindor
head(valid_df)
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn
## 2 7 7 6 8 6 7 6
## 3 6 8 1 6 3 3 5
## 5 10 6 8 3 4 4 1
## 12 6 6 6 2 3 6 7
## 13 9 7 9 8 6 3 2
## 15 7 8 1 7 7 7 3
## Brave House
## 2 9 Slytherin
## 3 2 Slytherin
## 5 8 Slytherin
## 12 5 Slytherin
## 13 5 Slytherin
## 15 5 Slytherin
str(train_df)
## 'data.frame': 513 obs. of 9 variables:
## $ Manipulative: int 6 5 6 7 9 6 7 1 6 9 ...
## $ Resourceful : int 5 9 9 8 7 5 4 7 7 10 ...
## $ Dismissive : int 7 5 5 5 7 7 1 1 3 5 ...
## $ Intelligent : int 7 4 4 7 9 2 6 4 3 4 ...
## $ Trusting : int 4 2 2 4 8 3 9 6 7 8 ...
## $ Loyal : int 5 5 6 7 6 4 3 8 5 5 ...
## $ Stubborn : int 9 7 6 5 8 7 8 8 3 5 ...
## $ Brave : int 9 10 9 6 3 8 7 8 2 7 ...
## $ House : Factor w/ 4 levels "Gryffindor","Hufflepuff",..: 1 1 1 4 2 1 1 1 4 4 ...
str(valid_df)
## 'data.frame': 221 obs. of 9 variables:
## $ Manipulative: int 7 6 10 6 9 7 6 6 9 9 ...
## $ Resourceful : int 7 8 6 6 7 8 6 6 6 8 ...
## $ Dismissive : int 6 1 8 6 9 1 6 6 8 3 ...
## $ Intelligent : int 8 6 3 2 8 7 6 3 3 5 ...
## $ Trusting : int 6 3 4 3 6 7 8 6 3 6 ...
## $ Loyal : int 7 3 4 6 3 7 8 7 2 7 ...
## $ Stubborn : int 6 5 1 7 2 3 1 6 5 2 ...
## $ Brave : int 9 2 8 5 5 5 7 2 7 8 ...
## $ House : Factor w/ 4 levels "Gryffindor","Hufflepuff",..: 4 4 4 4 4 4 4 4 4 4 ...
Here, we will create a new record for a new padawan.
The resultant model can be used to predict which house this padawan will be sorted into.
These values can be different for different padawans, including yourselves :-)
padawan_1 <- data.frame(Manipulative = 9,
Resourceful = 9,
Dismissive = 8,
Intelligent = 8,
Trusting = 6,
Loyal = 8,
Stubborn = 7,
Brave = 7)
padawan_1
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
## 1 9 9 8 8 6 8 7 7
This is needed if predictors are on a different scale.
In this case, this is just for illustration.
train_norm <- train_df
valid_norm <- valid_df
First, create a normalising algorithm using the 8 variables in the training set.
names(train_df)
## [1] "Manipulative" "Resourceful" "Dismissive" "Intelligent" "Trusting"
## [6] "Loyal" "Stubborn" "Brave" "House"
norm_values <- preProcess(train_df[, -c(9)],
method = c("center",
"scale"))
train_norm[, -c(9)] <- predict(norm_values,
train_df[, -c(9)])
head(train_norm)
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal
## 574 0.05969871 -0.4471044 0.5239251 0.5648867 -0.7995293 -0.3240011
## 638 -0.36565462 1.2487518 -0.3320417 -0.7664466 -1.6594003 -0.3240011
## 608 0.05969871 1.2487518 -0.3320417 -0.7664466 -1.6594003 0.0894630
## 123 0.48505205 0.8247877 -0.3320417 0.5648867 -0.7995293 0.5029271
## 540 1.33575871 0.4008237 0.5239251 1.4524423 0.9202129 0.0894630
## 654 0.05969871 -0.4471044 0.5239251 -1.6540022 -1.2294648 -0.7374653
## Stubborn Brave House
## 574 1.36584012 1.35940026 Gryffindor
## 638 0.49866688 1.79014289 Gryffindor
## 608 0.06508025 1.35940026 Gryffindor
## 123 -0.36850637 0.06717234 Slytherin
## 540 0.93225350 -1.22505557 Hufflepuff
## 654 0.49866688 0.92865762 Gryffindor
Then using these normalising algorithm, predict the normalised values of the validation set.
valid_norm[, -c(9)] <- predict(norm_values,
valid_df[, -c(9)])
head(valid_norm)
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal
## 2 0.48505205 0.40082370 0.09594171 1.0086645 0.06034183 0.5029271
## 3 0.05969871 0.82478775 -2.04397548 0.1211089 -1.22946479 -1.1509294
## 5 1.76111204 -0.02314034 0.95190858 -1.2102244 -0.79952925 -0.7374653
## 12 0.05969871 -0.02314034 0.09594171 -1.6540022 -1.22946479 0.0894630
## 13 1.33575871 0.40082370 1.37989202 1.0086645 0.06034183 -1.1509294
## 15 0.48505205 0.82478775 -2.04397548 0.5648867 0.49027737 0.5029271
## Stubborn Brave House
## 2 0.06508025 1.3594003 Slytherin
## 3 -0.36850637 -1.6557982 Slytherin
## 5 -2.10285287 0.9286576 Slytherin
## 12 0.49866688 -0.3635703 Slytherin
## 13 -1.66926624 -0.3635703 Slytherin
## 15 -1.23567962 -0.3635703 Slytherin
Finally, predict the normaised values for the new padawan (i.e. the new record.
padawan_1_norm <- predict(norm_values, padawan_1)
padawan_1_norm
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal
## 1 1.335759 1.248752 0.9519086 1.008665 0.06034183 0.9163913
## Stubborn Brave
## 1 0.4986669 0.497915
Train the kNN model using k = 5.
Other values of k can be used too.
knn_model <- caret::knn3(House ~ ., data = train_norm, k = 5)
knn_model
## 5-nearest neighbor model
## Training set outcome distribution:
##
## Gryffindor Hufflepuff Ravenclaw Slytherin
## 131 130 107 145
The prediction on the training set.
knn_pred_train <- predict(knn_model, newdata = train_norm[, -c(9)],
type = "class")
head(knn_pred_train)
## [1] Gryffindor Gryffindor Gryffindor Slytherin Hufflepuff Gryffindor
## Levels: Gryffindor Hufflepuff Ravenclaw Slytherin
Check the model on the training set.
confusionMatrix(knn_pred_train, as.factor(train_norm[, 9]))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Gryffindor Hufflepuff Ravenclaw Slytherin
## Gryffindor 118 3 3 6
## Hufflepuff 6 122 4 6
## Ravenclaw 4 2 98 5
## Slytherin 3 3 2 128
##
## Overall Statistics
##
## Accuracy : 0.9084
## 95% CI : (0.88, 0.9319)
## No Information Rate : 0.2827
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8775
##
## Mcnemar's Test P-Value : 0.5317
##
## Statistics by Class:
##
## Class: Gryffindor Class: Hufflepuff Class: Ravenclaw
## Sensitivity 0.9008 0.9385 0.9159
## Specificity 0.9686 0.9582 0.9729
## Pos Pred Value 0.9077 0.8841 0.8991
## Neg Pred Value 0.9661 0.9787 0.9777
## Prevalence 0.2554 0.2534 0.2086
## Detection Rate 0.2300 0.2378 0.1910
## Detection Prevalence 0.2534 0.2690 0.2125
## Balanced Accuracy 0.9347 0.9483 0.9444
## Class: Slytherin
## Sensitivity 0.8828
## Specificity 0.9783
## Pos Pred Value 0.9412
## Neg Pred Value 0.9549
## Prevalence 0.2827
## Detection Rate 0.2495
## Detection Prevalence 0.2651
## Balanced Accuracy 0.9305
The prediction on the validation set.
knn_pred_valid <- predict(knn_model, newdata = valid_norm[, -c(9)],
type = "class")
head(knn_pred_valid)
## [1] Slytherin Slytherin Slytherin Gryffindor Ravenclaw Hufflepuff
## Levels: Gryffindor Hufflepuff Ravenclaw Slytherin
Check the model on the validation set.
confusionMatrix(knn_pred_valid, as.factor(valid_norm[, 9]))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Gryffindor Hufflepuff Ravenclaw Slytherin
## Gryffindor 52 5 4 2
## Hufflepuff 4 49 5 3
## Ravenclaw 0 2 38 4
## Slytherin 1 3 2 47
##
## Overall Statistics
##
## Accuracy : 0.8416
## 95% CI : (0.7867, 0.8871)
## No Information Rate : 0.267
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7882
##
## Mcnemar's Test P-Value : 0.3802
##
## Statistics by Class:
##
## Class: Gryffindor Class: Hufflepuff Class: Ravenclaw
## Sensitivity 0.9123 0.8305 0.7755
## Specificity 0.9329 0.9259 0.9651
## Pos Pred Value 0.8254 0.8033 0.8636
## Neg Pred Value 0.9684 0.9375 0.9379
## Prevalence 0.2579 0.2670 0.2217
## Detection Rate 0.2353 0.2217 0.1719
## Detection Prevalence 0.2851 0.2760 0.1991
## Balanced Accuracy 0.9226 0.8782 0.8703
## Class: Slytherin
## Sensitivity 0.8393
## Specificity 0.9636
## Pos Pred Value 0.8868
## Neg Pred Value 0.9464
## Prevalence 0.2534
## Detection Rate 0.2127
## Detection Prevalence 0.2398
## Balanced Accuracy 0.9015
So what House does the new padawan belong to?
padawan_predict <- predict(knn_model, newdata = padawan_1_norm,
type = "class")
padawan_predict
## [1] Slytherin
## Levels: Gryffindor Hufflepuff Ravenclaw Slytherin
Since we already split the data and normalised them, we can just record the training and validation sets.
There are different ways to recode.
library(car)
## Loading required package: carData
train_norm_2 <- train_norm
train_norm_2$House <- recode(train_norm_2$House,
" 'Gryffindor' = 'Not Slytherin';
'Ravenclaw' = 'Not Slytherin';
'Hufflepuff' = 'Not Slytherin'")
table(train_norm_2$House)
##
## Not Slytherin Slytherin
## 368 145
valid_norm_2 <- valid_norm
valid_norm_2$House <- recode(valid_norm_2$House,
" 'Gryffindor' = 'Not Slytherin';
'Ravenclaw' = 'Not Slytherin';
'Hufflepuff' = 'Not Slytherin'")
table(valid_norm_2$House)
##
## Not Slytherin Slytherin
## 165 56
Train the kNN model using k = 7 (just for illustration).
Other values of k can be used too.
knn_model_k7_2 <- caret::knn3(House ~ ., data = train_norm_2, k = 7)
knn_model_k7_2
## 7-nearest neighbor model
## Training set outcome distribution:
##
## Not Slytherin Slytherin
## 368 145
The prediction on the training set.
knn_pred_k7_train_2 <- predict(knn_model_k7_2, newdata = train_norm_2[, -c(9)], type = "class")
head(knn_pred_k7_train_2)
## [1] Not Slytherin Not Slytherin Not Slytherin Slytherin Not Slytherin
## [6] Not Slytherin
## Levels: Not Slytherin Slytherin
Check the quality of the model on the training set with Slytherin as the preferred class.
confusionMatrix(knn_pred_k7_train_2, as.factor(train_norm_2[, 9]),
positive = "Slytherin")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not Slytherin Slytherin
## Not Slytherin 357 22
## Slytherin 11 123
##
## Accuracy : 0.9357
## 95% CI : (0.9108, 0.9553)
## No Information Rate : 0.7173
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8376
##
## Mcnemar's Test P-Value : 0.08172
##
## Sensitivity : 0.8483
## Specificity : 0.9701
## Pos Pred Value : 0.9179
## Neg Pred Value : 0.9420
## Prevalence : 0.2827
## Detection Rate : 0.2398
## Detection Prevalence : 0.2612
## Balanced Accuracy : 0.9092
##
## 'Positive' Class : Slytherin
##
The prediction on the validation set.
knn_pred_k7_valid_2 <- predict(knn_model_k7_2, newdata = valid_norm_2[, -c(9)], type = "class")
head(knn_pred_k7_valid_2)
## [1] Slytherin Slytherin Slytherin Not Slytherin Not Slytherin
## [6] Not Slytherin
## Levels: Not Slytherin Slytherin
This gives the probabilities
knn_pred_k7_valid_2_prob <- predict(knn_model_k7_2, newdata = valid_norm_2[, -c(9)])
head(knn_pred_k7_valid_2_prob)
## Not Slytherin Slytherin
## [1,] 0.2857143 0.7142857
## [2,] 0.0000000 1.0000000
## [3,] 0.4285714 0.5714286
## [4,] 0.7142857 0.2857143
## [5,] 0.5714286 0.4285714
## [6,] 0.5714286 0.4285714
Check the quality on the validation set, with Slytherin as the preferred class.
confusionMatrix(knn_pred_k7_valid_2, as.factor(valid_norm_2[, 9]),
positive = "Slytherin")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not Slytherin Slytherin
## Not Slytherin 158 8
## Slytherin 7 48
##
## Accuracy : 0.9321
## 95% CI : (0.8905, 0.9615)
## No Information Rate : 0.7466
## P-Value [Acc > NIR] : 7.151e-13
##
## Kappa : 0.8196
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8571
## Specificity : 0.9576
## Pos Pred Value : 0.8727
## Neg Pred Value : 0.9518
## Prevalence : 0.2534
## Detection Rate : 0.2172
## Detection Prevalence : 0.2489
## Balanced Accuracy : 0.9074
##
## 'Positive' Class : Slytherin
##
Predicting the padawan using a 2-class set up.
The house.
padawan_predict_2 <- predict(knn_model_k7_2,
newdata = padawan_1_norm,
type = "class")
padawan_predict_2
## [1] Slytherin
## Levels: Not Slytherin Slytherin
The probability.
padawan_predict_prob_2 <- predict(knn_model_k7_2,
newdata = padawan_1_norm,
type = "prob")
padawan_predict_prob_2
## Not Slytherin Slytherin
## [1,] 0.1428571 0.8571429
“Not Slytherin, eh?…Are you sure? You could be great, you know, it’s all here in your head, and Slytherin will help you on the way to greatness, no doubt about that no?”
Using the 2 class model.
Using ROSE.
library(ROSE)
## Loaded ROSE 0.0-4
ROSE::roc.curve(valid_norm_2$House, knn_pred_k7_valid_2)
## Area under the curve (AUC): 0.907
Using ROCR.
library(ROCR)
labels_2 <- valid_norm_2$House
library(car)
pred_2a <- recode(knn_pred_k7_valid_2,
" 'Slytherin' = '1';
'Not Slytherin' = '0'")
labels_2a <- recode(labels_2,
" 'Slytherin' = '1';
'Not Slytherin' = '0'")
Create the prediction.
prediction_2a <- prediction(as.numeric(pred_2a),
as.numeric(labels_2a))
ROC with TPR and FPR.
perform_2_v1 <- performance(prediction_2a, "tpr", "fpr")
Plot the ROC.
plot(perform_2_v1)
AUC.
perform_2_auc <- performance(prediction_2a, "auc")
perform_2_auc@y.name
## [1] "Area under the ROC curve"
perform_2_auc@y.values
## [[1]]
## [1] 0.9073593
Precision and recall
perform_2_v2 <- performance(prediction_2a, "prec", "rec")
plot(perform_2_v2)
Sensitivity and specificity.
perform_2_v3 <- performance(prediction_2a, "sens", "spec")
plot(perform_2_v3)
Library.
library(modelplotr)
## Package modelplotr loaded! Happy model plotting!
The scores need to be computed for the knn.
We’ll use deciles instead of percentiles.
scores_and_ntiles <- prepare_scores_and_ntiles(datasets =
list("valid_norm_2"),
dataset_labels =
list("Validation data"),
models =
list("knn_model_k7_2"),
model_labels =
list("k Nearest Neighbours"),
target_column = "House",
ntiles = 10)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## ... scoring caret model "knn_model_k7_2" on dataset "valid_norm_2".
## Data preparation step 1 succeeded! Dataframe created.
Check the scores.
head(scores_and_ntiles)
## model_label dataset_label y_true prob_Not Slytherin
## 2 k Nearest Neighbours Validation data Slytherin 0.2857143
## 3 k Nearest Neighbours Validation data Slytherin 0.0000000
## 5 k Nearest Neighbours Validation data Slytherin 0.4285714
## 12 k Nearest Neighbours Validation data Slytherin 0.7142857
## 13 k Nearest Neighbours Validation data Slytherin 0.5714286
## 15 k Nearest Neighbours Validation data Slytherin 0.5714286
## prob_Slytherin ntl_Not Slytherin ntl_Slytherin
## 2 0.7142857 9 2
## 3 1.0000000 10 1
## 5 0.5714286 8 3
## 12 0.2857143 7 4
## 13 0.4285714 8 3
## 15 0.4285714 8 3
Specify the select_targetclass argument to the preferred class.
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,
select_targetclass = "Slytherin")
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
##
## No comparison specified, default values are used.
##
## Single evaluation line will be plotted: Target value "Slytherin" plotted for dataset "Validation data" and model "k Nearest Neighbours.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
head(plot_input)
## scope model_label dataset_label target_class ntile neg pos
## 1 no_comparison k Nearest Neighbours Validation data Slytherin 0 0 0
## 2 no_comparison k Nearest Neighbours Validation data Slytherin 1 1 21
## 3 no_comparison k Nearest Neighbours Validation data Slytherin 2 4 18
## 4 no_comparison k Nearest Neighbours Validation data Slytherin 3 9 13
## 5 no_comparison k Nearest Neighbours Validation data Slytherin 4 19 3
## 6 no_comparison k Nearest Neighbours Validation data Slytherin 5 21 1
## tot pct negtot postot tottot pcttot cumneg cumpos cumtot cumpct
## 1 0 NA NA NA NA NA 0 0 0 NA
## 2 22 0.95454545 165 56 221 0.2533937 1 21 22 0.9545455
## 3 22 0.81818182 165 56 221 0.2533937 5 39 44 0.8863636
## 4 22 0.59090909 165 56 221 0.2533937 14 52 66 0.7878788
## 5 22 0.13636364 165 56 221 0.2533937 33 55 88 0.6250000
## 6 22 0.04545455 165 56 221 0.2533937 54 56 110 0.5090909
## gain cumgain gain_ref gain_opt lift cumlift cumlift_ref
## 1 0.00000000 0.0000000 0.0 0.0000000 NA NA 1
## 2 0.37500000 0.3750000 0.1 0.3928571 3.7670455 3.767045 1
## 3 0.32142857 0.6964286 0.2 0.7857143 3.2288961 3.497971 1
## 4 0.23214286 0.9285714 0.3 1.0000000 2.3319805 3.109307 1
## 5 0.05357143 0.9821429 0.4 1.0000000 0.5381494 2.466518 1
## 6 0.01785714 1.0000000 0.5 1.0000000 0.1793831 2.009091 1
## legend
## 1 Slytherin
## 2 Slytherin
## 3 Slytherin
## 4 Slytherin
## 5 Slytherin
## 6 Slytherin
Cumulative gains for kNN, 2 class model.
Highlight the 30th percentile or 3rd decile and change the colour.
plot_cumgains(data = plot_input, highlight_ntile = 3,
custom_line_colors = "#1E9C33")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
##
## Plot annotation for plot: Cumulative gains
## - When we select 30% with the highest probability according to model k Nearest Neighbours, this selection holds 93% of all Slytherin cases in Validation data.
##
##
Cumulative lift for kNN, 2 class model.
Highlight the 20th percentile or 2nd decile and change the colour.
plot_cumlift(data = plot_input, highlight_ntile = 2,
custom_line_colors = "#1B18CC")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
##
## Plot annotation for plot: Cumulative lift
## - When we select 20% with the highest probability according to model k Nearest Neighbours in Validation data, this selection for Slytherin cases is 3.5 times better than selecting without a model.
##
##
Response plot for kNN, 2 class model.
Highlight the 30th percentile or 3rd decile and change the colour.
plot_response(data = plot_input, highlight_ntile = 3,
custom_line_colors = "#B717BF")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
##
## Plot annotation for plot: Response
## - When we select ntile 3 according to model k Nearest Neighbours in dataset Validation data the % of Slytherin cases in the selection is 59.1%.
##
##
Cumulative response plot for kNN, 2 class model.
Highlight the 20th percentile or 2nd decile and change the colour.
plot_cumresponse(data = plot_input, highlight_ntile = 2,
custom_line_colors = "#B37D12")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
##
## Plot annotation for plot: Cumulative response
## - When we select ntiles 1 until 2 according to model k Nearest Neighbours in dataset Validation data the % of Slytherin cases in the selection is 88.6%.
##
##
Use caret to perform a 6-fold cross validation; repeated 6 times.
library(caret)
caret_control_k10 <- trainControl(method = "repeatedcv",
number = 6,
repeats = 6)
knn_cv_train <- train(House ~ .,
method = "knn",
tuneGrid = expand.grid(k = seq(3, 11, 2)),
trControl = caret_control_k10,
metric = "Accuracy",
data = train_norm_2)
knn_cv_train
## k-Nearest Neighbors
##
## 513 samples
## 8 predictor
## 2 classes: 'Not Slytherin', 'Slytherin'
##
## No pre-processing
## Resampling: Cross-Validated (6 fold, repeated 6 times)
## Summary of sample sizes: 427, 427, 427, 428, 428, 428, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 3 0.9087468 0.7683535
## 5 0.9129499 0.7798717
## 7 0.9093475 0.7699509
## 9 0.9103355 0.7738109
## 11 0.9155262 0.7855361
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 11.
library(ggplot2)
knn_cv_k <- as.data.frame(knn_cv_train$results[, c(1:2)])
knn_cv_k$k <- as.factor(knn_cv_k$k)
ggplot(knn_cv_k) + aes(x = k, y = Accuracy) + geom_col() +
coord_cartesian(ylim = c(0.9, 0.92))
Predict the training set.
knn_pred_cv_train_3 <- predict(knn_cv_train, newdata = train_norm_2[, -c(9)],
type = "raw")
head(knn_pred_cv_train_3)
## [1] Not Slytherin Not Slytherin Not Slytherin Slytherin Not Slytherin
## [6] Not Slytherin
## Levels: Not Slytherin Slytherin
Check the quality of the model using Slytherin as the preferred class.
confusionMatrix(knn_pred_cv_train_3, as.factor(train_norm_2[, 9]),
positive = "Slytherin")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not Slytherin Slytherin
## Not Slytherin 358 22
## Slytherin 10 123
##
## Accuracy : 0.9376
## 95% CI : (0.9131, 0.9569)
## No Information Rate : 0.7173
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8422
##
## Mcnemar's Test P-Value : 0.05183
##
## Sensitivity : 0.8483
## Specificity : 0.9728
## Pos Pred Value : 0.9248
## Neg Pred Value : 0.9421
## Prevalence : 0.2827
## Detection Rate : 0.2398
## Detection Prevalence : 0.2593
## Balanced Accuracy : 0.9106
##
## 'Positive' Class : Slytherin
##
And now, the validation set.
knn_pred_cv_valid_3 <- predict(knn_cv_train, newdata = valid_norm_2[, -c(9)],
type = "raw")
head(knn_pred_cv_valid_3)
## [1] Slytherin Slytherin Slytherin Not Slytherin Not Slytherin
## [6] Slytherin
## Levels: Not Slytherin Slytherin
Check the quality on the validation set, with Slytherin as the preferred class.
confusionMatrix(knn_pred_cv_valid_3, as.factor(valid_norm_2[, 9]),
positive = "Slytherin")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not Slytherin Slytherin
## Not Slytherin 158 7
## Slytherin 7 49
##
## Accuracy : 0.9367
## 95% CI : (0.896, 0.9649)
## No Information Rate : 0.7466
## P-Value [Acc > NIR] : 1.499e-13
##
## Kappa : 0.8326
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8750
## Specificity : 0.9576
## Pos Pred Value : 0.8750
## Neg Pred Value : 0.9576
## Prevalence : 0.2534
## Detection Rate : 0.2217
## Detection Prevalence : 0.2534
## Balanced Accuracy : 0.9163
##
## 'Positive' Class : Slytherin
##
The probabilities.
knn_pred_cv_valid_3_prob <- predict(knn_cv_train, newdata = valid_norm_2[, -c(9)],
type = "prob")
head(knn_pred_cv_valid_3_prob)
## Not Slytherin Slytherin
## 1 0.45454545 0.5454545
## 2 0.09090909 0.9090909
## 3 0.36363636 0.6363636
## 4 0.72727273 0.2727273
## 5 0.54545455 0.4545455
## 6 0.45454545 0.5454545
Different cutoff.
confusionMatrix(as.factor(ifelse(knn_pred_cv_valid_3_prob[,2] > 0.75,
"Slytherin", "Not Slytherin")),
valid_norm_2$House, positive = "Slytherin")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not Slytherin Slytherin
## Not Slytherin 164 30
## Slytherin 1 26
##
## Accuracy : 0.8597
## 95% CI : (0.8068, 0.9027)
## No Information Rate : 0.7466
## P-Value [Acc > NIR] : 3.005e-05
##
## Kappa : 0.5528
##
## Mcnemar's Test P-Value : 4.932e-07
##
## Sensitivity : 0.4643
## Specificity : 0.9939
## Pos Pred Value : 0.9630
## Neg Pred Value : 0.8454
## Prevalence : 0.2534
## Detection Rate : 0.1176
## Detection Prevalence : 0.1222
## Balanced Accuracy : 0.7291
##
## 'Positive' Class : Slytherin
##
ROC for the cv model.
library(ROSE)
ROSE::roc.curve(valid_norm_2$House, knn_pred_cv_valid_3)
## Area under the curve (AUC): 0.916
Predict the padawan using the cv model.
padawan_predict_3 <- predict(knn_cv_train,
newdata = padawan_1_norm,
type = "raw")
padawan_predict_3
## [1] Slytherin
## Levels: Not Slytherin Slytherin
The probability.
padawan_predict_prob_3 <- predict(knn_cv_train,
newdata = padawan_1_norm,
type = "prob")
padawan_predict_prob_3
## Not Slytherin Slytherin
## 1 0.1818182 0.8181818
Looping through k for the best accuracy.
accuracy_df <- data.frame(k = seq(1, 15, 1), accuracy = rep(1, 15), sensitivity = rep(1, 15),
specificity = rep(1,15))
head(accuracy_df)
## k accuracy sensitivity specificity
## 1 1 1 1 1
## 2 2 1 1 1
## 3 3 1 1 1
## 4 4 1 1 1
## 5 5 1 1 1
## 6 6 1 1 1
set.seed(666)
for(i in 1:15) {
knn_loop_model <- caret::knn3(House ~ .,
data = train_norm_2,
k = i)
knn_loop_pred <- predict(knn_loop_model,
newdata = train_norm_2[, -c(9)],
type = "class")
confusion_matrix <- confusionMatrix(knn_loop_pred,
as.factor(train_norm_2[, 9]),
positive = "Slytherin")
accuracy_df[i, 2] <- confusion_matrix$overall[1]
accuracy_df[i, 3] <- confusion_matrix$byClass[1]
accuracy_df[i, 4] <- confusion_matrix$byClass[2]
}
head(accuracy_df, 10)
## k accuracy sensitivity specificity
## 1 1 1.0000000 1.0000000 1.0000000
## 2 2 0.9454191 0.8965517 0.9646739
## 3 3 0.9571150 0.8965517 0.9809783
## 4 4 0.9415205 0.8551724 0.9755435
## 5 5 0.9493177 0.8758621 0.9782609
## 6 6 0.9376218 0.8413793 0.9755435
## 7 7 0.9356725 0.8482759 0.9701087
## 8 8 0.9356725 0.8413793 0.9728261
## 9 9 0.9376218 0.8482759 0.9728261
## 10 10 0.9356725 0.8551724 0.9673913
And the rest is history.