Even superheroes learn magic :-)
They drink to victory because they deserve it, and they drink to defeat because they need it
Load the required libraries.
If you do not have these, install them.
Be sure to also install the dependencies.
library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
Load the data and explore them.
It’s a good idea to see if they look (and feel) ok.
hogwarts <- read.csv("super_heroes_hogwarts_v3a.csv", header = TRUE)
head(hogwarts, 10)
## ID Name Gender Race Height Publisher
## 1 A001 A-Bomb Male Human 203 Marvel Comics
## 2 A002 Abe Sapien Male Icthyo Sapien 191 Dark Horse Comics
## 3 A004 Abomination Male Human / Radiation 203 Marvel Comics
## 4 A009 Agent 13 Female <NA> 173 Marvel Comics
## 5 A015 Alex Mercer Male Human NA Wildstorm
## 6 A016 Alex Woolsly Male <NA> NA NBC - Heroes
## 7 A024 Angel Male Vampire NA Dark Horse Comics
## 8 A025 Angel Dust Female Mutant 165 Marvel Comics
## 9 A028 Animal Man Male Human 183 DC Comics
## 10 A032 Anti-Monitor Male God / Eternal 61 DC Comics
## Alignment Weight Manipulative Resourceful Dismissive Intelligent Trusting
## 1 good 441 10 10 7 6 7
## 2 good 65 7 7 6 8 6
## 3 bad 441 6 8 1 6 3
## 4 good 61 7 7 1 9 7
## 5 bad NA 10 6 8 3 4
## 6 good NA 8 10 5 5 6
## 7 good NA 8 6 8 7 4
## 8 good 57 9 8 9 4 1
## 9 good 83 7 6 6 5 8
## 10 bad NA 7 7 7 1 9
## Loyal Stubborn Brave HouseID House STR DEX CON INT WIS CHA Level HP
## 1 7 7 9 1 Slytherin 18 11 17 12 13 11 1 7
## 2 7 6 9 1 Slytherin 16 17 10 13 15 11 8 72
## 3 3 5 2 1 Slytherin 13 14 13 10 18 15 15 135
## 4 4 6 6 1 Slytherin 15 18 16 16 17 10 14 140
## 5 4 1 8 1 Slytherin 14 17 13 12 10 11 9 72
## 6 7 7 6 1 Slytherin 14 14 11 13 12 12 1 8
## 7 1 5 2 1 Slytherin 15 17 15 18 13 18 11 88
## 8 6 5 4 1 Slytherin 8 17 12 15 17 18 1 8
## 9 3 3 2 1 Slytherin 10 17 15 18 13 14 8 56
## 10 1 6 5 1 Slytherin 8 10 11 16 12 11 7 63
str(hogwarts)
## 'data.frame': 734 obs. of 26 variables:
## $ ID : chr "A001" "A002" "A004" "A009" ...
## $ Name : chr "A-Bomb" "Abe Sapien" "Abomination" "Agent 13" ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ Race : chr "Human" "Icthyo Sapien" "Human / Radiation" NA ...
## $ Height : num 203 191 203 173 NA NA NA 165 183 61 ...
## $ Publisher : chr "Marvel Comics" "Dark Horse Comics" "Marvel Comics" "Marvel Comics" ...
## $ Alignment : chr "good" "good" "bad" "good" ...
## $ Weight : int 441 65 441 61 NA NA NA 57 83 NA ...
## $ Manipulative: int 10 7 6 7 10 8 8 9 7 7 ...
## $ Resourceful : int 10 7 8 7 6 10 6 8 6 7 ...
## $ Dismissive : int 7 6 1 1 8 5 8 9 6 7 ...
## $ Intelligent : int 6 8 6 9 3 5 7 4 5 1 ...
## $ Trusting : int 7 6 3 7 4 6 4 1 8 9 ...
## $ Loyal : int 7 7 3 4 4 7 1 6 3 1 ...
## $ Stubborn : int 7 6 5 6 1 7 5 5 3 6 ...
## $ Brave : int 9 9 2 6 8 6 2 4 2 5 ...
## $ HouseID : int 1 1 1 1 1 1 1 1 1 1 ...
## $ House : chr "Slytherin" "Slytherin" "Slytherin" "Slytherin" ...
## $ STR : int 18 16 13 15 14 14 15 8 10 8 ...
## $ DEX : int 11 17 14 18 17 14 17 17 17 10 ...
## $ CON : int 17 10 13 16 13 11 15 12 15 11 ...
## $ INT : int 12 13 10 16 12 13 18 15 18 16 ...
## $ WIS : int 13 15 18 17 10 12 13 17 13 12 ...
## $ CHA : int 11 11 15 10 11 12 18 18 14 11 ...
## $ Level : int 1 8 15 14 9 1 11 1 8 7 ...
## $ HP : int 7 72 135 140 72 8 88 8 56 63 ...
names(hogwarts)
## [1] "ID" "Name" "Gender" "Race" "Height"
## [6] "Publisher" "Alignment" "Weight" "Manipulative" "Resourceful"
## [11] "Dismissive" "Intelligent" "Trusting" "Loyal" "Stubborn"
## [16] "Brave" "HouseID" "House" "STR" "DEX"
## [21] "CON" "INT" "WIS" "CHA" "Level"
## [26] "HP"
nrow(hogwarts)
## [1] 734
We don’t need all the variables.
Slicing the dataset for what we need can make things easier.
hogwarts <- hogwarts[ , -c(1:8, 17:18, 19:25)]
names(hogwarts)
## [1] "Manipulative" "Resourceful" "Dismissive" "Intelligent" "Trusting"
## [6] "Loyal" "Stubborn" "Brave" "HP"
t(t(names(hogwarts)))
## [,1]
## [1,] "Manipulative"
## [2,] "Resourceful"
## [3,] "Dismissive"
## [4,] "Intelligent"
## [5,] "Trusting"
## [6,] "Loyal"
## [7,] "Stubborn"
## [8,] "Brave"
## [9,] "HP"
str(hogwarts)
## 'data.frame': 734 obs. of 9 variables:
## $ Manipulative: int 10 7 6 7 10 8 8 9 7 7 ...
## $ Resourceful : int 10 7 8 7 6 10 6 8 6 7 ...
## $ Dismissive : int 7 6 1 1 8 5 8 9 6 7 ...
## $ Intelligent : int 6 8 6 9 3 5 7 4 5 1 ...
## $ Trusting : int 7 6 3 7 4 6 4 1 8 9 ...
## $ Loyal : int 7 7 3 4 4 7 1 6 3 1 ...
## $ Stubborn : int 7 6 5 6 1 7 5 5 3 6 ...
## $ Brave : int 9 9 2 6 8 6 2 4 2 5 ...
## $ HP : int 7 72 135 140 72 8 88 8 56 63 ...
table(hogwarts$House)
## < table of extent 0 >
nrow(hogwarts)
## [1] 734
First, set the seed. We’ll use our favourite number :-)
set.seed(666)
Then randomly sample the rows via their indices (i.e. row numbers).
We will do a 70-30 split.
train_index <- sample(1:nrow(hogwarts), 0.7 * nrow(hogwarts))
valid_index <- setdiff(1:nrow(hogwarts), train_index)
Assign the randomly selected indices to the dataset to create the training and validation sets.
train_df <- hogwarts[train_index, ]
valid_df <- hogwarts[valid_index, ]
It’s a good idea to check the 2 sets before continuing.
nrow(train_df)
## [1] 513
nrow(valid_df)
## [1] 221
head(train_df)
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn
## 574 6 5 7 7 4 5 9
## 638 5 9 5 4 2 5 7
## 608 6 9 5 4 2 6 6
## 123 7 8 5 7 4 7 5
## 540 9 7 7 9 8 6 8
## 654 6 5 7 2 3 4 7
## Brave HP
## 574 9 72
## 638 10 120
## 608 9 98
## 123 6 63
## 540 3 40
## 654 8 54
head(valid_df)
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn
## 2 7 7 6 8 6 7 6
## 3 6 8 1 6 3 3 5
## 5 10 6 8 3 4 4 1
## 12 6 6 6 2 3 6 7
## 13 9 7 9 8 6 3 2
## 15 7 8 1 7 7 7 3
## Brave HP
## 2 9 72
## 3 2 135
## 5 8 72
## 12 5 30
## 13 5 84
## 15 5 140
str(train_df)
## 'data.frame': 513 obs. of 9 variables:
## $ Manipulative: int 6 5 6 7 9 6 7 1 6 9 ...
## $ Resourceful : int 5 9 9 8 7 5 4 7 7 10 ...
## $ Dismissive : int 7 5 5 5 7 7 1 1 3 5 ...
## $ Intelligent : int 7 4 4 7 9 2 6 4 3 4 ...
## $ Trusting : int 4 2 2 4 8 3 9 6 7 8 ...
## $ Loyal : int 5 5 6 7 6 4 3 8 5 5 ...
## $ Stubborn : int 9 7 6 5 8 7 8 8 3 5 ...
## $ Brave : int 9 10 9 6 3 8 7 8 2 7 ...
## $ HP : int 72 120 98 63 40 54 140 66 56 56 ...
str(valid_df)
## 'data.frame': 221 obs. of 9 variables:
## $ Manipulative: int 7 6 10 6 9 7 6 6 9 9 ...
## $ Resourceful : int 7 8 6 6 7 8 6 6 6 8 ...
## $ Dismissive : int 6 1 8 6 9 1 6 6 8 3 ...
## $ Intelligent : int 8 6 3 2 8 7 6 3 3 5 ...
## $ Trusting : int 6 3 4 3 6 7 8 6 3 6 ...
## $ Loyal : int 7 3 4 6 3 7 8 7 2 7 ...
## $ Stubborn : int 6 5 1 7 2 3 1 6 5 2 ...
## $ Brave : int 9 2 8 5 5 5 7 2 7 8 ...
## $ HP : int 72 135 72 30 84 140 140 72 21 112 ...
Here, we will create a new record for a new padawan.
The resultant model can be used to predict the HP for this padawan.
These values can be different for different padawans, including yourselves :-)
padawan_1 <- data.frame(Manipulative = 9,
Resourceful = 9,
Dismissive = 8,
Intelligent = 8,
Trusting = 6,
Loyal = 8,
Stubborn = 7,
Brave = 7)
padawan_1
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
## 1 9 9 8 8 6 8 7 7
This is needed if predictors are on a different scale.
In this case, this is just for illustration.
train_norm <- train_df
valid_norm <- valid_df
First, create a normalising algorithm using the 8 variables in the training set.
names(train_df)
## [1] "Manipulative" "Resourceful" "Dismissive" "Intelligent" "Trusting"
## [6] "Loyal" "Stubborn" "Brave" "HP"
norm_values <- preProcess(train_df[, -c(9)],
method = c("center",
"scale"))
train_norm[, -c(9)] <- predict(norm_values,
train_df[, -c(9)])
head(train_norm)
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal
## 574 0.05969871 -0.4471044 0.5239251 0.5648867 -0.7995293 -0.3240011
## 638 -0.36565462 1.2487518 -0.3320417 -0.7664466 -1.6594003 -0.3240011
## 608 0.05969871 1.2487518 -0.3320417 -0.7664466 -1.6594003 0.0894630
## 123 0.48505205 0.8247877 -0.3320417 0.5648867 -0.7995293 0.5029271
## 540 1.33575871 0.4008237 0.5239251 1.4524423 0.9202129 0.0894630
## 654 0.05969871 -0.4471044 0.5239251 -1.6540022 -1.2294648 -0.7374653
## Stubborn Brave HP
## 574 1.36584012 1.35940026 72
## 638 0.49866688 1.79014289 120
## 608 0.06508025 1.35940026 98
## 123 -0.36850637 0.06717234 63
## 540 0.93225350 -1.22505557 40
## 654 0.49866688 0.92865762 54
Then using these normalising algorithm, predict the normalised values of the validation set.
valid_norm[, -c(9)] <- predict(norm_values,
valid_df[, -c(9)])
head(valid_norm)
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal
## 2 0.48505205 0.40082370 0.09594171 1.0086645 0.06034183 0.5029271
## 3 0.05969871 0.82478775 -2.04397548 0.1211089 -1.22946479 -1.1509294
## 5 1.76111204 -0.02314034 0.95190858 -1.2102244 -0.79952925 -0.7374653
## 12 0.05969871 -0.02314034 0.09594171 -1.6540022 -1.22946479 0.0894630
## 13 1.33575871 0.40082370 1.37989202 1.0086645 0.06034183 -1.1509294
## 15 0.48505205 0.82478775 -2.04397548 0.5648867 0.49027737 0.5029271
## Stubborn Brave HP
## 2 0.06508025 1.3594003 72
## 3 -0.36850637 -1.6557982 135
## 5 -2.10285287 0.9286576 72
## 12 0.49866688 -0.3635703 30
## 13 -1.66926624 -0.3635703 84
## 15 -1.23567962 -0.3635703 140
Finally, predict the normaised values for the new padawan (i.e. the new record.
padawan_1_norm <- predict(norm_values, padawan_1)
padawan_1_norm
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal
## 1 1.335759 1.248752 0.9519086 1.008665 0.06034183 0.9163913
## Stubborn Brave
## 1 0.4986669 0.497915
Train the kNN model using k = 5.
Other values of k can be used too.
knn_model <- caret::knnreg(HP ~ ., data = train_norm, k = 5)
knn_model
## 5-nearest neighbor regression model
The prediction on the training set.
predict_train <- predict(knn_model, train_norm)
accuracy(predict_train, train_norm$HP)
## ME RMSE MAE MPE MAPE
## Test set 0.1732943 32.74151 26.86764 -57.6088 83.27205
plot(train_norm$HP, predict(knn_model, train_norm),
xlab = "Actual HP",
ylab = "Predicted HP",
main = "Predicted vs Actual HP (Training)",
pch = 19,
col = "green",
cex = 1.2,
las = 1,
bty = "l")
grid()
The prediction on the validation set.
predict_valid <- predict(knn_model, valid_norm)
accuracy(predict_valid, valid_norm$HP)
## ME RMSE MAE MPE MAPE
## Test set -4.350226 39.45858 31.63529 -78.05783 106.0062
plot(valid_norm$HP, predict(knn_model, valid_norm),
xlab = "Actual HP",
ylab = "Predicted HP",
main = "Predicted vs Actual HP (Validation)",
pch = 19,
col = "blue",
cex = 1.2,
las = 1,
bty = "l")
grid()
knn_model_padawan_pred <- predict(knn_model, newdata = padawan_1_norm)
knn_model_padawan_pred
## [1] 61.6
Using FNN
train_norm_padawan_norm <- rbind(train_norm[, -c(9)], padawan_1_norm)
tail(train_norm_padawan_norm)
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal
## 147 1.7611120 1.2487518 0.09594171 1.4524423 -1.65940033 -0.3240011
## 130 0.9104054 0.8247877 0.52392514 -0.3226688 0.49027737 1.3298554
## 185 0.9104054 0.8247877 -1.61599204 -1.2102244 0.06034183 0.9163913
## 154 0.4850520 1.2487518 0.52392514 -0.7664466 0.49027737 -1.1509294
## 681 0.4850520 -1.2950325 0.09594171 -0.7664466 -1.22946479 1.3298554
## 12 1.3357587 1.2487518 0.95190858 1.0086645 0.06034183 0.9163913
## Stubborn Brave
## 147 1.3658401 -1.2250556
## 130 0.9322535 -0.7943129
## 185 0.4986669 -1.6557982
## 154 0.4986669 1.3594003
## 681 1.7994267 1.7901429
## 12 0.4986669 0.4979150
nrow(train_norm_padawan_norm)
## [1] 514
Get distance from the last row, which is padawan_1_norm.
library(FNN)
knn_numeric_model_dist <- get.knn(train_norm_padawan_norm , k = 5)
# knn_numeric_model_dist
# knn_numeric_model_dist$nn.index
tail(knn_numeric_model_dist$nn.index, 1)
## [,1] [,2] [,3] [,4] [,5]
## [514,] 359 58 215 54 323
Identify the nearest neighbours.
# k nearest neighbours
array_elements <- tail(knn_numeric_model_dist$nn.index, 1)
array_elements
## [,1] [,2] [,3] [,4] [,5]
## [514,] 359 58 215 54 323
train_norm[array_elements, ]
## Manipulative Resourceful Dismissive Intelligent Trusting Loyal
## 103 1.335759 0.8247877 0.09594171 1.0086645 0.06034183 0.9163913
## 1 1.761112 1.6727158 0.52392514 0.1211089 0.49027737 0.5029271
## 157 1.335759 1.2487518 1.37989202 -0.3226688 -0.36959371 1.3298554
## 136 1.335759 0.8247877 0.52392514 -0.3226688 -0.36959371 0.9163913
## 191 0.485052 0.4008237 0.52392514 1.4524423 0.06034183 0.9163913
## Stubborn Brave HP
## 103 -0.3685064 -0.36357030 30
## 1 0.4986669 1.35940026 7
## 157 0.9322535 0.49791498 150
## 136 0.4986669 0.06717234 81
## 191 0.4986669 1.35940026 40
Check.
train_norm[array_elements, ]$HP
## [1] 30 7 150 81 40
mean(train_norm[array_elements, ]$HP)
## [1] 61.6