Every rose has its thorn
Just like every night has its dawn
And every cowboy sings a sad, sad song
Every rose has its thorn
pokemon <- read.csv("pokemon_2.csv", header = TRUE)
Explore the data.
head(pokemon)
## abilities against_bug against_dark against_dragon
## 1 ['Overgrow', 'Chlorophyll'] 1.00 1 1
## 2 ['Overgrow', 'Chlorophyll'] 1.00 1 1
## 3 ['Overgrow', 'Chlorophyll'] 1.00 1 1
## 4 ['Blaze', 'Solar Power'] 0.50 1 1
## 5 ['Blaze', 'Solar Power'] 0.50 1 1
## 6 ['Blaze', 'Solar Power'] 0.25 1 1
## against_electric against_fairy against_fight against_fire against_flying
## 1 0.5 0.5 0.5 2.0 2
## 2 0.5 0.5 0.5 2.0 2
## 3 0.5 0.5 0.5 2.0 2
## 4 1.0 0.5 1.0 0.5 1
## 5 1.0 0.5 1.0 0.5 1
## 6 2.0 0.5 0.5 0.5 1
## against_ghost against_grass against_ground against_ice against_normal
## 1 1 0.25 1 2.0 1
## 2 1 0.25 1 2.0 1
## 3 1 0.25 1 2.0 1
## 4 1 0.50 2 0.5 1
## 5 1 0.50 2 0.5 1
## 6 1 0.25 0 1.0 1
## against_poison against_psychic against_rock against_steel against_water
## 1 1 2 1 1.0 0.5
## 2 1 2 1 1.0 0.5
## 3 1 2 1 1.0 0.5
## 4 1 1 2 0.5 2.0
## 5 1 1 2 0.5 2.0
## 6 1 1 4 0.5 2.0
## attack base_egg_steps base_happiness base_total capture_rate
## 1 49 5120 70 318 45
## 2 62 5120 70 405 45
## 3 100 5120 70 625 45
## 4 52 5120 70 309 45
## 5 64 5120 70 405 45
## 6 104 5120 70 634 45
## classfication defence experience_growth height_m hp
## 1 Seed Pok\xed\xa9mon 49 1059860 0.7 45
## 2 Seed Pok\xed\xa9mon 63 1059860 1.0 60
## 3 Seed Pok\xed\xa9mon 123 1059860 2.0 80
## 4 Lizard Pok\xed\xa9mon 43 1059860 0.6 39
## 5 Flame Pok\xed\xa9mon 58 1059860 1.1 58
## 6 Flame Pok\xed\xa9mon 78 1059860 1.7 78
## japanese_name name percentage_male
## 1 Fushigidane܀\xb4܉\x87܉\xac܀\xf3܀\x8d Bulbasaur 88.1
## 2 Fushigisou܀\xb4܉\x87܉\xac܉_܉_ Ivysaur 88.1
## 3 Fushigibana܀\xb4܉\x87܉\xac܀\x90܀_ Venusaur 88.1
## 4 Hitokage܀\xcd܀\x9a܉\x82܉_ Charmander 88.1
## 5 Lizardo܀\xc8܉_܀_܀\x8a Charmeleon 88.1
## 6 Lizardon܀\xc8܉_܀_܀\x8a܀_ Charizard 88.1
## pokedex_number sp_attack sp_defence speed type1 type2 weight_kg generation
## 1 1 65 65 45 grass poison 6.9 1
## 2 2 80 80 60 grass poison 13.0 1
## 3 3 122 120 80 grass poison 100.0 1
## 4 4 60 50 65 fire 8.5 1
## 5 5 80 65 80 fire 19.0 1
## 6 6 159 115 100 fire flying 90.5 1
## is_legendary train
## 1 0 1
## 2 0 1
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
str(pokemon)
## 'data.frame': 801 obs. of 42 variables:
## $ abilities : chr "['Overgrow', 'Chlorophyll']" "['Overgrow', 'Chlorophyll']" "['Overgrow', 'Chlorophyll']" "['Blaze', 'Solar Power']" ...
## $ against_bug : num 1 1 1 0.5 0.5 0.25 1 1 1 1 ...
## $ against_dark : num 1 1 1 1 1 1 1 1 1 1 ...
## $ against_dragon : num 1 1 1 1 1 1 1 1 1 1 ...
## $ against_electric : num 0.5 0.5 0.5 1 1 2 2 2 2 1 ...
## $ against_fairy : num 0.5 0.5 0.5 0.5 0.5 0.5 1 1 1 1 ...
## $ against_fight : num 0.5 0.5 0.5 1 1 0.5 1 1 1 0.5 ...
## $ against_fire : num 2 2 2 0.5 0.5 0.5 0.5 0.5 0.5 2 ...
## $ against_flying : num 2 2 2 1 1 1 1 1 1 2 ...
## $ against_ghost : num 1 1 1 1 1 1 1 1 1 1 ...
## $ against_grass : num 0.25 0.25 0.25 0.5 0.5 0.25 2 2 2 0.5 ...
## $ against_ground : num 1 1 1 2 2 0 1 1 1 0.5 ...
## $ against_ice : num 2 2 2 0.5 0.5 1 0.5 0.5 0.5 1 ...
## $ against_normal : num 1 1 1 1 1 1 1 1 1 1 ...
## $ against_poison : num 1 1 1 1 1 1 1 1 1 1 ...
## $ against_psychic : num 2 2 2 1 1 1 1 1 1 1 ...
## $ against_rock : num 1 1 1 2 2 4 1 1 1 2 ...
## $ against_steel : num 1 1 1 0.5 0.5 0.5 0.5 0.5 0.5 1 ...
## $ against_water : num 0.5 0.5 0.5 2 2 2 0.5 0.5 0.5 1 ...
## $ attack : int 49 62 100 52 64 104 48 63 103 30 ...
## $ base_egg_steps : int 5120 5120 5120 5120 5120 5120 5120 5120 5120 3840 ...
## $ base_happiness : int 70 70 70 70 70 70 70 70 70 70 ...
## $ base_total : int 318 405 625 309 405 634 314 405 630 195 ...
## $ capture_rate : chr "45" "45" "45" "45" ...
## $ classfication : chr "Seed Pok\xed\xa9mon" "Seed Pok\xed\xa9mon" "Seed Pok\xed\xa9mon" "Lizard Pok\xed\xa9mon" ...
## $ defence : int 49 63 123 43 58 78 65 80 120 35 ...
## $ experience_growth: int 1059860 1059860 1059860 1059860 1059860 1059860 1059860 1059860 1059860 1000000 ...
## $ height_m : num 0.7 1 2 0.6 1.1 1.7 0.5 1 1.6 0.3 ...
## $ hp : int 45 60 80 39 58 78 44 59 79 45 ...
## $ japanese_name : chr "Fushigidane܀\xb4܉\x87܉\xac܀\xf3܀\x8d" "Fushigisou܀\xb4܉\x87܉\xac܉_܉_" "Fushigibana܀\xb4܉\x87܉\xac܀\x90܀_" "Hitokage܀\xcd܀\x9a܉\x82܉_" ...
## $ name : chr "Bulbasaur" "Ivysaur" "Venusaur" "Charmander" ...
## $ percentage_male : num 88.1 88.1 88.1 88.1 88.1 88.1 88.1 88.1 88.1 50 ...
## $ pokedex_number : int 1 2 3 4 5 6 7 8 9 10 ...
## $ sp_attack : int 65 80 122 60 80 159 50 65 135 20 ...
## $ sp_defence : int 65 80 120 50 65 115 64 80 115 20 ...
## $ speed : int 45 60 80 65 80 100 43 58 78 45 ...
## $ type1 : chr "grass" "grass" "grass" "fire" ...
## $ type2 : chr "poison" "poison" "poison" "" ...
## $ weight_kg : num 6.9 13 100 8.5 19 90.5 9 22.5 85.5 2.9 ...
## $ generation : int 1 1 1 1 1 1 1 1 1 1 ...
## $ is_legendary : int 0 0 0 0 0 0 0 0 0 0 ...
## $ train : int 1 1 0 0 0 0 0 0 0 0 ...
summary(pokemon)
## abilities against_bug against_dark against_dragon
## Length:801 Min. :0.2500 Min. :0.250 Min. :0.0000
## Class :character 1st Qu.:0.5000 1st Qu.:1.000 1st Qu.:1.0000
## Mode :character Median :1.0000 Median :1.000 Median :1.0000
## Mean :0.9963 Mean :1.057 Mean :0.9688
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :4.0000 Max. :4.000 Max. :2.0000
##
## against_electric against_fairy against_fight against_fire
## Min. :0.000 Min. :0.250 Min. :0.000 Min. :0.250
## 1st Qu.:0.500 1st Qu.:1.000 1st Qu.:0.500 1st Qu.:0.500
## Median :1.000 Median :1.000 Median :1.000 Median :1.000
## Mean :1.074 Mean :1.069 Mean :1.066 Mean :1.135
## 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
##
## against_flying against_ghost against_grass against_ground
## Min. :0.250 Min. :0.000 Min. :0.250 Min. :0.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.500 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :1.000 Median :1.000
## Mean :1.193 Mean :0.985 Mean :1.034 Mean :1.098
## 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
##
## against_ice against_normal against_poison against_psychic
## Min. :0.250 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.500 1st Qu.:1.000 1st Qu.:0.5000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :1.0000 Median :1.000
## Mean :1.208 Mean :0.887 Mean :0.9753 Mean :1.005
## 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :4.000 Max. :1.000 Max. :4.0000 Max. :4.000
##
## against_rock against_steel against_water attack
## Min. :0.25 Min. :0.2500 Min. :0.250 Min. : 5.00
## 1st Qu.:1.00 1st Qu.:0.5000 1st Qu.:0.500 1st Qu.: 55.00
## Median :1.00 Median :1.0000 Median :1.000 Median : 75.00
## Mean :1.25 Mean :0.9835 Mean :1.058 Mean : 77.86
## 3rd Qu.:2.00 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:100.00
## Max. :4.00 Max. :4.0000 Max. :4.000 Max. :185.00
##
## base_egg_steps base_happiness base_total capture_rate
## Min. : 1280 Min. : 0.00 Min. :180.0 Length:801
## 1st Qu.: 5120 1st Qu.: 70.00 1st Qu.:320.0 Class :character
## Median : 5120 Median : 70.00 Median :435.0 Mode :character
## Mean : 7191 Mean : 65.36 Mean :428.4
## 3rd Qu.: 6400 3rd Qu.: 70.00 3rd Qu.:505.0
## Max. :30720 Max. :140.00 Max. :780.0
##
## classfication defence experience_growth height_m
## Length:801 Min. : 5.00 Min. : 600000 Min. : 0.100
## Class :character 1st Qu.: 50.00 1st Qu.:1000000 1st Qu.: 0.600
## Mode :character Median : 70.00 Median :1000000 Median : 1.000
## Mean : 73.01 Mean :1054996 Mean : 1.164
## 3rd Qu.: 90.00 3rd Qu.:1059860 3rd Qu.: 1.500
## Max. :230.00 Max. :1640000 Max. :14.500
## NA's :20
## hp japanese_name name percentage_male
## Min. : 1.00 Length:801 Length:801 Min. : 0.00
## 1st Qu.: 50.00 Class :character Class :character 1st Qu.: 50.00
## Median : 65.00 Mode :character Mode :character Median : 50.00
## Mean : 68.96 Mean : 55.16
## 3rd Qu.: 80.00 3rd Qu.: 50.00
## Max. :255.00 Max. :100.00
## NA's :98
## pokedex_number sp_attack sp_defence speed
## Min. : 1 Min. : 10.00 Min. : 20.00 Min. : 5.00
## 1st Qu.:201 1st Qu.: 45.00 1st Qu.: 50.00 1st Qu.: 45.00
## Median :401 Median : 65.00 Median : 66.00 Median : 65.00
## Mean :401 Mean : 71.31 Mean : 70.91 Mean : 66.33
## 3rd Qu.:601 3rd Qu.: 91.00 3rd Qu.: 90.00 3rd Qu.: 85.00
## Max. :801 Max. :194.00 Max. :230.00 Max. :180.00
##
## type1 type2 weight_kg generation
## Length:801 Length:801 Min. : 0.10 Min. :1.00
## Class :character Class :character 1st Qu.: 9.00 1st Qu.:2.00
## Mode :character Mode :character Median : 27.30 Median :4.00
## Mean : 61.38 Mean :3.69
## 3rd Qu.: 64.80 3rd Qu.:5.00
## Max. :999.90 Max. :7.00
## NA's :20
## is_legendary train
## Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000
## Mean :0.08739 Mean :0.09988
## 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000
##
nrow(pokemon)
## [1] 801
table(pokemon$is_legendary)
##
## 0 1
## 731 70
names(pokemon)
## [1] "abilities" "against_bug" "against_dark"
## [4] "against_dragon" "against_electric" "against_fairy"
## [7] "against_fight" "against_fire" "against_flying"
## [10] "against_ghost" "against_grass" "against_ground"
## [13] "against_ice" "against_normal" "against_poison"
## [16] "against_psychic" "against_rock" "against_steel"
## [19] "against_water" "attack" "base_egg_steps"
## [22] "base_happiness" "base_total" "capture_rate"
## [25] "classfication" "defence" "experience_growth"
## [28] "height_m" "hp" "japanese_name"
## [31] "name" "percentage_male" "pokedex_number"
## [34] "sp_attack" "sp_defence" "speed"
## [37] "type1" "type2" "weight_kg"
## [40] "generation" "is_legendary" "train"
pokemon_df <- pokemon[, c(34:37, 41)]
head(pokemon_df)
## sp_attack sp_defence speed type1 is_legendary
## 1 65 65 45 grass 0
## 2 80 80 60 grass 0
## 3 122 120 80 grass 0
## 4 60 50 65 fire 0
## 5 80 65 80 fire 0
## 6 159 115 100 fire 0
nrow(pokemon_df)
## [1] 801
names(pokemon_df)
## [1] "sp_attack" "sp_defence" "speed" "type1" "is_legendary"
table(pokemon_df$is_legendary)
##
## 0 1
## 731 70
Our favourite seed :-)
set.seed(666)
Training-validation split.
Create the indices for the split This samples the row indices to split the data into training and validation.
train_index <- sample(1:nrow(pokemon_df), 0.6 * nrow(pokemon_df))
valid_index <- setdiff(1:nrow(pokemon_df), train_index)
Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.
train_df <- pokemon_df[train_index, ]
valid_df <- pokemon_df[valid_index, ]
It is a good habit to check after splitting.
nrow(train_df)
## [1] 480
nrow(valid_df)
## [1] 321
table(train_df$is_legendary)
##
## 0 1
## 442 38
names(train_df)
## [1] "sp_attack" "sp_defence" "speed" "type1" "is_legendary"
Build the tree for the training set. This amounts to training the data. Other classification algorithms work too.
library(rpart)
class_tr_1 <- rpart(is_legendary ~ sp_attack + sp_defence +
speed + type1,
data = train_df, method = "class",
maxdepth = 3)
Plot the tree. Try different settings to tweak the format.
library(rpart.plot)
rpart.plot(class_tr_1, type = 5)
class_tr_1_train_predict <- predict(class_tr_1, train_df,
type = "class")
summary(class_tr_1_train_predict)
## 0 1
## 461 19
class_tr_1_train_predict_prob <- predict(class_tr_1, train_df,
type = "prob")
head(class_tr_1_train_predict_prob)
## 0 1
## 574 0.9627907 0.0372093
## 638 0.9627907 0.0372093
## 608 0.9627907 0.0372093
## 123 0.9627907 0.0372093
## 540 0.9627907 0.0372093
## 654 0.9627907 0.0372093
class_tr_1_valid_predict <- predict(class_tr_1, valid_df,
type = "class")
summary(class_tr_1_valid_predict)
## 0 1
## 304 17
class_tr_1_valid_predict_prob <- predict(class_tr_1, valid_df,
type = "prob")
head(class_tr_1_valid_predict_prob)
## 0 1
## 2 0.9627907 0.0372093
## 3 0.6666667 0.3333333
## 4 0.9627907 0.0372093
## 5 0.9627907 0.0372093
## 12 0.9627907 0.0372093
## 13 0.9627907 0.0372093
Evaluate the model.
Convert to factor for the confusion matrix.
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
class_tr_1_train_predict <- as.factor(class_tr_1_train_predict)
train_df$is_legendary <- as.factor(train_df$is_legendary)
confusionMatrix(class_tr_1_train_predict, train_df$is_legendary, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 439 22
## 1 3 16
##
## Accuracy : 0.9479
## 95% CI : (0.9241, 0.966)
## No Information Rate : 0.9208
## P-Value [Acc > NIR] : 0.0134929
##
## Kappa : 0.537
##
## Mcnemar's Test P-Value : 0.0003182
##
## Sensitivity : 0.42105
## Specificity : 0.99321
## Pos Pred Value : 0.84211
## Neg Pred Value : 0.95228
## Prevalence : 0.07917
## Detection Rate : 0.03333
## Detection Prevalence : 0.03958
## Balanced Accuracy : 0.70713
##
## 'Positive' Class : 1
##
con_mat_train <- confusionMatrix(class_tr_1_train_predict, train_df$is_legendary,
positive = "1")
sensitivity_train <- con_mat_train$byClass[1]
precision_train <- con_mat_train$byClass[3]
f1_train <- 2/((1/sensitivity_train) + (1/precision_train))
paste("The F1 score for training is", f1_train)
## [1] "The F1 score for training is 0.56140350877193"
class_tr_1_valid_predict <- as.factor(class_tr_1_valid_predict)
valid_df$is_legendary <- as.factor(valid_df$is_legendary)
Do the confusion matrix.
It is important to specify the preferred class. Otherwise, the results will be opposite.
confusionMatrix(class_tr_1_valid_predict, valid_df$is_legendary, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 281 23
## 1 8 9
##
## Accuracy : 0.9034
## 95% CI : (0.8657, 0.9334)
## No Information Rate : 0.9003
## P-Value [Acc > NIR] : 0.47276
##
## Kappa : 0.3203
##
## Mcnemar's Test P-Value : 0.01192
##
## Sensitivity : 0.28125
## Specificity : 0.97232
## Pos Pred Value : 0.52941
## Neg Pred Value : 0.92434
## Prevalence : 0.09969
## Detection Rate : 0.02804
## Detection Prevalence : 0.05296
## Balanced Accuracy : 0.62678
##
## 'Positive' Class : 1
##
con_mat_valid <- confusionMatrix(class_tr_1_valid_predict,
valid_df$is_legendary, positive = "1")
sensitivity_valid <- con_mat_valid$byClass[1]
precision_valid <- con_mat_valid$byClass[3]
f1_valid <- 2/((1/sensitivity_valid) + (1/precision_valid))
paste("The F1 score for validation is", f1_valid)
## [1] "The F1 score for validation is 0.36734693877551"
Plot the ROC curve.
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_curve <- pROC::roc(response = valid_df$is_legendary,
predictor = class_tr_1_valid_predict_prob[, 2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_curve$auc
## Area under the curve: 0.5854
plot(roc_curve)
More performance evaluation.
library(modelplotr)
## Package modelplotr loaded! Happy model plotting!
scores_and_ntiles <- prepare_scores_and_ntiles(datasets = list("valid_df"),
dataset_labels = list("Validation data"),
models = list("class_tr_1"),
model_labels = list("Classification Tree"),
target_column = "is_legendary",
ntiles = 100)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `select()` instead.
## ℹ The deprecated feature was likely used in the modelplotr package.
## Please report the issue at <https://github.com/jurrr/modelplotr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## ... scoring caret model "class_tr_1" on dataset "valid_df".
## Data preparation step 1 succeeded! Dataframe created.
plot_input <- plotting_scope(prepared_input = scores_and_ntiles)
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## ℹ The deprecated feature was likely used in the modelplotr package.
## Please report the issue at <https://github.com/jurrr/modelplotr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
##
## No comparison specified, default values are used.
##
## Single evaluation line will be plotted: Target value "1" plotted for dataset "Validation data" and model "Classification Tree.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
Cumulative gains for decision tree.
plot_cumgains(data = plot_input,
custom_line_colors = "#1F8723")
Cumulative lift for decision tree.
plot_cumlift(data = plot_input,
custom_line_colors = "#1F3387")
Now, try the same model using weighted sample. This can be done using the ROSE package.
library(ROSE)
## Loaded ROSE 0.0-4
Create the weighted training df.
Factorise categorical variables for ROSE to work.
names(train_df)
## [1] "sp_attack" "sp_defence" "speed" "type1" "is_legendary"
# needed for update ROSE package
train_df$type1 <- as.factor(train_df$type1)
valid_df$type1 <- as.factor(valid_df$type1)
train_df_rose <- ROSE(is_legendary ~ sp_attack + sp_defence +
speed + type1,
data = train_df, seed = 666)$data
table(train_df_rose$is_legendary)
##
## 0 1
## 231 249
This is the same decision tree, except it uses the weighted training data.
class_tr_2 <- rpart(is_legendary ~ sp_attack + sp_defence +
speed + type1,
data = train_df_rose, method = "class",
maxdepth = 3)
rpart.plot(class_tr_2, type = 5)
Compute the predictions using model from the weighted training data.
class_tr_2_train_predict <- predict(class_tr_2, train_df_rose,
type = "class")
summary(class_tr_2_train_predict)
## 0 1
## 215 265
class_tr_2_train_predict_prob <- predict(class_tr_2, train_df_rose,
type = "prob")
head(class_tr_2_train_predict_prob)
## 0 1
## 1 0.7692308 0.23076923
## 2 0.9281437 0.07185629
## 3 0.9281437 0.07185629
## 4 0.9281437 0.07185629
## 5 0.9281437 0.07185629
## 6 0.9281437 0.07185629
Use the new model generated from training the weighted training data.
class_tr_2_valid_predict <- predict(class_tr_2, valid_df,
type = "class")
summary(class_tr_2_valid_predict)
## 0 1
## 221 100
class_tr_2_valid_predict_prob <- predict(class_tr_2, valid_df,
type = "prob")
head(class_tr_2_valid_predict_prob)
## 0 1
## 2 0.9281437 0.07185629
## 3 0.1167513 0.88324873
## 4 0.9281437 0.07185629
## 5 0.9281437 0.07185629
## 12 0.7692308 0.23076923
## 13 0.9281437 0.07185629
Convert to factor for the confusion matrix. Then generate the confusion matrix.
class_tr_2_train_predict <- as.factor(class_tr_2_train_predict)
train_df_rose$is_legendary <- as.factor(train_df_rose$is_legendary)
confusionMatrix(class_tr_2_train_predict, train_df_rose$is_legendary, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 192 23
## 1 39 226
##
## Accuracy : 0.8708
## 95% CI : (0.8375, 0.8995)
## No Information Rate : 0.5188
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7407
##
## Mcnemar's Test P-Value : 0.05678
##
## Sensitivity : 0.9076
## Specificity : 0.8312
## Pos Pred Value : 0.8528
## Neg Pred Value : 0.8930
## Prevalence : 0.5188
## Detection Rate : 0.4708
## Detection Prevalence : 0.5521
## Balanced Accuracy : 0.8694
##
## 'Positive' Class : 1
##
con_mat_train_balanced <- confusionMatrix(class_tr_2_train_predict,
train_df_rose$is_legendary,
positive = "1")
sensitivity_train <- con_mat_train_balanced$byClass[1]
precision_train <- con_mat_train_balanced$byClass[3]
f1_train <- 2/((1/sensitivity_train) + (1/precision_train))
paste("The F1 score for training (balanced data) is", f1_train)
## [1] "The F1 score for training (balanced data) is 0.879377431906615"
Convert to factor for the confusion matrix.
class_tr_2_valid_predict <- as.factor(class_tr_2_valid_predict)
valid_df$is_legendary <- as.factor(valid_df$is_legendary)
confusionMatrix(class_tr_2_valid_predict, valid_df$is_legendary, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 210 11
## 1 79 21
##
## Accuracy : 0.7196
## 95% CI : (0.6671, 0.7681)
## No Information Rate : 0.9003
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1969
##
## Mcnemar's Test P-Value : 1.636e-12
##
## Sensitivity : 0.65625
## Specificity : 0.72664
## Pos Pred Value : 0.21000
## Neg Pred Value : 0.95023
## Prevalence : 0.09969
## Detection Rate : 0.06542
## Detection Prevalence : 0.31153
## Balanced Accuracy : 0.69145
##
## 'Positive' Class : 1
##
con_mat_valid_balanced <- confusionMatrix(class_tr_2_valid_predict,
valid_df$is_legendary, positive = "1")
sensitivity_valid <- con_mat_valid_balanced$byClass[1]
precision_valid <- con_mat_valid_balanced$byClass[3]
f1_valid <- 2/((1/sensitivity_valid) + (1/precision_valid))
paste("The F1 score for validation (balanced data) is", f1_valid)
## [1] "The F1 score for validation (balanced data) is 0.318181818181818"
Now, we can see if the results from the weighted training set is better.
Plot the ROC curve for the decision tree using weighted training data. The AUC is better.
roc_Curve_weighted_dt <- pROC::roc(response = valid_df$is_legendary,
predictor = class_tr_2_valid_predict_prob[, 2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_Curve_weighted_dt$auc
## Area under the curve: 0.7879
plot(roc_Curve_weighted_dt)
More performance evaluation.
Prepare the data for evaluation
library(modelplotr)
scores_and_ntiles <- prepare_scores_and_ntiles(datasets = list("valid_df"),
dataset_labels = list("Validation data"),
models = list("class_tr_2"),
model_labels = list("Classification Tree (Balanced)"),
target_column = "is_legendary",
ntiles = 100)
## ... scoring caret model "class_tr_2" on dataset "valid_df".
## Data preparation step 1 succeeded! Dataframe created.
plot_input <- plotting_scope(prepared_input = scores_and_ntiles)
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
##
## No comparison specified, default values are used.
##
## Single evaluation line will be plotted: Target value "1" plotted for dataset "Validation data" and model "Classification Tree (Balanced).
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
Cumulative gains for decision tree using weighted training data.
plot_cumgains(data = plot_input,
custom_line_colors = "#1F8723")
Cumulative lift for decision tree using weighted training data.
plot_cumlift(data = plot_input,
custom_line_colors = "#1F3387")
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
class_tr_rf <- randomForest(is_legendary ~ sp_attack +
sp_defence + speed + type1,
data = train_df_rose, ntree = 150, mtry = 2, maxnodes = 5,
nodesize = 15, importance = TRUE)
The confusion matrix.
class_tr_rf_pred <- predict(class_tr_rf, train_df_rose, type = "class")
confusionMatrix(class_tr_rf_pred, train_df_rose$is_legendary,
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 184 8
## 1 47 241
##
## Accuracy : 0.8854
## 95% CI : (0.8535, 0.9125)
## No Information Rate : 0.5188
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7691
##
## Mcnemar's Test P-Value : 2.992e-07
##
## Sensitivity : 0.9679
## Specificity : 0.7965
## Pos Pred Value : 0.8368
## Neg Pred Value : 0.9583
## Prevalence : 0.5188
## Detection Rate : 0.5021
## Detection Prevalence : 0.6000
## Balanced Accuracy : 0.8822
##
## 'Positive' Class : 1
##
con_mat_train_rf <- confusionMatrix(class_tr_rf_pred,
train_df_rose$is_legendary,
positive = "1")
sensitivity_train <- con_mat_train_rf$byClass[1]
precision_train <- con_mat_train_rf$byClass[3]
f1_train <- 2/((1/sensitivity_train) + (1/precision_train))
paste("The F1 score for training (rf) is", f1_train)
## [1] "The F1 score for training (rf) is 0.897579143389199"
class_tr_rf_pred <- predict(class_tr_rf, valid_df, type = "class")
head(class_tr_rf_pred)
## 2 3 4 5 12 13
## 0 1 0 0 0 0
## Levels: 0 1
class_tr_rf_pred_prob <- predict(class_tr_rf, valid_df, type = "prob")
head(class_tr_rf_pred_prob)
## 0 1
## 2 0.8800000 0.120000000
## 3 0.1866667 0.813333333
## 4 0.9000000 0.100000000
## 5 0.8133333 0.186666667
## 12 0.7400000 0.260000000
## 13 0.9933333 0.006666667
confusionMatrix(class_tr_rf_pred, valid_df$is_legendary,
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 222 6
## 1 67 26
##
## Accuracy : 0.7726
## 95% CI : (0.7228, 0.8173)
## No Information Rate : 0.9003
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3143
##
## Mcnemar's Test P-Value : 2.18e-12
##
## Sensitivity : 0.81250
## Specificity : 0.76817
## Pos Pred Value : 0.27957
## Neg Pred Value : 0.97368
## Prevalence : 0.09969
## Detection Rate : 0.08100
## Detection Prevalence : 0.28972
## Balanced Accuracy : 0.79033
##
## 'Positive' Class : 1
##
con_mat_valid_rf <- confusionMatrix(class_tr_rf_pred,
valid_df$is_legendary,
positive = "1")
sensitivity_valid <- con_mat_valid_rf$byClass[1]
precision_valid <- con_mat_valid_rf$byClass[3]
f1_valid <- 2/((1/sensitivity_valid) + (1/precision_valid))
paste("The F1 score for validation (rf) is", f1_valid)
## [1] "The F1 score for validation (rf) is 0.416"
ROC.
roc_Curve_weighted_rf <- pROC::roc(response = valid_df$is_legendary,
predictor = class_tr_rf_pred_prob[, 2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_Curve_weighted_rf$auc
## Area under the curve: 0.8613
plot(roc_Curve_weighted_rf)
More performance evaluation.
library(modelplotr)
scores_and_ntiles <- prepare_scores_and_ntiles(datasets = list("valid_df"),
dataset_labels = list("Validation data"),
models = list("class_tr_rf"),
model_labels = list("Random Forest"),
target_column = "is_legendary",
ntiles = 100)
## ... scoring caret model "class_tr_rf" on dataset "valid_df".
## Data preparation step 1 succeeded! Dataframe created.
plot_input <- plotting_scope(prepared_input = scores_and_ntiles, select_targetclass = "1")
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
##
## No comparison specified, default values are used.
##
## Single evaluation line will be plotted: Target value "1" plotted for dataset "Validation data" and model "Random Forest.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
Cumulative gains for kNN using weighted training data.
plot_cumgains(data = plot_input,
custom_line_colors = "#DA5A0C")
Cumulative lift for kNN using weighted training data.
plot_cumlift(data = plot_input,
custom_line_colors = "#BEDA0C")
logistic_reg <- train(is_legendary ~ sp_attack + sp_defence +
speed + type1,
data = train_df_rose, method = "glm")
summary(logistic_reg)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.795e+00 1.136e+00 -8.622 < 2e-16 ***
## sp_attack 2.816e-02 5.373e-03 5.242 1.59e-07 ***
## sp_defence 3.130e-02 5.639e-03 5.550 2.85e-08 ***
## speed 3.874e-02 7.083e-03 5.470 4.51e-08 ***
## type1dark 2.172e-01 9.085e-01 0.239 0.81109
## type1dragon 3.009e+00 1.109e+00 2.713 0.00666 **
## type1electric -1.132e-01 8.905e-01 -0.127 0.89883
## type1fairy -2.711e-01 1.167e+00 -0.232 0.81628
## type1fighting -1.627e+01 1.701e+03 -0.010 0.99237
## type1fire 1.098e+00 8.232e-01 1.334 0.18217
## type1flying 1.807e+01 1.955e+03 0.009 0.99263
## type1ghost 1.445e+00 9.702e-01 1.489 0.13641
## type1grass 6.450e-01 7.896e-01 0.817 0.41400
## type1ground 7.291e-01 1.115e+00 0.654 0.51323
## type1ice -1.769e-01 1.012e+00 -0.175 0.86125
## type1normal 5.279e-02 8.261e-01 0.064 0.94905
## type1poison -1.475e+01 1.890e+03 -0.008 0.99377
## type1psychic 5.602e+00 9.922e-01 5.646 1.64e-08 ***
## type1rock 4.326e+00 1.002e+00 4.319 1.57e-05 ***
## type1steel 1.845e+00 1.036e+00 1.782 0.07482 .
## type1water 7.118e-01 7.478e-01 0.952 0.34121
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 664.75 on 479 degrees of freedom
## Residual deviance: 280.46 on 459 degrees of freedom
## AIC: 322.46
##
## Number of Fisher Scoring iterations: 17
varImp(logistic_reg)
## glm variable importance
##
## Overall
## type1psychic 100.00000
## sp_defence 98.30408
## speed 96.87131
## sp_attack 92.82455
## type1rock 76.45490
## type1dragon 47.98295
## type1steel 31.45909
## type1ghost 26.27545
## type1fire 23.52285
## type1water 16.74215
## type1grass 14.34952
## type1ground 11.45771
## type1dark 4.10090
## type1fairy 3.98218
## type1ice 2.96156
## type1electric 2.11642
## type1normal 0.99496
## type1fighting 0.03124
## type1flying 0.02552
## type1poison 0.00000
logistic_reg_pred_train <- predict(logistic_reg,
newdata = train_df_rose, type = "raw")
head(logistic_reg_pred_train)
## [1] 0 0 0 0 0 0
## Levels: 0 1
logistic_reg_pred_train_prob <- predict(logistic_reg,
newdata = train_df_rose, type = "prob")
head(logistic_reg_pred_train_prob)
## 0 1
## 1 0.6369369 3.630631e-01
## 2 1.0000000 3.559549e-08
## 3 0.9988197 1.180325e-03
## 4 0.9769298 2.307023e-02
## 5 0.9411284 5.887160e-02
## 6 0.9554029 4.459714e-02
logistic_reg_pred_valid <- predict(logistic_reg,
newdata = valid_df, type = "raw")
head(logistic_reg_pred_valid)
## [1] 0 1 0 0 0 0
## Levels: 0 1
logistic_reg_pred_valid_prob <- predict(logistic_reg,
newdata = valid_df, type = "prob")
head(logistic_reg_pred_valid_prob)
## 0 1
## 2 0.8878305 0.112169470
## 3 0.2421821 0.757817875
## 4 0.9490233 0.050976737
## 5 0.7875565 0.212443477
## 12 0.8854143 0.114585711
## 13 0.9987316 0.001268388
confusionMatrix(as.factor(logistic_reg_pred_train),
train_df_rose$is_legendary, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 200 23
## 1 31 226
##
## Accuracy : 0.8875
## 95% CI : (0.8558, 0.9143)
## No Information Rate : 0.5188
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7744
##
## Mcnemar's Test P-Value : 0.3408
##
## Sensitivity : 0.9076
## Specificity : 0.8658
## Pos Pred Value : 0.8794
## Neg Pred Value : 0.8969
## Prevalence : 0.5188
## Detection Rate : 0.4708
## Detection Prevalence : 0.5354
## Balanced Accuracy : 0.8867
##
## 'Positive' Class : 1
##
con_mat_train_logit <- confusionMatrix(as.factor(logistic_reg_pred_train),
train_df_rose$is_legendary,
positive = "1")
sensitivity_train <- con_mat_train_logit$byClass[1]
precision_train <- con_mat_train_logit$byClass[3]
f1_train <- 2/((1/sensitivity_train) + (1/precision_train))
paste("The F1 score for traination (logistic reg) is", f1_train)
## [1] "The F1 score for traination (logistic reg) is 0.893280632411067"
confusionMatrix(as.factor(logistic_reg_pred_valid),
valid_df$is_legendary, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 235 7
## 1 54 25
##
## Accuracy : 0.81
## 95% CI : (0.7627, 0.8514)
## No Information Rate : 0.9003
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3596
##
## Mcnemar's Test P-Value : 3.869e-09
##
## Sensitivity : 0.78125
## Specificity : 0.81315
## Pos Pred Value : 0.31646
## Neg Pred Value : 0.97107
## Prevalence : 0.09969
## Detection Rate : 0.07788
## Detection Prevalence : 0.24611
## Balanced Accuracy : 0.79720
##
## 'Positive' Class : 1
##
con_mat_valid_logit <- confusionMatrix(as.factor(logistic_reg_pred_valid),
valid_df$is_legendary, positive = "1")
sensitivity_valid <- con_mat_valid_logit$byClass[1]
precision_valid <- con_mat_valid_logit$byClass[3]
f1_valid <- 2/((1/sensitivity_valid) + (1/precision_valid))
paste("The F1 score for validation (logistic reg) is", f1_valid)
## [1] "The F1 score for validation (logistic reg) is 0.450450450450451"
ROC for logistic regression on balanced training data.
roc_Curve_logit <- pROC::roc(response = valid_df$is_legendary,
predictor = logistic_reg_pred_valid_prob[, 2])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_Curve_logit$auc
## Area under the curve: 0.859
plot(roc_Curve_logit, col = "green", lwd = 3)
legend("bottomright", legend = paste0("AUC =", round(roc_Curve_logit$auc, 3)),
col = "green", lwd = 3)
More fun :-)
plot(roc_Curve_logit, col = "green", lwd = 3)
lines(roc_Curve_weighted_dt, col = "red", lwd = 3)
lines(roc_Curve_weighted_rf, col = "blue", lwd = 3)
legend("bottomright", legend = paste0("AUC =", round(roc_Curve_logit$auc, 3)),
col = "green", lwd = 3)
legend("bottomright",
legend = c(paste0("Logistic Regression AUC =", round(roc_Curve_logit$auc, 3)),
paste0("Classification Tree AUC =", round(roc_Curve_weighted_dt$auc, 3)),
paste0("Random Forest AUC =", round(roc_Curve_weighted_rf$auc, 3))),
col = c("green", "red", "blue"),
lwd = 3)
Optimal Cutoff/Threshold
coords(roc_Curve_logit, x = "best")
## threshold specificity sensitivity
## 1 0.4517801 0.7923875 0.875
confusionMatrix(as.factor(ifelse(logistic_reg_pred_train_prob[, 2] > 0.4517801,
"1", "0")),
train_df_rose$is_legendary, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 192 18
## 1 39 231
##
## Accuracy : 0.8812
## 95% CI : (0.8489, 0.9088)
## No Information Rate : 0.5188
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7614
##
## Mcnemar's Test P-Value : 0.008071
##
## Sensitivity : 0.9277
## Specificity : 0.8312
## Pos Pred Value : 0.8556
## Neg Pred Value : 0.9143
## Prevalence : 0.5188
## Detection Rate : 0.4813
## Detection Prevalence : 0.5625
## Balanced Accuracy : 0.8794
##
## 'Positive' Class : 1
##
con_mat_train_logit_2 <- confusionMatrix(as.factor(ifelse(logistic_reg_pred_train_prob[, 2] > 0.4517801,
"1", "0")),
train_df_rose$is_legendary, positive = "1")
sensitivity_train <- con_mat_train_logit_2$byClass[1]
precision_train <- con_mat_train_logit_2$byClass[3]
f1_train <- 2/((1/sensitivity_train) + (1/precision_train))
paste("The F1 score for traination (logistic reg) is", f1_train)
## [1] "The F1 score for traination (logistic reg) is 0.890173410404624"
confusionMatrix(as.factor(ifelse(logistic_reg_pred_valid_prob[, 2] > 0.4517801,
"1", "0")),
valid_df$is_legendary, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 229 4
## 1 60 28
##
## Accuracy : 0.8006
## 95% CI : (0.7527, 0.8429)
## No Information Rate : 0.9003
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3753
##
## Mcnemar's Test P-Value : 6.199e-12
##
## Sensitivity : 0.87500
## Specificity : 0.79239
## Pos Pred Value : 0.31818
## Neg Pred Value : 0.98283
## Prevalence : 0.09969
## Detection Rate : 0.08723
## Detection Prevalence : 0.27414
## Balanced Accuracy : 0.83369
##
## 'Positive' Class : 1
##
con_mat_valid_logit_2 <- confusionMatrix(as.factor(ifelse(logistic_reg_pred_valid_prob[, 2] > 0.4517801, "1", "0")),
valid_df$is_legendary, positive = "1")
sensitivity_valid <- con_mat_valid_logit_2$byClass[1]
precision_valid <- con_mat_valid_logit_2$byClass[3]
f1_valid <- 2/((1/sensitivity_valid) + (1/precision_valid))
paste("The F1 score for validation (logistic reg) is", f1_valid)
## [1] "The F1 score for validation (logistic reg) is 0.466666666666667"
More performance evaluation.
library(modelplotr)
scores_and_ntiles <- prepare_scores_and_ntiles(datasets = list("valid_df"),
dataset_labels = list("Validation data"),
models = list("logistic_reg"),
model_labels = list("Logistic regression"),
target_column = "is_legendary",
ntiles = 10)
## ... scoring caret model "logistic_reg" on dataset "valid_df".
## Data preparation step 1 succeeded! Dataframe created.
plot_input <- plotting_scope(prepared_input = scores_and_ntiles, select_targetclass = "1")
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
##
## No comparison specified, default values are used.
##
## Single evaluation line will be plotted: Target value "1" plotted for dataset "Validation data" and model "Logistic regression.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
Cumulative gains for kNN using training data.
plot_cumgains(data = plot_input)
Cumulative lift for kNN using training data.
plot_cumlift(data = plot_input,
custom_line_colors = "#0022AA")