A Classification Tree at Hogwarts

1. Libraries

Load the libraries

library(rpart)
library(rpart.plot)
library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

2. Load data

Load the data

hogwarts <- read.csv("super_heroes_hogwarts_v3.csv", header = TRUE)
head(hogwarts, 10)

##      ID         Name Gender              Race Height         Publisher
## 1  A001       A-Bomb   Male             Human    203     Marvel Comics
## 2  A002   Abe Sapien   Male     Icthyo Sapien    191 Dark Horse Comics
## 3  A004  Abomination   Male Human / Radiation    203     Marvel Comics
## 4  A009     Agent 13 Female              <NA>    173     Marvel Comics
## 5  A015  Alex Mercer   Male             Human     NA         Wildstorm
## 6  A016 Alex Woolsly   Male              <NA>     NA      NBC - Heroes
## 7  A024        Angel   Male           Vampire     NA Dark Horse Comics
## 8  A025   Angel Dust Female            Mutant    165     Marvel Comics
## 9  A028   Animal Man   Male             Human    183         DC Comics
## 10 A032 Anti-Monitor   Male     God / Eternal     61         DC Comics
##    Alignment Weight Manipulative Resourceful Dismissive Intelligent Trusting
## 1       good    441           10          10          7           6        7
## 2       good     65            7           7          6           8        6
## 3        bad    441            6           8          1           6        3
## 4       good     61            7           7          1           9        7
## 5        bad     NA           10           6          8           3        4
## 6       good     NA            8          10          5           5        6
## 7       good     NA            8           6          8           7        4
## 8       good     57            9           8          9           4        1
## 9       good     83            7           6          6           5        8
## 10       bad     NA            7           7          7           1        9
##    Loyal Stubborn Brave HouseID     House STR DEX CON INT WIS CHA Level  HP
## 1      7        7     9       1 Slytherin  18  11  17  12  13  11     1   7
## 2      7        6     9       1 Slytherin  16  17  10  13  15  11     8  72
## 3      3        5     2       1 Slytherin  13  14  13  10  18  15    15 135
## 4      4        6     6       1 Slytherin  15  18  16  16  17  10    14 140
## 5      4        1     8       1 Slytherin  14  17  13  12  10  11     9  72
## 6      7        7     6       1 Slytherin  14  14  11  13  12  12     1   8
## 7      1        5     2       1 Slytherin  15  17  15  18  13  18    11  88
## 8      6        5     4       1 Slytherin   8  17  12  15  17  18     1   8
## 9      3        3     2       1 Slytherin  10  17  15  18  13  14     8  56
## 10     1        6     5       1 Slytherin   8  10  11  16  12  11     7  63

str(hogwarts)

## 'data.frame':    734 obs. of  26 variables:
##  $ ID          : chr  "A001" "A002" "A004" "A009" ...
##  $ Name        : chr  "A-Bomb" "Abe Sapien" "Abomination" "Agent 13" ...
##  $ Gender      : chr  "Male" "Male" "Male" "Female" ...
##  $ Race        : chr  "Human" "Icthyo Sapien" "Human / Radiation" NA ...
##  $ Height      : num  203 191 203 173 NA NA NA 165 183 61 ...
##  $ Publisher   : chr  "Marvel Comics" "Dark Horse Comics" "Marvel Comics" "Marvel Comics" ...
##  $ Alignment   : chr  "good" "good" "bad" "good" ...
##  $ Weight      : int  441 65 441 61 NA NA NA 57 83 NA ...
##  $ Manipulative: int  10 7 6 7 10 8 8 9 7 7 ...
##  $ Resourceful : int  10 7 8 7 6 10 6 8 6 7 ...
##  $ Dismissive  : int  7 6 1 1 8 5 8 9 6 7 ...
##  $ Intelligent : int  6 8 6 9 3 5 7 4 5 1 ...
##  $ Trusting    : int  7 6 3 7 4 6 4 1 8 9 ...
##  $ Loyal       : int  7 7 3 4 4 7 1 6 3 1 ...
##  $ Stubborn    : int  7 6 5 6 1 7 5 5 3 6 ...
##  $ Brave       : int  9 9 2 6 8 6 2 4 2 5 ...
##  $ HouseID     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ House       : chr  "Slytherin" "Slytherin" "Slytherin" "Slytherin" ...
##  $ STR         : int  18 16 13 15 14 14 15 8 10 8 ...
##  $ DEX         : int  11 17 14 18 17 14 17 17 17 10 ...
##  $ CON         : int  17 10 13 16 13 11 15 12 15 11 ...
##  $ INT         : int  12 13 10 16 12 13 18 15 18 16 ...
##  $ WIS         : int  13 15 18 17 10 12 13 17 13 12 ...
##  $ CHA         : int  11 11 15 10 11 12 18 18 14 11 ...
##  $ Level       : int  1 8 15 14 9 1 11 1 8 7 ...
##  $ HP          : int  7 72 135 140 72 8 88 8 56 63 ...

names(hogwarts)

##  [1] "ID"           "Name"         "Gender"       "Race"         "Height"      
##  [6] "Publisher"    "Alignment"    "Weight"       "Manipulative" "Resourceful" 
## [11] "Dismissive"   "Intelligent"  "Trusting"     "Loyal"        "Stubborn"    
## [16] "Brave"        "HouseID"      "House"        "STR"          "DEX"         
## [21] "CON"          "INT"          "WIS"          "CHA"          "Level"       
## [26] "HP"

nrow(hogwarts)

## [1] 734

Remove unnecessary variables for this model

hogwarts <- hogwarts[ , -c(1:8, 17, 19:26)]
names(hogwarts)

## [1] "Manipulative" "Resourceful"  "Dismissive"   "Intelligent"  "Trusting"    
## [6] "Loyal"        "Stubborn"     "Brave"        "House"

Look at the new order

t(t(names(hogwarts)))

##       [,1]          
##  [1,] "Manipulative"
##  [2,] "Resourceful" 
##  [3,] "Dismissive"  
##  [4,] "Intelligent" 
##  [5,] "Trusting"    
##  [6,] "Loyal"       
##  [7,] "Stubborn"    
##  [8,] "Brave"       
##  [9,] "House"

str(hogwarts)

## 'data.frame':    734 obs. of  9 variables:
##  $ Manipulative: int  10 7 6 7 10 8 8 9 7 7 ...
##  $ Resourceful : int  10 7 8 7 6 10 6 8 6 7 ...
##  $ Dismissive  : int  7 6 1 1 8 5 8 9 6 7 ...
##  $ Intelligent : int  6 8 6 9 3 5 7 4 5 1 ...
##  $ Trusting    : int  7 6 3 7 4 6 4 1 8 9 ...
##  $ Loyal       : int  7 7 3 4 4 7 1 6 3 1 ...
##  $ Stubborn    : int  7 6 5 6 1 7 5 5 3 6 ...
##  $ Brave       : int  9 9 2 6 8 6 2 4 2 5 ...
##  $ House       : chr  "Slytherin" "Slytherin" "Slytherin" "Slytherin" ...

table(hogwarts$House)

## 
## Gryffindor Hufflepuff  Ravenclaw  Slytherin 
##        188        189        156        201

nrow(hogwarts)

## [1] 734

Set House as factor This might be necessary for the confusion matrix to work

hogwarts$House <- as.factor(hogwarts$House)

3. Training validation split

Set the seed using our favourite number :-)

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation

train_index <- sample(1:nrow(hogwarts), 0.6 * nrow(hogwarts))
valid_index <- setdiff(1:nrow(hogwarts), train_index)

Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row

train_df <- hogwarts[train_index, ]
valid_df <- hogwarts[valid_index, ]

It is a good habit to check after splitting

nrow(train_df)

## [1] 440

nrow(valid_df)

## [1] 294

head(train_df)

##     Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn
## 574            6           5          7           7        4     5        9
## 638            5           9          5           4        2     5        7
## 608            6           9          5           4        2     6        6
## 123            7           8          5           7        4     7        5
## 540            9           7          7           9        8     6        8
## 654            6           5          7           2        3     4        7
##     Brave      House
## 574     9 Gryffindor
## 638    10 Gryffindor
## 608     9 Gryffindor
## 123     6  Slytherin
## 540     3 Hufflepuff
## 654     8 Gryffindor

head(valid_df)

##    Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn
## 2             7           7          6           8        6     7        6
## 3             6           8          1           6        3     3        5
## 5            10           6          8           3        4     4        1
## 12            6           6          6           2        3     6        7
## 13            9           7          9           8        6     3        2
## 14            7           6          3           9        4     7        2
##    Brave     House
## 2      9 Slytherin
## 3      2 Slytherin
## 5      8 Slytherin
## 12     5 Slytherin
## 13     5 Slytherin
## 14     8 Slytherin

str(train_df)

## 'data.frame':    440 obs. of  9 variables:
##  $ Manipulative: int  6 5 6 7 9 6 7 1 6 9 ...
##  $ Resourceful : int  5 9 9 8 7 5 4 7 7 10 ...
##  $ Dismissive  : int  7 5 5 5 7 7 1 1 3 5 ...
##  $ Intelligent : int  7 4 4 7 9 2 6 4 3 4 ...
##  $ Trusting    : int  4 2 2 4 8 3 9 6 7 8 ...
##  $ Loyal       : int  5 5 6 7 6 4 3 8 5 5 ...
##  $ Stubborn    : int  9 7 6 5 8 7 8 8 3 5 ...
##  $ Brave       : int  9 10 9 6 3 8 7 8 2 7 ...
##  $ House       : Factor w/ 4 levels "Gryffindor","Hufflepuff",..: 1 1 1 4 2 1 1 1 4 4 ...

str(valid_df)

## 'data.frame':    294 obs. of  9 variables:
##  $ Manipulative: int  7 6 10 6 9 7 7 6 6 9 ...
##  $ Resourceful : int  7 8 6 6 7 6 8 6 6 6 ...
##  $ Dismissive  : int  6 1 8 6 9 3 1 6 6 8 ...
##  $ Intelligent : int  8 6 3 2 8 9 7 6 3 3 ...
##  $ Trusting    : int  6 3 4 3 6 4 7 8 6 3 ...
##  $ Loyal       : int  7 3 4 6 3 7 7 8 7 2 ...
##  $ Stubborn    : int  6 5 1 7 2 2 3 1 6 5 ...
##  $ Brave       : int  9 2 8 5 5 8 5 7 2 7 ...
##  $ House       : Factor w/ 4 levels "Gryffindor","Hufflepuff",..: 4 4 4 4 4 4 4 4 4 4 ...

4. Classification tree

Build the tree for the training set This amounts to training the data

names(train_df)

## [1] "Manipulative" "Resourceful"  "Dismissive"   "Intelligent"  "Trusting"    
## [6] "Loyal"        "Stubborn"     "Brave"        "House"

class_tr <- rpart(House ~ Manipulative + Resourceful +
                    Dismissive + Intelligent + Trusting + Loyal +
                    Stubborn + Brave,
                  data = train_df, method = "class",
                  maxdepth = 30)

Plot the tree Try different settings to tweak the format

prp(class_tr, cex = 0.8, tweak = 1)

4.1 Alternate specifications

Like any model, we can adjust the specifications to build different trees. Some may be better than others. In this case, we are adjusting the minbucket and maxdepth parameters.

class_tr_v2 <- rpart(House ~ Manipulative + Resourceful +
                    Dismissive + Intelligent + Trusting + Loyal +
                    Stubborn + Brave,
                  data = train_df, method = "class", minbucket = 2,
                  maxdepth = 10)

prp(class_tr_v2, cex = 0.8, tweak = 1)

class_tr_v3 <- rpart(House ~ Manipulative + Resourceful +
                    Dismissive + Intelligent + Trusting + Loyal +
                    Stubborn + Brave,
                  data = train_df, method = "class", minbucket = 3,
                  maxdepth = 3)

prp(class_tr_v3, cex = 0.8, tweak = 1)

5. Confusion matrix

For all classification models, we have to generate a confusion matrix to see how good (or lousy) our model is.

For training set.

class_tr_train_predict <- predict(class_tr, train_df,
                                  type = "class")

t(t(head(class_tr_train_predict,10)))

##     [,1]      
## 574 Gryffindor
## 638 Gryffindor
## 608 Gryffindor
## 123 Slytherin 
## 540 Slytherin 
## 654 Gryffindor
## 652 Gryffindor
## 673 Gryffindor
## 50  Gryffindor
## 131 Slytherin 
## Levels: Gryffindor Hufflepuff Ravenclaw Slytherin

confusionMatrix(class_tr_train_predict, train_df$House)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Gryffindor Hufflepuff Ravenclaw Slytherin
##   Gryffindor         98         12        12         8
##   Hufflepuff         11         90        19         5
##   Ravenclaw           3          4        56         8
##   Slytherin           1          6         8        99
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7795          
##                  95% CI : (0.7379, 0.8174)
##     No Information Rate : 0.2727          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7046          
##                                           
##  Mcnemar's Test P-Value : 0.002025        
## 
## Statistics by Class:
## 
##                      Class: Gryffindor Class: Hufflepuff Class: Ravenclaw
## Sensitivity                     0.8673            0.8036           0.5895
## Specificity                     0.9021            0.8933           0.9565
## Pos Pred Value                  0.7538            0.7200           0.7887
## Neg Pred Value                  0.9516            0.9302           0.8943
## Prevalence                      0.2568            0.2545           0.2159
## Detection Rate                  0.2227            0.2045           0.1273
## Detection Prevalence            0.2955            0.2841           0.1614
## Balanced Accuracy               0.8847            0.8484           0.7730
##                      Class: Slytherin
## Sensitivity                    0.8250
## Specificity                    0.9531
## Pos Pred Value                 0.8684
## Neg Pred Value                 0.9356
## Prevalence                     0.2727
## Detection Rate                 0.2250
## Detection Prevalence           0.2591
## Balanced Accuracy              0.8891

Then we use the model to predict the outcome for validation set. This will tell us how good (or lousy) our predictions are.

class_tr_valid_predict <- predict(class_tr, valid_df,
                                  type = "class")

t(t(head(class_tr_valid_predict,10)))

##    [,1]      
## 2  Slytherin 
## 3  Ravenclaw 
## 5  Slytherin 
## 12 Ravenclaw 
## 13 Slytherin 
## 14 Slytherin 
## 15 Slytherin 
## 16 Hufflepuff
## 17 Ravenclaw 
## 18 Slytherin 
## Levels: Gryffindor Hufflepuff Ravenclaw Slytherin

confusionMatrix(class_tr_valid_predict, valid_df$House)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Gryffindor Hufflepuff Ravenclaw Slytherin
##   Gryffindor         60         12        15         2
##   Hufflepuff          7         54         9         1
##   Ravenclaw           5          3        30        12
##   Slytherin           3          8         7        66
## 
## Overall Statistics
##                                          
##                Accuracy : 0.7143         
##                  95% CI : (0.659, 0.7652)
##     No Information Rate : 0.2755         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.6168         
##                                          
##  Mcnemar's Test P-Value : 0.01235        
## 
## Statistics by Class:
## 
##                      Class: Gryffindor Class: Hufflepuff Class: Ravenclaw
## Sensitivity                     0.8000            0.7013           0.4918
## Specificity                     0.8676            0.9217           0.9142
## Pos Pred Value                  0.6742            0.7606           0.6000
## Neg Pred Value                  0.9268            0.8969           0.8730
## Prevalence                      0.2551            0.2619           0.2075
## Detection Rate                  0.2041            0.1837           0.1020
## Detection Prevalence            0.3027            0.2415           0.1701
## Balanced Accuracy               0.8338            0.8115           0.7030
##                      Class: Slytherin
## Sensitivity                    0.8148
## Specificity                    0.9155
## Pos Pred Value                 0.7857
## Neg Pred Value                 0.9286
## Prevalence                     0.2755
## Detection Rate                 0.2245
## Detection Prevalence           0.2857
## Balanced Accuracy              0.8652

Compute the probabilities.

class_tr_valid_predict_prob <- predict(class_tr, valid_df,
                                  type = "prob")

head(class_tr_valid_predict_prob)

##     Gryffindor Hufflepuff  Ravenclaw Slytherin
## 2  0.009615385 0.04807692 0.06730769  0.875000
## 3  0.052631579 0.07017544 0.75438596  0.122807
## 5  0.009615385 0.04807692 0.06730769  0.875000
## 12 0.052631579 0.07017544 0.75438596  0.122807
## 13 0.009615385 0.04807692 0.06730769  0.875000
## 14 0.009615385 0.04807692 0.06730769  0.875000

5.1 Model evaluation

We can perform further evaluation

Load the library.

library(modelplotr)

## Package modelplotr loaded! Happy model plotting!

Prepare scores for model evaluation.

scores_and_ntiles <- prepare_scores_and_ntiles(datasets = 
                                                 list("valid_df"),
                                               dataset_labels = 
                                                 list("Validation data"),
                                               models = 
                                                 list("class_tr"),
                                               model_labels = 
                                                 list("CART"),
                                               target_column = "House",
                                               ntiles = 10)

## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

## ... scoring caret model "class_tr" on dataset "valid_df".

## Data preparation step 1 succeeded! Dataframe created.

Specify the select_targetclass argument to the preferred class. In this case, we select Sytherin.

plot_input <- plotting_scope(prepared_input = scores_and_ntiles,
                             select_targetclass = "Slytherin")

## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

## Data preparation step 2 succeeded! Dataframe created.

## "prepared_input" aggregated...

## Data preparation step 3 succeeded! Dataframe created.
## 
## No comparison specified, default values are used. 
## 
## Single evaluation line will be plotted: Target value "Slytherin" plotted for dataset "Validation data" and model "CART.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".

Cumulative gains.

plot_cumgains(data = plot_input)

Cumulative lift.

plot_cumlift(data = plot_input)

Response plot.

plot_response(data = plot_input)

Cumulative response plot.

plot_cumresponse(data = plot_input)

6. Predict new record

The purpose of a classification model is to predict new records. We will predict the House for a new padawan at Hogwarts. Create the record for the new padawan.

names(hogwarts)

## [1] "Manipulative" "Resourceful"  "Dismissive"   "Intelligent"  "Trusting"    
## [6] "Loyal"        "Stubborn"     "Brave"        "House"

padawan_1 <- data.frame(Manipulative = 8,
                        Resourceful = 9,
                        Dismissive = 8,
                        Intelligent = 9,
                        Trusting = 6,
                        Loyal = 8,
                        Stubborn = 6,
                        Brave = 7)
padawan_1

##   Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
## 1            8           9          8           9        6     8        6     7

Sorting hat for the new padawan. This is basically our classification model.

house_class_tr <- predict(class_tr, newdata = padawan_1)
house_class_tr

##    Gryffindor Hufflepuff  Ravenclaw Slytherin
## 1 0.009615385 0.04807692 0.06730769     0.875

7. k fold cross validation with caret

Use caret to perform a 10-fold cross validation; repeated 3 times.

caret_control_k10 <- trainControl(method = "repeatedcv",
                              number = 10,
                              repeats = 3)

class(caret_control_k10)

## [1] "list"

Use caret to train the decision tree using 10-fold cross validation repeated 3 times and use 15 values for tuning the cp parameter for rpart.

This returns the best model trained on all the training data!

class_tr_k10 <- train(House ~ Manipulative + Resourceful +
                        Dismissive + Intelligent + Trusting + Loyal +
                        Stubborn + Brave,
                      data = train_df,
                      method = "rpart",
                      trControl = caret_control_k10,
                      tuneLength = 15)

Display the results of the cross validation run.

class_tr_k10

## CART 
## 
## 440 samples
##   8 predictor
##   4 classes: 'Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 395, 397, 397, 395, 397, 396, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.00000000  0.7209369  0.6269171
##   0.01629464  0.7241770  0.6309477
##   0.03258929  0.6985678  0.5965404
##   0.04888393  0.6674626  0.5534107
##   0.06517857  0.6651899  0.5502566
##   0.08147321  0.6135549  0.4795439
##   0.09776786  0.6097486  0.4742706
##   0.11406250  0.6029305  0.4642584
##   0.13035714  0.5272825  0.3593188
##   0.14665179  0.4866357  0.3041289
##   0.16294643  0.4341582  0.2323056
##   0.17924107  0.4341582  0.2323056
##   0.19553571  0.4341582  0.2323056
##   0.21183036  0.4341582  0.2323056
##   0.22812500  0.3610626  0.1281048
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01629464.

8. Predict new record with k-fold model

house_class_tr_k10 <- predict(class_tr_k10, newdata = padawan_1)
house_class_tr_k10

## [1] Slytherin
## Levels: Gryffindor Hufflepuff Ravenclaw Slytherin

Compute the probabilities. The Sorting Hat can take these into consideration :-)

house_class_tr_k10_prob <- predict(class_tr_k10, newdata = padawan_1,
                                   type = "prob")
house_class_tr_k10_prob

##   Gryffindor Hufflepuff  Ravenclaw Slytherin
## 1  0.1044776 0.05223881 0.05970149 0.7835821

9. Best model

Retrieve the best model.

class_tr_k10_best <- class_tr_k10$finalModel

class_tr_k10_best

## n= 440 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 440 320 Slytherin (0.25681818 0.25454545 0.21590909 0.27272727)  
##    2) Manipulative< 6.5 252 164 Hufflepuff (0.29365079 0.34920635 0.29761905 0.05952381)  
##      4) Trusting< 6.5 121  67 Gryffindor (0.44628099 0.05785124 0.42975207 0.06611570)  
##        8) Brave>=6.5 64  13 Gryffindor (0.79687500 0.04687500 0.14062500 0.01562500) *
##        9) Brave< 6.5 57  14 Ravenclaw (0.05263158 0.07017544 0.75438596 0.12280702) *
##      5) Trusting>=6.5 131  50 Hufflepuff (0.15267176 0.61832061 0.17557252 0.05343511)  
##       10) Loyal>=5.5 107  26 Hufflepuff (0.10280374 0.75700935 0.09345794 0.04672897) *
##       11) Loyal< 5.5 24  11 Ravenclaw (0.37500000 0.00000000 0.54166667 0.08333333)  
##         22) Dismissive< 5.5 10   1 Gryffindor (0.90000000 0.00000000 0.00000000 0.10000000) *
##         23) Dismissive>=5.5 14   1 Ravenclaw (0.00000000 0.00000000 0.92857143 0.07142857) *
##    3) Manipulative>=6.5 188  83 Slytherin (0.20744681 0.12765957 0.10638298 0.55851064)  
##      6) Resourceful< 5.5 54  29 Gryffindor (0.46296296 0.31481481 0.22222222 0.00000000)  
##       12) Stubborn>=6 36  11 Gryffindor (0.69444444 0.22222222 0.08333333 0.00000000) *
##       13) Stubborn< 6 18   9 Hufflepuff (0.00000000 0.50000000 0.50000000 0.00000000) *
##      7) Resourceful>=5.5 134  29 Slytherin (0.10447761 0.05223881 0.05970149 0.78358209) *

Plot the best model.

prp(class_tr_k10_best, type = 0, extra = 1, under = TRUE)

Confusion matrix for the best model.

class_tr_valid_predict_k10 <- predict(class_tr_k10_best, valid_df,
                                  type = "class")
confusionMatrix(class_tr_valid_predict_k10, valid_df$House)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Gryffindor Hufflepuff Ravenclaw Slytherin
##   Gryffindor         55         10        14         2
##   Hufflepuff          7         54         9         1
##   Ravenclaw           5          3        30        12
##   Slytherin           8         10         8        66
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6973          
##                  95% CI : (0.6413, 0.7493)
##     No Information Rate : 0.2755          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.5936          
##                                           
##  Mcnemar's Test P-Value : 0.00332         
## 
## Statistics by Class:
## 
##                      Class: Gryffindor Class: Hufflepuff Class: Ravenclaw
## Sensitivity                     0.7333            0.7013           0.4918
## Specificity                     0.8813            0.9217           0.9142
## Pos Pred Value                  0.6790            0.7606           0.6000
## Neg Pred Value                  0.9061            0.8969           0.8730
## Prevalence                      0.2551            0.2619           0.2075
## Detection Rate                  0.1871            0.1837           0.1020
## Detection Prevalence            0.2755            0.2415           0.1701
## Balanced Accuracy               0.8073            0.8115           0.7030
##                      Class: Slytherin
## Sensitivity                    0.8148
## Specificity                    0.8779
## Pos Pred Value                 0.7174
## Neg Pred Value                 0.9257
## Prevalence                     0.2755
## Detection Rate                 0.2245
## Detection Prevalence           0.3129
## Balanced Accuracy              0.8464

Predict new padawan using the best model.

class_tr_valid_predict_k10_padawan <- predict(class_tr_k10_best, padawan_1,
                                      type = "class")
class_tr_valid_predict_k10_padawan

##         1 
## Slytherin 
## Levels: Gryffindor Hufflepuff Ravenclaw Slytherin

Compute the probabilities using the best model.

class_tr_valid_predict_k10_padawan_prob <- predict(class_tr_k10_best, padawan_1,
                                              type = "prob")
class_tr_valid_predict_k10_padawan_prob

##   Gryffindor Hufflepuff  Ravenclaw Slytherin
## 1  0.1044776 0.05223881 0.05970149 0.7835821

9.1 The best model

Prepare scores.

scores_and_ntiles_best <- prepare_scores_and_ntiles(datasets = 
                                                 list("valid_df"),
                                               dataset_labels = 
                                                 list("Validation data"),
                                               models = 
                                                 list("class_tr_k10_best"),
                                               model_labels = 
                                                 list("CART"),
                                               target_column = "House",
                                               ntiles = 10)

## ... scoring caret model "class_tr_k10_best" on dataset "valid_df".

## Data preparation step 1 succeeded! Dataframe created.

Specify the select_targetclass argument to the preferred class.

plot_input_best <- plotting_scope(prepared_input = scores_and_ntiles_best,
                             select_targetclass = "Slytherin")

## Data preparation step 2 succeeded! Dataframe created.

## "prepared_input" aggregated...

## Data preparation step 3 succeeded! Dataframe created.
## 
## No comparison specified, default values are used. 
## 
## Single evaluation line will be plotted: Target value "Slytherin" plotted for dataset "Validation data" and model "CART.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".

Cumulative gains for the best model.

plot_cumgains(data = plot_input_best)

Cumulative lift for the best model.

plot_cumlift(data = plot_input_best)

Response plot for the best model.

plot_response(data = plot_input_best)

Cumulative response plot for the best model.

plot_cumresponse(data = plot_input_best)

10. A 2-class set up

10.1 Recode into 2 classes

Since we already split the data, we can just recode the training and validation sets.

There are different ways to recode.

Recoding the target variable for the training set as Slytherin vs. Not Slytherin. After all, much of the darkness seems to stem from the Slytherin House :-)

library(car)

## Loading required package: carData

train_df_2 <- train_df
train_df_2$House <- recode(train_df_2$House,
                         " 'Gryffindor' = 'Not Slytherin'; 
                         'Ravenclaw' = 'Not Slytherin'; 
                         'Hufflepuff' = 'Not Slytherin'")
table(train_df_2$House)

## 
## Not Slytherin     Slytherin 
##           320           120

Now for the validation set.

valid_df_2 <- valid_df
valid_df_2$House <- recode(valid_df_2$House,
                         " 'Gryffindor' = 'Not Slytherin'; 
                         'Ravenclaw' = 'Not Slytherin'; 
                         'Hufflepuff' = 'Not Slytherin'")
table(valid_df_2$House)

## 
## Not Slytherin     Slytherin 
##           213            81

10.2 Classification tree

Train the classification tree using some alternate specifications.

class_tr_slytherin <- rpart(House ~ Manipulative + Resourceful +
                    Dismissive + Intelligent + Trusting + Loyal +
                    Stubborn + Brave,
                  data = train_df_2, method = "class",  minbucket = 5,
                  maxdepth = 10)


prp(class_tr_slytherin, cex = 0.8, tweak = 1)

10.3 Confusion matrix

A simpler confusion matrix.

First, predict the training set.

class_tr_train_slytherin_predict <- predict(class_tr_slytherin,
                                            train_df_2, type = "class")

t(t(head(class_tr_train_slytherin_predict, 10)))

##     [,1]         
## 574 Not Slytherin
## 638 Not Slytherin
## 608 Not Slytherin
## 123 Slytherin    
## 540 Slytherin    
## 654 Not Slytherin
## 652 Not Slytherin
## 673 Not Slytherin
## 50  Slytherin    
## 131 Slytherin    
## Levels: Not Slytherin Slytherin

The confusion matrix.

confusionMatrix(class_tr_train_slytherin_predict, train_df_2$House,
                positive = "Slytherin")

## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Not Slytherin Slytherin
##   Not Slytherin           303         8
##   Slytherin                17       112
##                                           
##                Accuracy : 0.9432          
##                  95% CI : (0.9173, 0.9629)
##     No Information Rate : 0.7273          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8601          
##                                           
##  Mcnemar's Test P-Value : 0.1096          
##                                           
##             Sensitivity : 0.9333          
##             Specificity : 0.9469          
##          Pos Pred Value : 0.8682          
##          Neg Pred Value : 0.9743          
##              Prevalence : 0.2727          
##          Detection Rate : 0.2545          
##    Detection Prevalence : 0.2932          
##       Balanced Accuracy : 0.9401          
##                                           
##        'Positive' Class : Slytherin       
##

Now, predict the outcome for validation set.

class_tr_valid_slytherin_predict <- predict(class_tr_slytherin, valid_df_2, type = "class")

t(t(head(class_tr_valid_slytherin_predict,10)))

##    [,1]         
## 2  Slytherin    
## 3  Slytherin    
## 5  Slytherin    
## 12 Not Slytherin
## 13 Slytherin    
## 14 Slytherin    
## 15 Slytherin    
## 16 Not Slytherin
## 17 Not Slytherin
## 18 Slytherin    
## Levels: Not Slytherin Slytherin