Directions

Naive Bayes to predict whether to call a Pillar.

1 = call for help; 0 = don’t call for help

Data for demo

Back to the spellbook

1. Load data

demon <- read.csv("demon_slayer_10.csv", header = TRUE)

head(demon, 10)
##    ï..ID Character Breathing  Moon Form Battle_difficulty Effectiveness_rating
## 1      1   Tanjiro     Water Upper    4                28                    4
## 2      2   Inosuke     Beast Lower    7                20                    5
## 3      3   Zenitsu   Thunder Upper    1                24                    6
## 4      4   Inosuke     Beast Lower    4                27                    7
## 5      5   Inosuke     Beast Upper    1                43                    1
## 6      6   Zenitsu   Thunder Lower    7                53                    3
## 7      7   Zenitsu   Thunder Lower    1                45                    5
## 8      8   Zenitsu   Thunder Upper    7                75                    3
## 9      9   Inosuke     Beast Upper    4                59                   10
## 10    10   Tanjiro       Sun Lower   13                35                    6
##    Hours_to_daybreak Call_pillars
## 1                  4            1
## 2                  3            0
## 3                  4            0
## 4                  5            0
## 5                  8            0
## 6                  8            0
## 7                  5            1
## 8                  5            0
## 9                  4            0
## 10                 6            0
nrow(demon)
## [1] 6666
names(demon)
## [1] "ï..ID"                "Character"            "Breathing"           
## [4] "Moon"                 "Form"                 "Battle_difficulty"   
## [7] "Effectiveness_rating" "Hours_to_daybreak"    "Call_pillars"
str(demon)
## 'data.frame':    6666 obs. of  9 variables:
##  $ ï..ID               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Character           : chr  "Tanjiro" "Inosuke" "Zenitsu" "Inosuke" ...
##  $ Breathing           : chr  "Water" "Beast" "Thunder" "Beast" ...
##  $ Moon                : chr  "Upper" "Lower" "Upper" "Lower" ...
##  $ Form                : int  4 7 1 4 1 7 1 7 4 13 ...
##  $ Battle_difficulty   : int  28 20 24 27 43 53 45 75 59 35 ...
##  $ Effectiveness_rating: int  4 5 6 7 1 3 5 3 10 6 ...
##  $ Hours_to_daybreak   : int  4 3 4 5 8 8 5 5 4 6 ...
##  $ Call_pillars        : int  1 0 0 0 0 0 1 0 0 0 ...

Rename.

names(demon)[1] <- "ID"
names(demon)
## [1] "ID"                   "Character"            "Breathing"           
## [4] "Moon"                 "Form"                 "Battle_difficulty"   
## [7] "Effectiveness_rating" "Hours_to_daybreak"    "Call_pillars"

2. PreProcessing

2.1 Filter for required variables

Include only categorical variables.

library(plyr)
columns <- c("Character", "Breathing", "Moon", "Call_pillars")
demon[columns] <- lapply(demon[columns], factor)
str(demon)
## 'data.frame':    6666 obs. of  9 variables:
##  $ ID                  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Character           : Factor w/ 4 levels "Inosuke","Nezuko",..: 3 1 4 1 1 4 4 4 1 3 ...
##  $ Breathing           : Factor w/ 5 levels "Beast","Blood Demon Art",..: 5 1 4 1 1 4 4 4 1 3 ...
##  $ Moon                : Factor w/ 2 levels "Lower","Upper": 2 1 2 1 2 1 1 2 2 1 ...
##  $ Form                : int  4 7 1 4 1 7 1 7 4 13 ...
##  $ Battle_difficulty   : int  28 20 24 27 43 53 45 75 59 35 ...
##  $ Effectiveness_rating: int  4 5 6 7 1 3 5 3 10 6 ...
##  $ Hours_to_daybreak   : int  4 3 4 5 8 8 5 5 4 6 ...
##  $ Call_pillars        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
names(demon)
## [1] "ID"                   "Character"            "Breathing"           
## [4] "Moon"                 "Form"                 "Battle_difficulty"   
## [7] "Effectiveness_rating" "Hours_to_daybreak"    "Call_pillars"
demon_2 <- demon[,c(2:4, 9)]
head(demon_2)
##   Character Breathing  Moon Call_pillars
## 1   Tanjiro     Water Upper            1
## 2   Inosuke     Beast Lower            0
## 3   Zenitsu   Thunder Upper            0
## 4   Inosuke     Beast Lower            0
## 5   Inosuke     Beast Upper            0
## 6   Zenitsu   Thunder Lower            0
nrow(demon_2)
## [1] 6666
names(demon_2)
## [1] "Character"    "Breathing"    "Moon"         "Call_pillars"
str(demon_2)
## 'data.frame':    6666 obs. of  4 variables:
##  $ Character   : Factor w/ 4 levels "Inosuke","Nezuko",..: 3 1 4 1 1 4 4 4 1 3 ...
##  $ Breathing   : Factor w/ 5 levels "Beast","Blood Demon Art",..: 5 1 4 1 1 4 4 4 1 3 ...
##  $ Moon        : Factor w/ 2 levels "Lower","Upper": 2 1 2 1 2 1 1 2 2 1 ...
##  $ Call_pillars: Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
table(demon_2$Call_pillars)
## 
##    0    1 
## 3121 3545

3. Training-validation split

Using our favourite seed :-)

set.seed(666)

train_index <- sample(1:nrow(demon_2), 0.6 * nrow(demon_2))
valid_index <- setdiff(1:nrow(demon_2), train_index)

train_df <- demon_2[train_index, ]
valid_df <- demon_2[valid_index, ]
nrow(train_df)
## [1] 3999
nrow(valid_df)
## [1] 2667
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
compare_df_cols(train_df, valid_df)
##    column_name train_df valid_df
## 1    Breathing   factor   factor
## 2 Call_pillars   factor   factor
## 3    Character   factor   factor
## 4         Moon   factor   factor
compare_df_cols_same(train_df, valid_df)
## [1] TRUE

4. Naive Bayes

library(e1071)
demon_nb_train <- naiveBayes(Call_pillars ~ ., 
                      data = train_df)
demon_nb_train
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         0         1 
## 0.4691173 0.5308827 
## 
## Conditional probabilities:
##    Character
## Y      Inosuke     Nezuko    Tanjiro    Zenitsu
##   0 0.30756930 0.08102345 0.30810235 0.30330490
##   1 0.19029675 0.41403674 0.20113048 0.19453603
## 
##    Breathing
## Y        Beast Blood Demon Art        Sun    Thunder      Water
##   0 0.30756930      0.08102345 0.16631130 0.30330490 0.14179104
##   1 0.19029675      0.41403674 0.09891663 0.19453603 0.10221385
## 
##    Moon
## Y       Lower     Upper
##   0 0.4888060 0.5111940
##   1 0.5044748 0.4955252

4.1 Training set

pred_class_train <- predict(demon_nb_train, newdata = train_df[, c(1:3)],
                            type = "class")

head(pred_class_train)
## [1] 1 0 1 0 0 0
## Levels: 0 1

4.2 Validation set

pred_prob <- predict(demon_nb_train, newdata = valid_df[, c(1:3)], 
                     type = "raw")
head(pred_prob)
##              0         1
## [1,] 0.6595317 0.3404683
## [2,] 0.6890507 0.3109493
## [3,] 0.7042615 0.2957385
## [4,] 0.6754636 0.3245364
## [5,] 0.6754636 0.3245364
## [6,] 0.7042615 0.2957385
pred_class <- predict(demon_nb_train, newdata = valid_df[, c(1:3)])
head(pred_class)
## [1] 0 0 0 0 0 0
## Levels: 0 1
pred_class <- predict(demon_nb_train, newdata = valid_df[, c(1:3)])
head(pred_class)
## [1] 0 0 0 0 0 0
## Levels: 0 1

4.2 Evaluation

Training set.

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(pred_class_train, train_df$Call_pillars, 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1724 1244
##          1  152  879
##                                           
##                Accuracy : 0.6509          
##                  95% CI : (0.6359, 0.6657)
##     No Information Rate : 0.5309          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3221          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.4140          
##             Specificity : 0.9190          
##          Pos Pred Value : 0.8526          
##          Neg Pred Value : 0.5809          
##              Prevalence : 0.5309          
##          Detection Rate : 0.2198          
##    Detection Prevalence : 0.2578          
##       Balanced Accuracy : 0.6665          
##                                           
##        'Positive' Class : 1               
## 

Validation set.

confusionMatrix(pred_class, valid_df$Call_pillars, 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1156  850
##          1   89  572
##                                           
##                Accuracy : 0.6479          
##                  95% CI : (0.6295, 0.6661)
##     No Information Rate : 0.5332          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3186          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.4023          
##             Specificity : 0.9285          
##          Pos Pred Value : 0.8654          
##          Neg Pred Value : 0.5763          
##              Prevalence : 0.5332          
##          Detection Rate : 0.2145          
##    Detection Prevalence : 0.2478          
##       Balanced Accuracy : 0.6654          
##                                           
##        'Positive' Class : 1               
## 

5. ROC

library(ROSE)
## Loaded ROSE 0.0-3
ROSE::roc.curve(valid_df$Call_pillars, 
                pred_class)

## Area under the curve (AUC): 0.665