Naive Bayes to predict whether to call a Pillar.
1 = call for help; 0 = don’t call for help
demon <- read.csv("demon_slayer_10.csv", header = TRUE)
head(demon, 10)
## ï..ID Character Breathing Moon Form Battle_difficulty Effectiveness_rating
## 1 1 Tanjiro Water Upper 4 28 4
## 2 2 Inosuke Beast Lower 7 20 5
## 3 3 Zenitsu Thunder Upper 1 24 6
## 4 4 Inosuke Beast Lower 4 27 7
## 5 5 Inosuke Beast Upper 1 43 1
## 6 6 Zenitsu Thunder Lower 7 53 3
## 7 7 Zenitsu Thunder Lower 1 45 5
## 8 8 Zenitsu Thunder Upper 7 75 3
## 9 9 Inosuke Beast Upper 4 59 10
## 10 10 Tanjiro Sun Lower 13 35 6
## Hours_to_daybreak Call_pillars
## 1 4 1
## 2 3 0
## 3 4 0
## 4 5 0
## 5 8 0
## 6 8 0
## 7 5 1
## 8 5 0
## 9 4 0
## 10 6 0
nrow(demon)
## [1] 6666
names(demon)
## [1] "ï..ID" "Character" "Breathing"
## [4] "Moon" "Form" "Battle_difficulty"
## [7] "Effectiveness_rating" "Hours_to_daybreak" "Call_pillars"
str(demon)
## 'data.frame': 6666 obs. of 9 variables:
## $ ï..ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Character : chr "Tanjiro" "Inosuke" "Zenitsu" "Inosuke" ...
## $ Breathing : chr "Water" "Beast" "Thunder" "Beast" ...
## $ Moon : chr "Upper" "Lower" "Upper" "Lower" ...
## $ Form : int 4 7 1 4 1 7 1 7 4 13 ...
## $ Battle_difficulty : int 28 20 24 27 43 53 45 75 59 35 ...
## $ Effectiveness_rating: int 4 5 6 7 1 3 5 3 10 6 ...
## $ Hours_to_daybreak : int 4 3 4 5 8 8 5 5 4 6 ...
## $ Call_pillars : int 1 0 0 0 0 0 1 0 0 0 ...
Rename.
names(demon)[1] <- "ID"
names(demon)
## [1] "ID" "Character" "Breathing"
## [4] "Moon" "Form" "Battle_difficulty"
## [7] "Effectiveness_rating" "Hours_to_daybreak" "Call_pillars"
Include only categorical variables.
library(plyr)
columns <- c("Character", "Breathing", "Moon", "Call_pillars")
demon[columns] <- lapply(demon[columns], factor)
str(demon)
## 'data.frame': 6666 obs. of 9 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Character : Factor w/ 4 levels "Inosuke","Nezuko",..: 3 1 4 1 1 4 4 4 1 3 ...
## $ Breathing : Factor w/ 5 levels "Beast","Blood Demon Art",..: 5 1 4 1 1 4 4 4 1 3 ...
## $ Moon : Factor w/ 2 levels "Lower","Upper": 2 1 2 1 2 1 1 2 2 1 ...
## $ Form : int 4 7 1 4 1 7 1 7 4 13 ...
## $ Battle_difficulty : int 28 20 24 27 43 53 45 75 59 35 ...
## $ Effectiveness_rating: int 4 5 6 7 1 3 5 3 10 6 ...
## $ Hours_to_daybreak : int 4 3 4 5 8 8 5 5 4 6 ...
## $ Call_pillars : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
names(demon)
## [1] "ID" "Character" "Breathing"
## [4] "Moon" "Form" "Battle_difficulty"
## [7] "Effectiveness_rating" "Hours_to_daybreak" "Call_pillars"
demon_2 <- demon[,c(2:4, 9)]
head(demon_2)
## Character Breathing Moon Call_pillars
## 1 Tanjiro Water Upper 1
## 2 Inosuke Beast Lower 0
## 3 Zenitsu Thunder Upper 0
## 4 Inosuke Beast Lower 0
## 5 Inosuke Beast Upper 0
## 6 Zenitsu Thunder Lower 0
nrow(demon_2)
## [1] 6666
names(demon_2)
## [1] "Character" "Breathing" "Moon" "Call_pillars"
str(demon_2)
## 'data.frame': 6666 obs. of 4 variables:
## $ Character : Factor w/ 4 levels "Inosuke","Nezuko",..: 3 1 4 1 1 4 4 4 1 3 ...
## $ Breathing : Factor w/ 5 levels "Beast","Blood Demon Art",..: 5 1 4 1 1 4 4 4 1 3 ...
## $ Moon : Factor w/ 2 levels "Lower","Upper": 2 1 2 1 2 1 1 2 2 1 ...
## $ Call_pillars: Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
table(demon_2$Call_pillars)
##
## 0 1
## 3121 3545
Using our favourite seed :-)
set.seed(666)
train_index <- sample(1:nrow(demon_2), 0.6 * nrow(demon_2))
valid_index <- setdiff(1:nrow(demon_2), train_index)
train_df <- demon_2[train_index, ]
valid_df <- demon_2[valid_index, ]
nrow(train_df)
## [1] 3999
nrow(valid_df)
## [1] 2667
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
compare_df_cols(train_df, valid_df)
## column_name train_df valid_df
## 1 Breathing factor factor
## 2 Call_pillars factor factor
## 3 Character factor factor
## 4 Moon factor factor
compare_df_cols_same(train_df, valid_df)
## [1] TRUE
library(e1071)
demon_nb_train <- naiveBayes(Call_pillars ~ .,
data = train_df)
demon_nb_train
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.4691173 0.5308827
##
## Conditional probabilities:
## Character
## Y Inosuke Nezuko Tanjiro Zenitsu
## 0 0.30756930 0.08102345 0.30810235 0.30330490
## 1 0.19029675 0.41403674 0.20113048 0.19453603
##
## Breathing
## Y Beast Blood Demon Art Sun Thunder Water
## 0 0.30756930 0.08102345 0.16631130 0.30330490 0.14179104
## 1 0.19029675 0.41403674 0.09891663 0.19453603 0.10221385
##
## Moon
## Y Lower Upper
## 0 0.4888060 0.5111940
## 1 0.5044748 0.4955252
pred_class_train <- predict(demon_nb_train, newdata = train_df[, c(1:3)],
type = "class")
head(pred_class_train)
## [1] 1 0 1 0 0 0
## Levels: 0 1
pred_prob <- predict(demon_nb_train, newdata = valid_df[, c(1:3)],
type = "raw")
head(pred_prob)
## 0 1
## [1,] 0.6595317 0.3404683
## [2,] 0.6890507 0.3109493
## [3,] 0.7042615 0.2957385
## [4,] 0.6754636 0.3245364
## [5,] 0.6754636 0.3245364
## [6,] 0.7042615 0.2957385
pred_class <- predict(demon_nb_train, newdata = valid_df[, c(1:3)])
head(pred_class)
## [1] 0 0 0 0 0 0
## Levels: 0 1
pred_class <- predict(demon_nb_train, newdata = valid_df[, c(1:3)])
head(pred_class)
## [1] 0 0 0 0 0 0
## Levels: 0 1
Training set.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(pred_class_train, train_df$Call_pillars,
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1724 1244
## 1 152 879
##
## Accuracy : 0.6509
## 95% CI : (0.6359, 0.6657)
## No Information Rate : 0.5309
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3221
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4140
## Specificity : 0.9190
## Pos Pred Value : 0.8526
## Neg Pred Value : 0.5809
## Prevalence : 0.5309
## Detection Rate : 0.2198
## Detection Prevalence : 0.2578
## Balanced Accuracy : 0.6665
##
## 'Positive' Class : 1
##
Validation set.
confusionMatrix(pred_class, valid_df$Call_pillars,
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1156 850
## 1 89 572
##
## Accuracy : 0.6479
## 95% CI : (0.6295, 0.6661)
## No Information Rate : 0.5332
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3186
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4023
## Specificity : 0.9285
## Pos Pred Value : 0.8654
## Neg Pred Value : 0.5763
## Prevalence : 0.5332
## Detection Rate : 0.2145
## Detection Prevalence : 0.2478
## Balanced Accuracy : 0.6654
##
## 'Positive' Class : 1
##
library(ROSE)
## Loaded ROSE 0.0-3
ROSE::roc.curve(valid_df$Call_pillars,
pred_class)
## Area under the curve (AUC): 0.665