1. Load data
hero <- read.csv("mha_v2.csv", header = TRUE)
head(hero)
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 All For One Villain All for One 6.5 6.5 6.5 6.5
## 2 All Might Hero One for All 6.5 6.5 6.0 6.0
## 3 Best Jeanist Hero Fiber Master 2.0 3.0 5.0 2.0
## 4 Bubble Girl Hero Bubble 1.0 2.0 4.5 3.0
## 5 Camie Utsushimi Student Glamour 1.5 2.5 5.0 2.5
## 6 Cementoss Hero Cement 4.5 5.0 5.0 5.0
## Score Rank
## 1 26.0 1
## 2 25.0 2
## 3 12.0 94
## 4 10.5 111
## 5 11.5 105
## 6 19.5 10
1.1 Rename
Change to any name.
names(hero)[1] <- "Name"
1.2 Preview
head(hero, 10)
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 All For One Villain All for One 6.5 6.5 6.5 6.5
## 2 All Might Hero One for All 6.5 6.5 6.0 6.0
## 3 Best Jeanist Hero Fiber Master 2.0 3.0 5.0 2.0
## 4 Bubble Girl Hero Bubble 1.0 2.0 4.5 3.0
## 5 Camie Utsushimi Student Glamour 1.5 2.5 5.0 2.5
## 6 Cementoss Hero Cement 4.5 5.0 5.0 5.0
## 7 Centipeder Hero Centipede 2.0 4.0 4.0 4.0
## 8 Chronostasis Villain Chronostasis 2.0 2.0 5.0 5.0
## 9 Curious Villain Landmine 2.0 3.0 3.0 4.0
## 10 Dabi Villain Cremation 4.0 3.0 1.5 3.0
## Score Rank
## 1 26.0 1
## 2 25.0 2
## 3 12.0 94
## 4 10.5 111
## 5 11.5 105
## 6 19.5 10
## 7 14.0 63
## 8 14.0 63
## 9 12.0 94
## 10 11.5 105
2. Aggregate
How many students, heros, and villains are there?
table(hero$Occupation)
##
## Hero Jedi Student Villain
## 31 1 53 35
One way or another… Note: Specify the package if there’s a conflict with another.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
hero_occupation <- hero %>%
group_by(Occupation) %>%
dplyr::summarize(count = n())
hero_occupation
## # A tibble: 4 × 2
## Occupation count
## <chr> <int>
## 1 Hero 31
## 2 Jedi 1
## 3 Student 53
## 4 Villain 35
3. Top 10
Who are the top 10?
hero_top10 <- subset(hero, Rank <= 10)
hero_top10
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 All For One Villain All for One 6.5 6.5 6.5 6.5
## 2 All Might Hero One for All 6.5 6.5 6.0 6.0
## 6 Cementoss Hero Cement 4.5 5.0 5.0 5.0
## 17 Endeavor Hero Hellflame 6.5 5.5 5.0 4.0
## 18 Eraser Head Hero Erasure 3.0 4.5 6.5 5.5
## 28 Hawks Hero Fierce Wings 3.5 6.0 6.5 5.0
## 39 Izuku Midoriya Student One for All 5.5 5.0 5.0 5.0
## 43 Katsuki Bakugo Student Explosion 5.0 5.0 5.5 5.0
## 84 Re-Destro Villain Stress 5.5 5.0 5.5 5.0
## 101 Tamaki Amajiki Student Manifest 5.0 4.0 6.0 5.0
## 108 Tomura Shigaraki Villain Decay 6.5 3.5 5.0 5.0
## 120 Yeoda Jedi Force 10.0 10.0 10.0 10.0
## Score Rank
## 1 26.0 1
## 2 25.0 2
## 6 19.5 10
## 17 21.0 3
## 18 19.5 10
## 28 21.0 3
## 39 20.5 6
## 43 20.5 6
## 84 21.0 3
## 101 20.0 8
## 108 20.0 8
## 120 40.0 1
4. Sort
Sort the results by rank.
hero_top10_sort <- hero_top10[order(hero_top10$Rank),]
hero_top10_sort
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 All For One Villain All for One 6.5 6.5 6.5 6.5
## 120 Yeoda Jedi Force 10.0 10.0 10.0 10.0
## 2 All Might Hero One for All 6.5 6.5 6.0 6.0
## 17 Endeavor Hero Hellflame 6.5 5.5 5.0 4.0
## 28 Hawks Hero Fierce Wings 3.5 6.0 6.5 5.0
## 84 Re-Destro Villain Stress 5.5 5.0 5.5 5.0
## 39 Izuku Midoriya Student One for All 5.5 5.0 5.0 5.0
## 43 Katsuki Bakugo Student Explosion 5.0 5.0 5.5 5.0
## 101 Tamaki Amajiki Student Manifest 5.0 4.0 6.0 5.0
## 108 Tomura Shigaraki Villain Decay 6.5 3.5 5.0 5.0
## 6 Cementoss Hero Cement 4.5 5.0 5.0 5.0
## 18 Eraser Head Hero Erasure 3.0 4.5 6.5 5.5
## Score Rank
## 1 26.0 1
## 120 40.0 1
## 2 25.0 2
## 17 21.0 3
## 28 21.0 3
## 84 21.0 3
## 39 20.5 6
## 43 20.5 6
## 101 20.0 8
## 108 20.0 8
## 6 19.5 10
## 18 19.5 10
Hierarchical sorting and reverse sorting.
hero_top10_sort_2 <- hero_top10[order(hero_top10$Rank,
-xtfrm(hero_top10$Occupation)),]
hero_top10_sort_2
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 All For One Villain All for One 6.5 6.5 6.5 6.5
## 120 Yeoda Jedi Force 10.0 10.0 10.0 10.0
## 2 All Might Hero One for All 6.5 6.5 6.0 6.0
## 84 Re-Destro Villain Stress 5.5 5.0 5.5 5.0
## 17 Endeavor Hero Hellflame 6.5 5.5 5.0 4.0
## 28 Hawks Hero Fierce Wings 3.5 6.0 6.5 5.0
## 39 Izuku Midoriya Student One for All 5.5 5.0 5.0 5.0
## 43 Katsuki Bakugo Student Explosion 5.0 5.0 5.5 5.0
## 108 Tomura Shigaraki Villain Decay 6.5 3.5 5.0 5.0
## 101 Tamaki Amajiki Student Manifest 5.0 4.0 6.0 5.0
## 6 Cementoss Hero Cement 4.5 5.0 5.0 5.0
## 18 Eraser Head Hero Erasure 3.0 4.5 6.5 5.5
## Score Rank
## 1 26.0 1
## 120 40.0 1
## 2 25.0 2
## 84 21.0 3
## 17 21.0 3
## 28 21.0 3
## 39 20.5 6
## 43 20.5 6
## 108 20.0 8
## 101 20.0 8
## 6 19.5 10
## 18 19.5 10
Probably an easier way.
library(dplyr)
hero_top10_sort_3 <- arrange(hero_top10, Rank, desc(Name))
hero_top10_sort_3
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 Yeoda Jedi Force 10.0 10.0 10.0 10.0
## 2 All For One Villain All for One 6.5 6.5 6.5 6.5
## 3 All Might Hero One for All 6.5 6.5 6.0 6.0
## 4 Re-Destro Villain Stress 5.5 5.0 5.5 5.0
## 5 Hawks Hero Fierce Wings 3.5 6.0 6.5 5.0
## 6 Endeavor Hero Hellflame 6.5 5.5 5.0 4.0
## 7 Katsuki Bakugo Student Explosion 5.0 5.0 5.5 5.0
## 8 Izuku Midoriya Student One for All 5.5 5.0 5.0 5.0
## 9 Tomura Shigaraki Villain Decay 6.5 3.5 5.0 5.0
## 10 Tamaki Amajiki Student Manifest 5.0 4.0 6.0 5.0
## 11 Eraser Head Hero Erasure 3.0 4.5 6.5 5.5
## 12 Cementoss Hero Cement 4.5 5.0 5.0 5.0
## Score Rank
## 1 40.0 1
## 2 26.0 1
## 3 25.0 2
## 4 21.0 3
## 5 21.0 3
## 6 21.0 3
## 7 20.5 6
## 8 20.5 6
## 9 20.0 8
## 10 20.0 8
## 11 19.5 10
## 12 19.5 10
5. Scatter plot
Looks like someone is off the charts :-)
hero_top20 <- subset(hero, Rank <= 20)
head(hero_top20)
## Name Occupation Quirk Power Speed Technique Intelligence Score
## 1 All For One Villain All for One 6.5 6.5 6.5 6.5 26.0
## 2 All Might Hero One for All 6.5 6.5 6.0 6.0 25.0
## 6 Cementoss Hero Cement 4.5 5.0 5.0 5.0 19.5
## 15 Edgeshot Hero Foldabody 3.5 6.0 5.0 4.0 18.5
## 17 Endeavor Hero Hellflame 6.5 5.5 5.0 4.0 21.0
## 18 Eraser Head Hero Erasure 3.0 4.5 6.5 5.5 19.5
## Rank
## 1 1
## 2 2
## 6 10
## 15 13
## 17 3
## 18 10
library(ggplot2)
library(ggrepel)
library(ggpubr)
ggplot(hero_top20) + aes(x = Power, y = Speed) +
geom_point() +
geom_point(data = subset(hero_top20, Name == "Yeoda"),
aes(x = Power ,y = Speed),
color = "red",
size = 20) +
geom_label_repel(aes(label = Name),
box.padding = 0.35,
point.padding = 0.8,
segment.color = "blue")
ggplot(hero_top20) + aes(x = Power, y = Speed) +
geom_point() +
geom_point(data = subset(hero_top20, Name == "Yeoda"),
aes(x = Power ,y = Speed),
color = "red",
size = 10) +
geom_smooth(method = loess) +
geom_label_repel(data = subset(hero_top20, Rank <= 5),
aes(label = Name),
box.padding = 2.5,
point.padding = 0.8,
segment.color = "blue") +
xlab("Power Score") + ylab("Speed Score") +
coord_cartesian(xlim = c(1, 12), ylim = c(1, 12)) +
ggtitle("No tradeoff between power and speed?") +
theme(axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14))
6. Sampling
Random acts of senseless data :-)
One for All… 20%!!! :-)
sampling 20%
library(dplyr)
hero_sample_20p <- sample_frac(hero, 0.2)
nrow(hero_sample_20p)
## [1] 24
head(hero_sample_20p)
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 Hound Dog Hero Dog 5.5 5 1.5 4.0
## 2 La Brava Villain Love 1.0 1 4.0 5.5
## 3 Skeptic Villain Anthropomorph 1.0 1 4.0 4.0
## 4 Trumpet Villain Incite 1.0 1 4.0 4.0
## 5 Kojiro Bondo Student Cemedine 4.0 3 3.0 3.0
## 6 Sir. Nighteye Hero Foresight 4.0 3 5.0 5.0
## Score Rank
## 1 16.0 31
## 2 11.5 105
## 3 10.0 112
## 4 10.0 112
## 5 13.0 80
## 6 17.0 23
Sample n = 25
hero_sample_25n <- sample_n(hero, 25)
nrow(hero_sample_25n)
## [1] 25
head(hero_sample_25n)
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 Ragdoll Hero Search 2.0 2.0 1.0 3.0
## 2 Rikido Sato Student Sugar Rush 5.5 4.5 2.0 3.0
## 3 Momo Yaoyorozu Student Creation 2.0 3.0 5.5 6.0
## 4 Mt. Lady Hero Gigantification 5.0 3.0 2.0 2.0
## 5 Yosetsu Awase Student Weld 2.0 3.5 5.0 4.5
## 6 Mandalay Hero Telepath 2.0 3.0 4.0 5.0
## Score Rank
## 1 8.0 119
## 2 15.0 41
## 3 16.5 29
## 4 12.0 94
## 5 15.0 41
## 6 14.0 63
7. Concatenate
By row.
hero_concat_1 <- rbind(hero_sample_20p, hero_sample_25n)
nrow(hero_concat_1)
## [1] 49
head(hero_concat_1)
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 Hound Dog Hero Dog 5.5 5 1.5 4.0
## 2 La Brava Villain Love 1.0 1 4.0 5.5
## 3 Skeptic Villain Anthropomorph 1.0 1 4.0 4.0
## 4 Trumpet Villain Incite 1.0 1 4.0 4.0
## 5 Kojiro Bondo Student Cemedine 4.0 3 3.0 3.0
## 6 Sir. Nighteye Hero Foresight 4.0 3 5.0 5.0
## Score Rank
## 1 16.0 31
## 2 11.5 105
## 3 10.0 112
## 4 10.0 112
## 5 13.0 80
## 6 17.0 23
By column.
hero_sample_by_col_1 <- hero[, c(1:3)]
head(hero_sample_by_col_1)
## Name Occupation Quirk
## 1 All For One Villain All for One
## 2 All Might Hero One for All
## 3 Best Jeanist Hero Fiber Master
## 4 Bubble Girl Hero Bubble
## 5 Camie Utsushimi Student Glamour
## 6 Cementoss Hero Cement
hero_sample_by_col_2 <- hero[, c(1, 5, 6)]
head(hero_sample_by_col_2)
## Name Speed Technique
## 1 All For One 6.5 6.5
## 2 All Might 6.5 6.0
## 3 Best Jeanist 3.0 5.0
## 4 Bubble Girl 2.0 4.5
## 5 Camie Utsushimi 2.5 5.0
## 6 Cementoss 5.0 5.0
hero_concat_2 <- cbind(hero_sample_by_col_1, hero_sample_by_col_2)
nrow(hero_concat_2)
## [1] 120
head(hero_concat_2)
## Name Occupation Quirk Name Speed Technique
## 1 All For One Villain All for One All For One 6.5 6.5
## 2 All Might Hero One for All All Might 6.5 6.0
## 3 Best Jeanist Hero Fiber Master Best Jeanist 3.0 5.0
## 4 Bubble Girl Hero Bubble Bubble Girl 2.0 4.5
## 5 Camie Utsushimi Student Glamour Camie Utsushimi 2.5 5.0
## 6 Cementoss Hero Cement Cementoss 5.0 5.0
vlookup.
hero_concat_3 <- merge(hero_sample_by_col_1, hero_sample_by_col_2, by = "Name")
head(hero_concat_3)
## Name Occupation Quirk Speed Technique
## 1 All For One Villain All for One 6.5 6.5
## 2 All Might Hero One for All 6.5 6.0
## 3 Best Jeanist Hero Fiber Master 3.0 5.0
## 4 Bubble Girl Hero Bubble 2.0 4.5
## 5 Camie Utsushimi Student Glamour 2.5 5.0
## 6 Cementoss Hero Cement 5.0 5.0
Another way.
library(dplyr)
hero_concat_4 <- inner_join(hero_sample_by_col_1, hero_sample_by_col_2,
by = "Name")
head(hero_concat_4)
## Name Occupation Quirk Speed Technique
## 1 All For One Villain All for One 6.5 6.5
## 2 All Might Hero One for All 6.5 6.0
## 3 Best Jeanist Hero Fiber Master 3.0 5.0
## 4 Bubble Girl Hero Bubble 2.0 4.5
## 5 Camie Utsushimi Student Glamour 2.5 5.0
## 6 Cementoss Hero Cement 5.0 5.0
8. Recoding
Categorical variables.
head(hero)
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 All For One Villain All for One 6.5 6.5 6.5 6.5
## 2 All Might Hero One for All 6.5 6.5 6.0 6.0
## 3 Best Jeanist Hero Fiber Master 2.0 3.0 5.0 2.0
## 4 Bubble Girl Hero Bubble 1.0 2.0 4.5 3.0
## 5 Camie Utsushimi Student Glamour 1.5 2.5 5.0 2.5
## 6 Cementoss Hero Cement 4.5 5.0 5.0 5.0
## Score Rank
## 1 26.0 1
## 2 25.0 2
## 3 12.0 94
## 4 10.5 111
## 5 11.5 105
## 6 19.5 10
hero_recode <- hero
library(plyr)
hero_recode$Occupation_2<- revalue(hero_recode$Occupation,
c("Villain" = "Blah blah blah",
"Hero" = "Ole ole ole ole ole ole",
"Student" = "May the force be with you",
"Jedi" = "This is the Way"))
head(hero_recode)
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 All For One Villain All for One 6.5 6.5 6.5 6.5
## 2 All Might Hero One for All 6.5 6.5 6.0 6.0
## 3 Best Jeanist Hero Fiber Master 2.0 3.0 5.0 2.0
## 4 Bubble Girl Hero Bubble 1.0 2.0 4.5 3.0
## 5 Camie Utsushimi Student Glamour 1.5 2.5 5.0 2.5
## 6 Cementoss Hero Cement 4.5 5.0 5.0 5.0
## Score Rank Occupation_2
## 1 26.0 1 Blah blah blah
## 2 25.0 2 Ole ole ole ole ole ole
## 3 12.0 94 Ole ole ole ole ole ole
## 4 10.5 111 Ole ole ole ole ole ole
## 5 11.5 105 May the force be with you
## 6 19.5 10 Ole ole ole ole ole ole
table(hero_recode$Occupation_2)
##
## Blah blah blah May the force be with you Ole ole ole ole ole ole
## 35 53 31
## This is the Way
## 1
Numerical variables.
summary(hero_recode$Score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.0 13.0 14.5 15.0 16.5 40.0
library(dplyr)
hero_recode <- hero_recode %>%
mutate(Score_2 = case_when(Score <= 10 ~ 1000,
Score > 10 & Score <= 20 ~ 2000,
Score > 20 & Score <= 30 ~ 3000,
Score > 30 & Score <= 40 ~ 4000))
head(hero_recode$Score_2)
## [1] 3000 3000 2000 2000 2000 2000
summary(hero_recode$Score_2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1000 2000 2000 2008 2000 4000
From numerical to categorical.
library(dplyr)
hero_recode <- hero_recode %>%
mutate(Score_3 = case_when(Score <= 10 ~ "Low",
Score > 10 & Score <= 20 ~ "Moderate",
Score > 20 & Score <= 30 ~ "High",
Score > 30 & Score <= 40 ~ "OP"))
head(hero_recode)
## Name Occupation Quirk Power Speed Technique Intelligence
## 1 All For One Villain All for One 6.5 6.5 6.5 6.5
## 2 All Might Hero One for All 6.5 6.5 6.0 6.0
## 3 Best Jeanist Hero Fiber Master 2.0 3.0 5.0 2.0
## 4 Bubble Girl Hero Bubble 1.0 2.0 4.5 3.0
## 5 Camie Utsushimi Student Glamour 1.5 2.5 5.0 2.5
## 6 Cementoss Hero Cement 4.5 5.0 5.0 5.0
## Score Rank Occupation_2 Score_2 Score_3
## 1 26.0 1 Blah blah blah 3000 High
## 2 25.0 2 Ole ole ole ole ole ole 3000 High
## 3 12.0 94 Ole ole ole ole ole ole 2000 Moderate
## 4 10.5 111 Ole ole ole ole ole ole 2000 Moderate
## 5 11.5 105 May the force be with you 2000 Moderate
## 6 19.5 10 Ole ole ole ole ole ole 2000 Moderate
table(hero_recode$Score_3)
##
## High Low Moderate OP
## 7 8 104 1
9. Factor
Factorise.
str(hero)
## 'data.frame': 120 obs. of 9 variables:
## $ Name : chr "All For One" "All Might" "Best Jeanist" "Bubble Girl" ...
## $ Occupation : chr "Villain" "Hero" "Hero" "Hero" ...
## $ Quirk : chr "All for One" "One for All" "Fiber Master" "Bubble" ...
## $ Power : num 6.5 6.5 2 1 1.5 4.5 2 2 2 4 ...
## $ Speed : num 6.5 6.5 3 2 2.5 5 4 2 3 3 ...
## $ Technique : num 6.5 6 5 4.5 5 5 4 5 3 1.5 ...
## $ Intelligence: num 6.5 6 2 3 2.5 5 4 5 4 3 ...
## $ Score : num 26 25 12 10.5 11.5 19.5 14 14 12 11.5 ...
## $ Rank : int 1 2 94 111 105 10 63 63 94 105 ...
hero$Occupation <- as.factor(hero$Occupation)
str(hero)
## 'data.frame': 120 obs. of 9 variables:
## $ Name : chr "All For One" "All Might" "Best Jeanist" "Bubble Girl" ...
## $ Occupation : Factor w/ 4 levels "Hero","Jedi",..: 4 1 1 1 3 1 1 4 4 4 ...
## $ Quirk : chr "All for One" "One for All" "Fiber Master" "Bubble" ...
## $ Power : num 6.5 6.5 2 1 1.5 4.5 2 2 2 4 ...
## $ Speed : num 6.5 6.5 3 2 2.5 5 4 2 3 3 ...
## $ Technique : num 6.5 6 5 4.5 5 5 4 5 3 1.5 ...
## $ Intelligence: num 6.5 6 2 3 2.5 5 4 5 4 3 ...
## $ Score : num 26 25 12 10.5 11.5 19.5 14 14 12 11.5 ...
## $ Rank : int 1 2 94 111 105 10 63 63 94 105 ...
10. Training-Validation Split
set.seed(666)
train_index <- sample(1:nrow(hero), 0.7 * nrow(hero))
valid_index <- setdiff(1:nrow(hero), train_index)
train_df <- hero[train_index, ]
valid_df <- hero[valid_index, ]
nrow(train_df)
## [1] 84
head(train_df)
## Name Occupation Quirk Power Speed Technique Intelligence
## 62 Minoru Mineta Student Pop Off 1.0 4.5 5.0 5.5
## 96 Sir. Nighteye Hero Foresight 4.0 3.0 5.0 5.0
## 11 Daruma Ujiko Villain Life Force 1.0 1.0 1.0 6.5
## 28 Hawks Hero Fierce Wings 3.5 6.0 6.5 5.0
## 14 Ectoplasm Hero Clones 4.0 3.0 5.0 3.0
## 5 Camie Utsushimi Student Glamour 1.5 2.5 5.0 2.5
## Score Rank
## 62 16.0 31
## 96 17.0 23
## 11 9.5 117
## 28 21.0 3
## 14 15.0 41
## 5 11.5 105
nrow(valid_df)
## [1] 36
head(valid_df)
## Name Occupation Quirk Power Speed Technique Intelligence
## 2 All Might Hero One for All 6.5 6.5 6.0 6
## 4 Bubble Girl Hero Bubble 1.0 2.0 4.5 3
## 15 Edgeshot Hero Foldabody 3.5 6.0 5.0 4
## 16 Eijiro Kirishima Student Hardening 4.5 2.5 2.0 3
## 17 Endeavor Hero Hellflame 6.5 5.5 5.0 4
## 21 Gang Orca Hero Orca 4.0 3.0 5.0 4
## Score Rank
## 2 25.0 2
## 4 10.5 111
## 15 18.5 13
## 16 12.0 94
## 17 21.0 3
## 21 16.0 31
I am a hero too!