Directions

Recruiting heroes in a world of quirks…

Data for demo

Back to the spellbook

1. Load data

hero <- read.csv("mha_v2.csv", header = TRUE)
head(hero)
##           ï..Name Occupation        Quirk Power Speed Technique Intelligence
## 1     All For One    Villain  All for One   6.5   6.5       6.5          6.5
## 2       All Might       Hero  One for All   6.5   6.5       6.0          6.0
## 3    Best Jeanist       Hero Fiber Master   2.0   3.0       5.0          2.0
## 4     Bubble Girl       Hero       Bubble   1.0   2.0       4.5          3.0
## 5 Camie Utsushimi    Student      Glamour   1.5   2.5       5.0          2.5
## 6       Cementoss       Hero       Cement   4.5   5.0       5.0          5.0
##   Score Rank
## 1  26.0    1
## 2  25.0    2
## 3  12.0   94
## 4  10.5  111
## 5  11.5  105
## 6  19.5   10

1.1 Rename

names(hero)[1] <- "Name"

1.2 Preview

head(hero, 10)
##               Name Occupation        Quirk Power Speed Technique Intelligence
## 1      All For One    Villain  All for One   6.5   6.5       6.5          6.5
## 2        All Might       Hero  One for All   6.5   6.5       6.0          6.0
## 3     Best Jeanist       Hero Fiber Master   2.0   3.0       5.0          2.0
## 4      Bubble Girl       Hero       Bubble   1.0   2.0       4.5          3.0
## 5  Camie Utsushimi    Student      Glamour   1.5   2.5       5.0          2.5
## 6        Cementoss       Hero       Cement   4.5   5.0       5.0          5.0
## 7       Centipeder       Hero    Centipede   2.0   4.0       4.0          4.0
## 8     Chronostasis    Villain Chronostasis   2.0   2.0       5.0          5.0
## 9          Curious    Villain     Landmine   2.0   3.0       3.0          4.0
## 10            Dabi    Villain    Cremation   4.0   3.0       1.5          3.0
##    Score Rank
## 1   26.0    1
## 2   25.0    2
## 3   12.0   94
## 4   10.5  111
## 5   11.5  105
## 6   19.5   10
## 7   14.0   63
## 8   14.0   63
## 9   12.0   94
## 10  11.5  105
nrow(hero)
## [1] 120

1.3 Data table

A nicer table to look at.

library(DT)
datatable(hero,
          extensions = "Scroller",
          filter = "top",
          options = list(scrollX = TRUE,
                         scrollY = 200))
## This version of bslib is designed to work with rmarkdown version 2.7 or higher.

1.4 Aggregate

How many students, heros, and villians are there?

table(hero$Occupation)
## 
##    Hero    Jedi Student Villain 
##      31       1      53      35

1.5 Top 5

The top 5.

hero_top10 <- subset(hero, Rank <= 5)
hero_top10
##            Name Occupation        Quirk Power Speed Technique Intelligence
## 1   All For One    Villain  All for One   6.5   6.5       6.5          6.5
## 2     All Might       Hero  One for All   6.5   6.5       6.0          6.0
## 17     Endeavor       Hero    Hellflame   6.5   5.5       5.0          4.0
## 28        Hawks       Hero Fierce Wings   3.5   6.0       6.5          5.0
## 84    Re-Destro    Villain       Stress   5.5   5.0       5.5          5.0
## 120       Yeoda       Jedi        Force  10.0  10.0      10.0         10.0
##     Score Rank
## 1      26    1
## 2      25    2
## 17     21    3
## 28     21    3
## 84     21    3
## 120    40    1

2. Central Tendencies

mean(hero$Power, na.rm = TRUE)
## [1] 3.425
median(hero$Power, na.rm = TRUE)
## [1] 3
library(DescTools)

Mode(hero$Intelligence)
## [1] 3
## attr(,"freq")
## [1] 34

3. Quartiles

quantile(hero$Technique)
##     0%    25%    50%    75%   100% 
##  1.000  3.875  4.250  5.000 10.000

4. Variance and standard deviation

sd(hero$Intelligence)
## [1] 1.371351
var(hero$Intelligence)
## [1] 1.880602

5. Correlation

cor(hero[,c(4:7)], method = "pearson")
##                    Power     Speed   Technique Intelligence
## Power         1.00000000 0.6052469 -0.03934614    0.0368783
## Speed         0.60524692 1.0000000  0.32649036    0.1344191
## Technique    -0.03934614 0.3264904  1.00000000    0.4307295
## Intelligence  0.03687830 0.1344191  0.43072945    1.0000000

6. Range

max(hero$Score)
## [1] 40
min(hero$Score)
## [1] 8
max(hero$Score) - min(hero$Score)
## [1] 32

7. Z-Score

Manual calculation.

hero$Speed_z <- (hero$Speed - mean(hero$Speed))/sd(hero$Speed)

head(hero$Speed_z)
## [1]  2.0439343  2.0439343 -0.3506331 -1.0347952 -0.6927141  1.0176911

A simpler way.

hero$Speed_z2 <- scale(hero$Speed)
head(hero$Speed_z2)
##            [,1]
## [1,]  2.0439343
## [2,]  2.0439343
## [3,] -0.3506331
## [4,] -1.0347952
## [5,] -0.6927141
## [6,]  1.0176911

8. Distribution

library(moments)

moments::skewness(hero$Intelligence)
## [1] 0.5438367
moments::kurtosis(hero$Intelligence)
## [1] 5.414441

Statistical test for normality using skewness and kurtosis.

H0: The data have a skewness and kurtosis that matches a normal distribution.

H1: The data have a skewness and kurtosis that does not match a normal distribution.

jarque.test(hero$Intelligence)
## 
##  Jarque-Bera Normality Test
## 
## data:  hero$Intelligence
## JB = 35.063, p-value = 2.433e-08
## alternative hypothesis: greater

9. Pivot tables

Count of number by occupation.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
occupation_df <- hero %>%
  group_by(Occupation) %>%
  summarize(Number = n())
## `summarise()` ungrouping output (override with `.groups` argument)
occupation_df
## # A tibble: 4 x 2
##   Occupation Number
##   <chr>       <int>
## 1 Hero           31
## 2 Jedi            1
## 3 Student        53
## 4 Villain        35

Mean Power by occupation

power_df <- hero %>%
  group_by(Occupation) %>%
  summarize(Average_Power = mean(Power))
## `summarise()` ungrouping output (override with `.groups` argument)
power_df
## # A tibble: 4 x 2
##   Occupation Average_Power
##   <chr>              <dbl>
## 1 Hero                3.60
## 2 Jedi               10   
## 3 Student             3.40
## 4 Villain             3.13

Median Intelligence by occupation

intelligence_df <- hero %>%
  group_by(Occupation) %>%
  summarize(Average_Intelligence = mean(Intelligence))
## `summarise()` ungrouping output (override with `.groups` argument)
intelligence_df
## # A tibble: 4 x 2
##   Occupation Average_Intelligence
##   <chr>                     <dbl>
## 1 Hero                       3.90
## 2 Jedi                      10   
## 3 Student                    3.75
## 4 Villain                    3.57

Maximum Speed by occupation

speed_df <- hero %>%
  group_by(Occupation) %>%
  summarize(Max_Speed = max(Speed))
## `summarise()` ungrouping output (override with `.groups` argument)
speed_df
## # A tibble: 4 x 2
##   Occupation Max_Speed
##   <chr>          <dbl>
## 1 Hero             6.5
## 2 Jedi            10  
## 3 Student          6  
## 4 Villain          6.5

Minimum technique by occupation

technique_df <- hero %>%
  group_by(Occupation) %>%
  summarize(Min_Speed = min(Technique))
## `summarise()` ungrouping output (override with `.groups` argument)
technique_df
## # A tibble: 4 x 2
##   Occupation Min_Speed
##   <chr>          <dbl>
## 1 Hero               1
## 2 Jedi              10
## 3 Student            2
## 4 Villain            1

10. Frequency Tables

Create a frequency table of ranks.

Descriptive statistics of ranks may not make much sense; so it’s just for illustration.

library(dplyr)
rank_df <- hero %>%
  group_by(Rank) %>%
  summarize(frequency = n())
## `summarise()` ungrouping output (override with `.groups` argument)
rank_df
## # A tibble: 28 x 2
##     Rank frequency
##    <int>     <int>
##  1     1         2
##  2     2         1
##  3     3         3
##  4     6         2
##  5     8         2
##  6    10         2
##  7    12         1
##  8    13         3
##  9    16         4
## 10    20         3
## # ... with 18 more rows

Convert data.

rank_df_data <- rep(rank_df$Rank, rank_df$frequency)
rank_df_data
##   [1]   1   1   2   3   3   3   6   6   8   8  10  10  12  13  13  13  16  16
##  [19]  16  16  20  20  20  23  23  23  23  23  23  29  29  31  31  31  31  31
##  [37]  31  31  31  39  39  41  41  41  41  41  41  41  41  41  41  41  41  41
##  [55]  41  41  41  41  41  59  59  59  59  63  63  63  63  63  63  63  63  63
##  [73]  63  63  63  63  63  63  78  78  80  80  80  80  80  80  80  80  80  80
##  [91]  80  91  91  91  94  94  94  94  94  94  94  94  94  94  94 105 105 105
## [109] 105 105 110 111 112 112 112 112 112 117 118 119

Mean.

mean(rank_df_data)
## [1] 55.66667

Check.

sum(rank_df$Rank * rank_df$frequency) / sum(rank_df$frequency)
## [1] 55.66667
mean(hero$Rank, na.rm = TRUE)
## [1] 55.66667

Median.

median(rank_df_data)
## [1] 59

Mode.

DescTools::Mode(rank_df_data)
## [1] 41
## attr(,"freq")
## [1] 18

Standard Deviation.

sd(rank_df_data)
## [1] 34.22671

Variance.

var(rank_df_data)
## [1] 1171.468

Plus Ultra!!