Directions

Descriptive statistics

Data for demo

Back to the spellbook

1. Load data

univ <- read.csv("universities_and_colleges.csv", header = TRUE)
head(univ)
##     School       Type Median_SAT Acceptance_Rate Expenditures_Student
## 1  Amherst   Lib Arts       1315            0.22                26636
## 2  Barnard   Lib Arts       1220            0.53                17653
## 3    Bates   Lib Arts       1240            0.36                17554
## 4 Berkeley University       1176            0.37                23665
## 5  Bowdoin   Lib Arts       1300            0.24                25703
## 6    Brown University       1281            0.24                24201
##   Top_10pct_HS Graduation_pct
## 1           85             93
## 2           69             80
## 3           58             88
## 4           95             68
## 5           78             90
## 6           80             90
str(univ)
## 'data.frame':    49 obs. of  7 variables:
##  $ School              : chr  "Amherst" "Barnard" "Bates" "Berkeley" ...
##  $ Type                : chr  "Lib Arts" "Lib Arts" "Lib Arts" "University" ...
##  $ Median_SAT          : int  1315 1220 1240 1176 1300 1281 1255 1400 1300 1225 ...
##  $ Acceptance_Rate     : num  0.22 0.53 0.36 0.37 0.24 0.24 0.56 0.31 0.4 0.64 ...
##  $ Expenditures_Student: num  26636 17653 17554 23665 25703 ...
##  $ Top_10pct_HS        : int  85 69 58 95 78 80 70 98 75 52 ...
##  $ Graduation_pct      : int  93 80 88 68 90 90 84 75 80 77 ...

2. Central tendencies

mean(univ$Acceptance_Rate, na.rm = TRUE)
## [1] 0.3810204
median(univ$Acceptance_Rate, na.rm = TRUE)
## [1] 0.36
library(DescTools)

Mode(univ$Acceptance_Rate)
## [1] 0.24
## attr(,"freq")
## [1] 5

3. Quartiles

quantile(univ$Acceptance_Rate)
##   0%  25%  50%  75% 100% 
## 0.17 0.28 0.36 0.48 0.67

4. Variance and standard deviation

sd(univ$Acceptance_Rate)
## [1] 0.1337168
var(univ$Acceptance_Rate)
## [1] 0.01788019

5. Correlation

cor(univ[,c(3:6)], method = "pearson")
##                      Median_SAT Acceptance_Rate Expenditures_Student
## Median_SAT            1.0000000      -0.6019020            0.5727417
## Acceptance_Rate      -0.6019020       1.0000000           -0.2842544
## Expenditures_Student  0.5727417      -0.2842544            1.0000000
## Top_10pct_HS          0.5034680      -0.6097210            0.5057820
##                      Top_10pct_HS
## Median_SAT               0.503468
## Acceptance_Rate         -0.609721
## Expenditures_Student     0.505782
## Top_10pct_HS             1.000000

6. Range

max(univ$Acceptance_Rate)
## [1] 0.67
min(univ$Acceptance_Rate)
## [1] 0.17
max(univ$Acceptance_Rate) - min(univ$Acceptance_Rate)
## [1] 0.5

7. z-score

Manual calculation.

univ$Acceptance_Rate_zscore <- (univ$Acceptance_Rate - 
                                  mean(univ$Acceptance_Rate))/sd(univ$Acceptance_Rate)

head(univ)
##     School       Type Median_SAT Acceptance_Rate Expenditures_Student
## 1  Amherst   Lib Arts       1315            0.22                26636
## 2  Barnard   Lib Arts       1220            0.53                17653
## 3    Bates   Lib Arts       1240            0.36                17554
## 4 Berkeley University       1176            0.37                23665
## 5  Bowdoin   Lib Arts       1300            0.24                25703
## 6    Brown University       1281            0.24                24201
##   Top_10pct_HS Graduation_pct Acceptance_Rate_zscore
## 1           85             93            -1.20418966
## 2           69             80             1.11414253
## 3           58             88            -0.15720093
## 4           95             68            -0.08241602
## 5           78             90            -1.05461984
## 6           80             90            -1.05461984

Possibly a simpler way to do this.

univ$Acceptance_Rate_zscore_2 <- scale(univ$Acceptance_Rate)
head(univ)
##     School       Type Median_SAT Acceptance_Rate Expenditures_Student
## 1  Amherst   Lib Arts       1315            0.22                26636
## 2  Barnard   Lib Arts       1220            0.53                17653
## 3    Bates   Lib Arts       1240            0.36                17554
## 4 Berkeley University       1176            0.37                23665
## 5  Bowdoin   Lib Arts       1300            0.24                25703
## 6    Brown University       1281            0.24                24201
##   Top_10pct_HS Graduation_pct Acceptance_Rate_zscore Acceptance_Rate_zscore_2
## 1           85             93            -1.20418966              -1.20418966
## 2           69             80             1.11414253               1.11414253
## 3           58             88            -0.15720093              -0.15720093
## 4           95             68            -0.08241602              -0.08241602
## 5           78             90            -1.05461984              -1.05461984
## 6           80             90            -1.05461984              -1.05461984

8. Distribution

library(moments)

moments::skewness(univ$Acceptance_Rate)
## [1] 0.3450946
3*(mean(univ$Acceptance_Rate) - median(univ$Acceptance_Rate))/sd(univ$Acceptance_Rate)
## [1] 0.4716028
moments::kurtosis(univ$Acceptance_Rate)
## [1] 2.150586

Statistical test for normality using skewness and kurtosis.

H0 The data have a skewness and kurtosis that matches a normal distribution

H1: The data have a skewness and kurtosis that does not match a normal distribution

jarque.test(univ$Acceptance_Rate)
## 
##  Jarque-Bera Normality Test
## 
## data:  univ$Acceptance_Rate
## JB = 2.4456, p-value = 0.2944
## alternative hypothesis: greater

9. Frequency tables

Central tendencies in frequency tables.

Frequency data for this demo

comp <- read.csv("computer_repair_times.csv", header = TRUE)
head(comp)
##   ï..Days Frequency
## 1       0         0
## 2       1         0
## 3       2         0
## 4       3         0
## 5       4         0
## 6       5         1

Rename column names.

names(comp) <- c("days", "frequency")
head(comp)
##   days frequency
## 1    0         0
## 2    1         0
## 3    2         0
## 4    3         0
## 5    4         0
## 6    5         1

Convert data.

comp_data <- rep(comp$days, comp$frequency)
comp_data
##   [1]  5  6  6  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  8  8  9  9  9  9  9
##  [26]  9  9  9  9  9  9  9  9  9 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
##  [51] 10 10 10 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 12 12 12
##  [76] 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 13 13 13 13 13
## [101] 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 14 14 14 14 14 14 14 14
## [126] 14 14 14 14 14 14 14 14 14 14 14 14 15 15 15 15 15 15 15 15 15 15 15 15 15
## [151] 15 15 15 15 15 15 15 15 15 15 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
## [176] 16 16 16 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 18 18 18 18 18 18
## [201] 18 18 18 19 19 19 19 19 19 19 19 19 20 20 20 20 20 20 20 20 21 21 21 21 21
## [226] 22 22 22 22 23 23 23 23 25 25 25 26 26 27 27 29 29 31 34 36 36 37 38 39 40

Mean.

mean(comp_data)
## [1] 14.912

Median.

median(comp_data)
## [1] 14

Mode.

DescTools::Mode(comp_data)
## [1] 12 15
## attr(,"freq")
## [1] 23

Variance.

var(comp_data)
## [1] 35.50227

Standard deviation.

sd(comp_data)
## [1] 5.958378