Directions

Prepare for trouble!

And make it double!

Data for demo

Back to the spellbook

1. Naming variables

a <- 10
b <- a
print(b)
## [1] 10
b
## [1] 10

2. Basic data types

2.1 Logical

TRUE
## [1] TRUE
class(TRUE)
## [1] "logical"
FALSE
## [1] FALSE
class(FALSE)
## [1] "logical"

2.2 Character

y <- "Greetings Jedi Master"
y
## [1] "Greetings Jedi Master"

2.3 Numeric and Integer

2
## [1] 2
2.5
## [1] 2.5
2L
## [1] 2
class(2)
## [1] "numeric"
class(2L)
## [1] "integer"

3. Data structures

3.1 Vector

vector1 <- c(1,2,3,4,5)
vector1
## [1] 1 2 3 4 5
class(vector1)
## [1] "numeric"
vector2 <- c("S", "U")
vector2
## [1] "S" "U"
class(vector2)
## [1] "character"

3.2 List

list <- list("Hello", 123, TRUE)
list
## [[1]]
## [1] "Hello"
## 
## [[2]]
## [1] 123
## 
## [[3]]
## [1] TRUE

3.3 Matrix

matrix <- matrix(1:9, nrow = 3)
matrix
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9

3.4 Data frame

df <- read.csv("statex77.csv", header = TRUE)
head(df)
##            X Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
## 1    Alabama       3615   3624        2.1    69.05   15.1    41.3    20  50708
## 2     Alaska        365   6315        1.5    69.31   11.3    66.7   152 566432
## 3    Arizona       2212   4530        1.8    70.55    7.8    58.1    15 113417
## 4   Arkansas       2110   3378        1.9    70.66   10.1    39.9    65  51945
## 5 California      21198   5114        1.1    71.71   10.3    62.6    20 156361
## 6   Colorado       2541   4884        0.7    72.06    6.8    63.9   166 103766

4. Accessing data

4.1 Vectors

temps <- c(22, 20, 23, 32, -10)
temps
## [1]  22  20  23  32 -10
cities <- c("Seattle", "Tokyo", "Shanghai", "Hawaii", "Oslo")
cities
## [1] "Seattle"  "Tokyo"    "Shanghai" "Hawaii"   "Oslo"
names(temps) <- cities
temps
##  Seattle    Tokyo Shanghai   Hawaii     Oslo 
##       22       20       23       32      -10
temps["Oslo"]
## Oslo 
##  -10
v3 <- c("A", "B", "C")
v3[c(1,3)]
## [1] "A" "C"
v4 <- c(1:9)
v4[3:7]
## [1] 3 4 5 6 7

4.2 Matrices

v5 <- c(1:12)
v5
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12
matrix2 <- matrix(v5, nrow = 4, byrow = TRUE)
matrix2
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## [3,]    7    8    9
## [4,]   10   11   12
columns <- c("Tue", "Wed", "Thu")
rows <- c("Sleep", "Eat", "Play", "Laze around")
colnames(matrix2) <- columns
rownames(matrix2) <- rows

matrix2
##             Tue Wed Thu
## Sleep         1   2   3
## Eat           4   5   6
## Play          7   8   9
## Laze around  10  11  12

4.3 Data frames

4.3.1 Example 1

days <- c("Mon", "Tue", "Wed", "Thu", "Fri")
temp <- c(10, 21, 23, 25.3, 28)
study <- c(TRUE, FALSE, FALSE, FALSE, FALSE)

df2 <- data.frame(days, temp, study)
df2
##   days temp study
## 1  Mon 10.0  TRUE
## 2  Tue 21.0 FALSE
## 3  Wed 23.0 FALSE
## 4  Thu 25.3 FALSE
## 5  Fri 28.0 FALSE
df2[1,]
##   days temp study
## 1  Mon   10  TRUE
df2[3,2]
## [1] 23

4.3.2 Some manipulations

So if our class is on Mondays, what should we do in the week?

df3 <- df2
df3
##   days temp study
## 1  Mon 10.0  TRUE
## 2  Tue 21.0 FALSE
## 3  Wed 23.0 FALSE
## 4  Thu 25.3 FALSE
## 5  Fri 28.0 FALSE
df3$activity <- ifelse(df3$days == "Mon", "Study hard", "Be lazy")
df3
##   days temp study   activity
## 1  Mon 10.0  TRUE Study hard
## 2  Tue 21.0 FALSE    Be lazy
## 3  Wed 23.0 FALSE    Be lazy
## 4  Thu 25.3 FALSE    Be lazy
## 5  Fri 28.0 FALSE    Be lazy

How does temperature affect our daily activities?

df4 <- df2
df4
##   days temp study
## 1  Mon 10.0  TRUE
## 2  Tue 21.0 FALSE
## 3  Wed 23.0 FALSE
## 4  Thu 25.3 FALSE
## 5  Fri 28.0 FALSE
df4$activity <- ifelse(df4$temp <=10, "Stay at home", "Sleep at home")
df4
##   days temp study      activity
## 1  Mon 10.0  TRUE  Stay at home
## 2  Tue 21.0 FALSE Sleep at home
## 3  Wed 23.0 FALSE Sleep at home
## 4  Thu 25.3 FALSE Sleep at home
## 5  Fri 28.0 FALSE Sleep at home

4.3.3 Renaming

head(df)  
##            X Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
## 1    Alabama       3615   3624        2.1    69.05   15.1    41.3    20  50708
## 2     Alaska        365   6315        1.5    69.31   11.3    66.7   152 566432
## 3    Arizona       2212   4530        1.8    70.55    7.8    58.1    15 113417
## 4   Arkansas       2110   3378        1.9    70.66   10.1    39.9    65  51945
## 5 California      21198   5114        1.1    71.71   10.3    62.6    20 156361
## 6   Colorado       2541   4884        0.7    72.06    6.8    63.9   166 103766
names(df)[1] <- "State"

head(df)  
##        State Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
## 1    Alabama       3615   3624        2.1    69.05   15.1    41.3    20  50708
## 2     Alaska        365   6315        1.5    69.31   11.3    66.7   152 566432
## 3    Arizona       2212   4530        1.8    70.55    7.8    58.1    15 113417
## 4   Arkansas       2110   3378        1.9    70.66   10.1    39.9    65  51945
## 5 California      21198   5114        1.1    71.71   10.3    62.6    20 156361
## 6   Colorado       2541   4884        0.7    72.06    6.8    63.9   166 103766

4.3.4 Filtering

subset(df, State == "Alaska", select = c(Population, Area))
##   Population   Area
## 2        365 566432
subset(df, State == "Alaska" | State == "Washington",
       select = c(State:Income))
##         State Population Income
## 2      Alaska        365   6315
## 47 Washington       3559   4864
subset(df, Life.Exp >= 70 & Frost >= 170)
##            State Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## 29 New Hampshire        812   4281        0.7    71.23    3.3    57.6   174
## 34  North Dakota        637   5087        0.8    72.78    1.4    50.3   186
## 41  South Dakota        681   4167        0.5    72.08    1.7    53.3   172
## 50       Wyoming        376   4566        0.6    70.29    6.9    62.9   173
##     Area
## 29  9027
## 34 69273
## 41 75955
## 50 97203
subset(df, Life.Exp >= 70 & Frost >= 170,
       select = c(State, Population, Area))
##            State Population  Area
## 29 New Hampshire        812  9027
## 34  North Dakota        637 69273
## 41  South Dakota        681 75955
## 50       Wyoming        376 97203

5. Data exploration

5.1 Descriptive

Import the data if not already (Sect 3.4)

head(df)
##        State Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
## 1    Alabama       3615   3624        2.1    69.05   15.1    41.3    20  50708
## 2     Alaska        365   6315        1.5    69.31   11.3    66.7   152 566432
## 3    Arizona       2212   4530        1.8    70.55    7.8    58.1    15 113417
## 4   Arkansas       2110   3378        1.9    70.66   10.1    39.9    65  51945
## 5 California      21198   5114        1.1    71.71   10.3    62.6    20 156361
## 6   Colorado       2541   4884        0.7    72.06    6.8    63.9   166 103766
# if needed
# Convert row names (state names) into a column
# df$State <- rownames(df)


# if needed
# names(df)[1] <- "State"

Mean

mean(df$Population)
## [1] 4246.42

Median

median(df$Population)
## [1] 2838.5

Max

max(df$Population)
## [1] 21198

Min

min(df$Population)
## [1] 365

sd and variance

sd(df$Income)
## [1] 614.4699
var(df$Income)
## [1] 377573.3

Range

range(df$Murder)
## [1]  1.4 15.1
diff(range(df$Murder))
## [1] 13.7

Quantiles

quantile(df$Life.Exp, probs = c(0.25, 0.5, 0.75))
##     25%     50%     75% 
## 70.1175 70.6750 71.8925

Correlation

cor(df$Income, df$HS.Grad)
## [1] 0.6199323

Correlation matrix

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# select only numerical columns for correlations
# round to 3 decimal places

corr_matrix <- df %>%
  select_if(is.numeric) %>%
  cor() %>%
  round(3)

corr_matrix
##            Population Income Illiteracy Life.Exp Murder HS.Grad  Frost   Area
## Population      1.000  0.208      0.108   -0.068  0.344  -0.098 -0.332  0.023
## Income          0.208  1.000     -0.437    0.340 -0.230   0.620  0.226  0.363
## Illiteracy      0.108 -0.437      1.000   -0.588  0.703  -0.657 -0.672  0.077
## Life.Exp       -0.068  0.340     -0.588    1.000 -0.781   0.582  0.262 -0.107
## Murder          0.344 -0.230      0.703   -0.781  1.000  -0.488 -0.539  0.228
## HS.Grad        -0.098  0.620     -0.657    0.582 -0.488   1.000  0.367  0.334
## Frost          -0.332  0.226     -0.672    0.262 -0.539   0.367  1.000  0.059
## Area            0.023  0.363      0.077   -0.107  0.228   0.334  0.059  1.000

5.2 Aggregate

Top 10

# load library if not already
# library(dplyr) 

df %>%
  arrange(desc(Population)) %>% 
  head(10)
##            State Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## 1     California      21198   5114        1.1    71.71   10.3    62.6    20
## 2       New York      18076   4903        1.4    70.55   10.9    52.7    82
## 3          Texas      12237   4188        2.2    70.90   12.2    47.4    35
## 4   Pennsylvania      11860   4449        1.0    70.43    6.1    50.2   126
## 5       Illinois      11197   5107        0.9    70.14   10.3    52.6   127
## 6           Ohio      10735   4561        0.8    70.82    7.4    53.2   124
## 7       Michigan       9111   4751        0.9    70.63   11.1    52.8   125
## 8        Florida       8277   4815        1.3    70.66   10.7    52.6    11
## 9     New Jersey       7333   5237        1.1    70.93    5.2    52.5   115
## 10 Massachusetts       5814   4755        1.1    71.83    3.3    58.5   103
##      Area
## 1  156361
## 2   47831
## 3  262134
## 4   44966
## 5   55748
## 6   40975
## 7   56817
## 8   54090
## 9    7521
## 10   7826

Bottom 10

df %>%
  arrange(Population) %>%
  head(10)
##            State Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## 1         Alaska        365   6315        1.5    69.31   11.3    66.7   152
## 2        Wyoming        376   4566        0.6    70.29    6.9    62.9   173
## 3        Vermont        472   3907        0.6    71.64    5.5    57.1   168
## 4       Delaware        579   4809        0.9    70.06    6.2    54.6   103
## 5         Nevada        590   5149        0.5    69.03   11.5    65.2   188
## 6   North Dakota        637   5087        0.8    72.78    1.4    50.3   186
## 7   South Dakota        681   4167        0.5    72.08    1.7    53.3   172
## 8        Montana        746   4347        0.6    70.56    5.0    59.2   155
## 9  New Hampshire        812   4281        0.7    71.23    3.3    57.6   174
## 10         Idaho        813   4119        0.6    71.87    5.3    59.5   126
##      Area
## 1  566432
## 2   97203
## 3    9267
## 4    1982
## 5  109889
## 6   69273
## 7   75955
## 8  145587
## 9    9027
## 10  82677

Mean income by life expectancy

df %>%
  mutate(HS_Grad_Category = cut(HS.Grad, 
                                breaks=3, 
                                labels=c("Low", "Medium", "High"))) %>%
  group_by(HS_Grad_Category) %>%
  summarise(
    Mean_Income = mean(Income),
    Mean_Life_Exp = mean(Life.Exp)
  )
## # A tibble: 3 × 3
##   HS_Grad_Category Mean_Income Mean_Life_Exp
##   <fct>                  <dbl>         <dbl>
## 1 Low                    3762.          69.6
## 2 Medium                 4580.          71.0
## 3 High                   4725.          71.6

median Population and Murder Rate

df %>%
  mutate(Frost_Category = cut(Frost, 
                              breaks=3, 
                              labels=c("Low", "Medium", "High"))) %>%
  group_by(Frost_Category) %>%
  summarise(
    Median_Population = median(Population),
    Median_Murder = median(Murder)
  )
## # A tibble: 3 × 3
##   Frost_Category Median_Population Median_Murder
##   <fct>                      <dbl>         <dbl>
## 1 Low                        3615           10.7
## 2 Medium                     4173            9.3
## 3 High                        994.           3.9

5.3 Visualisations

Top 10 states by population

library(ggplot2)
library(maps)

ggplot(df %>% arrange(desc(Population)) %>% head(10), aes(x=reorder(State, Population), y=Population, fill=State)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  theme_minimal() +
  labs(title="Top 10 States by Population", x="State", y="Population") +
  scale_fill_brewer(palette="Set3")

Scatter plot of income vs HS graduation rate

ggplot(df, aes(x=Income, y=HS.Grad)) +
  geom_point(color="blue", size=3, alpha=0.7) +
  theme_minimal() +
  labs(title="Income vs High School Graduation Rate", x="Income", y="HS Graduation Rate (%)")

ggplot(df, aes(x=Income, y=HS.Grad)) +
  geom_point(color="darkred", size=3, alpha=0.6) +
  geom_smooth(method="lm", se=TRUE, color="black") +
  theme_minimal() +
  labs(title="Income vs HS Graduation Rate (with Trend Line)", x="Income", y="HS Graduation Rate (%)")
## `geom_smooth()` using formula = 'y ~ x'

Map Income by state

states_map <- map_data("state")

# change state names to lowercase to match map_data
df$State <- tolower(df$State)

# Merge df with map info
map_df <- left_join(states_map, df, by = c("region" = "State"))

# Plot
ggplot(map_df, aes(long, lat, group=group, fill=Income)) +
  geom_polygon(color="white") +
  theme_void() +
  scale_fill_gradient(low="lightblue", high="darkblue", name="Income") +
  labs(title="Income by State")

“Going outside is highly overrated” – Anorak’s Almanac, Ch 17, Verse 32.