a <- 10
b <- a
print(b)
## [1] 10
b
## [1] 10
TRUE
## [1] TRUE
class(TRUE)
## [1] "logical"
FALSE
## [1] FALSE
class(FALSE)
## [1] "logical"
y <- "Greetings Jedi Master"
y
## [1] "Greetings Jedi Master"
2
## [1] 2
2.5
## [1] 2.5
2L
## [1] 2
class(2)
## [1] "numeric"
class(2L)
## [1] "integer"
vector1 <- c(1,2,3,4,5)
vector1
## [1] 1 2 3 4 5
class(vector1)
## [1] "numeric"
vector2 <- c("S", "U")
vector2
## [1] "S" "U"
class(vector2)
## [1] "character"
list <- list("Hello", 123, TRUE)
list
## [[1]]
## [1] "Hello"
##
## [[2]]
## [1] 123
##
## [[3]]
## [1] TRUE
matrix <- matrix(1:9, nrow = 3)
matrix
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
df <- read.csv("statex77.csv", header = TRUE)
head(df)
## X Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
## 1 Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708
## 2 Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432
## 3 Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417
## 4 Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945
## 5 California 21198 5114 1.1 71.71 10.3 62.6 20 156361
## 6 Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766
temps <- c(22, 20, 23, 32, -10)
temps
## [1] 22 20 23 32 -10
cities <- c("Seattle", "Tokyo", "Shanghai", "Hawaii", "Oslo")
cities
## [1] "Seattle" "Tokyo" "Shanghai" "Hawaii" "Oslo"
names(temps) <- cities
temps
## Seattle Tokyo Shanghai Hawaii Oslo
## 22 20 23 32 -10
temps["Oslo"]
## Oslo
## -10
v3 <- c("A", "B", "C")
v3[c(1,3)]
## [1] "A" "C"
v4 <- c(1:9)
v4[3:7]
## [1] 3 4 5 6 7
v5 <- c(1:12)
v5
## [1] 1 2 3 4 5 6 7 8 9 10 11 12
matrix2 <- matrix(v5, nrow = 4, byrow = TRUE)
matrix2
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
## [3,] 7 8 9
## [4,] 10 11 12
columns <- c("Tue", "Wed", "Thu")
rows <- c("Sleep", "Eat", "Play", "Laze around")
colnames(matrix2) <- columns
rownames(matrix2) <- rows
matrix2
## Tue Wed Thu
## Sleep 1 2 3
## Eat 4 5 6
## Play 7 8 9
## Laze around 10 11 12
days <- c("Mon", "Tue", "Wed", "Thu", "Fri")
temp <- c(10, 21, 23, 25.3, 28)
study <- c(TRUE, FALSE, FALSE, FALSE, FALSE)
df2 <- data.frame(days, temp, study)
df2
## days temp study
## 1 Mon 10.0 TRUE
## 2 Tue 21.0 FALSE
## 3 Wed 23.0 FALSE
## 4 Thu 25.3 FALSE
## 5 Fri 28.0 FALSE
df2[1,]
## days temp study
## 1 Mon 10 TRUE
df2[3,2]
## [1] 23
So if our class is on Mondays, what should we do in the week?
df3 <- df2
df3
## days temp study
## 1 Mon 10.0 TRUE
## 2 Tue 21.0 FALSE
## 3 Wed 23.0 FALSE
## 4 Thu 25.3 FALSE
## 5 Fri 28.0 FALSE
df3$activity <- ifelse(df3$days == "Mon", "Study hard", "Be lazy")
df3
## days temp study activity
## 1 Mon 10.0 TRUE Study hard
## 2 Tue 21.0 FALSE Be lazy
## 3 Wed 23.0 FALSE Be lazy
## 4 Thu 25.3 FALSE Be lazy
## 5 Fri 28.0 FALSE Be lazy
How does temperature affect our daily activities?
df4 <- df2
df4
## days temp study
## 1 Mon 10.0 TRUE
## 2 Tue 21.0 FALSE
## 3 Wed 23.0 FALSE
## 4 Thu 25.3 FALSE
## 5 Fri 28.0 FALSE
df4$activity <- ifelse(df4$temp <=10, "Stay at home", "Sleep at home")
df4
## days temp study activity
## 1 Mon 10.0 TRUE Stay at home
## 2 Tue 21.0 FALSE Sleep at home
## 3 Wed 23.0 FALSE Sleep at home
## 4 Thu 25.3 FALSE Sleep at home
## 5 Fri 28.0 FALSE Sleep at home
head(df)
## X Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
## 1 Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708
## 2 Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432
## 3 Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417
## 4 Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945
## 5 California 21198 5114 1.1 71.71 10.3 62.6 20 156361
## 6 Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766
names(df)[1] <- "State"
head(df)
## State Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
## 1 Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708
## 2 Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432
## 3 Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417
## 4 Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945
## 5 California 21198 5114 1.1 71.71 10.3 62.6 20 156361
## 6 Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766
subset(df, State == "Alaska", select = c(Population, Area))
## Population Area
## 2 365 566432
subset(df, State == "Alaska" | State == "Washington",
select = c(State:Income))
## State Population Income
## 2 Alaska 365 6315
## 47 Washington 3559 4864
subset(df, Life.Exp >= 70 & Frost >= 170)
## State Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## 29 New Hampshire 812 4281 0.7 71.23 3.3 57.6 174
## 34 North Dakota 637 5087 0.8 72.78 1.4 50.3 186
## 41 South Dakota 681 4167 0.5 72.08 1.7 53.3 172
## 50 Wyoming 376 4566 0.6 70.29 6.9 62.9 173
## Area
## 29 9027
## 34 69273
## 41 75955
## 50 97203
subset(df, Life.Exp >= 70 & Frost >= 170,
select = c(State, Population, Area))
## State Population Area
## 29 New Hampshire 812 9027
## 34 North Dakota 637 69273
## 41 South Dakota 681 75955
## 50 Wyoming 376 97203
Import the data if not already (Sect 3.4)
head(df)
## State Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
## 1 Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708
## 2 Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432
## 3 Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417
## 4 Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945
## 5 California 21198 5114 1.1 71.71 10.3 62.6 20 156361
## 6 Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766
# if needed
# Convert row names (state names) into a column
# df$State <- rownames(df)
# if needed
# names(df)[1] <- "State"
Mean
mean(df$Population)
## [1] 4246.42
Median
median(df$Population)
## [1] 2838.5
Max
max(df$Population)
## [1] 21198
Min
min(df$Population)
## [1] 365
sd and variance
sd(df$Income)
## [1] 614.4699
var(df$Income)
## [1] 377573.3
Range
range(df$Murder)
## [1] 1.4 15.1
diff(range(df$Murder))
## [1] 13.7
Quantiles
quantile(df$Life.Exp, probs = c(0.25, 0.5, 0.75))
## 25% 50% 75%
## 70.1175 70.6750 71.8925
Correlation
cor(df$Income, df$HS.Grad)
## [1] 0.6199323
Correlation matrix
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# select only numerical columns for correlations
# round to 3 decimal places
corr_matrix <- df %>%
select_if(is.numeric) %>%
cor() %>%
round(3)
corr_matrix
## Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
## Population 1.000 0.208 0.108 -0.068 0.344 -0.098 -0.332 0.023
## Income 0.208 1.000 -0.437 0.340 -0.230 0.620 0.226 0.363
## Illiteracy 0.108 -0.437 1.000 -0.588 0.703 -0.657 -0.672 0.077
## Life.Exp -0.068 0.340 -0.588 1.000 -0.781 0.582 0.262 -0.107
## Murder 0.344 -0.230 0.703 -0.781 1.000 -0.488 -0.539 0.228
## HS.Grad -0.098 0.620 -0.657 0.582 -0.488 1.000 0.367 0.334
## Frost -0.332 0.226 -0.672 0.262 -0.539 0.367 1.000 0.059
## Area 0.023 0.363 0.077 -0.107 0.228 0.334 0.059 1.000
Top 10
# load library if not already
# library(dplyr)
df %>%
arrange(desc(Population)) %>%
head(10)
## State Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## 1 California 21198 5114 1.1 71.71 10.3 62.6 20
## 2 New York 18076 4903 1.4 70.55 10.9 52.7 82
## 3 Texas 12237 4188 2.2 70.90 12.2 47.4 35
## 4 Pennsylvania 11860 4449 1.0 70.43 6.1 50.2 126
## 5 Illinois 11197 5107 0.9 70.14 10.3 52.6 127
## 6 Ohio 10735 4561 0.8 70.82 7.4 53.2 124
## 7 Michigan 9111 4751 0.9 70.63 11.1 52.8 125
## 8 Florida 8277 4815 1.3 70.66 10.7 52.6 11
## 9 New Jersey 7333 5237 1.1 70.93 5.2 52.5 115
## 10 Massachusetts 5814 4755 1.1 71.83 3.3 58.5 103
## Area
## 1 156361
## 2 47831
## 3 262134
## 4 44966
## 5 55748
## 6 40975
## 7 56817
## 8 54090
## 9 7521
## 10 7826
Bottom 10
df %>%
arrange(Population) %>%
head(10)
## State Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## 1 Alaska 365 6315 1.5 69.31 11.3 66.7 152
## 2 Wyoming 376 4566 0.6 70.29 6.9 62.9 173
## 3 Vermont 472 3907 0.6 71.64 5.5 57.1 168
## 4 Delaware 579 4809 0.9 70.06 6.2 54.6 103
## 5 Nevada 590 5149 0.5 69.03 11.5 65.2 188
## 6 North Dakota 637 5087 0.8 72.78 1.4 50.3 186
## 7 South Dakota 681 4167 0.5 72.08 1.7 53.3 172
## 8 Montana 746 4347 0.6 70.56 5.0 59.2 155
## 9 New Hampshire 812 4281 0.7 71.23 3.3 57.6 174
## 10 Idaho 813 4119 0.6 71.87 5.3 59.5 126
## Area
## 1 566432
## 2 97203
## 3 9267
## 4 1982
## 5 109889
## 6 69273
## 7 75955
## 8 145587
## 9 9027
## 10 82677
Mean income by life expectancy
df %>%
mutate(HS_Grad_Category = cut(HS.Grad,
breaks=3,
labels=c("Low", "Medium", "High"))) %>%
group_by(HS_Grad_Category) %>%
summarise(
Mean_Income = mean(Income),
Mean_Life_Exp = mean(Life.Exp)
)
## # A tibble: 3 × 3
## HS_Grad_Category Mean_Income Mean_Life_Exp
## <fct> <dbl> <dbl>
## 1 Low 3762. 69.6
## 2 Medium 4580. 71.0
## 3 High 4725. 71.6
median Population and Murder Rate
df %>%
mutate(Frost_Category = cut(Frost,
breaks=3,
labels=c("Low", "Medium", "High"))) %>%
group_by(Frost_Category) %>%
summarise(
Median_Population = median(Population),
Median_Murder = median(Murder)
)
## # A tibble: 3 × 3
## Frost_Category Median_Population Median_Murder
## <fct> <dbl> <dbl>
## 1 Low 3615 10.7
## 2 Medium 4173 9.3
## 3 High 994. 3.9
Top 10 states by population
library(ggplot2)
library(maps)
ggplot(df %>% arrange(desc(Population)) %>% head(10), aes(x=reorder(State, Population), y=Population, fill=State)) +
geom_col(show.legend = FALSE) +
coord_flip() +
theme_minimal() +
labs(title="Top 10 States by Population", x="State", y="Population") +
scale_fill_brewer(palette="Set3")
Scatter plot of income vs HS graduation rate
ggplot(df, aes(x=Income, y=HS.Grad)) +
geom_point(color="blue", size=3, alpha=0.7) +
theme_minimal() +
labs(title="Income vs High School Graduation Rate", x="Income", y="HS Graduation Rate (%)")
ggplot(df, aes(x=Income, y=HS.Grad)) +
geom_point(color="darkred", size=3, alpha=0.6) +
geom_smooth(method="lm", se=TRUE, color="black") +
theme_minimal() +
labs(title="Income vs HS Graduation Rate (with Trend Line)", x="Income", y="HS Graduation Rate (%)")
## `geom_smooth()` using formula = 'y ~ x'
Map Income by state
states_map <- map_data("state")
# change state names to lowercase to match map_data
df$State <- tolower(df$State)
# Merge df with map info
map_df <- left_join(states_map, df, by = c("region" = "State"))
# Plot
ggplot(map_df, aes(long, lat, group=group, fill=Income)) +
geom_polygon(color="white") +
theme_void() +
scale_fill_gradient(low="lightblue", high="darkblue", name="Income") +
labs(title="Income by State")
“Going outside is highly overrated” – Anorak’s Almanac, Ch 17, Verse 32.