# 1. Load data ------------------------------------------------------------
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages ------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.1.1 v forcats 0.2.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tibble' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## -- Conflicts ---------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
country_demo <- read.csv("country_demo.csv", header = TRUE)
head(country_demo, 10)
## X country continent year lifeExp pop gdpPercap
## 1 1 Afghanistan Asia 1952 28.801 8425333 779.4453
## 2 2 Afghanistan Asia 1957 30.332 9240934 820.8530
## 3 3 Afghanistan Asia 1962 31.997 10267083 853.1007
## 4 4 Afghanistan Asia 1967 34.020 11537966 836.1971
## 5 5 Afghanistan Asia 1972 36.088 13079460 739.9811
## 6 6 Afghanistan Asia 1977 38.438 14880372 786.1134
## 7 7 Afghanistan Asia 1982 39.854 12881816 978.0114
## 8 8 Afghanistan Asia 1987 40.822 13867957 852.3959
## 9 9 Afghanistan Asia 1992 41.674 16317921 649.3414
## 10 10 Afghanistan Asia 1997 41.763 22227415 635.3414
names(country_demo)
## [1] "X" "country" "continent" "year" "lifeExp" "pop"
## [7] "gdpPercap"
str(country_demo)
## 'data.frame': 1704 obs. of 7 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
summary(country_demo)
## X country continent year
## Min. : 1.0 Afghanistan: 12 Africa :624 Min. :1952
## 1st Qu.: 426.8 Albania : 12 Americas:300 1st Qu.:1966
## Median : 852.5 Algeria : 12 Asia :396 Median :1980
## Mean : 852.5 Angola : 12 Europe :360 Mean :1980
## 3rd Qu.:1278.2 Argentina : 12 Oceania : 24 3rd Qu.:1993
## Max. :1704.0 Australia : 12 Max. :2007
## (Other) :1632
## lifeExp pop gdpPercap
## Min. :23.60 Min. :6.001e+04 Min. : 241.2
## 1st Qu.:48.20 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :60.71 Median :7.024e+06 Median : 3531.8
## Mean :59.47 Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:70.85 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :82.60 Max. :1.319e+09 Max. :113523.1
##
names(country_demo)[1] <- "ID"
names(country_demo)
## [1] "ID" "country" "continent" "year" "lifeExp" "pop"
## [7] "gdpPercap"
# 2. Interactive scatter plot --------------------------------------------------------
# Filter for year = 2007 only.
country_demo_2007 <- filter(country_demo,
year == 2007)
# Interactive plots
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.4
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
scatter_v1 <- ggplot(country_demo_2007) + aes(x = gdpPercap, y = lifeExp,
colour = continent) +
geom_point() + labs(colour = "Continent") +
scale_x_log10() + ggtitle("Life expectancy and GDP per Capita, 2007") +
xlab("GDP per capita (log scale)") +
ylab("Life expectancy")
ggplotly(scatter_v1)
# 3. Interactive line chart ----------------------------------------------------------
country_demo_by_year_v3 <- summarise(group_by(country_demo,
year, continent),
tot_pop = sum(as.numeric(pop)))
country_demo_by_year_v3
## # A tibble: 60 x 3
## # Groups: year [12]
## year continent tot_pop
## <int> <fct> <dbl>
## 1 1952 Africa 237640501
## 2 1952 Americas 345152446
## 3 1952 Asia 1395357351
## 4 1952 Europe 418120846
## 5 1952 Oceania 10686006
## 6 1957 Africa 264837738
## 7 1957 Americas 386953916
## 8 1957 Asia 1562780599
## 9 1957 Europe 437890351
## 10 1957 Oceania 11941976
## # ... with 50 more rows
names(country_demo_by_year_v3)
## [1] "year" "continent" "tot_pop"
line_v1 <- ggplot(country_demo_by_year_v3) + aes(x = year,
y = tot_pop,
colour = continent) +
geom_line(aes(linetype = continent), size = 1.5) + ggtitle("Total population by year") +
xlab("Year") +
ylab("Total population") +
labs(colour = "Continent", linetype = "Continent")
ggplotly(line_v1) %>%
layout(legend = "none")
# 4. Interactive histogram -----------------------------------------------------------
histogram_v1 <- ggplot(country_demo_2007) + aes(x = gdpPercap) +
geom_histogram(fill = "Orange", colour = "blue") +
scale_x_log10() +
xlab("GDP per capita (log scale)") +
ylab("Count") + ggtitle("Histogram of GDP per capita, 2007")
ggplotly(histogram_v1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# 5. Interactive boxplot -------------------------------------------------------------
box_v1 <- ggplot(country_demo_2007) + aes(x = continent, y = gdpPercap) +
geom_boxplot(colour = "blue", fill = "yellow") +
xlab("Continent") + ylab("GDP per capita")
ggplotly(box_v1)