# 1. Load data ------------------------------------------------------------
star_wars_planets <- read.csv("star_wars_planets_v5.csv", header = TRUE)
head(star_wars_planets)
## Name Region Sector System Inhabitants
## 1 Aargonar Unknown Unknown Unknown Unknown
## 2 Abafar Outer Rim Territories Sprizen sector Unknown Unknown
## 3 Abednedo Colonies Unknown Unknown Abednedo
## 4 Absanz Unknown Unknown Unknown Unknown
## 5 Affadar Unknown Unknown Unknown T'Laeem
## 6 Agamar Outer Rim Territories Unknown Agamar system Unknown
## Capital_City Population Record_High_Temp Record_Low_Temp Gravity
## 1 Unknown 3857305 36 -5 8.35
## 2 Unknown 2331308 33 -27 11.40
## 3 Unknown 5358387 28 -5 9.55
## 4 Unknown 7229751 38 -6 10.80
## 5 Unknown 4307513 37 -31 9.75
## 6 Unknown 6191940 38 -2 9.00
## Key_Element Rotation Revolution Rating Rating_Nom Resources
## 1 6 59.89 2112 7 Yay 8
## 2 28 49.20 1818 2 Yucks 9
## 3 30 127.76 1210 3 Yucks 5
## 4 57 145.98 1431 9 Yay 3
## 5 22 58.51 1060 9 Yay 7
## 6 34 148.23 1450 5 Meh 4
## Resources_Nom
## 1 Awesome
## 2 Awesome
## 3 Blah
## 4 Blah
## 5 Awesome
## 6 Blah
summary(star_wars_planets)
## Name Region Sector
## Aargonar: 1 Outer Rim Territories:243 Unknown :526
## Abafar : 1 Unknown :239 Anoat sector : 30
## Abednedo: 1 Mid Rim : 70 Hutt Space : 9
## Absanz : 1 Inner Rim : 30 Corporate Sector: 5
## Affadar : 1 Core Worlds : 28 Mandalore sector: 5
## Agamar : 1 Wild Space : 19 Abrion sector : 4
## (Other) :677 (Other) : 54 (Other) :104
## System Inhabitants Capital_City
## Unknown :501 Unknown :371 Unknown :650
## Five Points system: 5 Humans : 51 Aldera : 1
## Hosnian system : 5 Various : 18 Bardotta Capital City: 1
## Atterra system : 3 Hutts : 3 Canto Bight : 1
## Galidraan system : 3 Ithorians: 3 Capital City : 1
## Jinata system : 3 Human : 2 Central City : 1
## (Other) :163 (Other) :235 (Other) : 28
## Population Record_High_Temp Record_Low_Temp Gravity
## Min. :1014877 Min. : 0.00 Min. :-60.00 Min. : 7.000
## 1st Qu.:3226102 1st Qu.: 10.00 1st Qu.:-30.00 1st Qu.: 8.280
## Median :5458429 Median : 21.00 Median :-20.00 Median : 9.570
## Mean :5522815 Mean : 20.96 Mean :-19.57 Mean : 9.542
## 3rd Qu.:7777546 3rd Qu.: 31.00 3rd Qu.: -9.00 3rd Qu.:10.805
## Max. :9973030 Max. :100.00 Max. : 30.00 Max. :12.000
##
## Key_Element Rotation Revolution Rating
## Min. : 1.00 Min. : 10.07 Min. : 100.0 Min. :1.000
## 1st Qu.: 28.00 1st Qu.: 64.32 1st Qu.: 740.5 1st Qu.:3.000
## Median : 55.00 Median :125.10 Median :1315.0 Median :5.000
## Mean : 53.62 Mean :127.44 Mean :1334.5 Mean :5.081
## 3rd Qu.: 80.00 3rd Qu.:194.54 3rd Qu.:1961.0 3rd Qu.:7.000
## Max. :105.00 Max. :249.23 Max. :2499.0 Max. :9.000
##
## Rating_Nom Resources Resources_Nom
## Meh :235 Min. :1.000 Awesome:321
## Yay :236 1st Qu.:3.000 Blah :362
## Yucks:212 Median :5.000
## Mean :5.152
## 3rd Qu.:7.000
## Max. :9.000
##
nrow(star_wars_planets)
## [1] 683
str(star_wars_planets)
## 'data.frame': 683 obs. of 17 variables:
## $ Name : Factor w/ 683 levels "Aargonar","Abafar",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Region : Factor w/ 14 levels "Beyond the\xcaRishi Maze",..: 11 10 2 11 11 10 14 11 11 11 ...
## $ Sector : Factor w/ 81 levels "7G sector","Abrion sector",..: 81 69 81 81 81 81 81 81 81 81 ...
## $ System : Factor w/ 153 levels "Agamar system",..: 143 143 143 143 143 1 143 2 2 3 ...
## $ Inhabitants : Factor w/ 232 levels "Abednedo","Abyssin",..: 207 207 1 207 180 207 5 207 207 207 ...
## $ Capital_City : Factor w/ 34 levels "Aldera","Bardotta Capital City",..: 32 32 32 32 32 32 32 32 32 32 ...
## $ Population : int 3857305 2331308 5358387 7229751 4307513 6191940 9622189 4432983 1125034 6244208 ...
## $ Record_High_Temp: int 36 33 28 38 37 38 29 39 33 40 ...
## $ Record_Low_Temp : int -5 -27 -5 -6 -31 -2 -35 -6 -16 -15 ...
## $ Gravity : num 8.35 11.4 9.55 10.8 9.75 ...
## $ Key_Element : int 6 28 30 57 22 34 105 57 37 74 ...
## $ Rotation : num 59.9 49.2 127.8 146 58.5 ...
## $ Revolution : int 2112 1818 1210 1431 1060 1450 2245 440 494 1999 ...
## $ Rating : int 7 2 3 9 9 5 5 4 8 2 ...
## $ Rating_Nom : Factor w/ 3 levels "Meh","Yay","Yucks": 2 3 3 2 2 1 1 1 2 3 ...
## $ Resources : int 8 9 5 3 7 4 5 8 4 4 ...
## $ Resources_Nom : Factor w/ 2 levels "Awesome","Blah": 1 1 2 2 1 2 2 1 2 2 ...
names(star_wars_planets)
## [1] "Name" "Region" "Sector"
## [4] "System" "Inhabitants" "Capital_City"
## [7] "Population" "Record_High_Temp" "Record_Low_Temp"
## [10] "Gravity" "Key_Element" "Rotation"
## [13] "Revolution" "Rating" "Rating_Nom"
## [16] "Resources" "Resources_Nom"
# 2. Filter variables for clustering --------------------------------------
star_wars_planets_filter <- subset(star_wars_planets[c(7, 10, 12)])
names(star_wars_planets_filter)
## [1] "Population" "Gravity" "Rotation"
str(star_wars_planets_filter)
## 'data.frame': 683 obs. of 3 variables:
## $ Population: int 3857305 2331308 5358387 7229751 4307513 6191940 9622189 4432983 1125034 6244208 ...
## $ Gravity : num 8.35 11.4 9.55 10.8 9.75 ...
## $ Rotation : num 59.9 49.2 127.8 146 58.5 ...
# 3. Data exploration -----------------------------------------------------
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplot(star_wars_planets_filter) + aes(y = Population) +
geom_boxplot(fill = "#C6C6FF") + ylab("Population") +
ggtitle("Size of population") + theme_classic()
ggplot(star_wars_planets_filter) + aes(Population) +
geom_histogram(fill = "#C6C6FF") + ylab("Count") +
ggtitle("Histogram of size of population") + theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(star_wars_planets_filter$Population)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1014877 3226102 5458429 5522815 7777546 9973030
ggplot(star_wars_planets_filter) + aes(y = Gravity) +
geom_boxplot(fill = "#C6C6CD") + ylab("Gravity") +
ggtitle("Strength of gravitational force") + theme_dark()
ggplot(star_wars_planets_filter) + aes(Gravity) +
geom_histogram(fill = "#C6C6CD") + ylab("Count") +
ggtitle("Histogram of gravitational force") + theme_dark()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(star_wars_planets_filter$Gravity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.000 8.280 9.570 9.542 10.805 12.000
ggplot(star_wars_planets_filter) + aes(y = Rotation) +
geom_boxplot(fill = "#C6C666") + ylab("Rotation") +
ggtitle("Length of day") + theme_linedraw()
ggplot(star_wars_planets_filter) + aes(Rotation) +
geom_histogram(fill = "#C6C666") + ylab("Count") +
ggtitle("Histogram of length of day") + theme_linedraw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(star_wars_planets_filter$Rotation)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.07 64.32 125.10 127.44 194.54 249.23
# 4. Normalise data -------------------------------------------------------
star_wars_planets_filter_norm <- sapply(star_wars_planets_filter, scale)
head(star_wars_planets_filter_norm)
## Population Gravity Rotation
## [1,] -0.62792716 -0.824062435 -0.950164564
## [2,] -1.20325536 1.284398527 -1.100531714
## [3,] -0.06199235 0.005495976 0.004505081
## [4,] 0.64354540 0.869619322 0.260790346
## [5,] -0.45819067 0.143755712 -0.969575852
## [6,] 0.25227206 -0.374718296 0.292439185
# 5. Compute normalised distances -----------------------------------------
star_wars_planets_filter_norm_m <- dist(star_wars_planets_filter_norm,
method = "euclidean")
head(star_wars_planets_filter_norm_m)
## [1] 2.1907123 1.3855842 2.4395926 0.9827814 1.5876791 3.0200473
# 6. Hierarchical clustering ----------------------------------------------
star_wars_planets_filter_norm_m_hc <- hclust(star_wars_planets_filter_norm_m,
method = "ward.D2")
plot(star_wars_planets_filter_norm_m_hc, hang = -100, ann = TRUE)
# 7. Check clustering validity --------------------------------------------
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
# fviz_nbclust(x, FUNcluster, method = c("silhouette", "wss", "gap_stat"))
# x: numeric matrix or data frame
# FUNcluster: a partitioning function. Allowed values include kmeans, pam, clara and hcut (for hierarchical clustering).
# method: the method to be used for determining the optimal number of clusters.
fviz_nbclust(star_wars_planets_filter_norm,
hcut, method = "wss") +
geom_vline(xintercept = 2, linetype = 2)+
labs(subtitle = "Elbow method")
fviz_nbclust(star_wars_planets_filter_norm,
hcut, method = "silhouette") +
labs(subtitle = "Silhouette method")
set.seed(666)
fviz_nbclust(star_wars_planets_filter_norm,
hcut, nstart = 25, method = "gap_stat",
nboot = 50) +
labs(subtitle = "Gap statistic method")
# 8. Hierarchical clustering k = 5 ----------------------------------------
star_wars_planets_filter_norm_m_hc_memb_5 <- cutree(star_wars_planets_filter_norm_m_hc,
k = 5)
star_wars_planets_filter_norm_m_hc_memb_5
## [1] 1 2 3 4 2 3 4 3 3 4 5 2 3 2 3 4 5 3 4 2 4 4 3 2 5 2 2 2 3 1 3 4 3 5 2
## [36] 3 4 3 1 1 3 3 5 2 4 2 3 3 4 3 3 1 4 1 1 3 1 3 4 4 4 5 5 1 1 2 4 3 4 2
## [71] 2 3 2 5 2 3 3 2 4 1 5 2 5 1 5 3 4 3 2 1 4 1 1 1 5 1 3 1 1 2 3 3 3 5 4
## [106] 1 3 5 3 4 3 1 1 4 2 3 3 4 4 3 4 4 3 2 1 1 5 3 3 5 3 3 5 4 3 4 3 4 4 1
## [141] 5 2 2 4 4 2 5 5 4 1 3 1 4 5 1 3 3 4 1 2 3 5 3 1 4 3 4 3 2 1 2 2 5 1 3
## [176] 5 1 5 4 3 1 4 2 4 4 2 4 3 5 3 3 2 5 4 1 5 5 3 3 3 3 4 3 2 3 5 4 5 3 4
## [211] 5 5 3 5 3 4 1 1 1 1 1 5 5 4 1 1 5 4 1 5 3 4 1 3 5 1 1 4 2 3 2 5 2 3 2
## [246] 3 1 2 4 3 4 1 1 1 1 1 1 3 3 1 1 4 1 3 3 1 3 2 2 1 4 1 1 5 4 3 2 1 1 3
## [281] 3 4 4 5 3 3 3 4 4 1 1 1 3 4 5 3 2 4 4 3 1 2 1 2 4 2 3 3 2 3 4 3 4 3 5
## [316] 4 5 4 5 3 4 3 4 3 4 1 5 4 2 4 4 5 2 3 3 4 5 3 2 5 5 4 5 2 2 1 3 3 3 2
## [351] 3 3 1 5 5 5 4 4 1 5 1 4 4 5 3 2 2 2 2 2 3 1 2 3 1 1 1 3 5 2 3 3 4 1 3
## [386] 1 1 1 3 2 5 2 2 4 4 5 5 4 2 1 5 5 4 4 2 5 1 3 1 3 3 2 1 4 2 3 3 3 3 2
## [421] 3 1 4 5 2 5 3 1 3 4 1 2 2 4 4 3 4 5 1 2 5 1 1 1 3 4 3 1 3 2 1 3 4 1 4
## [456] 4 5 2 4 5 3 2 5 5 2 1 3 2 3 3 5 3 3 5 4 5 4 1 4 3 1 3 4 3 2 3 5 5 4 4
## [491] 4 2 4 3 1 3 4 4 3 2 2 4 2 3 5 1 2 3 4 3 5 5 2 3 5 5 1 2 2 2 3 5 1 1 3
## [526] 3 1 1 4 5 4 5 5 5 1 5 5 4 3 4 4 1 5 3 5 2 2 2 3 4 1 5 5 3 5 3 4 1 3 1
## [561] 4 4 1 2 2 3 2 1 4 2 1 3 1 4 3 2 4 2 1 1 4 3 3 4 4 1 5 3 2 5 1 1 1 3 2
## [596] 4 5 3 5 2 5 2 5 4 5 4 4 5 3 3 4 2 3 2 2 5 3 4 3 2 4 1 3 2 5 5 5 4 1 2
## [631] 1 3 4 1 2 3 5 5 2 3 4 3 2 3 4 1 3 2 5 5 4 4 3 3 1 4 5 4 2 4 2 4 3 2 3
## [666] 1 4 3 5 3 5 1 3 4 3 1 1 1 4 1 5 3 1
# 9. Hierarchical cluster characteristics ---------------------------------
star_wars_planets_filter_norm_m_hc_memb_5_df <- as.data.frame(star_wars_planets_filter_norm_m_hc_memb_5)
head(star_wars_planets_filter_norm_m_hc_memb_5_df)
## star_wars_planets_filter_norm_m_hc_memb_5
## 1 1
## 2 2
## 3 3
## 4 4
## 5 2
## 6 3
names(star_wars_planets_filter_norm_m_hc_memb_5_df)[1] <- "Cluster"
tail(star_wars_planets_filter_norm_m_hc_memb_5_df)
## Cluster
## 678 1
## 679 4
## 680 1
## 681 5
## 682 3
## 683 1
table(star_wars_planets_filter_norm_m_hc_memb_5_df$Cluster)
##
## 1 2 3 4 5
## 132 113 179 145 114
# Merge cluster membership with original data
star_wars_planets_w_cluster <- cbind(star_wars_planets,
star_wars_planets_filter_norm_m_hc_memb_5_df)
head(star_wars_planets_w_cluster)
## Name Region Sector System Inhabitants
## 1 Aargonar Unknown Unknown Unknown Unknown
## 2 Abafar Outer Rim Territories Sprizen sector Unknown Unknown
## 3 Abednedo Colonies Unknown Unknown Abednedo
## 4 Absanz Unknown Unknown Unknown Unknown
## 5 Affadar Unknown Unknown Unknown T'Laeem
## 6 Agamar Outer Rim Territories Unknown Agamar system Unknown
## Capital_City Population Record_High_Temp Record_Low_Temp Gravity
## 1 Unknown 3857305 36 -5 8.35
## 2 Unknown 2331308 33 -27 11.40
## 3 Unknown 5358387 28 -5 9.55
## 4 Unknown 7229751 38 -6 10.80
## 5 Unknown 4307513 37 -31 9.75
## 6 Unknown 6191940 38 -2 9.00
## Key_Element Rotation Revolution Rating Rating_Nom Resources
## 1 6 59.89 2112 7 Yay 8
## 2 28 49.20 1818 2 Yucks 9
## 3 30 127.76 1210 3 Yucks 5
## 4 57 145.98 1431 9 Yay 3
## 5 22 58.51 1060 9 Yay 7
## 6 34 148.23 1450 5 Meh 4
## Resources_Nom Cluster
## 1 Awesome 1
## 2 Awesome 2
## 3 Blah 3
## 4 Blah 4
## 5 Awesome 2
## 6 Blah 3
# Pivot table
names(star_wars_planets_w_cluster)
## [1] "Name" "Region" "Sector"
## [4] "System" "Inhabitants" "Capital_City"
## [7] "Population" "Record_High_Temp" "Record_Low_Temp"
## [10] "Gravity" "Key_Element" "Rotation"
## [13] "Revolution" "Rating" "Rating_Nom"
## [16] "Resources" "Resources_Nom" "Cluster"
aggregate(star_wars_planets[,c(7, 10, 12)],
by = star_wars_planets_w_cluster[18], FUN = mean)
## Cluster Population Gravity Rotation
## 1 1 5548260 8.140606 64.63447
## 2 2 3078082 10.541593 79.66062
## 3 3 3624345 9.423687 190.96514
## 4 4 7571942 10.667034 85.96717
## 5 5 8291226 8.928947 200.52570
# Pivot chart
star_wars_planets_w_cluster_df <- aggregate(star_wars_planets[,c(7, 10, 12)],
by = star_wars_planets_w_cluster[18], FUN = mean)
star_wars_planets_w_cluster_df
## Cluster Population Gravity Rotation
## 1 1 5548260 8.140606 64.63447
## 2 2 3078082 10.541593 79.66062
## 3 3 3624345 9.423687 190.96514
## 4 4 7571942 10.667034 85.96717
## 5 5 8291226 8.928947 200.52570
# Normalise for better representation
star_wars_planets_w_cluster_df_norm <- sapply(star_wars_planets_w_cluster_df[, -1], scale)
star_wars_planets_w_cluster_df_norm
## Population Gravity Rotation
## [1,] -0.03221782 -1.3026573 -0.9086406
## [2,] -1.10030336 0.9317588 -0.6800028
## [3,] -0.86410352 -0.1085912 1.0136061
## [4,] 0.84280591 1.0484977 -0.5840423
## [5,] 1.15381880 -0.5690079 1.1590796
star_wars_planets_w_cluster_df_norm_2 <- as.data.frame(cbind(star_wars_planets_w_cluster_df$Cluster,
star_wars_planets_w_cluster_df_norm))
star_wars_planets_w_cluster_df_norm_2
## V1 Population Gravity Rotation
## 1 1 -0.03221782 -1.3026573 -0.9086406
## 2 2 -1.10030336 0.9317588 -0.6800028
## 3 3 -0.86410352 -0.1085912 1.0136061
## 4 4 0.84280591 1.0484977 -0.5840423
## 5 5 1.15381880 -0.5690079 1.1590796
names(star_wars_planets_w_cluster_df_norm_2)[1] <- "Cluster"
library(reshape2)
star_wars_planets_w_cluster_df_long_norm_2 <- melt(star_wars_planets_w_cluster_df_norm_2,
id.vars = c("Cluster"))
star_wars_planets_w_cluster_df_long_norm_2
## Cluster variable value
## 1 1 Population -0.03221782
## 2 2 Population -1.10030336
## 3 3 Population -0.86410352
## 4 4 Population 0.84280591
## 5 5 Population 1.15381880
## 6 1 Gravity -1.30265731
## 7 2 Gravity 0.93175877
## 8 3 Gravity -0.10859121
## 9 4 Gravity 1.04849770
## 10 5 Gravity -0.56900794
## 11 1 Rotation -0.90864062
## 12 2 Rotation -0.68000281
## 13 3 Rotation 1.01360615
## 14 4 Rotation -0.58404233
## 15 5 Rotation 1.15907961
names(star_wars_planets_w_cluster_df_long_norm_2)[2] <- "Variable"
names(star_wars_planets_w_cluster_df_long_norm_2)[3] <- "Mean"
head(star_wars_planets_w_cluster_df_long_norm_2)
## Cluster Variable Mean
## 1 1 Population -0.03221782
## 2 2 Population -1.10030336
## 3 3 Population -0.86410352
## 4 4 Population 0.84280591
## 5 5 Population 1.15381880
## 6 1 Gravity -1.30265731
ggplot(star_wars_planets_w_cluster_df_long_norm_2) +
aes(x = Variable, y = Mean, fill = Cluster) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
facet_wrap(~ Cluster) +
theme(legend.position = "none") +
ggtitle("Cluster characteristics")