star_wars_planets <- read.csv("star_wars_planets_v5.csv", header = TRUE)
head(star_wars_planets)
## Name Region Sector System Inhabitants
## 1 Aargonar Unknown Unknown Unknown Unknown
## 2 Abafar Outer Rim Territories Sprizen sector Unknown Unknown
## 3 Abednedo Colonies Unknown Unknown Abednedo
## 4 Absanz Unknown Unknown Unknown Unknown
## 5 Affadar Unknown Unknown Unknown T'Laeem
## 6 Agamar Outer Rim Territories Unknown Agamar system Unknown
## Capital_City Population Record_High_Temp Record_Low_Temp Gravity Key_Element
## 1 Unknown 3857305 36 -5 8.35 6
## 2 Unknown 2331308 33 -27 11.40 28
## 3 Unknown 5358387 28 -5 9.55 30
## 4 Unknown 7229751 38 -6 10.80 57
## 5 Unknown 4307513 37 -31 9.75 22
## 6 Unknown 6191940 38 -2 9.00 34
## Rotation Revolution Rating Rating_Nom Resources Resources_Nom
## 1 59.89 2112 7 Yay 8 Awesome
## 2 49.20 1818 2 Yucks 9 Awesome
## 3 127.76 1210 3 Yucks 5 Blah
## 4 145.98 1431 9 Yay 3 Blah
## 5 58.51 1060 9 Yay 7 Awesome
## 6 148.23 1450 5 Meh 4 Blah
summary(star_wars_planets)
## Name Region Sector
## Aargonar: 1 Outer Rim Territories:243 Unknown :526
## Abafar : 1 Unknown :239 Anoat sector : 30
## Abednedo: 1 Mid Rim : 70 Hutt Space : 9
## Absanz : 1 Inner Rim : 30 Corporate Sector: 5
## Affadar : 1 Core Worlds : 28 Mandalore sector: 5
## Agamar : 1 Wild Space : 19 Abrion sector : 4
## (Other) :677 (Other) : 54 (Other) :104
## System Inhabitants Capital_City
## Unknown :501 Unknown :371 Unknown :650
## Five Points system: 5 Humans : 51 Aldera : 1
## Hosnian system : 5 Various : 18 Bardotta Capital City: 1
## Atterra system : 3 Hutts : 3 Canto Bight : 1
## Galidraan system : 3 Ithorians: 3 Capital City : 1
## Jinata system : 3 Human : 2 Central City : 1
## (Other) :163 (Other) :235 (Other) : 28
## Population Record_High_Temp Record_Low_Temp Gravity
## Min. :1014877 Min. : 0.00 Min. :-60.00 Min. : 7.000
## 1st Qu.:3226102 1st Qu.: 10.00 1st Qu.:-30.00 1st Qu.: 8.280
## Median :5458429 Median : 21.00 Median :-20.00 Median : 9.570
## Mean :5522815 Mean : 20.96 Mean :-19.57 Mean : 9.542
## 3rd Qu.:7777546 3rd Qu.: 31.00 3rd Qu.: -9.00 3rd Qu.:10.805
## Max. :9973030 Max. :100.00 Max. : 30.00 Max. :12.000
##
## Key_Element Rotation Revolution Rating Rating_Nom
## Min. : 1.00 Min. : 10.07 Min. : 100.0 Min. :1.000 Meh :235
## 1st Qu.: 28.00 1st Qu.: 64.32 1st Qu.: 740.5 1st Qu.:3.000 Yay :236
## Median : 55.00 Median :125.10 Median :1315.0 Median :5.000 Yucks:212
## Mean : 53.62 Mean :127.44 Mean :1334.5 Mean :5.081
## 3rd Qu.: 80.00 3rd Qu.:194.54 3rd Qu.:1961.0 3rd Qu.:7.000
## Max. :105.00 Max. :249.23 Max. :2499.0 Max. :9.000
##
## Resources Resources_Nom
## Min. :1.000 Awesome:321
## 1st Qu.:3.000 Blah :362
## Median :5.000
## Mean :5.152
## 3rd Qu.:7.000
## Max. :9.000
##
nrow(star_wars_planets)
## [1] 683
str(star_wars_planets)
## 'data.frame': 683 obs. of 17 variables:
## $ Name : Factor w/ 683 levels "Aargonar","Abafar",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Region : Factor w/ 14 levels "Beyond theÊRishi Maze",..: 11 10 2 11 11 10 14 11 11 11 ...
## $ Sector : Factor w/ 81 levels "7G sector","Abrion sector",..: 81 69 81 81 81 81 81 81 81 81 ...
## $ System : Factor w/ 153 levels "Agamar system",..: 143 143 143 143 143 1 143 2 2 3 ...
## $ Inhabitants : Factor w/ 232 levels "Abednedo","Abyssin",..: 207 207 1 207 180 207 5 207 207 207 ...
## $ Capital_City : Factor w/ 34 levels "Aldera","Bardotta Capital City",..: 32 32 32 32 32 32 32 32 32 32 ...
## $ Population : int 3857305 2331308 5358387 7229751 4307513 6191940 9622189 4432983 1125034 6244208 ...
## $ Record_High_Temp: int 36 33 28 38 37 38 29 39 33 40 ...
## $ Record_Low_Temp : int -5 -27 -5 -6 -31 -2 -35 -6 -16 -15 ...
## $ Gravity : num 8.35 11.4 9.55 10.8 9.75 ...
## $ Key_Element : int 6 28 30 57 22 34 105 57 37 74 ...
## $ Rotation : num 59.9 49.2 127.8 146 58.5 ...
## $ Revolution : int 2112 1818 1210 1431 1060 1450 2245 440 494 1999 ...
## $ Rating : int 7 2 3 9 9 5 5 4 8 2 ...
## $ Rating_Nom : Factor w/ 3 levels "Meh","Yay","Yucks": 2 3 3 2 2 1 1 1 2 3 ...
## $ Resources : int 8 9 5 3 7 4 5 8 4 4 ...
## $ Resources_Nom : Factor w/ 2 levels "Awesome","Blah": 1 1 2 2 1 2 2 1 2 2 ...
names(star_wars_planets)
## [1] "Name" "Region" "Sector" "System"
## [5] "Inhabitants" "Capital_City" "Population" "Record_High_Temp"
## [9] "Record_Low_Temp" "Gravity" "Key_Element" "Rotation"
## [13] "Revolution" "Rating" "Rating_Nom" "Resources"
## [17] "Resources_Nom"
We will look at population, gravity, and rotation.
star_wars_planets_filter <- subset(star_wars_planets[c(7, 10, 12)])
names(star_wars_planets_filter)
## [1] "Population" "Gravity" "Rotation"
str(star_wars_planets_filter)
## 'data.frame': 683 obs. of 3 variables:
## $ Population: int 3857305 2331308 5358387 7229751 4307513 6191940 9622189 4432983 1125034 6244208 ...
## $ Gravity : num 8.35 11.4 9.55 10.8 9.75 ...
## $ Rotation : num 59.9 49.2 127.8 146 58.5 ...
star_wars_planets_filter_norm <- sapply(star_wars_planets_filter, scale)
head(star_wars_planets_filter_norm)
## Population Gravity Rotation
## [1,] -0.62792716 -0.824062435 -0.950164564
## [2,] -1.20325536 1.284398527 -1.100531714
## [3,] -0.06199235 0.005495976 0.004505081
## [4,] 0.64354540 0.869619322 0.260790346
## [5,] -0.45819067 0.143755712 -0.969575852
## [6,] 0.25227206 -0.374718296 0.292439185
Try different distances for possibly different results
star_wars_planets_filter_norm_m <- dist(star_wars_planets_filter_norm,
method = "euclidean")
head(star_wars_planets_filter_norm_m)
## [1] 2.1907123 1.3855842 2.4395926 0.9827814 1.5876791 3.0200473
star_wars_planets_filter_norm_m_hc <- hclust(star_wars_planets_filter_norm_m,
method = "ward.D2")
plot(star_wars_planets_filter_norm_m_hc, hang = -100, ann = TRUE,
xlab = "", main = "Clusters in a galaxy far, far away")
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# fviz_nbclust(x, FUNcluster, method = c("silhouette", "wss", "gap_stat"))
# x: numeric matrix or data frame
# FUNcluster: a partitioning function. Allowed values include kmeans, pam, clara and hcut (for hierarchical clustering).
# method: the method to be used for determining the optimal number of clusters.
fviz_nbclust(star_wars_planets_filter_norm,
hcut, method = "silhouette") +
labs(subtitle = "Silhouette method")
8 clusters is a lot Maybe try 6?
star_wars_planets_filter_norm_m_hc_memb_6 <- cutree(star_wars_planets_filter_norm_m_hc, k = 6)
star_wars_planets_filter_norm_m_hc_memb_6
## [1] 1 2 3 4 2 3 4 5 3 4 6 2 5 2 3 4 6 3 4 2 4 4 5 2 6 2 2 2 5 1 3 4 5 6 2 3 4
## [38] 3 1 1 3 5 6 2 4 2 3 5 4 5 3 1 4 1 1 3 1 5 4 4 4 6 6 1 1 2 4 3 4 2 2 5 2 6
## [75] 2 5 3 2 4 1 6 2 6 1 6 5 4 3 2 1 4 1 1 1 6 1 3 1 1 2 5 5 3 6 4 1 3 6 5 4 5
## [112] 1 1 4 2 5 5 4 4 3 4 4 3 2 1 1 6 3 5 6 3 5 6 4 3 4 5 4 4 1 6 2 2 4 4 2 6 6
## [149] 4 1 5 1 4 6 1 3 5 4 1 2 5 6 3 1 4 3 4 3 2 1 2 2 6 1 5 6 1 6 4 3 1 4 2 4 4
## [186] 2 4 5 6 5 5 2 6 4 1 6 6 3 5 3 3 4 3 2 3 6 4 6 3 4 6 6 5 6 3 4 1 1 1 1 1 6
## [223] 6 4 1 1 6 4 1 6 3 4 1 5 6 1 1 4 2 5 2 6 2 5 2 3 1 2 4 3 4 1 1 1 1 1 1 3 3
## [260] 1 1 4 1 5 3 1 3 2 2 1 4 1 1 6 4 3 2 1 1 5 3 4 4 6 3 3 3 4 4 1 1 1 3 4 6 3
## [297] 2 4 4 5 1 2 1 2 4 2 3 3 2 3 4 3 4 5 6 4 6 4 6 3 4 5 4 3 4 1 6 4 2 4 4 6 2
## [334] 5 3 4 6 5 2 6 6 4 6 2 2 1 3 3 5 2 3 3 1 6 6 6 4 4 1 6 1 4 4 6 3 2 2 2 2 2
## [371] 5 1 2 3 1 1 1 3 6 2 3 3 4 1 3 1 1 1 5 2 6 2 2 4 4 6 6 4 2 1 6 6 4 4 2 6 1
## [408] 5 1 3 3 2 1 4 2 5 3 5 3 2 3 1 4 6 2 6 3 1 5 4 1 2 2 4 4 5 4 6 1 2 6 1 1 1
## [445] 3 4 5 1 5 2 1 5 4 1 4 4 6 2 4 6 3 2 6 6 2 1 3 2 3 5 6 5 3 6 4 6 4 1 4 3 1
## [482] 3 4 3 2 3 6 6 4 4 4 2 4 3 1 3 4 4 5 2 2 4 2 3 6 1 2 5 4 3 6 6 2 3 6 6 1 2
## [519] 2 2 3 6 1 1 5 3 1 1 4 6 4 6 6 6 1 6 6 4 3 4 4 1 6 5 6 2 2 2 3 4 1 6 6 5 6
## [556] 5 4 1 5 1 4 4 1 2 2 3 2 1 4 2 1 3 1 4 5 2 4 2 1 1 4 5 5 4 4 1 6 3 2 6 1 1
## [593] 1 5 2 4 6 3 6 2 6 2 6 4 6 4 4 6 3 3 4 2 3 2 2 6 5 4 3 2 4 1 3 2 6 6 6 4 1
## [630] 2 1 3 4 1 2 3 6 6 2 3 4 5 2 5 4 1 5 2 6 6 4 4 5 5 1 4 6 4 2 4 2 4 5 2 3 1
## [667] 4 3 6 3 6 1 3 4 5 1 1 1 4 1 6 5 1
star_wars_planets_filter_norm_m_hc_memb_6_df <- as.data.frame(star_wars_planets_filter_norm_m_hc_memb_6)
head(star_wars_planets_filter_norm_m_hc_memb_6_df)
## star_wars_planets_filter_norm_m_hc_memb_6
## 1 1
## 2 2
## 3 3
## 4 4
## 5 2
## 6 3
names(star_wars_planets_filter_norm_m_hc_memb_6_df)[1] <- "Cluster"
tail(star_wars_planets_filter_norm_m_hc_memb_6_df)
## Cluster
## 678 1
## 679 4
## 680 1
## 681 6
## 682 5
## 683 1
table(star_wars_planets_filter_norm_m_hc_memb_6_df$Cluster)
##
## 1 2 3 4 5 6
## 132 113 106 145 73 114
star_wars_planets_w_cluster <- cbind(star_wars_planets,
star_wars_planets_filter_norm_m_hc_memb_6_df)
head(star_wars_planets_w_cluster)
## Name Region Sector System Inhabitants
## 1 Aargonar Unknown Unknown Unknown Unknown
## 2 Abafar Outer Rim Territories Sprizen sector Unknown Unknown
## 3 Abednedo Colonies Unknown Unknown Abednedo
## 4 Absanz Unknown Unknown Unknown Unknown
## 5 Affadar Unknown Unknown Unknown T'Laeem
## 6 Agamar Outer Rim Territories Unknown Agamar system Unknown
## Capital_City Population Record_High_Temp Record_Low_Temp Gravity Key_Element
## 1 Unknown 3857305 36 -5 8.35 6
## 2 Unknown 2331308 33 -27 11.40 28
## 3 Unknown 5358387 28 -5 9.55 30
## 4 Unknown 7229751 38 -6 10.80 57
## 5 Unknown 4307513 37 -31 9.75 22
## 6 Unknown 6191940 38 -2 9.00 34
## Rotation Revolution Rating Rating_Nom Resources Resources_Nom Cluster
## 1 59.89 2112 7 Yay 8 Awesome 1
## 2 49.20 1818 2 Yucks 9 Awesome 2
## 3 127.76 1210 3 Yucks 5 Blah 3
## 4 145.98 1431 9 Yay 3 Blah 4
## 5 58.51 1060 9 Yay 7 Awesome 2
## 6 148.23 1450 5 Meh 4 Blah 3
names(star_wars_planets_w_cluster)
## [1] "Name" "Region" "Sector" "System"
## [5] "Inhabitants" "Capital_City" "Population" "Record_High_Temp"
## [9] "Record_Low_Temp" "Gravity" "Key_Element" "Rotation"
## [13] "Revolution" "Rating" "Rating_Nom" "Resources"
## [17] "Resources_Nom" "Cluster"
names(star_wars_planets_w_cluster)
## [1] "Name" "Region" "Sector" "System"
## [5] "Inhabitants" "Capital_City" "Population" "Record_High_Temp"
## [9] "Record_Low_Temp" "Gravity" "Key_Element" "Rotation"
## [13] "Revolution" "Rating" "Rating_Nom" "Resources"
## [17] "Resources_Nom" "Cluster"
aggregate(star_wars_planets[,c(7, 10, 12)],
by = star_wars_planets_w_cluster[18], FUN = mean)
## Cluster Population Gravity Rotation
## 1 1 5548260 8.140606 64.63447
## 2 2 3078082 10.541593 79.66062
## 3 3 3647331 8.451981 176.45858
## 4 4 7571942 10.667034 85.96717
## 5 5 3590969 10.834658 212.02945
## 6 6 8291226 8.928947 200.52570
star_wars_planets_w_cluster_df <- aggregate(star_wars_planets[,c(7, 10, 12)],
by = star_wars_planets_w_cluster[18], FUN = mean)
star_wars_planets_w_cluster_df
## Cluster Population Gravity Rotation
## 1 1 5548260 8.140606 64.63447
## 2 2 3078082 10.541593 79.66062
## 3 3 3647331 8.451981 176.45858
## 4 4 7571942 10.667034 85.96717
## 5 5 3590969 10.834658 212.02945
## 6 6 8291226 8.928947 200.52570
star_wars_planets_w_cluster_df_norm <- sapply(star_wars_planets_w_cluster_df[, -1], scale)
star_wars_planets_w_cluster_df_norm
## Population Gravity Rotation
## [1,] 0.1169715 -1.1909847 -1.0755947
## [2,] -0.9930907 0.7763208 -0.8508457
## [3,] -0.7372787 -0.9358521 0.5969803
## [4,] 1.0263843 0.8791043 -0.7565173
## [5,] -0.7626067 1.0164502 1.1290206
## [6,] 1.3496203 -0.5450385 0.9569568
star_wars_planets_w_cluster_df_norm_2 <- as.data.frame(cbind(star_wars_planets_w_cluster_df$Cluster,
star_wars_planets_w_cluster_df_norm))
star_wars_planets_w_cluster_df_norm_2
## V1 Population Gravity Rotation
## 1 1 0.1169715 -1.1909847 -1.0755947
## 2 2 -0.9930907 0.7763208 -0.8508457
## 3 3 -0.7372787 -0.9358521 0.5969803
## 4 4 1.0263843 0.8791043 -0.7565173
## 5 5 -0.7626067 1.0164502 1.1290206
## 6 6 1.3496203 -0.5450385 0.9569568
names(star_wars_planets_w_cluster_df_norm_2)[1] <- "Cluster"
library(reshape2)
star_wars_planets_w_cluster_df_long_norm_2 <- melt(star_wars_planets_w_cluster_df_norm_2,
id.vars = c("Cluster"))
star_wars_planets_w_cluster_df_long_norm_2
## Cluster variable value
## 1 1 Population 0.1169715
## 2 2 Population -0.9930907
## 3 3 Population -0.7372787
## 4 4 Population 1.0263843
## 5 5 Population -0.7626067
## 6 6 Population 1.3496203
## 7 1 Gravity -1.1909847
## 8 2 Gravity 0.7763208
## 9 3 Gravity -0.9358521
## 10 4 Gravity 0.8791043
## 11 5 Gravity 1.0164502
## 12 6 Gravity -0.5450385
## 13 1 Rotation -1.0755947
## 14 2 Rotation -0.8508457
## 15 3 Rotation 0.5969803
## 16 4 Rotation -0.7565173
## 17 5 Rotation 1.1290206
## 18 6 Rotation 0.9569568
names(star_wars_planets_w_cluster_df_long_norm_2)[2] <- "Variable"
names(star_wars_planets_w_cluster_df_long_norm_2)[3] <- "Mean"
head(star_wars_planets_w_cluster_df_long_norm_2)
## Cluster Variable Mean
## 1 1 Population 0.1169715
## 2 2 Population -0.9930907
## 3 3 Population -0.7372787
## 4 4 Population 1.0263843
## 5 5 Population -0.7626067
## 6 6 Population 1.3496203
ggplot(star_wars_planets_w_cluster_df_long_norm_2) +
aes(x = Variable, y = Mean, fill = Cluster) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(~ Cluster) +
theme(legend.position = "none") +
ggtitle("Cluster characteristics \n in a galaxy far, far away")