1 Load data

star_wars_planets <- read.csv("star_wars_planets_v5.csv", header = TRUE)
head(star_wars_planets)
##       Name                Region         Sector        System Inhabitants
## 1 Aargonar               Unknown        Unknown       Unknown     Unknown
## 2   Abafar Outer Rim Territories Sprizen sector       Unknown     Unknown
## 3 Abednedo              Colonies        Unknown       Unknown    Abednedo
## 4   Absanz               Unknown        Unknown       Unknown     Unknown
## 5  Affadar               Unknown        Unknown       Unknown     T'Laeem
## 6   Agamar Outer Rim Territories        Unknown Agamar system     Unknown
##   Capital_City Population Record_High_Temp Record_Low_Temp Gravity Key_Element
## 1      Unknown    3857305               36              -5    8.35           6
## 2      Unknown    2331308               33             -27   11.40          28
## 3      Unknown    5358387               28              -5    9.55          30
## 4      Unknown    7229751               38              -6   10.80          57
## 5      Unknown    4307513               37             -31    9.75          22
## 6      Unknown    6191940               38              -2    9.00          34
##   Rotation Revolution Rating Rating_Nom Resources Resources_Nom
## 1    59.89       2112      7        Yay         8       Awesome
## 2    49.20       1818      2      Yucks         9       Awesome
## 3   127.76       1210      3      Yucks         5          Blah
## 4   145.98       1431      9        Yay         3          Blah
## 5    58.51       1060      9        Yay         7       Awesome
## 6   148.23       1450      5        Meh         4          Blah
summary(star_wars_planets)
##        Name                       Region                 Sector   
##  Aargonar:  1   Outer Rim Territories:243   Unknown         :526  
##  Abafar  :  1   Unknown              :239   Anoat sector    : 30  
##  Abednedo:  1   Mid Rim              : 70   Hutt Space      :  9  
##  Absanz  :  1   Inner Rim            : 30   Corporate Sector:  5  
##  Affadar :  1   Core Worlds          : 28   Mandalore sector:  5  
##  Agamar  :  1   Wild Space           : 19   Abrion sector   :  4  
##  (Other) :677   (Other)              : 54   (Other)         :104  
##                 System       Inhabitants                 Capital_City
##  Unknown           :501   Unknown  :371   Unknown              :650  
##  Five Points system:  5   Humans   : 51   Aldera               :  1  
##  Hosnian system    :  5   Various  : 18   Bardotta Capital City:  1  
##  Atterra system    :  3   Hutts    :  3   Canto Bight          :  1  
##  Galidraan system  :  3   Ithorians:  3   Capital City         :  1  
##  Jinata system     :  3   Human    :  2   Central City         :  1  
##  (Other)           :163   (Other)  :235   (Other)              : 28  
##    Population      Record_High_Temp Record_Low_Temp     Gravity      
##  Min.   :1014877   Min.   :  0.00   Min.   :-60.00   Min.   : 7.000  
##  1st Qu.:3226102   1st Qu.: 10.00   1st Qu.:-30.00   1st Qu.: 8.280  
##  Median :5458429   Median : 21.00   Median :-20.00   Median : 9.570  
##  Mean   :5522815   Mean   : 20.96   Mean   :-19.57   Mean   : 9.542  
##  3rd Qu.:7777546   3rd Qu.: 31.00   3rd Qu.: -9.00   3rd Qu.:10.805  
##  Max.   :9973030   Max.   :100.00   Max.   : 30.00   Max.   :12.000  
##                                                                      
##   Key_Element        Rotation        Revolution         Rating      Rating_Nom 
##  Min.   :  1.00   Min.   : 10.07   Min.   : 100.0   Min.   :1.000   Meh  :235  
##  1st Qu.: 28.00   1st Qu.: 64.32   1st Qu.: 740.5   1st Qu.:3.000   Yay  :236  
##  Median : 55.00   Median :125.10   Median :1315.0   Median :5.000   Yucks:212  
##  Mean   : 53.62   Mean   :127.44   Mean   :1334.5   Mean   :5.081              
##  3rd Qu.: 80.00   3rd Qu.:194.54   3rd Qu.:1961.0   3rd Qu.:7.000              
##  Max.   :105.00   Max.   :249.23   Max.   :2499.0   Max.   :9.000              
##                                                                                
##    Resources     Resources_Nom
##  Min.   :1.000   Awesome:321  
##  1st Qu.:3.000   Blah   :362  
##  Median :5.000                
##  Mean   :5.152                
##  3rd Qu.:7.000                
##  Max.   :9.000                
## 
nrow(star_wars_planets)
## [1] 683
str(star_wars_planets)
## 'data.frame':    683 obs. of  17 variables:
##  $ Name            : Factor w/ 683 levels "Aargonar","Abafar",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Region          : Factor w/ 14 levels "Beyond theÊRishi Maze",..: 11 10 2 11 11 10 14 11 11 11 ...
##  $ Sector          : Factor w/ 81 levels "7G sector","Abrion sector",..: 81 69 81 81 81 81 81 81 81 81 ...
##  $ System          : Factor w/ 153 levels "Agamar system",..: 143 143 143 143 143 1 143 2 2 3 ...
##  $ Inhabitants     : Factor w/ 232 levels "Abednedo","Abyssin",..: 207 207 1 207 180 207 5 207 207 207 ...
##  $ Capital_City    : Factor w/ 34 levels "Aldera","Bardotta Capital City",..: 32 32 32 32 32 32 32 32 32 32 ...
##  $ Population      : int  3857305 2331308 5358387 7229751 4307513 6191940 9622189 4432983 1125034 6244208 ...
##  $ Record_High_Temp: int  36 33 28 38 37 38 29 39 33 40 ...
##  $ Record_Low_Temp : int  -5 -27 -5 -6 -31 -2 -35 -6 -16 -15 ...
##  $ Gravity         : num  8.35 11.4 9.55 10.8 9.75 ...
##  $ Key_Element     : int  6 28 30 57 22 34 105 57 37 74 ...
##  $ Rotation        : num  59.9 49.2 127.8 146 58.5 ...
##  $ Revolution      : int  2112 1818 1210 1431 1060 1450 2245 440 494 1999 ...
##  $ Rating          : int  7 2 3 9 9 5 5 4 8 2 ...
##  $ Rating_Nom      : Factor w/ 3 levels "Meh","Yay","Yucks": 2 3 3 2 2 1 1 1 2 3 ...
##  $ Resources       : int  8 9 5 3 7 4 5 8 4 4 ...
##  $ Resources_Nom   : Factor w/ 2 levels "Awesome","Blah": 1 1 2 2 1 2 2 1 2 2 ...
names(star_wars_planets)
##  [1] "Name"             "Region"           "Sector"           "System"          
##  [5] "Inhabitants"      "Capital_City"     "Population"       "Record_High_Temp"
##  [9] "Record_Low_Temp"  "Gravity"          "Key_Element"      "Rotation"        
## [13] "Revolution"       "Rating"           "Rating_Nom"       "Resources"       
## [17] "Resources_Nom"

2 Filter variables for clustering

We will look at population, gravity, and rotation.

star_wars_planets_filter <- subset(star_wars_planets[c(7, 10, 12)])
names(star_wars_planets_filter)
## [1] "Population" "Gravity"    "Rotation"
str(star_wars_planets_filter)
## 'data.frame':    683 obs. of  3 variables:
##  $ Population: int  3857305 2331308 5358387 7229751 4307513 6191940 9622189 4432983 1125034 6244208 ...
##  $ Gravity   : num  8.35 11.4 9.55 10.8 9.75 ...
##  $ Rotation  : num  59.9 49.2 127.8 146 58.5 ...

3 Normalise data

star_wars_planets_filter_norm <- sapply(star_wars_planets_filter, scale)
head(star_wars_planets_filter_norm)
##       Population      Gravity     Rotation
## [1,] -0.62792716 -0.824062435 -0.950164564
## [2,] -1.20325536  1.284398527 -1.100531714
## [3,] -0.06199235  0.005495976  0.004505081
## [4,]  0.64354540  0.869619322  0.260790346
## [5,] -0.45819067  0.143755712 -0.969575852
## [6,]  0.25227206 -0.374718296  0.292439185

4 Compare normalised distances

Try different distances for possibly different results

star_wars_planets_filter_norm_m <- dist(star_wars_planets_filter_norm, 
                                        method = "euclidean")
head(star_wars_planets_filter_norm_m)
## [1] 2.1907123 1.3855842 2.4395926 0.9827814 1.5876791 3.0200473

5 Hierarchical clustering

star_wars_planets_filter_norm_m_hc <- hclust(star_wars_planets_filter_norm_m, 
                                             method = "ward.D2")
plot(star_wars_planets_filter_norm_m_hc, hang = -100, ann = TRUE,
     xlab = "", main = "Clusters in a galaxy far, far away")

6 Check clustering quality

library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# fviz_nbclust(x, FUNcluster, method = c("silhouette", "wss", "gap_stat"))
# x: numeric matrix or data frame
# FUNcluster: a partitioning function. Allowed values include kmeans, pam, clara and hcut (for hierarchical clustering).
# method: the method to be used for determining the optimal number of clusters.


fviz_nbclust(star_wars_planets_filter_norm, 
             hcut, method = "silhouette") +
  labs(subtitle = "Silhouette method")

7 k = 6

8 clusters is a lot Maybe try 6?

star_wars_planets_filter_norm_m_hc_memb_6 <- cutree(star_wars_planets_filter_norm_m_hc, k = 6)
star_wars_planets_filter_norm_m_hc_memb_6
##   [1] 1 2 3 4 2 3 4 5 3 4 6 2 5 2 3 4 6 3 4 2 4 4 5 2 6 2 2 2 5 1 3 4 5 6 2 3 4
##  [38] 3 1 1 3 5 6 2 4 2 3 5 4 5 3 1 4 1 1 3 1 5 4 4 4 6 6 1 1 2 4 3 4 2 2 5 2 6
##  [75] 2 5 3 2 4 1 6 2 6 1 6 5 4 3 2 1 4 1 1 1 6 1 3 1 1 2 5 5 3 6 4 1 3 6 5 4 5
## [112] 1 1 4 2 5 5 4 4 3 4 4 3 2 1 1 6 3 5 6 3 5 6 4 3 4 5 4 4 1 6 2 2 4 4 2 6 6
## [149] 4 1 5 1 4 6 1 3 5 4 1 2 5 6 3 1 4 3 4 3 2 1 2 2 6 1 5 6 1 6 4 3 1 4 2 4 4
## [186] 2 4 5 6 5 5 2 6 4 1 6 6 3 5 3 3 4 3 2 3 6 4 6 3 4 6 6 5 6 3 4 1 1 1 1 1 6
## [223] 6 4 1 1 6 4 1 6 3 4 1 5 6 1 1 4 2 5 2 6 2 5 2 3 1 2 4 3 4 1 1 1 1 1 1 3 3
## [260] 1 1 4 1 5 3 1 3 2 2 1 4 1 1 6 4 3 2 1 1 5 3 4 4 6 3 3 3 4 4 1 1 1 3 4 6 3
## [297] 2 4 4 5 1 2 1 2 4 2 3 3 2 3 4 3 4 5 6 4 6 4 6 3 4 5 4 3 4 1 6 4 2 4 4 6 2
## [334] 5 3 4 6 5 2 6 6 4 6 2 2 1 3 3 5 2 3 3 1 6 6 6 4 4 1 6 1 4 4 6 3 2 2 2 2 2
## [371] 5 1 2 3 1 1 1 3 6 2 3 3 4 1 3 1 1 1 5 2 6 2 2 4 4 6 6 4 2 1 6 6 4 4 2 6 1
## [408] 5 1 3 3 2 1 4 2 5 3 5 3 2 3 1 4 6 2 6 3 1 5 4 1 2 2 4 4 5 4 6 1 2 6 1 1 1
## [445] 3 4 5 1 5 2 1 5 4 1 4 4 6 2 4 6 3 2 6 6 2 1 3 2 3 5 6 5 3 6 4 6 4 1 4 3 1
## [482] 3 4 3 2 3 6 6 4 4 4 2 4 3 1 3 4 4 5 2 2 4 2 3 6 1 2 5 4 3 6 6 2 3 6 6 1 2
## [519] 2 2 3 6 1 1 5 3 1 1 4 6 4 6 6 6 1 6 6 4 3 4 4 1 6 5 6 2 2 2 3 4 1 6 6 5 6
## [556] 5 4 1 5 1 4 4 1 2 2 3 2 1 4 2 1 3 1 4 5 2 4 2 1 1 4 5 5 4 4 1 6 3 2 6 1 1
## [593] 1 5 2 4 6 3 6 2 6 2 6 4 6 4 4 6 3 3 4 2 3 2 2 6 5 4 3 2 4 1 3 2 6 6 6 4 1
## [630] 2 1 3 4 1 2 3 6 6 2 3 4 5 2 5 4 1 5 2 6 6 4 4 5 5 1 4 6 4 2 4 2 4 5 2 3 1
## [667] 4 3 6 3 6 1 3 4 5 1 1 1 4 1 6 5 1

8 Cluster characteristics

star_wars_planets_filter_norm_m_hc_memb_6_df <- as.data.frame(star_wars_planets_filter_norm_m_hc_memb_6)
head(star_wars_planets_filter_norm_m_hc_memb_6_df)
##   star_wars_planets_filter_norm_m_hc_memb_6
## 1                                         1
## 2                                         2
## 3                                         3
## 4                                         4
## 5                                         2
## 6                                         3
names(star_wars_planets_filter_norm_m_hc_memb_6_df)[1] <- "Cluster"

tail(star_wars_planets_filter_norm_m_hc_memb_6_df)
##     Cluster
## 678       1
## 679       4
## 680       1
## 681       6
## 682       5
## 683       1
table(star_wars_planets_filter_norm_m_hc_memb_6_df$Cluster)
## 
##   1   2   3   4   5   6 
## 132 113 106 145  73 114

8.1 Merge cluster with original data

star_wars_planets_w_cluster <- cbind(star_wars_planets, 
                                     star_wars_planets_filter_norm_m_hc_memb_6_df)

head(star_wars_planets_w_cluster)
##       Name                Region         Sector        System Inhabitants
## 1 Aargonar               Unknown        Unknown       Unknown     Unknown
## 2   Abafar Outer Rim Territories Sprizen sector       Unknown     Unknown
## 3 Abednedo              Colonies        Unknown       Unknown    Abednedo
## 4   Absanz               Unknown        Unknown       Unknown     Unknown
## 5  Affadar               Unknown        Unknown       Unknown     T'Laeem
## 6   Agamar Outer Rim Territories        Unknown Agamar system     Unknown
##   Capital_City Population Record_High_Temp Record_Low_Temp Gravity Key_Element
## 1      Unknown    3857305               36              -5    8.35           6
## 2      Unknown    2331308               33             -27   11.40          28
## 3      Unknown    5358387               28              -5    9.55          30
## 4      Unknown    7229751               38              -6   10.80          57
## 5      Unknown    4307513               37             -31    9.75          22
## 6      Unknown    6191940               38              -2    9.00          34
##   Rotation Revolution Rating Rating_Nom Resources Resources_Nom Cluster
## 1    59.89       2112      7        Yay         8       Awesome       1
## 2    49.20       1818      2      Yucks         9       Awesome       2
## 3   127.76       1210      3      Yucks         5          Blah       3
## 4   145.98       1431      9        Yay         3          Blah       4
## 5    58.51       1060      9        Yay         7       Awesome       2
## 6   148.23       1450      5        Meh         4          Blah       3
names(star_wars_planets_w_cluster)
##  [1] "Name"             "Region"           "Sector"           "System"          
##  [5] "Inhabitants"      "Capital_City"     "Population"       "Record_High_Temp"
##  [9] "Record_Low_Temp"  "Gravity"          "Key_Element"      "Rotation"        
## [13] "Revolution"       "Rating"           "Rating_Nom"       "Resources"       
## [17] "Resources_Nom"    "Cluster"

8.2 Pivot table

names(star_wars_planets_w_cluster)
##  [1] "Name"             "Region"           "Sector"           "System"          
##  [5] "Inhabitants"      "Capital_City"     "Population"       "Record_High_Temp"
##  [9] "Record_Low_Temp"  "Gravity"          "Key_Element"      "Rotation"        
## [13] "Revolution"       "Rating"           "Rating_Nom"       "Resources"       
## [17] "Resources_Nom"    "Cluster"
aggregate(star_wars_planets[,c(7, 10, 12)], 
          by = star_wars_planets_w_cluster[18], FUN = mean)
##   Cluster Population   Gravity  Rotation
## 1       1    5548260  8.140606  64.63447
## 2       2    3078082 10.541593  79.66062
## 3       3    3647331  8.451981 176.45858
## 4       4    7571942 10.667034  85.96717
## 5       5    3590969 10.834658 212.02945
## 6       6    8291226  8.928947 200.52570

8.3 Pivot chart

star_wars_planets_w_cluster_df <- aggregate(star_wars_planets[,c(7, 10, 12)], 
                                            by = star_wars_planets_w_cluster[18], FUN = mean)

star_wars_planets_w_cluster_df
##   Cluster Population   Gravity  Rotation
## 1       1    5548260  8.140606  64.63447
## 2       2    3078082 10.541593  79.66062
## 3       3    3647331  8.451981 176.45858
## 4       4    7571942 10.667034  85.96717
## 5       5    3590969 10.834658 212.02945
## 6       6    8291226  8.928947 200.52570

8.4 Normalise for better illustration

star_wars_planets_w_cluster_df_norm <- sapply(star_wars_planets_w_cluster_df[, -1], scale)
star_wars_planets_w_cluster_df_norm
##      Population    Gravity   Rotation
## [1,]  0.1169715 -1.1909847 -1.0755947
## [2,] -0.9930907  0.7763208 -0.8508457
## [3,] -0.7372787 -0.9358521  0.5969803
## [4,]  1.0263843  0.8791043 -0.7565173
## [5,] -0.7626067  1.0164502  1.1290206
## [6,]  1.3496203 -0.5450385  0.9569568
star_wars_planets_w_cluster_df_norm_2 <- as.data.frame(cbind(star_wars_planets_w_cluster_df$Cluster,
                                                             star_wars_planets_w_cluster_df_norm))
star_wars_planets_w_cluster_df_norm_2
##   V1 Population    Gravity   Rotation
## 1  1  0.1169715 -1.1909847 -1.0755947
## 2  2 -0.9930907  0.7763208 -0.8508457
## 3  3 -0.7372787 -0.9358521  0.5969803
## 4  4  1.0263843  0.8791043 -0.7565173
## 5  5 -0.7626067  1.0164502  1.1290206
## 6  6  1.3496203 -0.5450385  0.9569568
names(star_wars_planets_w_cluster_df_norm_2)[1] <- "Cluster"

library(reshape2)

star_wars_planets_w_cluster_df_long_norm_2 <- melt(star_wars_planets_w_cluster_df_norm_2, 
                                                   id.vars = c("Cluster"))
star_wars_planets_w_cluster_df_long_norm_2
##    Cluster   variable      value
## 1        1 Population  0.1169715
## 2        2 Population -0.9930907
## 3        3 Population -0.7372787
## 4        4 Population  1.0263843
## 5        5 Population -0.7626067
## 6        6 Population  1.3496203
## 7        1    Gravity -1.1909847
## 8        2    Gravity  0.7763208
## 9        3    Gravity -0.9358521
## 10       4    Gravity  0.8791043
## 11       5    Gravity  1.0164502
## 12       6    Gravity -0.5450385
## 13       1   Rotation -1.0755947
## 14       2   Rotation -0.8508457
## 15       3   Rotation  0.5969803
## 16       4   Rotation -0.7565173
## 17       5   Rotation  1.1290206
## 18       6   Rotation  0.9569568
names(star_wars_planets_w_cluster_df_long_norm_2)[2] <- "Variable"
names(star_wars_planets_w_cluster_df_long_norm_2)[3] <- "Mean"

head(star_wars_planets_w_cluster_df_long_norm_2)
##   Cluster   Variable       Mean
## 1       1 Population  0.1169715
## 2       2 Population -0.9930907
## 3       3 Population -0.7372787
## 4       4 Population  1.0263843
## 5       5 Population -0.7626067
## 6       6 Population  1.3496203
ggplot(star_wars_planets_w_cluster_df_long_norm_2) + 
  aes(x = Variable, y = Mean, fill = Cluster) +
  geom_bar(stat = "identity") + 
  coord_flip() +
  facet_wrap(~ Cluster) +
  theme(legend.position = "none") +
  ggtitle("Cluster characteristics \n in a galaxy far, far away")