# 1. Load data ------------------------------------------------------------

star_wars_planets <- read.csv("star_wars_planets_v5.csv", header = TRUE)
head(star_wars_planets)
##       Name                Region         Sector        System Inhabitants
## 1 Aargonar               Unknown        Unknown       Unknown     Unknown
## 2   Abafar Outer Rim Territories Sprizen sector       Unknown     Unknown
## 3 Abednedo              Colonies        Unknown       Unknown    Abednedo
## 4   Absanz               Unknown        Unknown       Unknown     Unknown
## 5  Affadar               Unknown        Unknown       Unknown     T'Laeem
## 6   Agamar Outer Rim Territories        Unknown Agamar system     Unknown
##   Capital_City Population Record_High_Temp Record_Low_Temp Gravity
## 1      Unknown    3857305               36              -5    8.35
## 2      Unknown    2331308               33             -27   11.40
## 3      Unknown    5358387               28              -5    9.55
## 4      Unknown    7229751               38              -6   10.80
## 5      Unknown    4307513               37             -31    9.75
## 6      Unknown    6191940               38              -2    9.00
##   Key_Element Rotation Revolution Rating Rating_Nom Resources
## 1           6    59.89       2112      7        Yay         8
## 2          28    49.20       1818      2      Yucks         9
## 3          30   127.76       1210      3      Yucks         5
## 4          57   145.98       1431      9        Yay         3
## 5          22    58.51       1060      9        Yay         7
## 6          34   148.23       1450      5        Meh         4
##   Resources_Nom
## 1       Awesome
## 2       Awesome
## 3          Blah
## 4          Blah
## 5       Awesome
## 6          Blah
summary(star_wars_planets)
##        Name                       Region                 Sector   
##  Aargonar:  1   Outer Rim Territories:243   Unknown         :526  
##  Abafar  :  1   Unknown              :239   Anoat sector    : 30  
##  Abednedo:  1   Mid Rim              : 70   Hutt Space      :  9  
##  Absanz  :  1   Inner Rim            : 30   Corporate Sector:  5  
##  Affadar :  1   Core Worlds          : 28   Mandalore sector:  5  
##  Agamar  :  1   Wild Space           : 19   Abrion sector   :  4  
##  (Other) :677   (Other)              : 54   (Other)         :104  
##                 System       Inhabitants                 Capital_City
##  Unknown           :501   Unknown  :371   Unknown              :650  
##  Five Points system:  5   Humans   : 51   Aldera               :  1  
##  Hosnian system    :  5   Various  : 18   Bardotta Capital City:  1  
##  Atterra system    :  3   Hutts    :  3   Canto Bight          :  1  
##  Galidraan system  :  3   Ithorians:  3   Capital City         :  1  
##  Jinata system     :  3   Human    :  2   Central City         :  1  
##  (Other)           :163   (Other)  :235   (Other)              : 28  
##    Population      Record_High_Temp Record_Low_Temp     Gravity      
##  Min.   :1014877   Min.   :  0.00   Min.   :-60.00   Min.   : 7.000  
##  1st Qu.:3226102   1st Qu.: 10.00   1st Qu.:-30.00   1st Qu.: 8.280  
##  Median :5458429   Median : 21.00   Median :-20.00   Median : 9.570  
##  Mean   :5522815   Mean   : 20.96   Mean   :-19.57   Mean   : 9.542  
##  3rd Qu.:7777546   3rd Qu.: 31.00   3rd Qu.: -9.00   3rd Qu.:10.805  
##  Max.   :9973030   Max.   :100.00   Max.   : 30.00   Max.   :12.000  
##                                                                      
##   Key_Element        Rotation        Revolution         Rating     
##  Min.   :  1.00   Min.   : 10.07   Min.   : 100.0   Min.   :1.000  
##  1st Qu.: 28.00   1st Qu.: 64.32   1st Qu.: 740.5   1st Qu.:3.000  
##  Median : 55.00   Median :125.10   Median :1315.0   Median :5.000  
##  Mean   : 53.62   Mean   :127.44   Mean   :1334.5   Mean   :5.081  
##  3rd Qu.: 80.00   3rd Qu.:194.54   3rd Qu.:1961.0   3rd Qu.:7.000  
##  Max.   :105.00   Max.   :249.23   Max.   :2499.0   Max.   :9.000  
##                                                                    
##  Rating_Nom    Resources     Resources_Nom
##  Meh  :235   Min.   :1.000   Awesome:321  
##  Yay  :236   1st Qu.:3.000   Blah   :362  
##  Yucks:212   Median :5.000                
##              Mean   :5.152                
##              3rd Qu.:7.000                
##              Max.   :9.000                
## 
nrow(star_wars_planets)
## [1] 683
str(star_wars_planets)
## 'data.frame':    683 obs. of  17 variables:
##  $ Name            : Factor w/ 683 levels "Aargonar","Abafar",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Region          : Factor w/ 14 levels "Beyond the\xcaRishi Maze",..: 11 10 2 11 11 10 14 11 11 11 ...
##  $ Sector          : Factor w/ 81 levels "7G sector","Abrion sector",..: 81 69 81 81 81 81 81 81 81 81 ...
##  $ System          : Factor w/ 153 levels "Agamar system",..: 143 143 143 143 143 1 143 2 2 3 ...
##  $ Inhabitants     : Factor w/ 232 levels "Abednedo","Abyssin",..: 207 207 1 207 180 207 5 207 207 207 ...
##  $ Capital_City    : Factor w/ 34 levels "Aldera","Bardotta Capital City",..: 32 32 32 32 32 32 32 32 32 32 ...
##  $ Population      : int  3857305 2331308 5358387 7229751 4307513 6191940 9622189 4432983 1125034 6244208 ...
##  $ Record_High_Temp: int  36 33 28 38 37 38 29 39 33 40 ...
##  $ Record_Low_Temp : int  -5 -27 -5 -6 -31 -2 -35 -6 -16 -15 ...
##  $ Gravity         : num  8.35 11.4 9.55 10.8 9.75 ...
##  $ Key_Element     : int  6 28 30 57 22 34 105 57 37 74 ...
##  $ Rotation        : num  59.9 49.2 127.8 146 58.5 ...
##  $ Revolution      : int  2112 1818 1210 1431 1060 1450 2245 440 494 1999 ...
##  $ Rating          : int  7 2 3 9 9 5 5 4 8 2 ...
##  $ Rating_Nom      : Factor w/ 3 levels "Meh","Yay","Yucks": 2 3 3 2 2 1 1 1 2 3 ...
##  $ Resources       : int  8 9 5 3 7 4 5 8 4 4 ...
##  $ Resources_Nom   : Factor w/ 2 levels "Awesome","Blah": 1 1 2 2 1 2 2 1 2 2 ...
names(star_wars_planets)
##  [1] "Name"             "Region"           "Sector"          
##  [4] "System"           "Inhabitants"      "Capital_City"    
##  [7] "Population"       "Record_High_Temp" "Record_Low_Temp" 
## [10] "Gravity"          "Key_Element"      "Rotation"        
## [13] "Revolution"       "Rating"           "Rating_Nom"      
## [16] "Resources"        "Resources_Nom"
# 2. Filter variables for clustering --------------------------------------

star_wars_planets_filter <- subset(star_wars_planets[c(7, 10, 12)])
names(star_wars_planets_filter)
## [1] "Population" "Gravity"    "Rotation"
str(star_wars_planets_filter)
## 'data.frame':    683 obs. of  3 variables:
##  $ Population: int  3857305 2331308 5358387 7229751 4307513 6191940 9622189 4432983 1125034 6244208 ...
##  $ Gravity   : num  8.35 11.4 9.55 10.8 9.75 ...
##  $ Rotation  : num  59.9 49.2 127.8 146 58.5 ...
# 3. Data exploration -----------------------------------------------------

library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ggplot(star_wars_planets_filter) + aes(y = Population) +
  geom_boxplot(fill = "#C6C6FF") + ylab("Population") + 
  ggtitle("Size of population") + theme_classic()

ggplot(star_wars_planets_filter) + aes(Population) +
  geom_histogram(fill = "#C6C6FF") + ylab("Count") +
  ggtitle("Histogram of size of population") + theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(star_wars_planets_filter$Population)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 1014877 3226102 5458429 5522815 7777546 9973030
ggplot(star_wars_planets_filter) + aes(y = Gravity) +
  geom_boxplot(fill = "#C6C6CD") + ylab("Gravity") + 
  ggtitle("Strength of gravitational force") + theme_dark()

ggplot(star_wars_planets_filter) + aes(Gravity) +
  geom_histogram(fill = "#C6C6CD") + ylab("Count") + 
  ggtitle("Histogram of gravitational force") + theme_dark()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(star_wars_planets_filter$Gravity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.000   8.280   9.570   9.542  10.805  12.000
ggplot(star_wars_planets_filter) + aes(y = Rotation) +
  geom_boxplot(fill = "#C6C666") + ylab("Rotation") + 
  ggtitle("Length of day") + theme_linedraw()

ggplot(star_wars_planets_filter) + aes(Rotation) +
  geom_histogram(fill = "#C6C666") + ylab("Count") + 
  ggtitle("Histogram of length of day") + theme_linedraw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(star_wars_planets_filter$Rotation)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.07   64.32  125.10  127.44  194.54  249.23
# 4. Normalise data -------------------------------------------------------

star_wars_planets_filter_norm <- sapply(star_wars_planets_filter, scale)
head(star_wars_planets_filter_norm)
##       Population      Gravity     Rotation
## [1,] -0.62792716 -0.824062435 -0.950164564
## [2,] -1.20325536  1.284398527 -1.100531714
## [3,] -0.06199235  0.005495976  0.004505081
## [4,]  0.64354540  0.869619322  0.260790346
## [5,] -0.45819067  0.143755712 -0.969575852
## [6,]  0.25227206 -0.374718296  0.292439185
# 5. Compute normalised distances -----------------------------------------


star_wars_planets_filter_norm_m <- dist(star_wars_planets_filter_norm, 
                                        method = "euclidean")
head(star_wars_planets_filter_norm_m)
## [1] 2.1907123 1.3855842 2.4395926 0.9827814 1.5876791 3.0200473
# 6. Hierarchical clustering ----------------------------------------------

star_wars_planets_filter_norm_m_hc <- hclust(star_wars_planets_filter_norm_m, 
                                             method = "ward.D2")
plot(star_wars_planets_filter_norm_m_hc, hang = -100, ann = TRUE)

# 7. Check clustering validity --------------------------------------------

library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
# fviz_nbclust(x, FUNcluster, method = c("silhouette", "wss", "gap_stat"))
# x: numeric matrix or data frame
# FUNcluster: a partitioning function. Allowed values include kmeans, pam, clara and hcut (for hierarchical clustering).
# method: the method to be used for determining the optimal number of clusters.

fviz_nbclust(star_wars_planets_filter_norm, 
             hcut, method = "wss") +
  geom_vline(xintercept = 2, linetype = 2)+
  labs(subtitle = "Elbow method")

fviz_nbclust(star_wars_planets_filter_norm, 
             hcut, method = "silhouette") +
  labs(subtitle = "Silhouette method")

set.seed(666)
fviz_nbclust(star_wars_planets_filter_norm, 
             hcut, nstart = 25,  method = "gap_stat", 
             nboot = 50) +
  labs(subtitle = "Gap statistic method")

# 8. Hierarchical clustering k = 5 ----------------------------------------

star_wars_planets_filter_norm_m_hc_memb_5 <- cutree(star_wars_planets_filter_norm_m_hc, 
                                                    k = 5)
star_wars_planets_filter_norm_m_hc_memb_5
##   [1] 1 2 3 4 2 3 4 3 3 4 5 2 3 2 3 4 5 3 4 2 4 4 3 2 5 2 2 2 3 1 3 4 3 5 2
##  [36] 3 4 3 1 1 3 3 5 2 4 2 3 3 4 3 3 1 4 1 1 3 1 3 4 4 4 5 5 1 1 2 4 3 4 2
##  [71] 2 3 2 5 2 3 3 2 4 1 5 2 5 1 5 3 4 3 2 1 4 1 1 1 5 1 3 1 1 2 3 3 3 5 4
## [106] 1 3 5 3 4 3 1 1 4 2 3 3 4 4 3 4 4 3 2 1 1 5 3 3 5 3 3 5 4 3 4 3 4 4 1
## [141] 5 2 2 4 4 2 5 5 4 1 3 1 4 5 1 3 3 4 1 2 3 5 3 1 4 3 4 3 2 1 2 2 5 1 3
## [176] 5 1 5 4 3 1 4 2 4 4 2 4 3 5 3 3 2 5 4 1 5 5 3 3 3 3 4 3 2 3 5 4 5 3 4
## [211] 5 5 3 5 3 4 1 1 1 1 1 5 5 4 1 1 5 4 1 5 3 4 1 3 5 1 1 4 2 3 2 5 2 3 2
## [246] 3 1 2 4 3 4 1 1 1 1 1 1 3 3 1 1 4 1 3 3 1 3 2 2 1 4 1 1 5 4 3 2 1 1 3
## [281] 3 4 4 5 3 3 3 4 4 1 1 1 3 4 5 3 2 4 4 3 1 2 1 2 4 2 3 3 2 3 4 3 4 3 5
## [316] 4 5 4 5 3 4 3 4 3 4 1 5 4 2 4 4 5 2 3 3 4 5 3 2 5 5 4 5 2 2 1 3 3 3 2
## [351] 3 3 1 5 5 5 4 4 1 5 1 4 4 5 3 2 2 2 2 2 3 1 2 3 1 1 1 3 5 2 3 3 4 1 3
## [386] 1 1 1 3 2 5 2 2 4 4 5 5 4 2 1 5 5 4 4 2 5 1 3 1 3 3 2 1 4 2 3 3 3 3 2
## [421] 3 1 4 5 2 5 3 1 3 4 1 2 2 4 4 3 4 5 1 2 5 1 1 1 3 4 3 1 3 2 1 3 4 1 4
## [456] 4 5 2 4 5 3 2 5 5 2 1 3 2 3 3 5 3 3 5 4 5 4 1 4 3 1 3 4 3 2 3 5 5 4 4
## [491] 4 2 4 3 1 3 4 4 3 2 2 4 2 3 5 1 2 3 4 3 5 5 2 3 5 5 1 2 2 2 3 5 1 1 3
## [526] 3 1 1 4 5 4 5 5 5 1 5 5 4 3 4 4 1 5 3 5 2 2 2 3 4 1 5 5 3 5 3 4 1 3 1
## [561] 4 4 1 2 2 3 2 1 4 2 1 3 1 4 3 2 4 2 1 1 4 3 3 4 4 1 5 3 2 5 1 1 1 3 2
## [596] 4 5 3 5 2 5 2 5 4 5 4 4 5 3 3 4 2 3 2 2 5 3 4 3 2 4 1 3 2 5 5 5 4 1 2
## [631] 1 3 4 1 2 3 5 5 2 3 4 3 2 3 4 1 3 2 5 5 4 4 3 3 1 4 5 4 2 4 2 4 3 2 3
## [666] 1 4 3 5 3 5 1 3 4 3 1 1 1 4 1 5 3 1
# 9. Hierarchical cluster characteristics ---------------------------------

star_wars_planets_filter_norm_m_hc_memb_5_df <- as.data.frame(star_wars_planets_filter_norm_m_hc_memb_5)
head(star_wars_planets_filter_norm_m_hc_memb_5_df)
##   star_wars_planets_filter_norm_m_hc_memb_5
## 1                                         1
## 2                                         2
## 3                                         3
## 4                                         4
## 5                                         2
## 6                                         3
names(star_wars_planets_filter_norm_m_hc_memb_5_df)[1] <- "Cluster"

tail(star_wars_planets_filter_norm_m_hc_memb_5_df)
##     Cluster
## 678       1
## 679       4
## 680       1
## 681       5
## 682       3
## 683       1
table(star_wars_planets_filter_norm_m_hc_memb_5_df$Cluster)
## 
##   1   2   3   4   5 
## 132 113 179 145 114
# Merge cluster membership with original data

star_wars_planets_w_cluster <- cbind(star_wars_planets, 
                                     star_wars_planets_filter_norm_m_hc_memb_5_df)

head(star_wars_planets_w_cluster)
##       Name                Region         Sector        System Inhabitants
## 1 Aargonar               Unknown        Unknown       Unknown     Unknown
## 2   Abafar Outer Rim Territories Sprizen sector       Unknown     Unknown
## 3 Abednedo              Colonies        Unknown       Unknown    Abednedo
## 4   Absanz               Unknown        Unknown       Unknown     Unknown
## 5  Affadar               Unknown        Unknown       Unknown     T'Laeem
## 6   Agamar Outer Rim Territories        Unknown Agamar system     Unknown
##   Capital_City Population Record_High_Temp Record_Low_Temp Gravity
## 1      Unknown    3857305               36              -5    8.35
## 2      Unknown    2331308               33             -27   11.40
## 3      Unknown    5358387               28              -5    9.55
## 4      Unknown    7229751               38              -6   10.80
## 5      Unknown    4307513               37             -31    9.75
## 6      Unknown    6191940               38              -2    9.00
##   Key_Element Rotation Revolution Rating Rating_Nom Resources
## 1           6    59.89       2112      7        Yay         8
## 2          28    49.20       1818      2      Yucks         9
## 3          30   127.76       1210      3      Yucks         5
## 4          57   145.98       1431      9        Yay         3
## 5          22    58.51       1060      9        Yay         7
## 6          34   148.23       1450      5        Meh         4
##   Resources_Nom Cluster
## 1       Awesome       1
## 2       Awesome       2
## 3          Blah       3
## 4          Blah       4
## 5       Awesome       2
## 6          Blah       3
# Pivot table
names(star_wars_planets_w_cluster)
##  [1] "Name"             "Region"           "Sector"          
##  [4] "System"           "Inhabitants"      "Capital_City"    
##  [7] "Population"       "Record_High_Temp" "Record_Low_Temp" 
## [10] "Gravity"          "Key_Element"      "Rotation"        
## [13] "Revolution"       "Rating"           "Rating_Nom"      
## [16] "Resources"        "Resources_Nom"    "Cluster"
aggregate(star_wars_planets[,c(7, 10, 12)], 
          by = star_wars_planets_w_cluster[18], FUN = mean)
##   Cluster Population   Gravity  Rotation
## 1       1    5548260  8.140606  64.63447
## 2       2    3078082 10.541593  79.66062
## 3       3    3624345  9.423687 190.96514
## 4       4    7571942 10.667034  85.96717
## 5       5    8291226  8.928947 200.52570
# Pivot chart

star_wars_planets_w_cluster_df <- aggregate(star_wars_planets[,c(7, 10, 12)], 
                                            by = star_wars_planets_w_cluster[18], FUN = mean)

star_wars_planets_w_cluster_df
##   Cluster Population   Gravity  Rotation
## 1       1    5548260  8.140606  64.63447
## 2       2    3078082 10.541593  79.66062
## 3       3    3624345  9.423687 190.96514
## 4       4    7571942 10.667034  85.96717
## 5       5    8291226  8.928947 200.52570
# Normalise for better representation

star_wars_planets_w_cluster_df_norm <- sapply(star_wars_planets_w_cluster_df[, -1], scale)
star_wars_planets_w_cluster_df_norm
##       Population    Gravity   Rotation
## [1,] -0.03221782 -1.3026573 -0.9086406
## [2,] -1.10030336  0.9317588 -0.6800028
## [3,] -0.86410352 -0.1085912  1.0136061
## [4,]  0.84280591  1.0484977 -0.5840423
## [5,]  1.15381880 -0.5690079  1.1590796
star_wars_planets_w_cluster_df_norm_2 <- as.data.frame(cbind(star_wars_planets_w_cluster_df$Cluster,
                                             star_wars_planets_w_cluster_df_norm))
star_wars_planets_w_cluster_df_norm_2
##   V1  Population    Gravity   Rotation
## 1  1 -0.03221782 -1.3026573 -0.9086406
## 2  2 -1.10030336  0.9317588 -0.6800028
## 3  3 -0.86410352 -0.1085912  1.0136061
## 4  4  0.84280591  1.0484977 -0.5840423
## 5  5  1.15381880 -0.5690079  1.1590796
names(star_wars_planets_w_cluster_df_norm_2)[1] <- "Cluster"

library(reshape2)

star_wars_planets_w_cluster_df_long_norm_2 <- melt(star_wars_planets_w_cluster_df_norm_2, 
                                  id.vars = c("Cluster"))
star_wars_planets_w_cluster_df_long_norm_2
##    Cluster   variable       value
## 1        1 Population -0.03221782
## 2        2 Population -1.10030336
## 3        3 Population -0.86410352
## 4        4 Population  0.84280591
## 5        5 Population  1.15381880
## 6        1    Gravity -1.30265731
## 7        2    Gravity  0.93175877
## 8        3    Gravity -0.10859121
## 9        4    Gravity  1.04849770
## 10       5    Gravity -0.56900794
## 11       1   Rotation -0.90864062
## 12       2   Rotation -0.68000281
## 13       3   Rotation  1.01360615
## 14       4   Rotation -0.58404233
## 15       5   Rotation  1.15907961
names(star_wars_planets_w_cluster_df_long_norm_2)[2] <- "Variable"
names(star_wars_planets_w_cluster_df_long_norm_2)[3] <- "Mean"

head(star_wars_planets_w_cluster_df_long_norm_2)
##   Cluster   Variable        Mean
## 1       1 Population -0.03221782
## 2       2 Population -1.10030336
## 3       3 Population -0.86410352
## 4       4 Population  0.84280591
## 5       5 Population  1.15381880
## 6       1    Gravity -1.30265731
ggplot(star_wars_planets_w_cluster_df_long_norm_2) + 
  aes(x = Variable, y = Mean, fill = Cluster) +
  geom_bar(stat = "identity", position = "dodge") + 
  coord_flip() +
  facet_wrap(~ Cluster) +
  theme(legend.position = "none") +
  ggtitle("Cluster characteristics")