Load and check.
star_wars_planets <- read.csv("star_wars_planets_v5.csv", header = TRUE)
head(star_wars_planets)
## Name Region Sector System Inhabitants
## 1 Aargonar Unknown Unknown Unknown Unknown
## 2 Abafar Outer Rim Territories Sprizen sector Unknown Unknown
## 3 Abednedo Colonies Unknown Unknown Abednedo
## 4 Absanz Unknown Unknown Unknown Unknown
## 5 Affadar Unknown Unknown Unknown T'Laeem
## 6 Agamar Outer Rim Territories Unknown Agamar system Unknown
## Capital_City Population Record_High_Temp Record_Low_Temp Gravity Key_Element
## 1 Unknown 3857305 36 -5 8.35 6
## 2 Unknown 2331308 33 -27 11.40 28
## 3 Unknown 5358387 28 -5 9.55 30
## 4 Unknown 7229751 38 -6 10.80 57
## 5 Unknown 4307513 37 -31 9.75 22
## 6 Unknown 6191940 38 -2 9.00 34
## Rotation Revolution Rating Rating_Nom Resources Resources_Nom
## 1 59.89 2112 7 Yay 8 Awesome
## 2 49.20 1818 2 Yucks 9 Awesome
## 3 127.76 1210 3 Yucks 5 Blah
## 4 145.98 1431 9 Yay 3 Blah
## 5 58.51 1060 9 Yay 7 Awesome
## 6 148.23 1450 5 Meh 4 Blah
summary(star_wars_planets)
## Name Region Sector System
## Length:683 Length:683 Length:683 Length:683
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Inhabitants Capital_City Population Record_High_Temp
## Length:683 Length:683 Min. :1014877 Min. : 0.00
## Class :character Class :character 1st Qu.:3226102 1st Qu.: 10.00
## Mode :character Mode :character Median :5458429 Median : 21.00
## Mean :5522815 Mean : 20.96
## 3rd Qu.:7777546 3rd Qu.: 31.00
## Max. :9973030 Max. :100.00
## Record_Low_Temp Gravity Key_Element Rotation
## Min. :-60.00 Min. : 7.000 Min. : 1.00 Min. : 10.07
## 1st Qu.:-30.00 1st Qu.: 8.280 1st Qu.: 28.00 1st Qu.: 64.32
## Median :-20.00 Median : 9.570 Median : 55.00 Median :125.10
## Mean :-19.57 Mean : 9.542 Mean : 53.62 Mean :127.44
## 3rd Qu.: -9.00 3rd Qu.:10.805 3rd Qu.: 80.00 3rd Qu.:194.54
## Max. : 30.00 Max. :12.000 Max. :105.00 Max. :249.23
## Revolution Rating Rating_Nom Resources
## Min. : 100.0 Min. :1.000 Length:683 Min. :1.000
## 1st Qu.: 740.5 1st Qu.:3.000 Class :character 1st Qu.:3.000
## Median :1315.0 Median :5.000 Mode :character Median :5.000
## Mean :1334.5 Mean :5.081 Mean :5.152
## 3rd Qu.:1961.0 3rd Qu.:7.000 3rd Qu.:7.000
## Max. :2499.0 Max. :9.000 Max. :9.000
## Resources_Nom
## Length:683
## Class :character
## Mode :character
##
##
##
nrow(star_wars_planets)
## [1] 683
str(star_wars_planets)
## 'data.frame': 683 obs. of 17 variables:
## $ Name : chr "Aargonar" "Abafar" "Abednedo" "Absanz" ...
## $ Region : chr "Unknown" "Outer Rim Territories" "Colonies" "Unknown" ...
## $ Sector : chr "Unknown" "Sprizen sector" "Unknown" "Unknown" ...
## $ System : chr "Unknown" "Unknown" "Unknown" "Unknown" ...
## $ Inhabitants : chr "Unknown" "Unknown" "Abednedo" "Unknown" ...
## $ Capital_City : chr "Unknown" "Unknown" "Unknown" "Unknown" ...
## $ Population : int 3857305 2331308 5358387 7229751 4307513 6191940 9622189 4432983 1125034 6244208 ...
## $ Record_High_Temp: int 36 33 28 38 37 38 29 39 33 40 ...
## $ Record_Low_Temp : int -5 -27 -5 -6 -31 -2 -35 -6 -16 -15 ...
## $ Gravity : num 8.35 11.4 9.55 10.8 9.75 ...
## $ Key_Element : int 6 28 30 57 22 34 105 57 37 74 ...
## $ Rotation : num 59.9 49.2 127.8 146 58.5 ...
## $ Revolution : int 2112 1818 1210 1431 1060 1450 2245 440 494 1999 ...
## $ Rating : int 7 2 3 9 9 5 5 4 8 2 ...
## $ Rating_Nom : chr "Yay" "Yucks" "Yucks" "Yay" ...
## $ Resources : int 8 9 5 3 7 4 5 8 4 4 ...
## $ Resources_Nom : chr "Awesome" "Awesome" "Blah" "Blah" ...
names(star_wars_planets)
## [1] "Name" "Region" "Sector" "System"
## [5] "Inhabitants" "Capital_City" "Population" "Record_High_Temp"
## [9] "Record_Low_Temp" "Gravity" "Key_Element" "Rotation"
## [13] "Revolution" "Rating" "Rating_Nom" "Resources"
## [17] "Resources_Nom"
Take the variables needed.
star_wars_planets_filter <- subset(star_wars_planets[c(8:9, 13)])
names(star_wars_planets_filter)
## [1] "Record_High_Temp" "Record_Low_Temp" "Revolution"
str(star_wars_planets_filter)
## 'data.frame': 683 obs. of 3 variables:
## $ Record_High_Temp: int 36 33 28 38 37 38 29 39 33 40 ...
## $ Record_Low_Temp : int -5 -27 -5 -6 -31 -2 -35 -6 -16 -15 ...
## $ Revolution : int 2112 1818 1210 1431 1060 1450 2245 440 494 1999 ...
Explore the worlds.
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplot(star_wars_planets_filter) + aes(y = Record_High_Temp) +
geom_boxplot(fill = "#C6C6FF") + ylab("High Temperature") +
ggtitle("Record high temperature") + theme_classic()
ggplot(star_wars_planets_filter) + aes(Record_High_Temp) +
geom_histogram(fill = "#C6C6FF") + ylab("Count") +
ggtitle("Histogram of size record high temperature") + theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(star_wars_planets_filter$Record_High_Temp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 10.00 21.00 20.96 31.00 100.00
ggplot(star_wars_planets_filter) + aes(y = Record_Low_Temp) +
geom_boxplot(fill = "#C6C6CD") + ylab("Low temperature") +
ggtitle("Record low temperature") + theme_dark()
ggplot(star_wars_planets_filter) + aes(Record_Low_Temp) +
geom_histogram(fill = "#C6C6CD") + ylab("Count") +
ggtitle("Histogram of record low temperature") + theme_dark()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(star_wars_planets_filter$Record_Low_Temp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -60.00 -30.00 -20.00 -19.57 -9.00 30.00
ggplot(star_wars_planets_filter) + aes(y = Revolution) +
geom_boxplot(fill = "#C6C666") + ylab("Revolution") +
ggtitle("Length of year") + theme_linedraw()
ggplot(star_wars_planets_filter) + aes(Revolution) +
geom_histogram(fill = "#C6C666") + ylab("Count") +
ggtitle("Histogram of length of year") + theme_linedraw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(star_wars_planets_filter$Revolution)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100.0 740.5 1315.0 1334.5 1961.0 2499.0
Normalise.
star_wars_planets_filter_norm <- sapply(star_wars_planets_filter, scale)
head(star_wars_planets_filter_norm)
## Record_High_Temp Record_Low_Temp Revolution
## [1,] 1.2242449 1.1579100 1.1162594
## [2,] 0.9800395 -0.5900060 0.6941853
## [3,] 0.5730305 1.1579100 -0.1786756
## [4,] 1.3870485 1.0784592 0.1385979
## [5,] 1.3056467 -0.9078089 -0.3940195
## [6,] 1.3870485 1.3962621 0.1658748
Using k = 3
star_wars_planets_filter_norm_kmeans_1 <- kmeans(star_wars_planets_filter_norm, 3)
str(star_wars_planets_filter_norm_kmeans_1)
## List of 9
## $ cluster : int [1:683] 1 2 1 1 3 1 2 1 3 1 ...
## $ centers : num [1:3, 1:3] 0.533 -0.191 -0.295 1.045 -0.645 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:3] "Record_High_Temp" "Record_Low_Temp" "Revolution"
## $ totss : num 2046
## $ withinss : num [1:3] 398 352 437
## $ tot.withinss: num 1186
## $ betweenss : num 860
## $ size : int [1:3] 217 209 257
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
star_wars_planets_filter$Cluster <- star_wars_planets_filter_norm_kmeans_1$cluster
head(star_wars_planets_filter)
## Record_High_Temp Record_Low_Temp Revolution Cluster
## 1 36 -5 2112 1
## 2 33 -27 1818 2
## 3 28 -5 1210 1
## 4 38 -6 1431 1
## 5 37 -31 1060 3
## 6 38 -2 1450 1
star_wars_planets_w_cluster_df_1 <- aggregate(
star_wars_planets_filter[, -4], by = list(star_wars_planets_filter$Cluster),
FUN = mean)
names(star_wars_planets_w_cluster_df_1)[1] <- "Cluster"
star_wars_planets_w_cluster_df_1
## Cluster Record_High_Temp Record_Low_Temp Revolution
## 1 1 27.50691 -6.423963 1511.7880
## 2 2 18.61244 -27.688995 1959.3158
## 3 3 17.34241 -24.077821 676.5759
library(reshape2)
star_wars_planets_w_cluster_df_1_long <- melt(
star_wars_planets_w_cluster_df_1,
id.vars = c("Cluster"))
star_wars_planets_w_cluster_df_1_long
## Cluster variable value
## 1 1 Record_High_Temp 27.506912
## 2 2 Record_High_Temp 18.612440
## 3 3 Record_High_Temp 17.342412
## 4 1 Record_Low_Temp -6.423963
## 5 2 Record_Low_Temp -27.688995
## 6 3 Record_Low_Temp -24.077821
## 7 1 Revolution 1511.788018
## 8 2 Revolution 1959.315789
## 9 3 Revolution 676.575875
names(star_wars_planets_w_cluster_df_1_long)[c(2,3)] <- c("Variable", "Mean")
head(star_wars_planets_w_cluster_df_1_long)
## Cluster Variable Mean
## 1 1 Record_High_Temp 27.506912
## 2 2 Record_High_Temp 18.612440
## 3 3 Record_High_Temp 17.342412
## 4 1 Record_Low_Temp -6.423963
## 5 2 Record_Low_Temp -27.688995
## 6 3 Record_Low_Temp -24.077821
ggplot(star_wars_planets_w_cluster_df_1_long) +
aes(x = Variable, y = Mean, fill = Cluster) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
facet_wrap(~ Cluster) +
theme(legend.position = "none") +
ggtitle("Cluster characteristics")
star_wars_planets_filter_norm <- as.data.frame(star_wars_planets_filter_norm)
head(star_wars_planets_filter_norm)
## Record_High_Temp Record_Low_Temp Revolution
## 1 1.2242449 1.1579100 1.1162594
## 2 0.9800395 -0.5900060 0.6941853
## 3 0.5730305 1.1579100 -0.1786756
## 4 1.3870485 1.0784592 0.1385979
## 5 1.3056467 -0.9078089 -0.3940195
## 6 1.3870485 1.3962621 0.1658748
star_wars_planets_filter_norm$Cluster <- star_wars_planets_filter_norm_kmeans_1$cluster
head(star_wars_planets_filter_norm)
## Record_High_Temp Record_Low_Temp Revolution Cluster
## 1 1.2242449 1.1579100 1.1162594 1
## 2 0.9800395 -0.5900060 0.6941853 2
## 3 0.5730305 1.1579100 -0.1786756 1
## 4 1.3870485 1.0784592 0.1385979 1
## 5 1.3056467 -0.9078089 -0.3940195 3
## 6 1.3870485 1.3962621 0.1658748 1
star_wars_planets_w_cluster_df_1_norm <- aggregate(
star_wars_planets_filter_norm[, -4], by = list(star_wars_planets_filter_norm$Cluster),
FUN = mean)
names(star_wars_planets_w_cluster_df_1_norm)[1] <- "Cluster"
star_wars_planets_w_cluster_df_1_norm
## Cluster Record_High_Temp Record_Low_Temp Revolution
## 1 1 0.5328923 1.0447751 0.2545793
## 2 2 -0.1911337 -0.6447471 0.8970619
## 3 3 -0.2945163 -0.3578367 -0.9444733
library(reshape2)
star_wars_planets_w_cluster_df_1_norm_long <- melt(star_wars_planets_w_cluster_df_1_norm, id.vars = c("Cluster"))
star_wars_planets_w_cluster_df_1_norm_long
## Cluster variable value
## 1 1 Record_High_Temp 0.5328923
## 2 2 Record_High_Temp -0.1911337
## 3 3 Record_High_Temp -0.2945163
## 4 1 Record_Low_Temp 1.0447751
## 5 2 Record_Low_Temp -0.6447471
## 6 3 Record_Low_Temp -0.3578367
## 7 1 Revolution 0.2545793
## 8 2 Revolution 0.8970619
## 9 3 Revolution -0.9444733
names(star_wars_planets_w_cluster_df_1_norm_long)[c(2:3)] <- c("Variable", "Mean")
head(star_wars_planets_w_cluster_df_1_norm_long)
## Cluster Variable Mean
## 1 1 Record_High_Temp 0.5328923
## 2 2 Record_High_Temp -0.1911337
## 3 3 Record_High_Temp -0.2945163
## 4 1 Record_Low_Temp 1.0447751
## 5 2 Record_Low_Temp -0.6447471
## 6 3 Record_Low_Temp -0.3578367
ggplot(star_wars_planets_w_cluster_df_1_norm_long) +
aes(x = Variable, y = Mean, fill = Cluster) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
facet_wrap(~ Cluster) +
theme(legend.position = "none") +
ggtitle("Cluster characteristics k = 3")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# fviz_nbclust(x, FUNcluster, method = c("silhouette", "wss", "gap_stat"))
# x: numeric matrix or data frame
# FUNcluster: a partitioning function. Allowed values include kmeans, pam, clara and hcut (for hierarchical clustering).
# method: the method to be used for determining the optimal number of clusters.
fviz_nbclust(star_wars_planets_filter_norm,
kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
fviz_nbclust(star_wars_planets_filter_norm,
kmeans, method = "silhouette") +
labs(subtitle = "Silhouette method")
star_wars_planets_filter_norm_kmeans_2 <- kmeans(star_wars_planets_filter_norm, 4)
str(star_wars_planets_filter_norm_kmeans_2)
## List of 9
## $ cluster : int [1:683] 1 3 1 1 4 1 3 1 4 1 ...
## $ centers : num [1:4, 1:4] 0.533 -0.909 -0.241 0.649 1.045 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "1" "2" "3" "4"
## .. ..$ : chr [1:4] "Record_High_Temp" "Record_Low_Temp" "Revolution" "Cluster"
## $ totss : num 2518
## $ withinss : num [1:4] 398 176 330 117
## $ tot.withinss: num 1021
## $ betweenss : num 1496
## $ size : int [1:4] 217 153 202 111
## $ iter : int 4
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
star_wars_planets_filter$Cluster <- star_wars_planets_filter_norm_kmeans_2$cluster
head(star_wars_planets_filter)
## Record_High_Temp Record_Low_Temp Revolution Cluster
## 1 36 -5 2112 1
## 2 33 -27 1818 3
## 3 28 -5 1210 1
## 4 38 -6 1431 1
## 5 37 -31 1060 4
## 6 38 -2 1450 1
aggregate(star_wars_planets_filter[, -4],
by = list(star_wars_planets_filter$Cluster),
FUN = mean)
## Group.1 Record_High_Temp Record_Low_Temp Revolution
## 1 1 27.50691 -6.423963 1511.7880
## 2 2 9.79085 -21.379085 729.3072
## 3 3 18.00495 -27.529703 1979.2376
## 4 4 28.93694 -28.315315 648.5315
star_wars_planets_w_cluster_df_2 <- aggregate(star_wars_planets_filter[, -4],
by = list(star_wars_planets_filter$Cluster),
FUN = mean)
star_wars_planets_w_cluster_df_2
## Group.1 Record_High_Temp Record_Low_Temp Revolution
## 1 1 27.50691 -6.423963 1511.7880
## 2 2 9.79085 -21.379085 729.3072
## 3 3 18.00495 -27.529703 1979.2376
## 4 4 28.93694 -28.315315 648.5315
names(star_wars_planets_w_cluster_df_2)[1] <- "Cluster"
star_wars_planets_w_cluster_df_2
## Cluster Record_High_Temp Record_Low_Temp Revolution
## 1 1 27.50691 -6.423963 1511.7880
## 2 2 9.79085 -21.379085 729.3072
## 3 3 18.00495 -27.529703 1979.2376
## 4 4 28.93694 -28.315315 648.5315
library(reshape2)
star_wars_planets_w_cluster_df_2_long <- melt(
star_wars_planets_w_cluster_df_2,
id.vars = c("Cluster"))
star_wars_planets_w_cluster_df_2_long
## Cluster variable value
## 1 1 Record_High_Temp 27.506912
## 2 2 Record_High_Temp 9.790850
## 3 3 Record_High_Temp 18.004950
## 4 4 Record_High_Temp 28.936937
## 5 1 Record_Low_Temp -6.423963
## 6 2 Record_Low_Temp -21.379085
## 7 3 Record_Low_Temp -27.529703
## 8 4 Record_Low_Temp -28.315315
## 9 1 Revolution 1511.788018
## 10 2 Revolution 729.307190
## 11 3 Revolution 1979.237624
## 12 4 Revolution 648.531532
names(star_wars_planets_w_cluster_df_2_long)[c(2,3)] <- c("Variable", "Mean")
head(star_wars_planets_w_cluster_df_2_long)
## Cluster Variable Mean
## 1 1 Record_High_Temp 27.506912
## 2 2 Record_High_Temp 9.790850
## 3 3 Record_High_Temp 18.004950
## 4 4 Record_High_Temp 28.936937
## 5 1 Record_Low_Temp -6.423963
## 6 2 Record_Low_Temp -21.379085
ggplot(star_wars_planets_w_cluster_df_2_long) +
aes(x = Variable, y = Mean, fill = Cluster) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
facet_wrap(~ Cluster) +
theme(legend.position = "none") +
ggtitle("Cluster characteristics")
star_wars_planets_filter_norm$Cluster <- star_wars_planets_filter_norm_kmeans_2$cluster
head(star_wars_planets_filter_norm)
## Record_High_Temp Record_Low_Temp Revolution Cluster
## 1 1.2242449 1.1579100 1.1162594 1
## 2 0.9800395 -0.5900060 0.6941853 3
## 3 0.5730305 1.1579100 -0.1786756 1
## 4 1.3870485 1.0784592 0.1385979 1
## 5 1.3056467 -0.9078089 -0.3940195 4
## 6 1.3870485 1.3962621 0.1658748 1
star_wars_planets_w_cluster_df_2_norm <-
aggregate(star_wars_planets_filter_norm[, -4],
by = list(star_wars_planets_filter_norm$Cluster),
FUN = mean)
names(star_wars_planets_w_cluster_df_2_norm)[1] <- "Cluster"
star_wars_planets_filter_norm_long_2 <- melt(star_wars_planets_w_cluster_df_2_norm,
id.vars = c("Cluster"))
head(star_wars_planets_filter_norm_long_2)
## Cluster variable value
## 1 1 Record_High_Temp 0.5328923
## 2 2 Record_High_Temp -0.9092271
## 3 3 Record_High_Temp -0.2405845
## 4 4 Record_High_Temp 0.6492989
## 5 1 Record_Low_Temp 1.0447751
## 6 2 Record_Low_Temp -0.1434202
names(star_wars_planets_filter_norm_long_2)[c(2:3)] <- c("Variable", "Mean")
head(star_wars_planets_filter_norm_long_2)
## Cluster Variable Mean
## 1 1 Record_High_Temp 0.5328923
## 2 2 Record_High_Temp -0.9092271
## 3 3 Record_High_Temp -0.2405845
## 4 4 Record_High_Temp 0.6492989
## 5 1 Record_Low_Temp 1.0447751
## 6 2 Record_Low_Temp -0.1434202
ggplot(star_wars_planets_filter_norm_long_2) +
aes(x = Variable, y = Mean, fill = Cluster) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
facet_wrap(~ Cluster) +
theme(legend.position = "none") +
ggtitle("Cluster characteristics, k = 4")
star_wars_planets_filter$Cluster <- as.factor(star_wars_planets_filter$Cluster)
ggplot(star_wars_planets_filter) + aes(x = Revolution, y = Record_Low_Temp,
shape = Cluster, color = Cluster) +
geom_point()
ggplot(star_wars_planets_filter) + aes(x = Revolution, y = Record_High_Temp,
shape = Cluster, color = Cluster) +
geom_point()
library(plotly)
plot_ly(x = star_wars_planets_filter$Revolution,
y = star_wars_planets_filter$Record_Low_Temp,
z = star_wars_planets_filter$Record_High_Temp,
type = "scatter3d", mode = "markers", color = star_wars_planets_filter$Cluster) %>%
layout(title = "Clustered Star Wars Planets",
scene = list(xaxis = list(title = "Revolution"),
yaxis = list(title = "Record Low Temp"),
zaxis = list(title = "Record High Temp")),
legend = list(title = list(text= "<b>Cluster</b>")))