import pandas as pd
star_wars = pd.read_csv("star_wars_planets_v5.csv", encoding = "cp1252", index_col = False)
star_wars
Name | Region | Sector | System | Inhabitants | Capital_City | Population | Record_High_Temp | Record_Low_Temp | Gravity | Key_Element | Rotation | Revolution | Rating | Rating_Nom | Resources | Resources_Nom | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Aargonar | Unknown | Unknown | Unknown | Unknown | Unknown | 3857305 | 36 | -5 | 8.35 | 6 | 59.89 | 2112 | 7 | Yay | 8 | Awesome |
1 | Abafar | Outer Rim Territories | Sprizen sector | Unknown | Unknown | Unknown | 2331308 | 33 | -27 | 11.40 | 28 | 49.20 | 1818 | 2 | Yucks | 9 | Awesome |
2 | Abednedo | Colonies | Unknown | Unknown | Abednedo | Unknown | 5358387 | 28 | -5 | 9.55 | 30 | 127.76 | 1210 | 3 | Yucks | 5 | Blah |
3 | Absanz | Unknown | Unknown | Unknown | Unknown | Unknown | 7229751 | 38 | -6 | 10.80 | 57 | 145.98 | 1431 | 9 | Yay | 3 | Blah |
4 | Affadar | Unknown | Unknown | Unknown | T'Laeem | Unknown | 4307513 | 37 | -31 | 9.75 | 22 | 58.51 | 1060 | 9 | Yay | 7 | Awesome |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
678 | Zhanox | Outer Rim Territories | Anoat sector | Zhanox system | Ugnaughts | Unknown | 8396480 | 17 | -23 | 10.81 | 10 | 104.19 | 2172 | 4 | Meh | 6 | Awesome |
679 | Zoh | Unknown | Unknown | Unknown | Unknown | Unknown | 1339277 | 29 | -12 | 8.10 | 50 | 20.88 | 145 | 2 | Yucks | 2 | Blah |
680 | Zolan | Mid Rim | Unknown | Unknown | Clawdites | Unknown | 5880473 | 20 | -18 | 8.60 | 84 | 197.40 | 939 | 5 | Meh | 8 | Awesome |
681 | Zygerria | Outer Rim Territories | Unknown | Unknown | Zygerrians | Unknown | 3984810 | 24 | -20 | 11.25 | 86 | 224.72 | 2327 | 4 | Meh | 4 | Blah |
682 | Zyzar | Unknown | Unknown | Unknown | Unknown | Unknown | 1464241 | 23 | -26 | 8.27 | 83 | 76.29 | 2438 | 3 | Yucks | 8 | Awesome |
683 rows × 17 columns
print(star_wars.columns)
Index(['Name', 'Region', 'Sector', 'System', 'Inhabitants', 'Capital_City', 'Population', 'Record_High_Temp', 'Record_Low_Temp', 'Gravity', 'Key_Element', 'Rotation', 'Revolution', 'Rating', 'Rating_Nom', 'Resources', 'Resources_Nom'], dtype='object')
star_wars.count()
Name 683 Region 683 Sector 683 System 683 Inhabitants 683 Capital_City 683 Population 683 Record_High_Temp 683 Record_Low_Temp 683 Gravity 683 Key_Element 683 Rotation 683 Revolution 683 Rating 683 Rating_Nom 683 Resources 683 Resources_Nom 683 dtype: int64
Set row names as the planet names.
star_wars.set_index("Name", inplace = True)
star_wars.head()
Region | Sector | System | Inhabitants | Capital_City | Population | Record_High_Temp | Record_Low_Temp | Gravity | Key_Element | Rotation | Revolution | Rating | Rating_Nom | Resources | Resources_Nom | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Name | ||||||||||||||||
Aargonar | Unknown | Unknown | Unknown | Unknown | Unknown | 3857305 | 36 | -5 | 8.35 | 6 | 59.89 | 2112 | 7 | Yay | 8 | Awesome |
Abafar | Outer Rim Territories | Sprizen sector | Unknown | Unknown | Unknown | 2331308 | 33 | -27 | 11.40 | 28 | 49.20 | 1818 | 2 | Yucks | 9 | Awesome |
Abednedo | Colonies | Unknown | Unknown | Abednedo | Unknown | 5358387 | 28 | -5 | 9.55 | 30 | 127.76 | 1210 | 3 | Yucks | 5 | Blah |
Absanz | Unknown | Unknown | Unknown | Unknown | Unknown | 7229751 | 38 | -6 | 10.80 | 57 | 145.98 | 1431 | 9 | Yay | 3 | Blah |
Affadar | Unknown | Unknown | Unknown | T'Laeem | Unknown | 4307513 | 37 | -31 | 9.75 | 22 | 58.51 | 1060 | 9 | Yay | 7 | Awesome |
star_wars_subset = star_wars.iloc[:, [5, 8, 10]]
star_wars_subset.head()
Population | Gravity | Rotation | |
---|---|---|---|
Name | |||
Aargonar | 3857305 | 8.35 | 59.89 |
Abafar | 2331308 | 11.40 | 49.20 |
Abednedo | 5358387 | 9.55 | 127.76 |
Absanz | 7229751 | 10.80 | 145.98 |
Affadar | 4307513 | 9.75 | 58.51 |
star_wars_subset.dtypes
Population int64 Gravity float64 Rotation float64 dtype: object
star_wars_subset.describe()
Population | Gravity | Rotation | |
---|---|---|---|
count | 6.830000e+02 | 683.000000 | 683.000000 |
mean | 5.522815e+06 | 9.542050 | 127.439722 |
std | 2.652394e+06 | 1.446553 | 71.092655 |
min | 1.014877e+06 | 7.000000 | 10.070000 |
25% | 3.226102e+06 | 8.280000 | 64.320000 |
50% | 5.458429e+06 | 9.570000 | 125.100000 |
75% | 7.777546e+06 | 10.805000 | 194.545000 |
max | 9.973030e+06 | 12.000000 | 249.230000 |
from sklearn.preprocessing import StandardScaler
star_wars_subset_norm = star_wars_subset.copy()
scaler = StandardScaler()
cols_to_norm = ["Population", "Gravity", "Rotation"]
star_wars_subset_norm[cols_to_norm] = scaler.fit_transform(star_wars_subset_norm[cols_to_norm])
star_wars_subset_norm.head()
Population | Gravity | Rotation | |
---|---|---|---|
Name | |||
Aargonar | -0.628387 | -0.824666 | -0.950861 |
Abafar | -1.204137 | 1.285340 | -1.101338 |
Abednedo | -0.062038 | 0.005500 | 0.004508 |
Absanz | 0.644017 | 0.870257 | 0.260981 |
Affadar | -0.458526 | 0.143861 | -0.970286 |
from sklearn.cluster import KMeans
Lesser clusters may be easier to interpret.
kmeans = KMeans(n_clusters = 4, random_state = 666)
kmeans
KMeans(n_clusters=4, random_state=666)
Fit the algorithm to the variables.
kmeans.fit(star_wars_subset_norm)
KMeans(n_clusters=4, random_state=666)
from sklearn.metrics import silhouette_score
kmeans_silhouette = silhouette_score(star_wars_subset_norm, kmeans.labels_).round(3)
kmeans_silhouette
0.282
Alternatively, use for loop to determine k.
for i in range(2,20):
kmeans_multiple = KMeans(n_clusters = i, random_state = 666)
kmeans_multiple.fit(star_wars_subset_norm)
kmeans_multiple
kmeans_silhouette = silhouette_score(star_wars_subset_norm, kmeans_multiple.labels_).round(5)
print(kmeans_silhouette)
0.24656 0.25173 0.28228 0.28241 0.30061 0.29409 0.29806 0.29829 0.29202 0.29598 0.28398 0.28561 0.2837 0.28061 0.27531 0.27965 0.27867 0.27188
If it's easier to read.
sil_result = []
for i in range(2,20):
kmeans_multiple = KMeans(n_clusters = i, random_state = 666)
kmeans_multiple.fit(star_wars_subset_norm)
sil_result.append({
"k_value": i,
"silhouette": silhouette_score(star_wars_subset_norm, kmeans_multiple.labels_).round(5),
})
sil_result = pd.DataFrame(sil_result)
sil_result
k_value | silhouette | |
---|---|---|
0 | 2 | 0.24656 |
1 | 3 | 0.25173 |
2 | 4 | 0.28228 |
3 | 5 | 0.28241 |
4 | 6 | 0.30061 |
5 | 7 | 0.29409 |
6 | 8 | 0.29806 |
7 | 9 | 0.29829 |
8 | 10 | 0.29202 |
9 | 11 | 0.29598 |
10 | 12 | 0.28398 |
11 | 13 | 0.28561 |
12 | 14 | 0.28370 |
13 | 15 | 0.28061 |
14 | 16 | 0.27531 |
15 | 17 | 0.27965 |
16 | 18 | 0.27867 |
17 | 19 | 0.27188 |
star_wars_subset_kmeans4_memb = pd.Series(kmeans.labels_, index = star_wars_subset_norm.index)
star_wars_subset_kmeans4_memb
Name Aargonar 1 Abafar 2 Abednedo 1 Absanz 0 Affadar 1 .. Zhanox 0 Zoh 1 Zolan 3 Zygerria 2 Zyzar 1 Length: 683, dtype: int32
star_wars_subset_kmeans4_memb = star_wars_subset_kmeans4_memb.replace([0, 1, 2, 3], [1, 2, 3, 4])
star_wars_subset_kmeans4_memb
Name Aargonar 2 Abafar 3 Abednedo 2 Absanz 1 Affadar 2 .. Zhanox 1 Zoh 2 Zolan 4 Zygerria 3 Zyzar 2 Length: 683, dtype: int32
star_wars_subset_kmeans4_memb_df = pd.DataFrame(star_wars_subset_kmeans4_memb, columns = ["Cluster"])
star_wars_subset_kmeans4_memb_df
Cluster | |
---|---|
Name | |
Aargonar | 2 |
Abafar | 3 |
Abednedo | 2 |
Absanz | 1 |
Affadar | 2 |
... | ... |
Zhanox | 1 |
Zoh | 2 |
Zolan | 4 |
Zygerria | 3 |
Zyzar | 2 |
683 rows × 1 columns
star_wars_subset_kmeans4_memb_df["Cluster"] = star_wars_subset_kmeans4_memb_df["Cluster"].astype("category")
star_wars_w_cluster = pd.concat([star_wars, star_wars_subset_kmeans4_memb_df], axis = 1)
star_wars_w_cluster
Region | Sector | System | Inhabitants | Capital_City | Population | Record_High_Temp | Record_Low_Temp | Gravity | Key_Element | Rotation | Revolution | Rating | Rating_Nom | Resources | Resources_Nom | Cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Name | |||||||||||||||||
Aargonar | Unknown | Unknown | Unknown | Unknown | Unknown | 3857305 | 36 | -5 | 8.35 | 6 | 59.89 | 2112 | 7 | Yay | 8 | Awesome | 2 |
Abafar | Outer Rim Territories | Sprizen sector | Unknown | Unknown | Unknown | 2331308 | 33 | -27 | 11.40 | 28 | 49.20 | 1818 | 2 | Yucks | 9 | Awesome | 3 |
Abednedo | Colonies | Unknown | Unknown | Abednedo | Unknown | 5358387 | 28 | -5 | 9.55 | 30 | 127.76 | 1210 | 3 | Yucks | 5 | Blah | 2 |
Absanz | Unknown | Unknown | Unknown | Unknown | Unknown | 7229751 | 38 | -6 | 10.80 | 57 | 145.98 | 1431 | 9 | Yay | 3 | Blah | 1 |
Affadar | Unknown | Unknown | Unknown | T'Laeem | Unknown | 4307513 | 37 | -31 | 9.75 | 22 | 58.51 | 1060 | 9 | Yay | 7 | Awesome | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Zhanox | Outer Rim Territories | Anoat sector | Zhanox system | Ugnaughts | Unknown | 8396480 | 17 | -23 | 10.81 | 10 | 104.19 | 2172 | 4 | Meh | 6 | Awesome | 1 |
Zoh | Unknown | Unknown | Unknown | Unknown | Unknown | 1339277 | 29 | -12 | 8.10 | 50 | 20.88 | 145 | 2 | Yucks | 2 | Blah | 2 |
Zolan | Mid Rim | Unknown | Unknown | Clawdites | Unknown | 5880473 | 20 | -18 | 8.60 | 84 | 197.40 | 939 | 5 | Meh | 8 | Awesome | 4 |
Zygerria | Outer Rim Territories | Unknown | Unknown | Zygerrians | Unknown | 3984810 | 24 | -20 | 11.25 | 86 | 224.72 | 2327 | 4 | Meh | 4 | Blah | 3 |
Zyzar | Unknown | Unknown | Unknown | Unknown | Unknown | 1464241 | 23 | -26 | 8.27 | 83 | 76.29 | 2438 | 3 | Yucks | 8 | Awesome | 2 |
683 rows × 17 columns
star_wars_w_cluster_comparison_1 = star_wars_w_cluster.iloc[:,[5, 8, 10, 16]]
star_wars_w_cluster_comparison_1
Population | Gravity | Rotation | Cluster | |
---|---|---|---|---|
Name | ||||
Aargonar | 3857305 | 8.35 | 59.89 | 2 |
Abafar | 2331308 | 11.40 | 49.20 | 3 |
Abednedo | 5358387 | 9.55 | 127.76 | 2 |
Absanz | 7229751 | 10.80 | 145.98 | 1 |
Affadar | 4307513 | 9.75 | 58.51 | 2 |
... | ... | ... | ... | ... |
Zhanox | 8396480 | 10.81 | 104.19 | 1 |
Zoh | 1339277 | 8.10 | 20.88 | 2 |
Zolan | 5880473 | 8.60 | 197.40 | 4 |
Zygerria | 3984810 | 11.25 | 224.72 | 3 |
Zyzar | 1464241 | 8.27 | 76.29 | 2 |
683 rows × 4 columns
import seaborn as sns
scatter_1 = sns.scatterplot(x = "Population",
y = "Gravity",
size = "Rotation",
hue = "Cluster",
palette ="Set1",
data = star_wars_w_cluster_comparison_1)
scatter_1.set_xlabel("Population", fontsize = 20)
scatter_1.set_ylabel("Gravity", fontsize = 20)
scatter_1.legend(bbox_to_anchor = (1.05, 1), loc = 'upper left');
import plotly.express as px
fig_1 = px.scatter_3d(star_wars_w_cluster_comparison_1,
x = "Population",
y = "Gravity",
z = "Rotation",
color = "Cluster",
symbol = "Cluster",
category_orders = {"Cluster": [1, 2, 3, 4]})
fig_1.show()
sns.relplot(
data = star_wars_w_cluster_comparison_1,
x = "Population",
y = "Gravity",
col = "Cluster",
size = "Rotation",
style = "Cluster",
kind = "scatter");
star_wars_w_cluster_comparison_2 = star_wars_w_cluster.iloc[:, [10, 11, 16]]
star_wars_w_cluster_comparison_2
Rotation | Revolution | Cluster | |
---|---|---|---|
Name | |||
Aargonar | 59.89 | 2112 | 2 |
Abafar | 49.20 | 1818 | 3 |
Abednedo | 127.76 | 1210 | 2 |
Absanz | 145.98 | 1431 | 1 |
Affadar | 58.51 | 1060 | 2 |
... | ... | ... | ... |
Zhanox | 104.19 | 2172 | 1 |
Zoh | 20.88 | 145 | 2 |
Zolan | 197.40 | 939 | 4 |
Zygerria | 224.72 | 2327 | 3 |
Zyzar | 76.29 | 2438 | 2 |
683 rows × 3 columns
# import seaborn as sns (if not already)
scatter_2 = sns.scatterplot(x = "Rotation",
y = "Revolution",
hue = "Cluster",
palette ="Set1",
data = star_wars_w_cluster_comparison_2)
scatter_2.set_xlabel("Rotation", fontsize = 20)
scatter_2.set_ylabel("Revolution", fontsize = 20)
scatter_2.legend(bbox_to_anchor = (1.15, 0.68), loc = "lower right", title = "Cluster");
fig_2 = px.scatter_3d(star_wars_w_cluster_comparison_2,
x = "Rotation",
y = "Revolution",
z = "Cluster",
color = "Cluster",
symbol = "Cluster",
category_orders = {"Cluster": [1, 2, 3, 4]})
fig_2.show()