import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
hogwarts = pd.read_csv("super_heroes_hogwarts_v3a.csv")
hogwarts.head()
ID | Name | Gender | Race | Height | Publisher | Alignment | Weight | Manipulative | Resourceful | ... | HouseID | House | STR | DEX | CON | INT | WIS | CHA | Level | HP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A001 | A-Bomb | Male | Human | 203.0 | Marvel Comics | good | 441.0 | 10 | 10 | ... | 1 | Slytherin | 18 | 11 | 17 | 12 | 13 | 11 | 1 | 7 |
1 | A002 | Abe Sapien | Male | Icthyo Sapien | 191.0 | Dark Horse Comics | good | 65.0 | 7 | 7 | ... | 1 | Slytherin | 16 | 17 | 10 | 13 | 15 | 11 | 8 | 72 |
2 | A004 | Abomination | Male | Human / Radiation | 203.0 | Marvel Comics | bad | 441.0 | 6 | 8 | ... | 1 | Slytherin | 13 | 14 | 13 | 10 | 18 | 15 | 15 | 135 |
3 | A009 | Agent 13 | Female | NaN | 173.0 | Marvel Comics | good | 61.0 | 7 | 7 | ... | 1 | Slytherin | 15 | 18 | 16 | 16 | 17 | 10 | 14 | 140 |
4 | A015 | Alex Mercer | Male | Human | NaN | Wildstorm | bad | NaN | 10 | 6 | ... | 1 | Slytherin | 14 | 17 | 13 | 12 | 10 | 11 | 9 | 72 |
5 rows × 26 columns
hogwarts.columns.values.tolist()
['ID', 'Name', 'Gender', 'Race', 'Height', 'Publisher', 'Alignment', 'Weight', 'Manipulative', 'Resourceful', 'Dismissive', 'Intelligent', 'Trusting', 'Loyal', 'Stubborn', 'Brave', 'HouseID', 'House', 'STR', 'DEX', 'CON', 'INT', 'WIS', 'CHA', 'Level', 'HP']
Or if this is easier
pd.DataFrame(hogwarts.columns.values, columns = ["Variables"])
Variables | |
---|---|
0 | ID |
1 | Name |
2 | Gender |
3 | Race |
4 | Height |
5 | Publisher |
6 | Alignment |
7 | Weight |
8 | Manipulative |
9 | Resourceful |
10 | Dismissive |
11 | Intelligent |
12 | Trusting |
13 | Loyal |
14 | Stubborn |
15 | Brave |
16 | HouseID |
17 | House |
18 | STR |
19 | DEX |
20 | CON |
21 | INT |
22 | WIS |
23 | CHA |
24 | Level |
25 | HP |
Filter required variables.
hogwarts_2 = hogwarts.loc[:, "Manipulative":"House"]
hogwarts_2.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | HouseID | House | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 10 | 10 | 7 | 6 | 7 | 7 | 7 | 9 | 1 | Slytherin |
1 | 7 | 7 | 6 | 8 | 6 | 7 | 6 | 9 | 1 | Slytherin |
2 | 6 | 8 | 1 | 6 | 3 | 3 | 5 | 2 | 1 | Slytherin |
3 | 7 | 7 | 1 | 9 | 7 | 4 | 6 | 6 | 1 | Slytherin |
4 | 10 | 6 | 8 | 3 | 4 | 4 | 1 | 8 | 1 | Slytherin |
hogwarts_2 = hogwarts_2.drop(columns = ["HouseID"])
hogwarts_2.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
0 | 10 | 10 | 7 | 6 | 7 | 7 | 7 | 9 | Slytherin |
1 | 7 | 7 | 6 | 8 | 6 | 7 | 6 | 9 | Slytherin |
2 | 6 | 8 | 1 | 6 | 3 | 3 | 5 | 2 | Slytherin |
3 | 7 | 7 | 1 | 9 | 7 | 4 | 6 | 6 | Slytherin |
4 | 10 | 6 | 8 | 3 | 4 | 4 | 1 | 8 | Slytherin |
Or specify the columns
hogwarts.loc[:, ["Manipulative", "Resourceful", "Dismissive", "Intelligent",
"Trusting", "Loyal", "Stubborn", "Brave", "House"]]
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
0 | 10 | 10 | 7 | 6 | 7 | 7 | 7 | 9 | Slytherin |
1 | 7 | 7 | 6 | 8 | 6 | 7 | 6 | 9 | Slytherin |
2 | 6 | 8 | 1 | 6 | 3 | 3 | 5 | 2 | Slytherin |
3 | 7 | 7 | 1 | 9 | 7 | 4 | 6 | 6 | Slytherin |
4 | 10 | 6 | 8 | 3 | 4 | 4 | 1 | 8 | Slytherin |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
729 | 2 | 3 | 3 | 5 | 7 | 8 | 8 | 8 | Gryffindor |
730 | 3 | 3 | 7 | 7 | 4 | 4 | 7 | 8 | Gryffindor |
731 | 6 | 6 | 2 | 8 | 7 | 3 | 8 | 6 | Gryffindor |
732 | 4 | 7 | 2 | 4 | 6 | 7 | 9 | 9 | Gryffindor |
733 | 3 | 7 | 7 | 3 | 8 | 4 | 9 | 6 | Gryffindor |
734 rows × 9 columns
Or use the index
import numpy as np
hogwarts.iloc[:, np.r_[8:16, 17]]
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
0 | 10 | 10 | 7 | 6 | 7 | 7 | 7 | 9 | Slytherin |
1 | 7 | 7 | 6 | 8 | 6 | 7 | 6 | 9 | Slytherin |
2 | 6 | 8 | 1 | 6 | 3 | 3 | 5 | 2 | Slytherin |
3 | 7 | 7 | 1 | 9 | 7 | 4 | 6 | 6 | Slytherin |
4 | 10 | 6 | 8 | 3 | 4 | 4 | 1 | 8 | Slytherin |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
729 | 2 | 3 | 3 | 5 | 7 | 8 | 8 | 8 | Gryffindor |
730 | 3 | 3 | 7 | 7 | 4 | 4 | 7 | 8 | Gryffindor |
731 | 6 | 6 | 2 | 8 | 7 | 3 | 8 | 6 | Gryffindor |
732 | 4 | 7 | 2 | 4 | 6 | 7 | 9 | 9 | Gryffindor |
733 | 3 | 7 | 7 | 3 | 8 | 4 | 9 | 6 | Gryffindor |
734 rows × 9 columns
trainData, validData = train_test_split(hogwarts_2, test_size = 0.4, random_state = 666)
trainData.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
650 | 5 | 5 | 5 | 9 | 9 | 4 | 9 | 8 | Gryffindor |
479 | 3 | 7 | 5 | 3 | 7 | 6 | 5 | 7 | Hufflepuff |
271 | 3 | 8 | 9 | 7 | 3 | 1 | 2 | 3 | Ravenclaw |
647 | 5 | 6 | 2 | 8 | 6 | 8 | 9 | 8 | Gryffindor |
307 | 6 | 7 | 10 | 6 | 7 | 1 | 4 | 9 | Ravenclaw |
validData.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
389 | 4 | 2 | 4 | 5 | 8 | 6 | 7 | 6 | Hufflepuff |
131 | 7 | 7 | 8 | 9 | 3 | 4 | 5 | 3 | Slytherin |
657 | 7 | 1 | 8 | 5 | 8 | 6 | 8 | 8 | Gryffindor |
421 | 4 | 9 | 2 | 4 | 7 | 8 | 6 | 9 | Hufflepuff |
160 | 6 | 9 | 5 | 2 | 4 | 4 | 1 | 4 | Slytherin |
predictors = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"]
outcome = "House"
Create the normalisation model or algorithm using the training set
scaler = preprocessing.StandardScaler()
scaler.fit(trainData[predictors])
StandardScaler()
Transform the corresponding data.
hogwarts_2_norm = pd.DataFrame(scaler.transform(hogwarts_2[predictors]),
columns = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"])
hogwarts_2_norm
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
0 | 1.780586 | 1.691429 | 0.494040 | 0.099603 | 0.484763 | 0.511163 | 0.499954 | 1.376814 |
1 | 0.512171 | 0.378084 | 0.064440 | 0.993995 | 0.052115 | 0.511163 | 0.071144 | 1.376814 |
2 | 0.089366 | 0.815866 | -2.083559 | 0.099603 | -1.245832 | -1.115685 | -0.357667 | -1.605314 |
3 | 0.512171 | 0.378084 | -2.083559 | 1.441191 | 0.484763 | -0.708973 | 0.071144 | 0.098759 |
4 | 1.780586 | -0.059698 | 0.923640 | -1.241985 | -0.813183 | -0.708973 | -2.072907 | 0.950795 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
729 | -1.601855 | -1.373043 | -1.224360 | -0.347593 | 0.484763 | 0.917875 | 0.928764 | 0.950795 |
730 | -1.179050 | -1.373043 | 0.494040 | 0.546799 | -0.813183 | -0.708973 | 0.499954 | 0.950795 |
731 | 0.089366 | -0.059698 | -1.653959 | 0.993995 | 0.484763 | -1.115685 | 0.928764 | 0.098759 |
732 | -0.756245 | 0.378084 | -1.653959 | -0.794789 | 0.052115 | 0.511163 | 1.357574 | 1.376814 |
733 | -1.179050 | 0.378084 | 0.494040 | -1.241985 | 0.917412 | -0.708973 | 1.357574 | 0.098759 |
734 rows × 8 columns
# alternatively,
pd.DataFrame(scaler.transform(hogwarts_2[predictors]),
columns = predictors)
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
0 | 1.780586 | 1.691429 | 0.494040 | 0.099603 | 0.484763 | 0.511163 | 0.499954 | 1.376814 |
1 | 0.512171 | 0.378084 | 0.064440 | 0.993995 | 0.052115 | 0.511163 | 0.071144 | 1.376814 |
2 | 0.089366 | 0.815866 | -2.083559 | 0.099603 | -1.245832 | -1.115685 | -0.357667 | -1.605314 |
3 | 0.512171 | 0.378084 | -2.083559 | 1.441191 | 0.484763 | -0.708973 | 0.071144 | 0.098759 |
4 | 1.780586 | -0.059698 | 0.923640 | -1.241985 | -0.813183 | -0.708973 | -2.072907 | 0.950795 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
729 | -1.601855 | -1.373043 | -1.224360 | -0.347593 | 0.484763 | 0.917875 | 0.928764 | 0.950795 |
730 | -1.179050 | -1.373043 | 0.494040 | 0.546799 | -0.813183 | -0.708973 | 0.499954 | 0.950795 |
731 | 0.089366 | -0.059698 | -1.653959 | 0.993995 | 0.484763 | -1.115685 | 0.928764 | 0.098759 |
732 | -0.756245 | 0.378084 | -1.653959 | -0.794789 | 0.052115 | 0.511163 | 1.357574 | 1.376814 |
733 | -1.179050 | 0.378084 | 0.494040 | -1.241985 | 0.917412 | -0.708973 | 1.357574 | 0.098759 |
734 rows × 8 columns
Merge with original set to get target variable
hogwarts_2_norm_full = pd.concat([hogwarts_2_norm, hogwarts_2["House"]], axis = 1)
hogwarts_2_norm_full
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
0 | 1.780586 | 1.691429 | 0.494040 | 0.099603 | 0.484763 | 0.511163 | 0.499954 | 1.376814 | Slytherin |
1 | 0.512171 | 0.378084 | 0.064440 | 0.993995 | 0.052115 | 0.511163 | 0.071144 | 1.376814 | Slytherin |
2 | 0.089366 | 0.815866 | -2.083559 | 0.099603 | -1.245832 | -1.115685 | -0.357667 | -1.605314 | Slytherin |
3 | 0.512171 | 0.378084 | -2.083559 | 1.441191 | 0.484763 | -0.708973 | 0.071144 | 0.098759 | Slytherin |
4 | 1.780586 | -0.059698 | 0.923640 | -1.241985 | -0.813183 | -0.708973 | -2.072907 | 0.950795 | Slytherin |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
729 | -1.601855 | -1.373043 | -1.224360 | -0.347593 | 0.484763 | 0.917875 | 0.928764 | 0.950795 | Gryffindor |
730 | -1.179050 | -1.373043 | 0.494040 | 0.546799 | -0.813183 | -0.708973 | 0.499954 | 0.950795 | Gryffindor |
731 | 0.089366 | -0.059698 | -1.653959 | 0.993995 | 0.484763 | -1.115685 | 0.928764 | 0.098759 | Gryffindor |
732 | -0.756245 | 0.378084 | -1.653959 | -0.794789 | 0.052115 | 0.511163 | 1.357574 | 1.376814 | Gryffindor |
733 | -1.179050 | 0.378084 | 0.494040 | -1.241985 | 0.917412 | -0.708973 | 1.357574 | 0.098759 | Gryffindor |
734 rows × 9 columns
Split the normalised set using the indices of the original split
trainNorm = hogwarts_2_norm_full.iloc[trainData.index]
trainNorm.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
650 | -0.333440 | -0.497479 | -0.365160 | 1.441191 | 1.350061 | -0.708973 | 1.357574 | 0.950795 | Gryffindor |
479 | -1.179050 | 0.378084 | -0.365160 | -1.241985 | 0.484763 | 0.104451 | -0.357667 | 0.524777 | Hufflepuff |
271 | -1.179050 | 0.815866 | 1.353239 | 0.546799 | -1.245832 | -1.929110 | -1.644097 | -1.179296 | Ravenclaw |
647 | -0.333440 | -0.059698 | -1.653959 | 0.993995 | 0.052115 | 0.917875 | 1.357574 | 0.950795 | Gryffindor |
307 | 0.089366 | 0.378084 | 1.782839 | 0.099603 | 0.484763 | -1.929110 | -0.786477 | 1.376814 | Ravenclaw |
validNorm = hogwarts_2_norm_full.iloc[validData.index]
validNorm.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
389 | -0.756245 | -1.810824 | -0.794760 | -0.347593 | 0.917412 | 0.104451 | 0.499954 | 0.098759 | Hufflepuff |
131 | 0.512171 | 0.378084 | 0.923640 | 1.441191 | -1.245832 | -0.708973 | -0.357667 | -1.179296 | Slytherin |
657 | 0.512171 | -2.248606 | 0.923640 | -0.347593 | 0.917412 | 0.104451 | 0.928764 | 0.950795 | Gryffindor |
421 | -0.756245 | 1.253648 | -1.653959 | -0.794789 | 0.484763 | 0.917875 | 0.071144 | 1.376814 | Hufflepuff |
160 | 0.089366 | 1.253648 | -0.365160 | -1.689181 | -0.813183 | -0.708973 | -2.072907 | -0.753278 | Slytherin |
train_X = trainNorm.loc[:, "Manipulative":"Brave"]
train_X
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
650 | -0.333440 | -0.497479 | -0.365160 | 1.441191 | 1.350061 | -0.708973 | 1.357574 | 0.950795 |
479 | -1.179050 | 0.378084 | -0.365160 | -1.241985 | 0.484763 | 0.104451 | -0.357667 | 0.524777 |
271 | -1.179050 | 0.815866 | 1.353239 | 0.546799 | -1.245832 | -1.929110 | -1.644097 | -1.179296 |
647 | -0.333440 | -0.059698 | -1.653959 | 0.993995 | 0.052115 | 0.917875 | 1.357574 | 0.950795 |
307 | 0.089366 | 0.378084 | 1.782839 | 0.099603 | 0.484763 | -1.929110 | -0.786477 | 1.376814 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
445 | 0.089366 | 1.253648 | -1.224360 | -0.794789 | 0.917412 | 0.917875 | -0.357667 | 1.376814 |
414 | 0.934976 | -0.497479 | 0.064440 | 0.099603 | 0.052115 | 0.917875 | -1.215287 | -1.179296 |
70 | 1.357781 | 0.378084 | -2.083559 | 1.441191 | -0.813183 | 0.917875 | -0.357667 | 0.950795 |
429 | 1.357781 | -0.059698 | 1.353239 | -1.241985 | 0.484763 | 0.511163 | 0.928764 | 0.524777 |
236 | -0.333440 | 0.378084 | 0.923640 | 0.993995 | 0.917412 | -0.708973 | -1.215287 | -1.605314 |
440 rows × 8 columns
# alternatively
trainNorm.loc[:, predictors]
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
650 | -0.333440 | -0.497479 | -0.365160 | 1.441191 | 1.350061 | -0.708973 | 1.357574 | 0.950795 |
479 | -1.179050 | 0.378084 | -0.365160 | -1.241985 | 0.484763 | 0.104451 | -0.357667 | 0.524777 |
271 | -1.179050 | 0.815866 | 1.353239 | 0.546799 | -1.245832 | -1.929110 | -1.644097 | -1.179296 |
647 | -0.333440 | -0.059698 | -1.653959 | 0.993995 | 0.052115 | 0.917875 | 1.357574 | 0.950795 |
307 | 0.089366 | 0.378084 | 1.782839 | 0.099603 | 0.484763 | -1.929110 | -0.786477 | 1.376814 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
445 | 0.089366 | 1.253648 | -1.224360 | -0.794789 | 0.917412 | 0.917875 | -0.357667 | 1.376814 |
414 | 0.934976 | -0.497479 | 0.064440 | 0.099603 | 0.052115 | 0.917875 | -1.215287 | -1.179296 |
70 | 1.357781 | 0.378084 | -2.083559 | 1.441191 | -0.813183 | 0.917875 | -0.357667 | 0.950795 |
429 | 1.357781 | -0.059698 | 1.353239 | -1.241985 | 0.484763 | 0.511163 | 0.928764 | 0.524777 |
236 | -0.333440 | 0.378084 | 0.923640 | 0.993995 | 0.917412 | -0.708973 | -1.215287 | -1.605314 |
440 rows × 8 columns
train_y = trainNorm["House"]
train_y
650 Gryffindor 479 Hufflepuff 271 Ravenclaw 647 Gryffindor 307 Ravenclaw ... 445 Hufflepuff 414 Hufflepuff 70 Slytherin 429 Hufflepuff 236 Ravenclaw Name: House, Length: 440, dtype: object
valid_X = validNorm.loc[:, "Manipulative":"Brave"]
valid_X
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
389 | -0.756245 | -1.810824 | -0.794760 | -0.347593 | 0.917412 | 0.104451 | 0.499954 | 0.098759 |
131 | 0.512171 | 0.378084 | 0.923640 | 1.441191 | -1.245832 | -0.708973 | -0.357667 | -1.179296 |
657 | 0.512171 | -2.248606 | 0.923640 | -0.347593 | 0.917412 | 0.104451 | 0.928764 | 0.950795 |
421 | -0.756245 | 1.253648 | -1.653959 | -0.794789 | 0.484763 | 0.917875 | 0.071144 | 1.376814 |
160 | 0.089366 | 1.253648 | -0.365160 | -1.689181 | -0.813183 | -0.708973 | -2.072907 | -0.753278 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
49 | 0.089366 | 0.378084 | -1.224360 | -1.241985 | 0.484763 | -0.302261 | -1.215287 | -1.605314 |
53 | 1.357781 | 0.815866 | 0.064440 | -1.689181 | -0.813183 | -1.929110 | -0.357667 | -0.753278 |
569 | 0.934976 | -1.373043 | 0.064440 | -1.689181 | 1.350061 | 1.324588 | 0.071144 | 0.950795 |
700 | -2.024660 | -0.059698 | -0.365160 | -0.347593 | 1.350061 | -1.522398 | 0.499954 | 1.376814 |
312 | 0.089366 | -0.935261 | 1.353239 | 0.546799 | -1.245832 | 0.104451 | -0.357667 | -0.753278 |
294 rows × 8 columns
# alternatively
validNorm.loc[:, predictors]
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
389 | -0.756245 | -1.810824 | -0.794760 | -0.347593 | 0.917412 | 0.104451 | 0.499954 | 0.098759 |
131 | 0.512171 | 0.378084 | 0.923640 | 1.441191 | -1.245832 | -0.708973 | -0.357667 | -1.179296 |
657 | 0.512171 | -2.248606 | 0.923640 | -0.347593 | 0.917412 | 0.104451 | 0.928764 | 0.950795 |
421 | -0.756245 | 1.253648 | -1.653959 | -0.794789 | 0.484763 | 0.917875 | 0.071144 | 1.376814 |
160 | 0.089366 | 1.253648 | -0.365160 | -1.689181 | -0.813183 | -0.708973 | -2.072907 | -0.753278 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
49 | 0.089366 | 0.378084 | -1.224360 | -1.241985 | 0.484763 | -0.302261 | -1.215287 | -1.605314 |
53 | 1.357781 | 0.815866 | 0.064440 | -1.689181 | -0.813183 | -1.929110 | -0.357667 | -0.753278 |
569 | 0.934976 | -1.373043 | 0.064440 | -1.689181 | 1.350061 | 1.324588 | 0.071144 | 0.950795 |
700 | -2.024660 | -0.059698 | -0.365160 | -0.347593 | 1.350061 | -1.522398 | 0.499954 | 1.376814 |
312 | 0.089366 | -0.935261 | 1.353239 | 0.546799 | -1.245832 | 0.104451 | -0.357667 | -0.753278 |
294 rows × 8 columns
valid_y = validNorm["House"]
valid_y
389 Hufflepuff 131 Slytherin 657 Gryffindor 421 Hufflepuff 160 Slytherin ... 49 Slytherin 53 Slytherin 569 Gryffindor 700 Gryffindor 312 Ravenclaw Name: House, Length: 294, dtype: object
from sklearn.neighbors import KNeighborsClassifier
%%capture --no-display
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(train_X, train_y)
KNeighborsClassifier(n_neighbors=3)
Get training set predictions.
%%capture --no-display
train_y_pred = knn.predict(train_X)
train_y_pred[:20]
array(['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Gryffindor', 'Ravenclaw', 'Hufflepuff', 'Gryffindor', 'Slytherin', 'Hufflepuff', 'Slytherin', 'Ravenclaw', 'Slytherin', 'Hufflepuff', 'Gryffindor', 'Hufflepuff', 'Hufflepuff', 'Hufflepuff', 'Gryffindor', 'Gryffindor', 'Gryffindor'], dtype=object)
Get validation set predictions.
# To remove the warning, add
# %%capture --no-display
valid_y_pred = knn.predict(valid_X)
valid_y_pred[:20]
C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning. mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
array(['Gryffindor', 'Ravenclaw', 'Gryffindor', 'Gryffindor', 'Slytherin', 'Slytherin', 'Slytherin', 'Hufflepuff', 'Gryffindor', 'Hufflepuff', 'Slytherin', 'Hufflepuff', 'Ravenclaw', 'Gryffindor', 'Hufflepuff', 'Hufflepuff', 'Gryffindor', 'Slytherin', 'Ravenclaw', 'Slytherin'], dtype=object)
from sklearn.metrics import confusion_matrix, accuracy_score
Training set. Confusion matrix.
confusion_matrix_train = confusion_matrix(train_y, train_y_pred)
confusion_matrix_train
array([[103, 1, 2, 4], [ 2, 105, 4, 2], [ 4, 7, 81, 5], [ 1, 7, 5, 107]], dtype=int64)
A confusion matrix that's easier to read
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
confusion_matrix_train_display = ConfusionMatrixDisplay(confusion_matrix_train, display_labels = knn.classes_)
confusion_matrix_train_display.plot()
plt.grid(False)
accuracy_score(train_y, train_y_pred)
0.9
from sklearn.metrics import classification_report
print(classification_report(train_y, train_y_pred))
precision recall f1-score support Gryffindor 0.94 0.94 0.94 110 Hufflepuff 0.88 0.93 0.90 113 Ravenclaw 0.88 0.84 0.86 97 Slytherin 0.91 0.89 0.90 120 accuracy 0.90 440 macro avg 0.90 0.90 0.90 440 weighted avg 0.90 0.90 0.90 440
import sklearn
training_report_1 = sklearn.metrics.classification_report(train_y, train_y_pred, output_dict=True)
training_report_1
{'Gryffindor': {'precision': 0.9363636363636364, 'recall': 0.9363636363636364, 'f1-score': 0.9363636363636364, 'support': 110}, 'Hufflepuff': {'precision': 0.875, 'recall': 0.9292035398230089, 'f1-score': 0.9012875536480688, 'support': 113}, 'Ravenclaw': {'precision': 0.8804347826086957, 'recall': 0.8350515463917526, 'f1-score': 0.8571428571428571, 'support': 97}, 'Slytherin': {'precision': 0.9067796610169492, 'recall': 0.8916666666666667, 'f1-score': 0.8991596638655461, 'support': 120}, 'accuracy': 0.9, 'macro avg': {'precision': 0.8996445199973204, 'recall': 0.8980713473112661, 'f1-score': 0.8984884277550271, 'support': 440}, 'weighted avg': {'precision': 0.9002062118979032, 'recall': 0.9, 'f1-score': 0.8997443417476237, 'support': 440}}
training_report_1_df = pd.DataFrame.from_dict(training_report_1)
training_report_1_df
# to export
# training_report_1_df.to_csv("training_report_1_df.csv")
Gryffindor | Hufflepuff | Ravenclaw | Slytherin | accuracy | macro avg | weighted avg | |
---|---|---|---|---|---|---|---|
precision | 0.936364 | 0.875000 | 0.880435 | 0.906780 | 0.9 | 0.899645 | 0.900206 |
recall | 0.936364 | 0.929204 | 0.835052 | 0.891667 | 0.9 | 0.898071 | 0.900000 |
f1-score | 0.936364 | 0.901288 | 0.857143 | 0.899160 | 0.9 | 0.898488 | 0.899744 |
support | 110.000000 | 113.000000 | 97.000000 | 120.000000 | 0.9 | 440.000000 | 440.000000 |
Validation set. Confusion matrix.
confusion_matrix_valid = confusion_matrix(valid_y, valid_y_pred)
confusion_matrix_valid
array([[63, 6, 4, 5], [10, 65, 0, 1], [ 5, 3, 48, 3], [ 7, 9, 4, 61]], dtype=int64)
A confusion matrix that's easier to read
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_valid_display = ConfusionMatrixDisplay(confusion_matrix_valid, display_labels = knn.classes_)
confusion_matrix_valid_display.plot()
plt.grid(False)
accuracy_score(valid_y, valid_y_pred)
0.8061224489795918
print(classification_report(valid_y, valid_y_pred))
precision recall f1-score support Gryffindor 0.74 0.81 0.77 78 Hufflepuff 0.78 0.86 0.82 76 Ravenclaw 0.86 0.81 0.83 59 Slytherin 0.87 0.75 0.81 81 accuracy 0.81 294 macro avg 0.81 0.81 0.81 294 weighted avg 0.81 0.81 0.81 294
# if not already:
# import sklearn
valid_report_1 = sklearn.metrics.classification_report(valid_y, valid_y_pred, output_dict=True)
valid_report_1
{'Gryffindor': {'precision': 0.7411764705882353, 'recall': 0.8076923076923077, 'f1-score': 0.7730061349693251, 'support': 78}, 'Hufflepuff': {'precision': 0.7831325301204819, 'recall': 0.8552631578947368, 'f1-score': 0.8176100628930818, 'support': 76}, 'Ravenclaw': {'precision': 0.8571428571428571, 'recall': 0.8135593220338984, 'f1-score': 0.8347826086956522, 'support': 59}, 'Slytherin': {'precision': 0.8714285714285714, 'recall': 0.7530864197530864, 'f1-score': 0.8079470198675497, 'support': 81}, 'accuracy': 0.8061224489795918, 'macro avg': {'precision': 0.8132201073200365, 'recall': 0.8074003018435074, 'f1-score': 0.8083364566064022, 'support': 294}, 'weighted avg': {'precision': 0.8111802035788498, 'recall': 0.8061224489795918, 'f1-score': 0.8065602919380835, 'support': 294}}
valid_report_1_df = pd.DataFrame.from_dict(valid_report_1)
valid_report_1_df
# to export
# valid_report_1_df.to_csv("valid_report_1_df.csv")
Gryffindor | Hufflepuff | Ravenclaw | Slytherin | accuracy | macro avg | weighted avg | |
---|---|---|---|---|---|---|---|
precision | 0.741176 | 0.783133 | 0.857143 | 0.871429 | 0.806122 | 0.813220 | 0.811180 |
recall | 0.807692 | 0.855263 | 0.813559 | 0.753086 | 0.806122 | 0.807400 | 0.806122 |
f1-score | 0.773006 | 0.817610 | 0.834783 | 0.807947 | 0.806122 | 0.808336 | 0.806560 |
support | 78.000000 | 76.000000 | 59.000000 | 81.000000 | 0.806122 | 294.000000 | 294.000000 |
Create new dataframe.
data = {"Manipulative": 8, "Resourceful": 9, "Dismissive" : 8, "Intelligent": 6, "Trusting": 6, "Loyal": 8,
"Stubborn": 6, "Brave": 6}
new_padawan = pd.DataFrame(data, index = [0])
new_padawan
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
0 | 8 | 9 | 8 | 6 | 6 | 8 | 6 | 6 |
# Alternatively, use a list
pd.DataFrame([data])
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
0 | 8 | 9 | 8 | 6 | 6 | 8 | 6 | 6 |
Normalise.
new_padawan_norm = pd.DataFrame(scaler.transform(new_padawan[predictors]),
columns = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"])
new_padawan_norm
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
0 | 0.934976 | 1.253648 | 0.92364 | 0.099603 | 0.052115 | 0.917875 | 0.071144 | 0.098759 |
# alternatively
pd.DataFrame(scaler.transform(new_padawan),
columns = predictors)
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
0 | 0.934976 | 1.253648 | 0.92364 | 0.099603 | 0.052115 | 0.917875 | 0.071144 | 0.098759 |
new_padawan_pred = knn.predict(new_padawan_norm)
new_padawan_pred
C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning. mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
array(['Slytherin'], dtype=object)
new_padawan_pred_prob = knn.predict_proba(new_padawan_norm)
new_padawan_pred_prob
array([[0. , 0. , 0.33333333, 0.66666667]])
# Use the classes
pd.DataFrame(new_padawan_pred_prob,
columns = knn.classes_)
Gryffindor | Hufflepuff | Ravenclaw | Slytherin | |
---|---|---|---|---|
0 | 0.0 | 0.0 | 0.333333 | 0.666667 |
# Or specify the columns
pd.DataFrame(new_padawan_pred_prob,
columns = ["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin"])
Gryffindor | Hufflepuff | Ravenclaw | Slytherin | |
---|---|---|---|---|
0 | 0.0 | 0.0 | 0.333333 | 0.666667 |
train_X2 = train_X.copy() # for consistency
valid_X2 = valid_X.copy() # for consistency
train_y2 = train_y.copy()
valid_y2 = valid_y.copy()
train_X2.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
650 | -0.333440 | -0.497479 | -0.365160 | 1.441191 | 1.350061 | -0.708973 | 1.357574 | 0.950795 |
479 | -1.179050 | 0.378084 | -0.365160 | -1.241985 | 0.484763 | 0.104451 | -0.357667 | 0.524777 |
271 | -1.179050 | 0.815866 | 1.353239 | 0.546799 | -1.245832 | -1.929110 | -1.644097 | -1.179296 |
647 | -0.333440 | -0.059698 | -1.653959 | 0.993995 | 0.052115 | 0.917875 | 1.357574 | 0.950795 |
307 | 0.089366 | 0.378084 | 1.782839 | 0.099603 | 0.484763 | -1.929110 | -0.786477 | 1.376814 |
valid_X2.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
389 | -0.756245 | -1.810824 | -0.794760 | -0.347593 | 0.917412 | 0.104451 | 0.499954 | 0.098759 |
131 | 0.512171 | 0.378084 | 0.923640 | 1.441191 | -1.245832 | -0.708973 | -0.357667 | -1.179296 |
657 | 0.512171 | -2.248606 | 0.923640 | -0.347593 | 0.917412 | 0.104451 | 0.928764 | 0.950795 |
421 | -0.756245 | 1.253648 | -1.653959 | -0.794789 | 0.484763 | 0.917875 | 0.071144 | 1.376814 |
160 | 0.089366 | 1.253648 | -0.365160 | -1.689181 | -0.813183 | -0.708973 | -2.072907 | -0.753278 |
train_y2.value_counts()
Slytherin 120 Hufflepuff 113 Gryffindor 110 Ravenclaw 97 Name: House, dtype: int64
valid_y2.value_counts()
Slytherin 81 Gryffindor 78 Hufflepuff 76 Ravenclaw 59 Name: House, dtype: int64
House_values = ["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin" ]
House_2_values = ["0", "0", "0", "1"]
train_y2 = train_y.replace(House_values, House_2_values)
train_y2
650 0 479 0 271 0 647 0 307 0 .. 445 0 414 0 70 1 429 0 236 0 Name: House, Length: 440, dtype: object
train_y2.head()
650 0 479 0 271 0 647 0 307 0 Name: House, dtype: object
train_y2.value_counts()
0 320 1 120 Name: House, dtype: int64
valid_y2 = valid_y.replace(House_values, House_2_values)
valid_y2
389 0 131 1 657 0 421 0 160 1 .. 49 1 53 1 569 0 700 0 312 0 Name: House, Length: 294, dtype: object
valid_y2.head()
389 0 131 1 657 0 421 0 160 1 Name: House, dtype: object
valid_y2.value_counts()
0 213 1 81 Name: House, dtype: int64
knn2 = KNeighborsClassifier(n_neighbors = 7)
knn2.fit(train_X2, train_y2)
accuracy_score(train_y2, knn2.predict(train_X2))
C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning. mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
0.9318181818181818
Get training set prediction.
train_y2_pred = knn2.predict(train_X2)
train_y2_pred[:20]
C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning. mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
array(['0', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0'], dtype=object)
train_y2_pred_prob2 = knn2.predict_proba(train_X2)
train_y2_pred_prob2[:20]
array([[1. , 0. ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [0.71428571, 0.28571429], [1. , 0. ], [0.71428571, 0.28571429], [0. , 1. ], [0.85714286, 0.14285714], [0.28571429, 0.71428571], [1. , 0. ], [0.14285714, 0.85714286], [0.42857143, 0.57142857], [0.85714286, 0.14285714], [1. , 0. ], [0.57142857, 0.42857143], [0.85714286, 0.14285714], [1. , 0. ], [1. , 0. ], [0.85714286, 0.14285714]])
Get validation set prediction.
# alternatively
pd.DataFrame(train_y2_pred_prob2, columns = knn2.classes_)
0 | 1 | |
---|---|---|
0 | 1.000000 | 0.000000 |
1 | 1.000000 | 0.000000 |
2 | 1.000000 | 0.000000 |
3 | 1.000000 | 0.000000 |
4 | 0.714286 | 0.285714 |
... | ... | ... |
435 | 0.857143 | 0.142857 |
436 | 0.857143 | 0.142857 |
437 | 0.142857 | 0.857143 |
438 | 0.714286 | 0.285714 |
439 | 1.000000 | 0.000000 |
440 rows × 2 columns
valid_y2_pred = knn2.predict(valid_X2)
valid_y2_pred[:20]
C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning. mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
array(['0', '0', '0', '0', '1', '1', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '1', '0', '1'], dtype=object)
valid_y2_pred_prob2 = knn2.predict_proba(valid_X2)
valid_y2_pred_prob2[:6]
array([[1. , 0. ], [0.85714286, 0.14285714], [1. , 0. ], [0.85714286, 0.14285714], [0. , 1. ], [0.14285714, 0.85714286]])
# alternatively
pd.DataFrame(valid_y2_pred_prob2, columns = knn2.classes_)
0 | 1 | |
---|---|---|
0 | 1.000000 | 0.000000 |
1 | 0.857143 | 0.142857 |
2 | 1.000000 | 0.000000 |
3 | 0.857143 | 0.142857 |
4 | 0.000000 | 1.000000 |
... | ... | ... |
289 | 0.428571 | 0.571429 |
290 | 0.000000 | 1.000000 |
291 | 1.000000 | 0.000000 |
292 | 1.000000 | 0.000000 |
293 | 0.857143 | 0.142857 |
294 rows × 2 columns
Training set.
confusion_matrix_train2 = confusion_matrix(train_y2, train_y2_pred)
confusion_matrix_train2
array([[309, 11], [ 19, 101]], dtype=int64)
A confusion matrix that's easier to read
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_train2_display = ConfusionMatrixDisplay(confusion_matrix_train2, display_labels = knn2.classes_)
confusion_matrix_train2_display.plot()
plt.grid(False)
accuracy_score(train_y2, train_y2_pred)
0.9318181818181818
print(classification_report(train_y2, train_y2_pred))
precision recall f1-score support 0 0.94 0.97 0.95 320 1 0.90 0.84 0.87 120 accuracy 0.93 440 macro avg 0.92 0.90 0.91 440 weighted avg 0.93 0.93 0.93 440
Validation set.
confusion_matrix_valid2 = confusion_matrix(valid_y2, valid_y2_pred)
confusion_matrix_valid2
array([[203, 10], [ 15, 66]], dtype=int64)
A confusion matrix that's easier to read
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_valid2_display = ConfusionMatrixDisplay(confusion_matrix_valid2, display_labels = knn2.classes_)
confusion_matrix_valid2_display.plot()
plt.grid(False)
accuracy_score(valid_y2, valid_y2_pred)
0.9149659863945578
print(classification_report(valid_y2, valid_y2_pred))
precision recall f1-score support 0 0.93 0.95 0.94 213 1 0.87 0.81 0.84 81 accuracy 0.91 294 macro avg 0.90 0.88 0.89 294 weighted avg 0.91 0.91 0.91 294
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
Check the probabilities Column 2 (index = 1), first 20 records
# check again
pd.DataFrame(valid_y2_pred_prob2, columns = knn2.classes_)
0 | 1 | |
---|---|---|
0 | 1.000000 | 0.000000 |
1 | 0.857143 | 0.142857 |
2 | 1.000000 | 0.000000 |
3 | 0.857143 | 0.142857 |
4 | 0.000000 | 1.000000 |
... | ... | ... |
289 | 0.428571 | 0.571429 |
290 | 0.000000 | 1.000000 |
291 | 1.000000 | 0.000000 |
292 | 1.000000 | 0.000000 |
293 | 0.857143 | 0.142857 |
294 rows × 2 columns
# Or specify the columns
pd.DataFrame(valid_y2_pred_prob2,
columns = ["Not Slytherin", "Slytherin"])
Not Slytherin | Slytherin | |
---|---|---|
0 | 1.000000 | 0.000000 |
1 | 0.857143 | 0.142857 |
2 | 1.000000 | 0.000000 |
3 | 0.857143 | 0.142857 |
4 | 0.000000 | 1.000000 |
... | ... | ... |
289 | 0.428571 | 0.571429 |
290 | 0.000000 | 1.000000 |
291 | 1.000000 | 0.000000 |
292 | 1.000000 | 0.000000 |
293 | 0.857143 | 0.142857 |
294 rows × 2 columns
Set the positive class as "Slytherin" (i.e. "1")
fpr, tpr, threshold = metrics.roc_curve(valid_y2, valid_y2_pred_prob2[:,1], pos_label = "1")
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.plot(fpr, tpr, linestyle = "-", color = "green", label = "Slytherin kNN")
# roc curve for tpr = fpr (random line)
random_probs = [0 for i in range(len(valid_y2))]
p_fpr, p_tpr, _ = roc_curve(valid_y2, random_probs, pos_label = "1")
plt.plot(p_fpr, p_tpr, linestyle = "--", color = "black", label = "Random Force")
# If desired
plt.legend()
plt.title("Sorting Hat ROC")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Text(0, 0.5, 'True Positive Rate')
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(valid_y2, valid_y2_pred_prob2[:,1])
auc
0.9729322436677679
new_padawan_pred2 = knn2.predict(new_padawan_norm)
new_padawan_pred2
C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning. mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
array(['1'], dtype=object)
new_padawan_pred_prob2 = knn2.predict_proba(new_padawan_norm)
new_padawan_pred_prob2
array([[0.14285714, 0.85714286]])
pd.DataFrame(new_padawan_pred_prob2,
columns = knn2.classes_)
0 | 1 | |
---|---|---|
0 | 0.142857 | 0.857143 |
Still a Slytherin, you are :-)