import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
hogwarts = pd.read_csv("super_heroes_hogwarts_v3a.csv")
hogwarts.head()
ID | Name | Gender | Race | Height | Publisher | Alignment | Weight | Manipulative | Resourceful | ... | HouseID | House | STR | DEX | CON | INT | WIS | CHA | Level | HP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A001 | A-Bomb | Male | Human | 203.0 | Marvel Comics | good | 441.0 | 10 | 10 | ... | 1 | Slytherin | 18 | 11 | 17 | 12 | 13 | 11 | 1 | 7 |
1 | A002 | Abe Sapien | Male | Icthyo Sapien | 191.0 | Dark Horse Comics | good | 65.0 | 7 | 7 | ... | 1 | Slytherin | 16 | 17 | 10 | 13 | 15 | 11 | 8 | 72 |
2 | A004 | Abomination | Male | Human / Radiation | 203.0 | Marvel Comics | bad | 441.0 | 6 | 8 | ... | 1 | Slytherin | 13 | 14 | 13 | 10 | 18 | 15 | 15 | 135 |
3 | A009 | Agent 13 | Female | NaN | 173.0 | Marvel Comics | good | 61.0 | 7 | 7 | ... | 1 | Slytherin | 15 | 18 | 16 | 16 | 17 | 10 | 14 | 140 |
4 | A015 | Alex Mercer | Male | Human | NaN | Wildstorm | bad | NaN | 10 | 6 | ... | 1 | Slytherin | 14 | 17 | 13 | 12 | 10 | 11 | 9 | 72 |
5 rows × 26 columns
hogwarts.columns.values.tolist()
['ID', 'Name', 'Gender', 'Race', 'Height', 'Publisher', 'Alignment', 'Weight', 'Manipulative', 'Resourceful', 'Dismissive', 'Intelligent', 'Trusting', 'Loyal', 'Stubborn', 'Brave', 'HouseID', 'House', 'STR', 'DEX', 'CON', 'INT', 'WIS', 'CHA', 'Level', 'HP']
pd.DataFrame(hogwarts.columns.values, columns = ["Variables"])
Variables | |
---|---|
0 | ID |
1 | Name |
2 | Gender |
3 | Race |
4 | Height |
5 | Publisher |
6 | Alignment |
7 | Weight |
8 | Manipulative |
9 | Resourceful |
10 | Dismissive |
11 | Intelligent |
12 | Trusting |
13 | Loyal |
14 | Stubborn |
15 | Brave |
16 | HouseID |
17 | House |
18 | STR |
19 | DEX |
20 | CON |
21 | INT |
22 | WIS |
23 | CHA |
24 | Level |
25 | HP |
Filter required variables.
hogwarts_2 = hogwarts.loc[:, "Manipulative":"House"]
hogwarts_2.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | HouseID | House | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 10 | 10 | 7 | 6 | 7 | 7 | 7 | 9 | 1 | Slytherin |
1 | 7 | 7 | 6 | 8 | 6 | 7 | 6 | 9 | 1 | Slytherin |
2 | 6 | 8 | 1 | 6 | 3 | 3 | 5 | 2 | 1 | Slytherin |
3 | 7 | 7 | 1 | 9 | 7 | 4 | 6 | 6 | 1 | Slytherin |
4 | 10 | 6 | 8 | 3 | 4 | 4 | 1 | 8 | 1 | Slytherin |
hogwarts_2 = hogwarts_2.drop(columns = ["HouseID"])
hogwarts_2.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
0 | 10 | 10 | 7 | 6 | 7 | 7 | 7 | 9 | Slytherin |
1 | 7 | 7 | 6 | 8 | 6 | 7 | 6 | 9 | Slytherin |
2 | 6 | 8 | 1 | 6 | 3 | 3 | 5 | 2 | Slytherin |
3 | 7 | 7 | 1 | 9 | 7 | 4 | 6 | 6 | Slytherin |
4 | 10 | 6 | 8 | 3 | 4 | 4 | 1 | 8 | Slytherin |
Use loc
Or specify the columns
hogwarts.loc[:, ["Manipulative", "Resourceful", "Dismissive", "Intelligent",
"Trusting", "Loyal", "Stubborn", "Brave", "House"]]
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
0 | 10 | 10 | 7 | 6 | 7 | 7 | 7 | 9 | Slytherin |
1 | 7 | 7 | 6 | 8 | 6 | 7 | 6 | 9 | Slytherin |
2 | 6 | 8 | 1 | 6 | 3 | 3 | 5 | 2 | Slytherin |
3 | 7 | 7 | 1 | 9 | 7 | 4 | 6 | 6 | Slytherin |
4 | 10 | 6 | 8 | 3 | 4 | 4 | 1 | 8 | Slytherin |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
729 | 2 | 3 | 3 | 5 | 7 | 8 | 8 | 8 | Gryffindor |
730 | 3 | 3 | 7 | 7 | 4 | 4 | 7 | 8 | Gryffindor |
731 | 6 | 6 | 2 | 8 | 7 | 3 | 8 | 6 | Gryffindor |
732 | 4 | 7 | 2 | 4 | 6 | 7 | 9 | 9 | Gryffindor |
733 | 3 | 7 | 7 | 3 | 8 | 4 | 9 | 6 | Gryffindor |
734 rows × 9 columns
Or use the index
import numpy as np
hogwarts.iloc[:, np.r_[8:16, 17]]
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
0 | 10 | 10 | 7 | 6 | 7 | 7 | 7 | 9 | Slytherin |
1 | 7 | 7 | 6 | 8 | 6 | 7 | 6 | 9 | Slytherin |
2 | 6 | 8 | 1 | 6 | 3 | 3 | 5 | 2 | Slytherin |
3 | 7 | 7 | 1 | 9 | 7 | 4 | 6 | 6 | Slytherin |
4 | 10 | 6 | 8 | 3 | 4 | 4 | 1 | 8 | Slytherin |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
729 | 2 | 3 | 3 | 5 | 7 | 8 | 8 | 8 | Gryffindor |
730 | 3 | 3 | 7 | 7 | 4 | 4 | 7 | 8 | Gryffindor |
731 | 6 | 6 | 2 | 8 | 7 | 3 | 8 | 6 | Gryffindor |
732 | 4 | 7 | 2 | 4 | 6 | 7 | 9 | 9 | Gryffindor |
733 | 3 | 7 | 7 | 3 | 8 | 4 | 9 | 6 | Gryffindor |
734 rows × 9 columns
trainData, validData = train_test_split(hogwarts_2, test_size = 0.4, random_state = 666)
trainData.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
650 | 5 | 5 | 5 | 9 | 9 | 4 | 9 | 8 | Gryffindor |
479 | 3 | 7 | 5 | 3 | 7 | 6 | 5 | 7 | Hufflepuff |
271 | 3 | 8 | 9 | 7 | 3 | 1 | 2 | 3 | Ravenclaw |
647 | 5 | 6 | 2 | 8 | 6 | 8 | 9 | 8 | Gryffindor |
307 | 6 | 7 | 10 | 6 | 7 | 1 | 4 | 9 | Ravenclaw |
validData.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
389 | 4 | 2 | 4 | 5 | 8 | 6 | 7 | 6 | Hufflepuff |
131 | 7 | 7 | 8 | 9 | 3 | 4 | 5 | 3 | Slytherin |
657 | 7 | 1 | 8 | 5 | 8 | 6 | 8 | 8 | Gryffindor |
421 | 4 | 9 | 2 | 4 | 7 | 8 | 6 | 9 | Hufflepuff |
160 | 6 | 9 | 5 | 2 | 4 | 4 | 1 | 4 | Slytherin |
predictors = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"]
outcome = "House"
Create the normalisation model or algorithm using the training set
scaler = preprocessing.StandardScaler()
scaler.fit(trainData[predictors])
StandardScaler()
Transform the whole data set.
hogwarts_2_norm = pd.DataFrame(scaler.transform(hogwarts_2[predictors]),
columns = predictors)
hogwarts_2_norm
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
0 | 1.780586 | 1.691429 | 0.494040 | 0.099603 | 0.484763 | 0.511163 | 0.499954 | 1.376814 |
1 | 0.512171 | 0.378084 | 0.064440 | 0.993995 | 0.052115 | 0.511163 | 0.071144 | 1.376814 |
2 | 0.089366 | 0.815866 | -2.083559 | 0.099603 | -1.245832 | -1.115685 | -0.357667 | -1.605314 |
3 | 0.512171 | 0.378084 | -2.083559 | 1.441191 | 0.484763 | -0.708973 | 0.071144 | 0.098759 |
4 | 1.780586 | -0.059698 | 0.923640 | -1.241985 | -0.813183 | -0.708973 | -2.072907 | 0.950795 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
729 | -1.601855 | -1.373043 | -1.224360 | -0.347593 | 0.484763 | 0.917875 | 0.928764 | 0.950795 |
730 | -1.179050 | -1.373043 | 0.494040 | 0.546799 | -0.813183 | -0.708973 | 0.499954 | 0.950795 |
731 | 0.089366 | -0.059698 | -1.653959 | 0.993995 | 0.484763 | -1.115685 | 0.928764 | 0.098759 |
732 | -0.756245 | 0.378084 | -1.653959 | -0.794789 | 0.052115 | 0.511163 | 1.357574 | 1.376814 |
733 | -1.179050 | 0.378084 | 0.494040 | -1.241985 | 0.917412 | -0.708973 | 1.357574 | 0.098759 |
734 rows × 8 columns
# Or spell out the names
pd.DataFrame(scaler.transform(hogwarts_2[predictors]),
columns = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"])
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
0 | 1.780586 | 1.691429 | 0.494040 | 0.099603 | 0.484763 | 0.511163 | 0.499954 | 1.376814 |
1 | 0.512171 | 0.378084 | 0.064440 | 0.993995 | 0.052115 | 0.511163 | 0.071144 | 1.376814 |
2 | 0.089366 | 0.815866 | -2.083559 | 0.099603 | -1.245832 | -1.115685 | -0.357667 | -1.605314 |
3 | 0.512171 | 0.378084 | -2.083559 | 1.441191 | 0.484763 | -0.708973 | 0.071144 | 0.098759 |
4 | 1.780586 | -0.059698 | 0.923640 | -1.241985 | -0.813183 | -0.708973 | -2.072907 | 0.950795 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
729 | -1.601855 | -1.373043 | -1.224360 | -0.347593 | 0.484763 | 0.917875 | 0.928764 | 0.950795 |
730 | -1.179050 | -1.373043 | 0.494040 | 0.546799 | -0.813183 | -0.708973 | 0.499954 | 0.950795 |
731 | 0.089366 | -0.059698 | -1.653959 | 0.993995 | 0.484763 | -1.115685 | 0.928764 | 0.098759 |
732 | -0.756245 | 0.378084 | -1.653959 | -0.794789 | 0.052115 | 0.511163 | 1.357574 | 1.376814 |
733 | -1.179050 | 0.378084 | 0.494040 | -1.241985 | 0.917412 | -0.708973 | 1.357574 | 0.098759 |
734 rows × 8 columns
Merge with original set to get target variable
hogwarts_2_norm_full = pd.concat([hogwarts_2_norm, hogwarts_2["House"]], axis = 1)
hogwarts_2_norm_full
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
0 | 1.780586 | 1.691429 | 0.494040 | 0.099603 | 0.484763 | 0.511163 | 0.499954 | 1.376814 | Slytherin |
1 | 0.512171 | 0.378084 | 0.064440 | 0.993995 | 0.052115 | 0.511163 | 0.071144 | 1.376814 | Slytherin |
2 | 0.089366 | 0.815866 | -2.083559 | 0.099603 | -1.245832 | -1.115685 | -0.357667 | -1.605314 | Slytherin |
3 | 0.512171 | 0.378084 | -2.083559 | 1.441191 | 0.484763 | -0.708973 | 0.071144 | 0.098759 | Slytherin |
4 | 1.780586 | -0.059698 | 0.923640 | -1.241985 | -0.813183 | -0.708973 | -2.072907 | 0.950795 | Slytherin |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
729 | -1.601855 | -1.373043 | -1.224360 | -0.347593 | 0.484763 | 0.917875 | 0.928764 | 0.950795 | Gryffindor |
730 | -1.179050 | -1.373043 | 0.494040 | 0.546799 | -0.813183 | -0.708973 | 0.499954 | 0.950795 | Gryffindor |
731 | 0.089366 | -0.059698 | -1.653959 | 0.993995 | 0.484763 | -1.115685 | 0.928764 | 0.098759 | Gryffindor |
732 | -0.756245 | 0.378084 | -1.653959 | -0.794789 | 0.052115 | 0.511163 | 1.357574 | 1.376814 | Gryffindor |
733 | -1.179050 | 0.378084 | 0.494040 | -1.241985 | 0.917412 | -0.708973 | 1.357574 | 0.098759 | Gryffindor |
734 rows × 9 columns
Split the normalised set using the indices of the original split
trainNorm = hogwarts_2_norm_full.iloc[trainData.index]
trainNorm.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
650 | -0.333440 | -0.497479 | -0.365160 | 1.441191 | 1.350061 | -0.708973 | 1.357574 | 0.950795 | Gryffindor |
479 | -1.179050 | 0.378084 | -0.365160 | -1.241985 | 0.484763 | 0.104451 | -0.357667 | 0.524777 | Hufflepuff |
271 | -1.179050 | 0.815866 | 1.353239 | 0.546799 | -1.245832 | -1.929110 | -1.644097 | -1.179296 | Ravenclaw |
647 | -0.333440 | -0.059698 | -1.653959 | 0.993995 | 0.052115 | 0.917875 | 1.357574 | 0.950795 | Gryffindor |
307 | 0.089366 | 0.378084 | 1.782839 | 0.099603 | 0.484763 | -1.929110 | -0.786477 | 1.376814 | Ravenclaw |
validNorm = hogwarts_2_norm_full.iloc[validData.index]
validNorm.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | House | |
---|---|---|---|---|---|---|---|---|---|
389 | -0.756245 | -1.810824 | -0.794760 | -0.347593 | 0.917412 | 0.104451 | 0.499954 | 0.098759 | Hufflepuff |
131 | 0.512171 | 0.378084 | 0.923640 | 1.441191 | -1.245832 | -0.708973 | -0.357667 | -1.179296 | Slytherin |
657 | 0.512171 | -2.248606 | 0.923640 | -0.347593 | 0.917412 | 0.104451 | 0.928764 | 0.950795 | Gryffindor |
421 | -0.756245 | 1.253648 | -1.653959 | -0.794789 | 0.484763 | 0.917875 | 0.071144 | 1.376814 | Hufflepuff |
160 | 0.089366 | 1.253648 | -0.365160 | -1.689181 | -0.813183 | -0.708973 | -2.072907 | -0.753278 | Slytherin |
train_X = trainNorm.loc[:, "Manipulative":"Brave"]
train_X
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
650 | -0.333440 | -0.497479 | -0.365160 | 1.441191 | 1.350061 | -0.708973 | 1.357574 | 0.950795 |
479 | -1.179050 | 0.378084 | -0.365160 | -1.241985 | 0.484763 | 0.104451 | -0.357667 | 0.524777 |
271 | -1.179050 | 0.815866 | 1.353239 | 0.546799 | -1.245832 | -1.929110 | -1.644097 | -1.179296 |
647 | -0.333440 | -0.059698 | -1.653959 | 0.993995 | 0.052115 | 0.917875 | 1.357574 | 0.950795 |
307 | 0.089366 | 0.378084 | 1.782839 | 0.099603 | 0.484763 | -1.929110 | -0.786477 | 1.376814 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
445 | 0.089366 | 1.253648 | -1.224360 | -0.794789 | 0.917412 | 0.917875 | -0.357667 | 1.376814 |
414 | 0.934976 | -0.497479 | 0.064440 | 0.099603 | 0.052115 | 0.917875 | -1.215287 | -1.179296 |
70 | 1.357781 | 0.378084 | -2.083559 | 1.441191 | -0.813183 | 0.917875 | -0.357667 | 0.950795 |
429 | 1.357781 | -0.059698 | 1.353239 | -1.241985 | 0.484763 | 0.511163 | 0.928764 | 0.524777 |
236 | -0.333440 | 0.378084 | 0.923640 | 0.993995 | 0.917412 | -0.708973 | -1.215287 | -1.605314 |
440 rows × 8 columns
# Alternatively
trainNorm.loc[:, predictors]
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
650 | -0.333440 | -0.497479 | -0.365160 | 1.441191 | 1.350061 | -0.708973 | 1.357574 | 0.950795 |
479 | -1.179050 | 0.378084 | -0.365160 | -1.241985 | 0.484763 | 0.104451 | -0.357667 | 0.524777 |
271 | -1.179050 | 0.815866 | 1.353239 | 0.546799 | -1.245832 | -1.929110 | -1.644097 | -1.179296 |
647 | -0.333440 | -0.059698 | -1.653959 | 0.993995 | 0.052115 | 0.917875 | 1.357574 | 0.950795 |
307 | 0.089366 | 0.378084 | 1.782839 | 0.099603 | 0.484763 | -1.929110 | -0.786477 | 1.376814 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
445 | 0.089366 | 1.253648 | -1.224360 | -0.794789 | 0.917412 | 0.917875 | -0.357667 | 1.376814 |
414 | 0.934976 | -0.497479 | 0.064440 | 0.099603 | 0.052115 | 0.917875 | -1.215287 | -1.179296 |
70 | 1.357781 | 0.378084 | -2.083559 | 1.441191 | -0.813183 | 0.917875 | -0.357667 | 0.950795 |
429 | 1.357781 | -0.059698 | 1.353239 | -1.241985 | 0.484763 | 0.511163 | 0.928764 | 0.524777 |
236 | -0.333440 | 0.378084 | 0.923640 | 0.993995 | 0.917412 | -0.708973 | -1.215287 | -1.605314 |
440 rows × 8 columns
train_y = trainNorm["House"]
train_y
650 Gryffindor 479 Hufflepuff 271 Ravenclaw 647 Gryffindor 307 Ravenclaw ... 445 Hufflepuff 414 Hufflepuff 70 Slytherin 429 Hufflepuff 236 Ravenclaw Name: House, Length: 440, dtype: object
valid_X = validNorm.loc[:, "Manipulative":"Brave"]
valid_X
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
389 | -0.756245 | -1.810824 | -0.794760 | -0.347593 | 0.917412 | 0.104451 | 0.499954 | 0.098759 |
131 | 0.512171 | 0.378084 | 0.923640 | 1.441191 | -1.245832 | -0.708973 | -0.357667 | -1.179296 |
657 | 0.512171 | -2.248606 | 0.923640 | -0.347593 | 0.917412 | 0.104451 | 0.928764 | 0.950795 |
421 | -0.756245 | 1.253648 | -1.653959 | -0.794789 | 0.484763 | 0.917875 | 0.071144 | 1.376814 |
160 | 0.089366 | 1.253648 | -0.365160 | -1.689181 | -0.813183 | -0.708973 | -2.072907 | -0.753278 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
49 | 0.089366 | 0.378084 | -1.224360 | -1.241985 | 0.484763 | -0.302261 | -1.215287 | -1.605314 |
53 | 1.357781 | 0.815866 | 0.064440 | -1.689181 | -0.813183 | -1.929110 | -0.357667 | -0.753278 |
569 | 0.934976 | -1.373043 | 0.064440 | -1.689181 | 1.350061 | 1.324588 | 0.071144 | 0.950795 |
700 | -2.024660 | -0.059698 | -0.365160 | -0.347593 | 1.350061 | -1.522398 | 0.499954 | 1.376814 |
312 | 0.089366 | -0.935261 | 1.353239 | 0.546799 | -1.245832 | 0.104451 | -0.357667 | -0.753278 |
294 rows × 8 columns
# Alternatively
validNorm.loc[:, predictors]
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
389 | -0.756245 | -1.810824 | -0.794760 | -0.347593 | 0.917412 | 0.104451 | 0.499954 | 0.098759 |
131 | 0.512171 | 0.378084 | 0.923640 | 1.441191 | -1.245832 | -0.708973 | -0.357667 | -1.179296 |
657 | 0.512171 | -2.248606 | 0.923640 | -0.347593 | 0.917412 | 0.104451 | 0.928764 | 0.950795 |
421 | -0.756245 | 1.253648 | -1.653959 | -0.794789 | 0.484763 | 0.917875 | 0.071144 | 1.376814 |
160 | 0.089366 | 1.253648 | -0.365160 | -1.689181 | -0.813183 | -0.708973 | -2.072907 | -0.753278 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
49 | 0.089366 | 0.378084 | -1.224360 | -1.241985 | 0.484763 | -0.302261 | -1.215287 | -1.605314 |
53 | 1.357781 | 0.815866 | 0.064440 | -1.689181 | -0.813183 | -1.929110 | -0.357667 | -0.753278 |
569 | 0.934976 | -1.373043 | 0.064440 | -1.689181 | 1.350061 | 1.324588 | 0.071144 | 0.950795 |
700 | -2.024660 | -0.059698 | -0.365160 | -0.347593 | 1.350061 | -1.522398 | 0.499954 | 1.376814 |
312 | 0.089366 | -0.935261 | 1.353239 | 0.546799 | -1.245832 | 0.104451 | -0.357667 | -0.753278 |
294 rows × 8 columns
valid_y = validNorm["House"]
valid_y
389 Hufflepuff 131 Slytherin 657 Gryffindor 421 Hufflepuff 160 Slytherin ... 49 Slytherin 53 Slytherin 569 Gryffindor 700 Gryffindor 312 Ravenclaw Name: House, Length: 294, dtype: object
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
%%capture --no-display
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(train_X, train_y)
accuracy_score(train_y, knn.predict(train_X))
0.9
Alternatively, use a loop to find the best k.
%%capture --no-display
for k in range(3, 11):
knn_loop = KNeighborsClassifier(n_neighbors = k)
knn_loop.fit(train_X, train_y)
accuracy = accuracy_score(train_y, knn_loop.predict(train_X))
print(accuracy)
If it's easier to read...
%%capture --no-display
result = []
for k in range(3, 11):
knn_loop = KNeighborsClassifier(n_neighbors = k)
knn_loop.fit(train_X, train_y)
result.append({
"k_value": k,
"accuracy value": accuracy_score(train_y, knn_loop.predict(train_X))
})
result = pd.DataFrame(result)
result
k_value | accuracy value | |
---|---|---|
0 | 3 | 0.900000 |
1 | 4 | 0.886364 |
2 | 5 | 0.897727 |
3 | 6 | 0.877273 |
4 | 7 | 0.890909 |
5 | 8 | 0.881818 |
6 | 9 | 0.884091 |
7 | 10 | 0.868182 |
Get training set predictions.
%%capture --no-display
train_y_pred = knn.predict(train_X)
train_y_pred[:20]
array(['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Gryffindor', 'Ravenclaw', 'Hufflepuff', 'Gryffindor', 'Slytherin', 'Hufflepuff', 'Slytherin', 'Ravenclaw', 'Slytherin', 'Hufflepuff', 'Gryffindor', 'Hufflepuff', 'Hufflepuff', 'Hufflepuff', 'Gryffindor', 'Gryffindor', 'Gryffindor'], dtype=object)
Get validation set predictions.
%%capture --no-display
valid_y_pred = knn.predict(valid_X)
valid_y_pred[:20]
array(['Gryffindor', 'Ravenclaw', 'Gryffindor', 'Gryffindor', 'Slytherin', 'Slytherin', 'Slytherin', 'Hufflepuff', 'Gryffindor', 'Hufflepuff', 'Slytherin', 'Hufflepuff', 'Ravenclaw', 'Gryffindor', 'Hufflepuff', 'Hufflepuff', 'Gryffindor', 'Slytherin', 'Ravenclaw', 'Slytherin'], dtype=object)
from sklearn.metrics import confusion_matrix, accuracy_score
Training set. Confusion matrix.
confusion_matrix_train = confusion_matrix(train_y, train_y_pred)
confusion_matrix_train
array([[103, 1, 2, 4], [ 2, 105, 4, 2], [ 4, 7, 81, 5], [ 1, 7, 5, 107]], dtype=int64)
A confusion matrix that's easier to read
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
confusion_matrix_train_display = ConfusionMatrixDisplay(confusion_matrix_train, display_labels = knn.classes_)
confusion_matrix_train_display.plot()
plt.grid(False)
accuracy_score(train_y, train_y_pred)
0.9
from sklearn.metrics import classification_report
print(classification_report(train_y, train_y_pred))
precision recall f1-score support Gryffindor 0.94 0.94 0.94 110 Hufflepuff 0.88 0.93 0.90 113 Ravenclaw 0.88 0.84 0.86 97 Slytherin 0.91 0.89 0.90 120 accuracy 0.90 440 macro avg 0.90 0.90 0.90 440 weighted avg 0.90 0.90 0.90 440
Validation set. Confusion matrix.
confusion_matrix_valid = confusion_matrix(valid_y, valid_y_pred)
confusion_matrix_valid
array([[63, 6, 4, 5], [10, 65, 0, 1], [ 5, 3, 48, 3], [ 7, 9, 4, 61]], dtype=int64)
A confusion matrix that's easier to read
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_valid_display = ConfusionMatrixDisplay(confusion_matrix_valid, display_labels = knn.classes_)
confusion_matrix_valid_display.plot()
plt.grid(False)
accuracy_score(valid_y, valid_y_pred)
0.8061224489795918
print(classification_report(valid_y, valid_y_pred))
precision recall f1-score support Gryffindor 0.74 0.81 0.77 78 Hufflepuff 0.78 0.86 0.82 76 Ravenclaw 0.86 0.81 0.83 59 Slytherin 0.87 0.75 0.81 81 accuracy 0.81 294 macro avg 0.81 0.81 0.81 294 weighted avg 0.81 0.81 0.81 294
Create data of lists.
data = {"Manipulative": 8, "Resourceful": 9, "Dismissive" : 8, "Intelligent": 6, "Trusting": 6, "Loyal": 8,
"Stubborn": 6, "Brave": 6}
new_padawan = pd.DataFrame(data, index = [0])
new_padawan
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
0 | 8 | 9 | 8 | 6 | 6 | 8 | 6 | 6 |
# Alternatively, use a list
pd.DataFrame([data])
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
0 | 8 | 9 | 8 | 6 | 6 | 8 | 6 | 6 |
Normalise.
new_padawan_norm = pd.DataFrame(scaler.transform(new_padawan[predictors]),
columns = predictors)
new_padawan_norm
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
0 | 0.934976 | 1.253648 | 0.92364 | 0.099603 | 0.052115 | 0.917875 | 0.071144 | 0.098759 |
%%capture --no-display
new_padawan_pred = knn.predict(new_padawan_norm)
new_padawan_pred
array(['Slytherin'], dtype=object)
new_padawan_pred_prob = knn.predict_proba(new_padawan_norm)
new_padawan_pred_prob
array([[0. , 0. , 0.33333333, 0.66666667]])
# Use the classes
pd.DataFrame(new_padawan_pred_prob,
columns = knn.classes_)
Gryffindor | Hufflepuff | Ravenclaw | Slytherin | |
---|---|---|---|---|
0 | 0.0 | 0.0 | 0.333333 | 0.666667 |
# Or specify the columns
pd.DataFrame(new_padawan_pred_prob,
columns = ["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin"])
Gryffindor | Hufflepuff | Ravenclaw | Slytherin | |
---|---|---|---|---|
0 | 0.0 | 0.0 | 0.333333 | 0.666667 |
train_X2 = train_X.copy() # for consistency
valid_X2 = valid_X.copy() # for consistency
train_y2 = train_y.copy()
valid_y2 = valid_y.copy()
train_X2.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
650 | -0.333440 | -0.497479 | -0.365160 | 1.441191 | 1.350061 | -0.708973 | 1.357574 | 0.950795 |
479 | -1.179050 | 0.378084 | -0.365160 | -1.241985 | 0.484763 | 0.104451 | -0.357667 | 0.524777 |
271 | -1.179050 | 0.815866 | 1.353239 | 0.546799 | -1.245832 | -1.929110 | -1.644097 | -1.179296 |
647 | -0.333440 | -0.059698 | -1.653959 | 0.993995 | 0.052115 | 0.917875 | 1.357574 | 0.950795 |
307 | 0.089366 | 0.378084 | 1.782839 | 0.099603 | 0.484763 | -1.929110 | -0.786477 | 1.376814 |
valid_X2.head()
Manipulative | Resourceful | Dismissive | Intelligent | Trusting | Loyal | Stubborn | Brave | |
---|---|---|---|---|---|---|---|---|
389 | -0.756245 | -1.810824 | -0.794760 | -0.347593 | 0.917412 | 0.104451 | 0.499954 | 0.098759 |
131 | 0.512171 | 0.378084 | 0.923640 | 1.441191 | -1.245832 | -0.708973 | -0.357667 | -1.179296 |
657 | 0.512171 | -2.248606 | 0.923640 | -0.347593 | 0.917412 | 0.104451 | 0.928764 | 0.950795 |
421 | -0.756245 | 1.253648 | -1.653959 | -0.794789 | 0.484763 | 0.917875 | 0.071144 | 1.376814 |
160 | 0.089366 | 1.253648 | -0.365160 | -1.689181 | -0.813183 | -0.708973 | -2.072907 | -0.753278 |
train_y2.value_counts()
Slytherin 120 Hufflepuff 113 Gryffindor 110 Ravenclaw 97 Name: House, dtype: int64
valid_y2.value_counts()
Slytherin 81 Gryffindor 78 Hufflepuff 76 Ravenclaw 59 Name: House, dtype: int64
House_values = ["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin" ]
House_2_values = ["Not Slytherin", "Not Slytherin", "Not Slytherin", "Slytherin"]
train_y2 = train_y.replace(House_values, House_2_values)
train_y2
650 Not Slytherin 479 Not Slytherin 271 Not Slytherin 647 Not Slytherin 307 Not Slytherin ... 445 Not Slytherin 414 Not Slytherin 70 Slytherin 429 Not Slytherin 236 Not Slytherin Name: House, Length: 440, dtype: object
train_y2.head()
650 Not Slytherin 479 Not Slytherin 271 Not Slytherin 647 Not Slytherin 307 Not Slytherin Name: House, dtype: object
train_y2.value_counts()
Not Slytherin 320 Slytherin 120 Name: House, dtype: int64
valid_y2 = valid_y.replace(House_values, House_2_values)
valid_y2
389 Not Slytherin 131 Slytherin 657 Not Slytherin 421 Not Slytherin 160 Slytherin ... 49 Slytherin 53 Slytherin 569 Not Slytherin 700 Not Slytherin 312 Not Slytherin Name: House, Length: 294, dtype: object
valid_y2.head()
389 Not Slytherin 131 Slytherin 657 Not Slytherin 421 Not Slytherin 160 Slytherin Name: House, dtype: object
valid_y2.value_counts()
Not Slytherin 213 Slytherin 81 Name: House, dtype: int64
%%capture --no-display
result_2 = []
for k in range(3, 11):
knn_loop = KNeighborsClassifier(n_neighbors = k)
knn_loop.fit(train_X2, train_y2)
result_2.append({
"k_value_in_knn_2": k,
"accuracy_value": accuracy_score(train_y2, knn_loop.predict(train_X2))
})
result_2 = pd.DataFrame(result_2)
result_2
k_value_in_knn_2 | accuracy_value | |
---|---|---|
0 | 3 | 0.945455 |
1 | 4 | 0.931818 |
2 | 5 | 0.940909 |
3 | 6 | 0.925000 |
4 | 7 | 0.931818 |
5 | 8 | 0.927273 |
6 | 9 | 0.927273 |
7 | 10 | 0.925000 |
%%capture --no-display
knn2_k3 = KNeighborsClassifier(n_neighbors = 3)
knn2_k3.fit(train_X2, train_y2)
accuracy_score(train_y2, knn2_k3.predict(train_X2))
0.9454545454545454
Get training set prediction.
%%capture --no-display
train_y2_pred_k3 = knn2_k3.predict(train_X2)
train_y2_pred_k3[:20]
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin'], dtype=object)
train_y2_pred_prob2_k3 = knn2_k3.predict_proba(train_X2)
train_y2_pred_prob2_k3[:20]
array([[1. , 0. ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [0.66666667, 0.33333333], [0. , 1. ], [1. , 0. ], [0.33333333, 0.66666667], [1. , 0. ], [0. , 1. ], [0.66666667, 0.33333333], [1. , 0. ], [1. , 0. ], [0.66666667, 0.33333333], [0.66666667, 0.33333333], [1. , 0. ], [1. , 0. ], [0.66666667, 0.33333333]])
pd.DataFrame(train_y2_pred_prob2_k3, columns = knn2_k3.classes_)
Not Slytherin | Slytherin | |
---|---|---|
0 | 1.000000 | 0.000000 |
1 | 1.000000 | 0.000000 |
2 | 1.000000 | 0.000000 |
3 | 1.000000 | 0.000000 |
4 | 1.000000 | 0.000000 |
... | ... | ... |
435 | 0.666667 | 0.333333 |
436 | 1.000000 | 0.000000 |
437 | 0.000000 | 1.000000 |
438 | 1.000000 | 0.000000 |
439 | 1.000000 | 0.000000 |
440 rows × 2 columns
Get validation set prediction.
%%capture --no-display
valid_y2_pred_k3 = knn2_k3.predict(valid_X2)
valid_y2_pred_k3[:20]
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin'], dtype=object)
valid_y2_pred_prob2_k3 = knn2_k3.predict_proba(valid_X2)
valid_y2_pred_prob2_k3[:20]
array([[1. , 0. ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [0. , 1. ], [0.33333333, 0.66666667], [0.33333333, 0.66666667], [1. , 0. ], [1. , 0. ], [1. , 0. ], [0.33333333, 0.66666667], [1. , 0. ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [0. , 1. ], [1. , 0. ], [0.33333333, 0.66666667]])
pd.DataFrame(valid_y2_pred_prob2_k3, columns = knn2_k3.classes_)
Not Slytherin | Slytherin | |
---|---|---|
0 | 1.000000 | 0.000000 |
1 | 1.000000 | 0.000000 |
2 | 1.000000 | 0.000000 |
3 | 1.000000 | 0.000000 |
4 | 0.000000 | 1.000000 |
... | ... | ... |
289 | 0.666667 | 0.333333 |
290 | 0.000000 | 1.000000 |
291 | 1.000000 | 0.000000 |
292 | 1.000000 | 0.000000 |
293 | 1.000000 | 0.000000 |
294 rows × 2 columns
%%capture --no-display
knn2_k5 = KNeighborsClassifier(n_neighbors = 5)
knn2_k5.fit(train_X2, train_y2)
accuracy_score(train_y2, knn2_k5.predict(train_X2))
0.9409090909090909
Get training set prediction.
%%capture --no-display
train_y2_pred_k5 = knn2_k5.predict(train_X2)
train_y2_pred_k5[:20]
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin'], dtype=object)
train_y2_pred_prob2_k5 = knn2_k5.predict_proba(train_X2)
train_y2_pred_prob2_k5[:20]
array([[1. , 0. ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [1. , 0. ], [0.8, 0.2], [0. , 1. ], [0.8, 0.2], [0.2, 0.8], [1. , 0. ], [0. , 1. ], [0.4, 0.6], [0.8, 0.2], [1. , 0. ], [0.6, 0.4], [0.8, 0.2], [1. , 0. ], [1. , 0. ], [0.8, 0.2]])
pd.DataFrame(train_y2_pred_prob2_k5, columns = knn2_k5.classes_)
Not Slytherin | Slytherin | |
---|---|---|
0 | 1.0 | 0.0 |
1 | 1.0 | 0.0 |
2 | 1.0 | 0.0 |
3 | 1.0 | 0.0 |
4 | 1.0 | 0.0 |
... | ... | ... |
435 | 0.8 | 0.2 |
436 | 0.8 | 0.2 |
437 | 0.2 | 0.8 |
438 | 0.8 | 0.2 |
439 | 1.0 | 0.0 |
440 rows × 2 columns
Get validation set prediction.
%%capture --no-display
valid_y2_pred_k5 = knn2_k5.predict(valid_X2)
valid_y2_pred_k5[:20]
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin'], dtype=object)
valid_y2_pred_prob2_k5 = knn2_k5.predict_proba(valid_X2)
valid_y2_pred_prob2_k5[:20]
array([[1. , 0. ], [0.8, 0.2], [1. , 0. ], [1. , 0. ], [0. , 1. ], [0.2, 0.8], [0.2, 0.8], [1. , 0. ], [1. , 0. ], [1. , 0. ], [0.2, 0.8], [1. , 0. ], [1. , 0. ], [0.8, 0.2], [1. , 0. ], [1. , 0. ], [1. , 0. ], [0.2, 0.8], [0.8, 0.2], [0.2, 0.8]])
pd.DataFrame(valid_y2_pred_prob2_k5, columns = knn2_k5.classes_)
Not Slytherin | Slytherin | |
---|---|---|
0 | 1.0 | 0.0 |
1 | 0.8 | 0.2 |
2 | 1.0 | 0.0 |
3 | 1.0 | 0.0 |
4 | 0.0 | 1.0 |
... | ... | ... |
289 | 0.4 | 0.6 |
290 | 0.0 | 1.0 |
291 | 1.0 | 0.0 |
292 | 1.0 | 0.0 |
293 | 1.0 | 0.0 |
294 rows × 2 columns
Training set.
confusion_matrix_train2_k3 = confusion_matrix(train_y2, train_y2_pred_k3)
confusion_matrix_train2_k3
array([[309, 11], [ 13, 107]], dtype=int64)
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_train2_k3_display = ConfusionMatrixDisplay(confusion_matrix_train2_k3, display_labels = knn2_k3.classes_)
confusion_matrix_train2_k3_display.plot()
plt.grid(False)
accuracy_score(train_y2, train_y2_pred_k3)
0.9454545454545454
print(classification_report(train_y2, train_y2_pred_k3))
precision recall f1-score support Not Slytherin 0.96 0.97 0.96 320 Slytherin 0.91 0.89 0.90 120 accuracy 0.95 440 macro avg 0.93 0.93 0.93 440 weighted avg 0.95 0.95 0.95 440
Validation set.
confusion_matrix_valid2_k3 = confusion_matrix(valid_y2, valid_y2_pred_k3)
confusion_matrix_valid2_k3
array([[204, 9], [ 20, 61]], dtype=int64)
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_valid2_k3_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k3, display_labels = knn2_k3.classes_)
confusion_matrix_valid2_k3_display.plot()
plt.grid(False)
accuracy_score(valid_y2, valid_y2_pred_k3)
0.9013605442176871
print(classification_report(valid_y2, valid_y2_pred_k3))
precision recall f1-score support Not Slytherin 0.91 0.96 0.93 213 Slytherin 0.87 0.75 0.81 81 accuracy 0.90 294 macro avg 0.89 0.86 0.87 294 weighted avg 0.90 0.90 0.90 294
confusion_matrix_train2_k5 = confusion_matrix(train_y2, train_y2_pred_k5)
confusion_matrix_train2_k5
array([[311, 9], [ 17, 103]], dtype=int64)
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_train2_k5_display = ConfusionMatrixDisplay(confusion_matrix_train2_k5, display_labels = knn2_k5.classes_)
confusion_matrix_train2_k5_display.plot()
plt.grid(False)
accuracy_score(train_y2, train_y2_pred_k5)
0.9409090909090909
print(classification_report(train_y2, train_y2_pred_k5))
precision recall f1-score support Not Slytherin 0.95 0.97 0.96 320 Slytherin 0.92 0.86 0.89 120 accuracy 0.94 440 macro avg 0.93 0.92 0.92 440 weighted avg 0.94 0.94 0.94 440
Validation set.
confusion_matrix_valid2_k5 = confusion_matrix(valid_y2, valid_y2_pred_k5)
confusion_matrix_valid2_k5
array([[204, 9], [ 14, 67]], dtype=int64)
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_valid2_k5_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k5, display_labels = knn2_k5.classes_)
confusion_matrix_valid2_k5_display.plot()
plt.grid(False)
accuracy_score(valid_y2, valid_y2_pred_k5)
0.9217687074829932
print(classification_report(valid_y2, valid_y2_pred_k5))
precision recall f1-score support Not Slytherin 0.94 0.96 0.95 213 Slytherin 0.88 0.83 0.85 81 accuracy 0.92 294 macro avg 0.91 0.89 0.90 294 weighted avg 0.92 0.92 0.92 294
Change cutoff to 0.7
train_y2_pred_k5
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin'], dtype=object)
train_y2_pred_k5_df = pd.DataFrame(train_y2_pred_k5, columns = ["pred_50"])
train_y2_pred_k5_df
pred_50 | |
---|---|
0 | Not Slytherin |
1 | Not Slytherin |
2 | Not Slytherin |
3 | Not Slytherin |
4 | Not Slytherin |
... | ... |
435 | Not Slytherin |
436 | Not Slytherin |
437 | Slytherin |
438 | Not Slytherin |
439 | Not Slytherin |
440 rows × 1 columns
train_y2_pred_prob2_k5_df = pd.DataFrame(train_y2_pred_prob2_k5, columns = knn2_k5.classes_)
train_y2_pred_prob2_k5_df
Not Slytherin | Slytherin | |
---|---|---|
0 | 1.0 | 0.0 |
1 | 1.0 | 0.0 |
2 | 1.0 | 0.0 |
3 | 1.0 | 0.0 |
4 | 1.0 | 0.0 |
... | ... | ... |
435 | 0.8 | 0.2 |
436 | 0.8 | 0.2 |
437 | 0.2 | 0.8 |
438 | 0.8 | 0.2 |
439 | 1.0 | 0.0 |
440 rows × 2 columns
def cutoff70(prob):
if prob < 0.7:
return "Not Slytherin"
elif 0.7 <= prob:
return "Slytherin"
train_y2_pred_k5_df["pred_70"] = train_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff70)
train_y2_pred_k5_df
pred_50 | pred_70 | |
---|---|---|
0 | Not Slytherin | Not Slytherin |
1 | Not Slytherin | Not Slytherin |
2 | Not Slytherin | Not Slytherin |
3 | Not Slytherin | Not Slytherin |
4 | Not Slytherin | Not Slytherin |
... | ... | ... |
435 | Not Slytherin | Not Slytherin |
436 | Not Slytherin | Not Slytherin |
437 | Slytherin | Slytherin |
438 | Not Slytherin | Not Slytherin |
439 | Not Slytherin | Not Slytherin |
440 rows × 2 columns
confusion_matrix_train2_k5_70 = confusion_matrix(train_y2, train_y2_pred_k5_df["pred_70"])
confusion_matrix_train2_k5_70
array([[317, 3], [ 40, 80]], dtype=int64)
confusion_matrix_train2_k5_70_display = ConfusionMatrixDisplay(confusion_matrix_train2_k5_70,
display_labels = knn2_k5.classes_)
confusion_matrix_train2_k5_70_display.plot()
plt.grid(False)
valid_y2_pred_k5
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin'], dtype=object)
valid_y2_pred_k5_df = pd.DataFrame(valid_y2_pred_k5, columns = ["pred_50"])
valid_y2_pred_k5_df
pred_50 | |
---|---|
0 | Not Slytherin |
1 | Not Slytherin |
2 | Not Slytherin |
3 | Not Slytherin |
4 | Slytherin |
... | ... |
289 | Slytherin |
290 | Slytherin |
291 | Not Slytherin |
292 | Not Slytherin |
293 | Not Slytherin |
294 rows × 1 columns
valid_y2_pred_prob2_k5_df = pd.DataFrame(valid_y2_pred_prob2_k5, columns = knn2_k5.classes_)
valid_y2_pred_prob2_k5_df
Not Slytherin | Slytherin | |
---|---|---|
0 | 1.0 | 0.0 |
1 | 0.8 | 0.2 |
2 | 1.0 | 0.0 |
3 | 1.0 | 0.0 |
4 | 0.0 | 1.0 |
... | ... | ... |
289 | 0.4 | 0.6 |
290 | 0.0 | 1.0 |
291 | 1.0 | 0.0 |
292 | 1.0 | 0.0 |
293 | 1.0 | 0.0 |
294 rows × 2 columns
def cutoff70(prob):
if prob < 0.7:
return "Not Slytherin"
elif 0.7 <= prob:
return "Slytherin"
valid_y2_pred_k5_df["pred_70"] = valid_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff70)
valid_y2_pred_k5_df
pred_50 | pred_70 | |
---|---|---|
0 | Not Slytherin | Not Slytherin |
1 | Not Slytherin | Not Slytherin |
2 | Not Slytherin | Not Slytherin |
3 | Not Slytherin | Not Slytherin |
4 | Slytherin | Slytherin |
... | ... | ... |
289 | Slytherin | Not Slytherin |
290 | Slytherin | Slytherin |
291 | Not Slytherin | Not Slytherin |
292 | Not Slytherin | Not Slytherin |
293 | Not Slytherin | Not Slytherin |
294 rows × 2 columns
confusion_matrix_valid2_k5_70 = confusion_matrix(valid_y2, valid_y2_pred_k5_df["pred_70"])
confusion_matrix_valid2_k5_70
array([[211, 2], [ 30, 51]], dtype=int64)
confusion_matrix_valid2_k5_70_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k5_70,
display_labels = knn2_k5.classes_)
confusion_matrix_valid2_k5_70_display.plot()
plt.grid(False)
print(classification_report(valid_y2, valid_y2_pred_k5_df["pred_70"]))
precision recall f1-score support Not Slytherin 0.88 0.99 0.93 213 Slytherin 0.96 0.63 0.76 81 accuracy 0.89 294 macro avg 0.92 0.81 0.85 294 weighted avg 0.90 0.89 0.88 294
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
# check again
pd.DataFrame(valid_y2_pred_prob2_k3, columns = knn2_k3.classes_)
Not Slytherin | Slytherin | |
---|---|---|
0 | 1.000000 | 0.000000 |
1 | 1.000000 | 0.000000 |
2 | 1.000000 | 0.000000 |
3 | 1.000000 | 0.000000 |
4 | 0.000000 | 1.000000 |
... | ... | ... |
289 | 0.666667 | 0.333333 |
290 | 0.000000 | 1.000000 |
291 | 1.000000 | 0.000000 |
292 | 1.000000 | 0.000000 |
293 | 1.000000 | 0.000000 |
294 rows × 2 columns
Set positive class
fpr1, tpr1, threshold1 = metrics.roc_curve(valid_y2, valid_y2_pred_prob2_k3[:,1], pos_label = "Slytherin")
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.plot(fpr1, tpr1, linestyle = "-", color = "green", label = "Slytherin k = 3")
# roc curve for tpr = fpr (random line)
random_probs = [0 for i in range(len(valid_y2))]
p_fpr1, p_tpr1, _ = roc_curve(valid_y2, random_probs, pos_label = "Slytherin")
plt.plot(p_fpr1, p_tpr1, linestyle = "--", color = "black", label = "Random Force")
# If desired
plt.legend()
plt.title("Sorting Hat ROC")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Text(0, 0.5, 'True Positive Rate')
from sklearn.metrics import roc_auc_score
auc_score1 = roc_auc_score(valid_y2, valid_y2_pred_prob2_k3[:,1])
auc_score1
0.9335188083231901
# check again
pd.DataFrame(valid_y2_pred_prob2_k5, columns = knn2_k5.classes_)
Not Slytherin | Slytherin | |
---|---|---|
0 | 1.0 | 0.0 |
1 | 0.8 | 0.2 |
2 | 1.0 | 0.0 |
3 | 1.0 | 0.0 |
4 | 0.0 | 1.0 |
... | ... | ... |
289 | 0.4 | 0.6 |
290 | 0.0 | 1.0 |
291 | 1.0 | 0.0 |
292 | 1.0 | 0.0 |
293 | 1.0 | 0.0 |
294 rows × 2 columns
Set positive class
fpr2, tpr2, threshold2 = metrics.roc_curve(valid_y2, valid_y2_pred_prob2_k5[:,1], pos_label = "Slytherin")
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.plot(fpr2, tpr2, linestyle = "-", color = "blue", label = "Slytherin k = 5")
# roc curve for tpr = fpr (random line)
random_probs = [0 for i in range(len(valid_y2))]
p_fpr2, p_tpr2, _ = roc_curve(valid_y2, random_probs, pos_label = "Slytherin")
plt.plot(p_fpr2, p_tpr2, linestyle = "--", color = "black", label = "Random Force")
# If desired
plt.legend()
plt.title("Sorting Hat ROC")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Text(0, 0.5, 'True Positive Rate')
auc_score2 = roc_auc_score(valid_y2, valid_y2_pred_prob2_k5[:,1])
auc_score2
0.966962267431751
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.plot(fpr1, tpr1, linestyle = "-", color = "green", label = "Slytherin k = 3")
plt.plot(fpr2, tpr2, linestyle = "-", color = "blue", label = "Slytherin k = 5")
# roc curve for tpr = fpr (random line)
random_probs = [0 for i in range(len(valid_y2))]
p_fpr_random, p_tpr_random, _ = roc_curve(valid_y2, random_probs, pos_label = "Slytherin")
plt.plot(p_fpr_random, p_tpr_random, linestyle = "--", color = "black", label = "Random Force")
# If desired
plt.legend()
plt.title("Sorting Hat ROC")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate");
# to save the plot
# plt.savefig("whatever_name",dpi = 300)
print(auc_score1, auc_score2)
0.9335188083231901 0.966962267431751
%%capture --no-display
new_padawan_pred2_k3 = knn2_k3.predict(new_padawan_norm)
new_padawan_pred2_k3
array(['Slytherin'], dtype=object)
new_padawan_pred_prob2_k3 = knn2_k3.predict_proba(new_padawan_norm)
new_padawan_pred_prob2_k3
array([[0.33333333, 0.66666667]])
pd.DataFrame(new_padawan_pred_prob2_k3,
columns = knn2_k3.classes_)
Not Slytherin | Slytherin | |
---|---|---|
0 | 0.333333 | 0.666667 |
%%capture --no-display
new_padawan_pred2_k5 = knn2_k5.predict(new_padawan_norm)
new_padawan_pred2_k5
array(['Slytherin'], dtype=object)
new_padawan_pred_prob2_k5 = knn2_k5.predict_proba(new_padawan_norm)
new_padawan_pred_prob2_k5
array([[0.2, 0.8]])
new_padawan_pred_prob2_k5_df = pd.DataFrame(new_padawan_pred_prob2_k5,
columns = knn2_k5.classes_)
new_padawan_pred_prob2_k5_df
Not Slytherin | Slytherin | |
---|---|---|
0 | 0.2 | 0.8 |
Change cutoff to 0.7
if new_padawan_pred_prob2_k5[0][1] > 0.7:
print("A Slytherin, you are")
else:
print("A Slytherin, you are not")
A Slytherin, you are
import numpy as np
new_padawan_pred_prob2_k5_df["New_Prediction_70"] = np.where(new_padawan_pred_prob2_k5_df["Slytherin"] >= 0.7,
"Slytherin", "Not Slytherin")
new_padawan_pred_prob2_k5_df.head()
Not Slytherin | Slytherin | New_Prediction_70 | |
---|---|---|---|
0 | 0.2 | 0.8 | Slytherin |
Still a Slytherin, you are :-)
Apply to k = 5 for illustration
# Get the optimal index position in the array
optimal_index = np.argmax(tpr1 - fpr1)
print(optimal_index)
3
optimal_cutoff = threshold1[optimal_index]
optimal_cutoff
0.3333333333333333
def cutoff_optimal(prob):
if prob < optimal_cutoff:
return "Not Slytherin"
elif optimal_cutoff <= prob:
return "Slytherin"
train_y2_pred_k5_df["pred_optimal"] = train_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff_optimal)
train_y2_pred_k5_df
pred_50 | pred_70 | pred_optimal | |
---|---|---|---|
0 | Not Slytherin | Not Slytherin | Not Slytherin |
1 | Not Slytherin | Not Slytherin | Not Slytherin |
2 | Not Slytherin | Not Slytherin | Not Slytherin |
3 | Not Slytherin | Not Slytherin | Not Slytherin |
4 | Not Slytherin | Not Slytherin | Not Slytherin |
... | ... | ... | ... |
435 | Not Slytherin | Not Slytherin | Not Slytherin |
436 | Not Slytherin | Not Slytherin | Not Slytherin |
437 | Slytherin | Slytherin | Slytherin |
438 | Not Slytherin | Not Slytherin | Not Slytherin |
439 | Not Slytherin | Not Slytherin | Not Slytherin |
440 rows × 3 columns
confusion_matrix_train2_k5_optimal = confusion_matrix(train_y2, train_y2_pred_k5_df["pred_optimal"])
confusion_matrix_train2_k5_optimal
array([[289, 31], [ 1, 119]], dtype=int64)
confusion_matrix_train2_k5_optimal_display = ConfusionMatrixDisplay(confusion_matrix_train2_k5_optimal,
display_labels = knn2_k5.classes_)
confusion_matrix_train2_k5_optimal_display.plot()
plt.grid(False)
print(classification_report(train_y2, train_y2_pred_k5_df["pred_optimal"]))
precision recall f1-score support Not Slytherin 1.00 0.90 0.95 320 Slytherin 0.79 0.99 0.88 120 accuracy 0.93 440 macro avg 0.89 0.95 0.91 440 weighted avg 0.94 0.93 0.93 440
#def cutoff_optimal(prob):
# if prob < optimal_cutoff:
# return "Not Slytherin"
# elif optimal_cutoff <= prob:
# return "Slytherin"
valid_y2_pred_k5_df["pred_optimal"] = valid_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff_optimal)
valid_y2_pred_k5_df
pred_50 | pred_70 | pred_optimal | |
---|---|---|---|
0 | Not Slytherin | Not Slytherin | Not Slytherin |
1 | Not Slytherin | Not Slytherin | Not Slytherin |
2 | Not Slytherin | Not Slytherin | Not Slytherin |
3 | Not Slytherin | Not Slytherin | Not Slytherin |
4 | Slytherin | Slytherin | Slytherin |
... | ... | ... | ... |
289 | Slytherin | Not Slytherin | Slytherin |
290 | Slytherin | Slytherin | Slytherin |
291 | Not Slytherin | Not Slytherin | Not Slytherin |
292 | Not Slytherin | Not Slytherin | Not Slytherin |
293 | Not Slytherin | Not Slytherin | Not Slytherin |
294 rows × 3 columns
confusion_matrix_valid2_k5_optimal = confusion_matrix(valid_y2, valid_y2_pred_k5_df["pred_optimal"])
confusion_matrix_valid2_k5_optimal
array([[195, 18], [ 6, 75]], dtype=int64)
confusion_matrix_valid2_k5_optimal_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k5_optimal,
display_labels = knn2_k5.classes_)
confusion_matrix_valid2_k5_optimal_display.plot()
plt.grid(False)
print(classification_report(valid_y2, valid_y2_pred_k5_df["pred_optimal"]))
precision recall f1-score support Not Slytherin 0.97 0.92 0.94 213 Slytherin 0.81 0.93 0.86 81 accuracy 0.92 294 macro avg 0.89 0.92 0.90 294 weighted avg 0.93 0.92 0.92 294
%%capture --no-display
new_padawan_pred2_k5_v2 = knn2_k5.predict(new_padawan_norm)
new_padawan_pred2_k5_v2
array(['Slytherin'], dtype=object)
new_padawan_pred_prob2_k5_v2 = knn2_k5.predict_proba(new_padawan_norm)
new_padawan_pred_prob2_k5_v2
array([[0.2, 0.8]])
new_padawan_pred_prob2_k5_df_2 = pd.DataFrame(new_padawan_pred_prob2_k5_v2,
columns = knn2_k5.classes_)
new_padawan_pred_prob2_k5_df_2
Not Slytherin | Slytherin | |
---|---|---|
0 | 0.2 | 0.8 |
if new_padawan_pred_prob2_k5_v2[0][1] > optimal_cutoff:
print("A Slytherin, you are")
else:
print("A Slytherin, you are not")
A Slytherin, you are
import numpy as np
new_padawan_pred_prob2_k5_df_2["New_Optimal_Prediction"] = np.where(new_padawan_pred_prob2_k5_df_2["Slytherin"] >= optimal_cutoff,
"Slytherin", "Not Slytherin")
new_padawan_pred_prob2_k5_df_2.head()
Not Slytherin | Slytherin | New_Optimal_Prediction | |
---|---|---|---|
0 | 0.2 | 0.8 | Slytherin |