import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
Sometimes we need to specify the encoding encoding = "ISO-8859-1" encoding = "utf-8"
songs_df = pd.read_csv("american_idol_songs_v8.csv", encoding = "ISO-8859-1")
songs_df.head()
No | Song_Title | Artiste | Song_Avg_Rtg | Year | Avg_Song_Age | Advance | Bottom | Elimination | Expectation | Artiste_Rating | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Stuff Like That There | Bette Midler | 95.0 | 1991 | 11.0 | 1 | 0 | 0 | 20.5 | 55.5 |
1 | 2 | In A Dream | Badlands | 94.0 | 1991 | 14.0 | 1 | 0 | 0 | 24.4 | 94.0 |
2 | 3 | Build Me Up Buttercup | The Foundations | 93.0 | 1969 | 34.0 | 1 | 0 | 0 | 26.2 | 93.0 |
3 | 4 | Hemorrhage (In My Hands) | Fuel | 92.0 | 2000 | 6.0 | 1 | 0 | 0 | 29.4 | 92.0 |
4 | 5 | Solitaire | Carpenters | 92.0 | 1974 | 29.0 | 1 | 0 | 0 | 25.2 | 68.5 |
songs_df.dtypes
No int64 Song_Title object Artiste object Song_Avg_Rtg float64 Year int64 Avg_Song_Age float64 Advance int64 Bottom int64 Elimination int64 Expectation float64 Artiste_Rating float64 dtype: object
Set target variable as category
songs_df.Advance = songs_df.Advance.astype("category")
Filter the variables for the spell
pd.DataFrame(songs_df.columns.values, columns = ["Variables"])
Variables | |
---|---|
0 | No |
1 | Song_Title |
2 | Artiste |
3 | Song_Avg_Rtg |
4 | Year |
5 | Avg_Song_Age |
6 | Advance |
7 | Bottom |
8 | Elimination |
9 | Expectation |
10 | Artiste_Rating |
Filter data for spell
songs_df_filtered = songs_df.iloc[:, [3, 5, 9, 10, 6]]
songs_df_filtered.head()
Song_Avg_Rtg | Avg_Song_Age | Expectation | Artiste_Rating | Advance | |
---|---|---|---|---|---|
0 | 95.0 | 11.0 | 20.5 | 55.5 | 1 |
1 | 94.0 | 14.0 | 24.4 | 94.0 | 1 |
2 | 93.0 | 34.0 | 26.2 | 93.0 | 1 |
3 | 92.0 | 6.0 | 29.4 | 92.0 | 1 |
4 | 92.0 | 29.0 | 25.2 | 68.5 | 1 |
songs_df_filtered.dtypes
Song_Avg_Rtg float64 Avg_Song_Age float64 Expectation float64 Artiste_Rating float64 Advance category dtype: object
songs_df_filtered["Advance"].value_counts()
1 1339 0 287 Name: Advance, dtype: int64
songs_df_filtered["Advance"].value_counts()[1]
1339
import plotly.express as px
fig_1 = px.scatter_3d(songs_df_filtered,
x = "Song_Avg_Rtg",
y = "Avg_Song_Age",
z = "Artiste_Rating",
color = "Advance",
symbol = "Advance",
category_orders = {"Advance": [1,0]},
labels = {"Song_Avg_Rtg": "Average Song Rating",
"Avg_Song_Age": "Average Song Age",
"Artiste_Rating": "Artiste Rating"},
title = "Characteristics of Songs that Advance")
fig_1.show()
X = songs_df_filtered.drop(columns = ["Advance"])
y = songs_df_filtered["Advance"].astype("category")
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.3, random_state = 666)
train_X.head()
Song_Avg_Rtg | Avg_Song_Age | Expectation | Artiste_Rating | |
---|---|---|---|---|
699 | 50.0 | 43.0 | 13.000 | 50.0 |
916 | 42.0 | 29.0 | -14.800 | 42.0 |
1343 | 16.0 | 20.5 | -0.637 | 14.7 |
868 | 44.0 | 46.0 | -11.700 | 54.0 |
645 | 52.5 | 4.3 | 2.800 | 52.5 |
train_y.head()
699 1 916 1 1343 0 868 1 645 1 Name: Advance, dtype: category Categories (2, int64): [0, 1]
valid_X.head()
Song_Avg_Rtg | Avg_Song_Age | Expectation | Artiste_Rating | |
---|---|---|---|---|
67 | 82.0 | 51.0 | 18.3 | 82.0 |
348 | 65.0 | 2.0 | 29.5 | 48.0 |
1396 | 10.0 | 5.0 | -32.9 | 10.0 |
356 | 65.0 | 23.0 | 4.5 | 74.8 |
768 | 47.5 | 26.0 | -14.7 | 64.9 |
valid_y.head()
67 1 348 1 1396 0 356 1 768 0 Name: Advance, dtype: category Categories (2, int64): [0, 1]
len(train_X)
1138
len(train_y)
1138
len(valid_X)
488
len(valid_y)
488
logreg = LogisticRegression()
# fit the model with data
logreg_model = logreg.fit(train_X, train_y)
print("The intercept is", logreg_model.intercept_)
print("The coefficients are", logreg_model.coef_)
coef_df = pd.DataFrame({"Coefficient": logreg_model.coef_[0]}, index = train_X.columns)
coef_df
The intercept is [-0.55674839] The coefficients are [[ 0.0432835 0.0155768 0.00112524 -0.00348794]]
Coefficient | |
---|---|
Song_Avg_Rtg | 0.043284 |
Avg_Song_Age | 0.015577 |
Expectation | 0.001125 |
Artiste_Rating | -0.003488 |
train_y_pred = logreg_model.predict(train_X)
train_y_pred[0:9]
array([1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)
valid_y_pred = logreg_model.predict(valid_X)
valid_y_pred[0:9]
array([1, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int64)
train_y_pred_prob = logreg_model.predict_proba(train_X)
train_y_pred_prob[0:9]
array([[0.10741132, 0.89258868], [0.17512397, 0.82487603], [0.40056235, 0.59943765], [0.13438348, 0.86561652], [0.1676116 , 0.8323884 ], [0.10212483, 0.89787517], [0.09516593, 0.90483407], [0.15540659, 0.84459341], [0.17910164, 0.82089836]])
valid_y_pred_prob = logreg_model.predict_proba(valid_X)
valid_y_pred_prob[0:9]
array([[0.02870674, 0.97129326], [0.10399469, 0.89600531], [0.52944816, 0.47055184], [0.08634425, 0.91365575], [0.15958667, 0.84041333], [0.30933336, 0.69066664], [0.12642922, 0.87357078], [0.04267718, 0.95732282], [0.09931677, 0.90068323]])
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix_train = confusion_matrix(train_y, train_y_pred)
confusion_matrix_train
array([[ 16, 184], [ 4, 934]], dtype=int64)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
confusion_matrix_train_display = ConfusionMatrixDisplay(confusion_matrix_train, display_labels = logreg_model.classes_)
confusion_matrix_train_display.plot()
plt.grid(False)
accuracy_train = accuracy_score(train_y, train_y_pred)
accuracy_train
0.8347978910369068
from sklearn.metrics import classification_report
print(classification_report(train_y, train_y_pred))
precision recall f1-score support 0 0.80 0.08 0.15 200 1 0.84 1.00 0.91 938 accuracy 0.83 1138 macro avg 0.82 0.54 0.53 1138 weighted avg 0.83 0.83 0.77 1138
# from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix_valid = confusion_matrix(valid_y, valid_y_pred)
confusion_matrix_valid
array([[ 8, 79], [ 5, 396]], dtype=int64)
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_valid_display = ConfusionMatrixDisplay(confusion_matrix_valid, display_labels = logreg_model.classes_)
confusion_matrix_valid_display.plot()
plt.grid(False)
# from sklearn.metrics import classification_report
print(classification_report(valid_y, valid_y_pred))
precision recall f1-score support 0 0.62 0.09 0.16 87 1 0.83 0.99 0.90 401 accuracy 0.83 488 macro avg 0.72 0.54 0.53 488 weighted avg 0.79 0.83 0.77 488
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
fpr1, tpr1, thresh1 = roc_curve(valid_y, valid_y_pred_prob[:,1], pos_label = 1)
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.plot(fpr1, tpr1, linestyle = '-', color = "red", label = "Songs to Advance")
# roc curve for tpr = fpr (random line)
random_probs = [0 for i in range(len(valid_y))]
p_fpr, p_tpr, _ = roc_curve(valid_y, random_probs, pos_label = 1)
plt.plot(p_fpr, p_tpr, linestyle = '--', color = "green", label = "Random Advancement")
# If desired
plt.legend()
plt.title("Songs to Advance ROC")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Text(0, 0.5, 'True Positive Rate')
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(valid_y, valid_y_pred_prob[:,1])
auc
0.7319918594318802
new_songs_df = pd.read_csv("new_songs_v2.csv", encoding = "ISO-8859-1")
new_songs_df
No | Song_Title | Artiste | Song_Avg_Rtg | Year | Avg_Song_Age | Advance | Bottom | Elimination | Expectation | Artiste_Rating | Comments | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6661 | Walk With Me In Hell | Lamb of God | 96 | 2004 | 19 | NaN | NaN | NaN | 42 | 90 | Classic song from a legendary band |
1 | 6662 | The Watcher | Arch Enemy | 90 | 2022 | 1 | NaN | NaN | NaN | 36 | 90 | Fantastic song from a legendary band |
2 | 6663 | Frantic | Metallica | 6 | 2003 | 20 | NaN | NaN | NaN | 2 | 100 | Zzz song from a legendary band |
new_songs_df_filtered = new_songs_df.iloc[:, [3, 5, 9, 10]]
new_songs_df_filtered
Song_Avg_Rtg | Avg_Song_Age | Expectation | Artiste_Rating | |
---|---|---|---|---|
0 | 96 | 19 | 42 | 90 |
1 | 90 | 1 | 36 | 90 |
2 | 6 | 20 | 2 | 100 |
new_songs_prediction = logreg_model.predict(new_songs_df_filtered)
new_songs_prediction
array([1, 1, 0], dtype=int64)
new_songs_prediction_prob = logreg_model.predict_proba(new_songs_df_filtered)
new_songs_prediction_prob
array([[0.02588734, 0.97411266], [0.04390043, 0.95609957], [0.58225918, 0.41774082]])
new_songs_prediction_df = pd.DataFrame(new_songs_prediction, columns = ["Prediction"])
new_songs_prediction_df
Prediction | |
---|---|
0 | 1 |
1 | 1 |
2 | 0 |
pd.concat((new_songs_df, new_songs_prediction_df), axis = 1)
No | Song_Title | Artiste | Song_Avg_Rtg | Year | Avg_Song_Age | Advance | Bottom | Elimination | Expectation | Artiste_Rating | Comments | Prediction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6661 | Walk With Me In Hell | Lamb of God | 96 | 2004 | 19 | NaN | NaN | NaN | 42 | 90 | Classic song from a legendary band | 1 |
1 | 6662 | The Watcher | Arch Enemy | 90 | 2022 | 1 | NaN | NaN | NaN | 36 | 90 | Fantastic song from a legendary band | 1 |
2 | 6663 | Frantic | Metallica | 6 | 2003 | 20 | NaN | NaN | NaN | 2 | 100 | Zzz song from a legendary band | 0 |
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame()
vif_df["features"] = train_X.columns
vif_df["VIF"] = [variance_inflation_factor(train_X.values, i) for i in range(train_X.shape[1])]
print(vif_df)
features VIF 0 Song_Avg_Rtg 16.706265 1 Avg_Song_Age 2.483684 2 Expectation 1.350685 3 Artiste_Rating 16.696281
train_X.corr()
Song_Avg_Rtg | Avg_Song_Age | Expectation | Artiste_Rating | |
---|---|---|---|---|
Song_Avg_Rtg | 1.000000 | 0.106034 | 0.710542 | 0.598677 |
Avg_Song_Age | 0.106034 | 1.000000 | 0.053974 | 0.176771 |
Expectation | 0.710542 | 0.053974 | 1.000000 | 0.400120 |
Artiste_Rating | 0.598677 | 0.176771 | 0.400120 | 1.000000 |
train_X_1a = train_X.drop(columns = ["Artiste_Rating"])
train_X_1a.head()
Song_Avg_Rtg | Avg_Song_Age | Expectation | |
---|---|---|---|
699 | 50.0 | 43.0 | 13.000 |
916 | 42.0 | 29.0 | -14.800 |
1343 | 16.0 | 20.5 | -0.637 |
868 | 44.0 | 46.0 | -11.700 |
645 | 52.5 | 4.3 | 2.800 |
train_y_1a = train_y.copy()
train_y_1a.head()
699 1 916 1 1343 0 868 1 645 1 Name: Advance, dtype: category Categories (2, int64): [0, 1]
valid_X_1a = valid_X.drop(columns = ["Artiste_Rating"])
valid_X_1a.head()
Song_Avg_Rtg | Avg_Song_Age | Expectation | |
---|---|---|---|
67 | 82.0 | 51.0 | 18.3 |
348 | 65.0 | 2.0 | 29.5 |
1396 | 10.0 | 5.0 | -32.9 |
356 | 65.0 | 23.0 | 4.5 |
768 | 47.5 | 26.0 | -14.7 |
valid_y_1a = valid_y.copy()
valid_y_1a.head()
67 1 348 1 1396 0 356 1 768 0 Name: Advance, dtype: category Categories (2, int64): [0, 1]
logreg = LogisticRegression()
# fit the model with data
logreg_model_1a = logreg.fit(train_X_1a, train_y_1a)
print("The intercept is", logreg_model_1a.intercept_)
print("The coefficients are", logreg_model_1a.coef_)
coef_df_1a = pd.DataFrame({"Coefficient": logreg_model_1a.coef_[0]}, index = train_X_1a.columns)
coef_df_1a
The intercept is [-0.64169234] The coefficients are [[0.04145236 0.01510463 0.00136229]]
Coefficient | |
---|---|
Song_Avg_Rtg | 0.041452 |
Avg_Song_Age | 0.015105 |
Expectation | 0.001362 |
train_y_1a_pred = logreg_model_1a.predict(train_X_1a)
train_y_1a_pred[0:9]
array([1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)
valid_y_1a_pred = logreg_model_1a.predict(valid_X_1a)
valid_y_1a_pred[0:9]
array([1, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int64)
train_y_pred_1a_prob = logreg_model_1a.predict_proba(train_X_1a)
train_y_pred_1a_prob[0:9]
array([[0.10927828, 0.89072172], [0.17987727, 0.82012273], [0.41816214, 0.58183786], [0.13457655, 0.86542345], [0.16751801, 0.83248199], [0.09937026, 0.90062974], [0.10617309, 0.89382691], [0.15548014, 0.84451986], [0.1807682 , 0.8192318 ]])
valid_y_pred_1a_prob = logreg_model_1a.predict_proba(valid_X_1a)
valid_y_pred_1a_prob[0:9]
array([[0.02785027, 0.97214973], [0.10687109, 0.89312891], [0.54895882, 0.45104118], [0.08269752, 0.91730248], [0.15446664, 0.84553336], [0.30828082, 0.69171918], [0.12509386, 0.87490614], [0.03705495, 0.96294505], [0.10126542, 0.89873458]])
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df_1a = pd.DataFrame()
vif_df_1a["features"] = train_X_1a.columns
vif_df_1a["VIF"] = [variance_inflation_factor(train_X_1a.values, i) for i in range(train_X_1a.shape[1])]
print(vif_df_1a)
features VIF 0 Song_Avg_Rtg 2.484737 1 Avg_Song_Age 2.364796 2 Expectation 1.109415
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix_train_1a = confusion_matrix(train_y_1a, train_y_1a_pred)
confusion_matrix_train_1a
array([[ 14, 186], [ 8, 930]], dtype=int64)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
confusion_matrix_train_1a_display = ConfusionMatrixDisplay(confusion_matrix_train_1a, display_labels = logreg_model_1a.classes_)
confusion_matrix_train_1a_display.plot()
plt.grid(False)
accuracy_train_1a = accuracy_score(train_y_1a, train_y_1a_pred)
accuracy_train_1a
0.8295254833040422
from sklearn.metrics import classification_report
print(classification_report(train_y_1a, train_y_1a_pred))
precision recall f1-score support 0 0.64 0.07 0.13 200 1 0.83 0.99 0.91 938 accuracy 0.83 1138 macro avg 0.73 0.53 0.52 1138 weighted avg 0.80 0.83 0.77 1138
# from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix_valid_1a = confusion_matrix(valid_y_1a, valid_y_1a_pred)
confusion_matrix_valid_1a
array([[ 8, 79], [ 5, 396]], dtype=int64)
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_valid_1a_display = ConfusionMatrixDisplay(confusion_matrix_valid_1a, display_labels = logreg_model_1a.classes_)
confusion_matrix_valid_1a_display.plot()
plt.grid(False)
# from sklearn.metrics import classification_report
print(classification_report(valid_y_1a, valid_y_1a_pred))
precision recall f1-score support 0 0.62 0.09 0.16 87 1 0.83 0.99 0.90 401 accuracy 0.83 488 macro avg 0.72 0.54 0.53 488 weighted avg 0.79 0.83 0.77 488
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
fpr1a, tpr1a, thresh1a = roc_curve(valid_y_1a, valid_y_pred_1a_prob[:,1], pos_label = 1)
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.plot(fpr1a, tpr1a, linestyle = '-', color = "red", label = "Songs to Advance")
# roc curve for tpr = fpr (random line)
random_probs = [0 for i in range(len(valid_y_1a))]
p_fpr, p_tpr, _ = roc_curve(valid_y_1a, random_probs, pos_label = 1)
plt.plot(p_fpr, p_tpr, linestyle = '--', color = "green", label = "Random Advancement")
# If desired
plt.legend()
plt.title("Songs to Advance ROC (Modified Variables)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Text(0, 0.5, 'True Positive Rate')
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(valid_y_1a, valid_y_pred_1a_prob[:,1])
auc
0.7337403617393299
# If not already
# new_songs_df = pd.read_csv("new_songs_v2.csv", encoding = "ISO-8859-1")
new_songs_df
No | Song_Title | Artiste | Song_Avg_Rtg | Year | Avg_Song_Age | Advance | Bottom | Elimination | Expectation | Artiste_Rating | Comments | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6661 | Walk With Me In Hell | Lamb of God | 96 | 2004 | 19 | NaN | NaN | NaN | 42 | 90 | Classic song from a legendary band |
1 | 6662 | The Watcher | Arch Enemy | 90 | 2022 | 1 | NaN | NaN | NaN | 36 | 90 | Fantastic song from a legendary band |
2 | 6663 | Frantic | Metallica | 6 | 2003 | 20 | NaN | NaN | NaN | 2 | 100 | Zzz song from a legendary band |
new_songs_df_filtered_1a = new_songs_df.iloc[:, [3, 5, 9]]
new_songs_df_filtered_1a
Song_Avg_Rtg | Avg_Song_Age | Expectation | |
---|---|---|---|
0 | 96 | 19 | 42 |
1 | 90 | 1 | 36 |
2 | 6 | 20 | 2 |
new_songs_prediction_1a = logreg_model_1a.predict(new_songs_df_filtered_1a)
new_songs_prediction_1a
array([1, 1, 0], dtype=int64)
new_songs_prediction_prob_1a = logreg_model_1a.predict_proba(new_songs_df_filtered_1a)
new_songs_prediction_prob_1a
array([[0.02455597, 0.97544403], [0.04096668, 0.95903332], [0.52202598, 0.47797402]])
new_songs_prediction_df_1a = pd.DataFrame(new_songs_prediction_1a, columns = ["Prediction"])
new_songs_prediction_df_1a
Prediction | |
---|---|
0 | 1 |
1 | 1 |
2 | 0 |
new_songs_prediction_prob_df_1a = pd.DataFrame(new_songs_prediction_prob_1a, columns = ["Fail Probability", "Advance Probability"])
new_songs_prediction_prob_df_1a
Fail Probability | Advance Probability | |
---|---|---|
0 | 0.024556 | 0.975444 |
1 | 0.040967 | 0.959033 |
2 | 0.522026 | 0.477974 |
pd.concat((new_songs_df, new_songs_prediction_df_1a, new_songs_prediction_prob_df_1a), axis = 1)
No | Song_Title | Artiste | Song_Avg_Rtg | Year | Avg_Song_Age | Advance | Bottom | Elimination | Expectation | Artiste_Rating | Comments | Prediction | Fail Probability | Advance Probability | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6661 | Walk With Me In Hell | Lamb of God | 96 | 2004 | 19 | NaN | NaN | NaN | 42 | 90 | Classic song from a legendary band | 1 | 0.024556 | 0.975444 |
1 | 6662 | The Watcher | Arch Enemy | 90 | 2022 | 1 | NaN | NaN | NaN | 36 | 90 | Fantastic song from a legendary band | 1 | 0.040967 | 0.959033 |
2 | 6663 | Frantic | Metallica | 6 | 2003 | 20 | NaN | NaN | NaN | 2 | 100 | Zzz song from a legendary band | 0 | 0.522026 | 0.477974 |
train_X.head()
Song_Avg_Rtg | Avg_Song_Age | Expectation | Artiste_Rating | |
---|---|---|---|---|
699 | 50.0 | 43.0 | 13.000 | 50.0 |
916 | 42.0 | 29.0 | -14.800 | 42.0 |
1343 | 16.0 | 20.5 | -0.637 | 14.7 |
868 | 44.0 | 46.0 | -11.700 | 54.0 |
645 | 52.5 | 4.3 | 2.800 | 52.5 |
import pandas as pd
train_df = pd.concat((train_X, train_y), axis = 1)
train_df.head()
train_df["Advance"].value_counts()
1 938 0 200 Name: Advance, dtype: int64
# Class count
count_class_0, count_class_1 = train_df["Advance"].value_counts()
print(count_class_0)
print(count_class_1)
print(len(train_df))
938 200 1138
# Filter
df_class_0 = train_df[train_df["Advance"] == 0]
df_class_1 = train_df[train_df["Advance"] == 1]
df_class_0["Advance"].value_counts()
0 200 1 0 Name: Advance, dtype: int64
df_class_1["Advance"].value_counts()
1 938 0 0 Name: Advance, dtype: int64
len(train_df)*0.5
569.0
df_class_0_over = df_class_0.sample(int(len(train_df)*0.5), replace = True)
df_class_0_over
df_class_0_over["Advance"].value_counts()
0 569 1 0 Name: Advance, dtype: int64
df_class_1_under = df_class_1.sample(int(len(train_df)*0.5), replace = True)
df_class_1_under
df_class_1_under["Advance"].value_counts()
1 569 0 0 Name: Advance, dtype: int64
train_df_balanced = pd.concat([df_class_0_over, df_class_1_under], axis = 0)
train_df_balanced.head()
Song_Avg_Rtg | Avg_Song_Age | Expectation | Artiste_Rating | Advance | |
---|---|---|---|---|---|
873 | 43.00 | 16.0 | -17.000 | 43.00 | 0 |
1442 | 49.77 | 10.0 | -0.637 | 75.20 | 0 |
419 | 62.00 | 2.0 | -0.637 | 48.10 | 0 |
1418 | 4.00 | 32.0 | -51.400 | 49.00 | 0 |
1535 | 49.77 | 0.0 | -0.637 | 50.88 | 0 |
train_df_balanced["Advance"].value_counts()
0 569 1 569 Name: Advance, dtype: int64
train_df_balanced.head()
Song_Avg_Rtg | Avg_Song_Age | Expectation | Artiste_Rating | Advance | |
---|---|---|---|---|---|
873 | 43.00 | 16.0 | -17.000 | 43.00 | 0 |
1442 | 49.77 | 10.0 | -0.637 | 75.20 | 0 |
419 | 62.00 | 2.0 | -0.637 | 48.10 | 0 |
1418 | 4.00 | 32.0 | -51.400 | 49.00 | 0 |
1535 | 49.77 | 0.0 | -0.637 | 50.88 | 0 |
train_X_2 = train_df_balanced.drop(columns = ["Advance"])
train_y_2 = train_df_balanced["Advance"].astype("category")
logreg_2 = LogisticRegression()
# fit the model with data
logreg_2_model = logreg_2.fit(train_X_2, train_y_2)
print("The intercept is", logreg_2_model.intercept_)
print("The coefficients are", logreg_2_model.coef_)
coef_df_2 = pd.DataFrame({"Coefficient": logreg_2_model.coef_[0]}, index = train_X_2.columns)
coef_df_2
The intercept is [-1.87418204] The coefficients are [[ 0.0335975 0.02094234 0.00442436 -0.00086124]]
Coefficient | |
---|---|
Song_Avg_Rtg | 0.033597 |
Avg_Song_Age | 0.020942 |
Expectation | 0.004424 |
Artiste_Rating | -0.000861 |
train_y_pred_2 = logreg_2_model.predict(train_X_2)
train_y_pred_2[0:9]
array([0, 0, 1, 0, 0, 0, 1, 0, 0], dtype=int64)
valid_y_pred_2 = logreg_2_model.predict(valid_X)
valid_y_pred_2[0:9]
array([1, 1, 0, 1, 1, 0, 1, 1, 1], dtype=int64)
train_y_pred_prob_2 = logreg_2_model.predict_proba(train_X_2)
train_y_pred_prob_2[0:9]
array([[0.55148188, 0.44851812], [0.51504425, 0.48495575], [0.44855688, 0.55144312], [0.79236661, 0.20763339], [0.56184988, 0.43815012], [0.54848761, 0.45151239], [0.47668553, 0.52331447], [0.5710294 , 0.4289706 ], [0.5152582 , 0.4847418 ]])
valid_y_pred_prob_2 = logreg_2_model.predict_proba(valid_X)
valid_y_pred_prob_2[0:9]
array([[0.12355196, 0.87644804], [0.39156972, 0.60843028], [0.83028793, 0.16971207], [0.32151514, 0.67848486], [0.46374698, 0.53625302], [0.65058663, 0.34941337], [0.3772033 , 0.6227967 ], [0.18367156, 0.81632844], [0.37432433, 0.62567567]])
#from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix_train_2 = confusion_matrix(train_y_2, train_y_pred_2)
confusion_matrix_train_2
array([[407, 162], [219, 350]], dtype=int64)
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#import matplotlib.pyplot as plt
confusion_matrix_train_display_2 = ConfusionMatrixDisplay(confusion_matrix_train_2, display_labels = logreg_2_model.classes_)
confusion_matrix_train_display_2.plot()
plt.grid(False)
accuracy_train_2 = accuracy_score(train_y_2, train_y_pred_2)
accuracy_train_2
0.6652021089630932
#from sklearn.metrics import classification_report
print(classification_report(train_y_2, train_y_pred_2))
precision recall f1-score support 0 0.65 0.72 0.68 569 1 0.68 0.62 0.65 569 accuracy 0.67 1138 macro avg 0.67 0.67 0.66 1138 weighted avg 0.67 0.67 0.66 1138
# from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix_valid_2 = confusion_matrix(valid_y, valid_y_pred_2)
confusion_matrix_valid_2
array([[ 67, 20], [148, 253]], dtype=int64)
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_valid_display_2 = ConfusionMatrixDisplay(confusion_matrix_valid_2, display_labels = logreg_2_model.classes_)
confusion_matrix_valid_display_2.plot()
plt.grid(False)
# from sklearn.metrics import classification_report
print(classification_report(valid_y, valid_y_pred_2))
precision recall f1-score support 0 0.31 0.77 0.44 87 1 0.93 0.63 0.75 401 accuracy 0.66 488 macro avg 0.62 0.70 0.60 488 weighted avg 0.82 0.66 0.70 488
#from sklearn import metrics
#import matplotlib.pyplot as plt
#from sklearn.metrics import roc_curve
fpr1, tpr1, thresh1 = roc_curve(valid_y, valid_y_pred_prob_2[:,1], pos_label = 1)
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.plot(fpr1, tpr1, linestyle = '-', color = "red", label = "Songs to Advance")
# roc curve for tpr = fpr (random line)
random_probs = [0 for i in range(len(valid_y))]
p_fpr, p_tpr, _ = roc_curve(valid_y, random_probs, pos_label = 1)
plt.plot(p_fpr, p_tpr, linestyle = '--', color = "green", label = "Random Advancement")
# If desired
plt.legend()
plt.title("Songs to Advance ROC (Balanced)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Text(0, 0.5, 'True Positive Rate')
#from sklearn.metrics import roc_auc_score
auc_2 = roc_auc_score(valid_y, valid_y_pred_prob_2[:,1])
auc_2
0.736664086909164
new_songs_prediction_2 = logreg_2_model.predict(new_songs_df_filtered)
new_songs_prediction_2
array([1, 1, 0], dtype=int64)
new_songs_prediction_prob_2 = logreg_2_model.predict_proba(new_songs_df_filtered)
new_songs_prediction_prob_2
array([[0.1350072 , 0.8649928 ], [0.22230228, 0.77769772], [0.79101029, 0.20898971]])
Be careful about the order
new_songs_prediction_prob_df_2 = pd.DataFrame(new_songs_prediction_prob_2, columns = ["Fail Probability", "Advance Probability"])
new_songs_prediction_prob_df_2
Fail Probability | Advance Probability | |
---|---|---|
0 | 0.135007 | 0.864993 |
1 | 0.222302 | 0.777698 |
2 | 0.791010 | 0.208990 |
new_songs_prediction_df_2 = pd.DataFrame(new_songs_prediction_2, columns = ["Prediction"])
new_songs_prediction_df_2
Prediction | |
---|---|
0 | 1 |
1 | 1 |
2 | 0 |
pd.concat((new_songs_df, new_songs_prediction_prob_df_2, new_songs_prediction_df_2), axis = 1)
No | Song_Title | Artiste | Song_Avg_Rtg | Year | Avg_Song_Age | Advance | Bottom | Elimination | Expectation | Artiste_Rating | Comments | Fail Probability | Advance Probability | Prediction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6661 | Walk With Me In Hell | Lamb of God | 96 | 2004 | 19 | NaN | NaN | NaN | 42 | 90 | Classic song from a legendary band | 0.135007 | 0.864993 | 1 |
1 | 6662 | The Watcher | Arch Enemy | 90 | 2022 | 1 | NaN | NaN | NaN | 36 | 90 | Fantastic song from a legendary band | 0.222302 | 0.777698 | 1 |
2 | 6663 | Frantic | Metallica | 6 | 2003 | 20 | NaN | NaN | NaN | 2 | 100 | Zzz song from a legendary band | 0.791010 | 0.208990 | 0 |
from statsmodels.stats.outliers_influence import variance_inflation_factor
Create VIF dataframe, then calculate VIF for each variable
vif_data = pd.DataFrame()
vif_data["feature"] = train_X_2.columns
vif_data["VIF"] = [variance_inflation_factor(train_X_2.values, i)
for i in range(len(train_X_2.columns))]
vif_data
feature | VIF | |
---|---|---|
0 | Song_Avg_Rtg | 13.957249 |
1 | Avg_Song_Age | 2.285344 |
2 | Expectation | 1.353987 |
3 | Artiste_Rating | 14.626662 |
train_X_2.corr()
Song_Avg_Rtg | Avg_Song_Age | Expectation | Artiste_Rating | |
---|---|---|---|---|
Song_Avg_Rtg | 1.000000 | 0.136549 | 0.657376 | 0.583989 |
Avg_Song_Age | 0.136549 | 1.000000 | 0.081988 | 0.178075 |
Expectation | 0.657376 | 0.081988 | 1.000000 | 0.331929 |
Artiste_Rating | 0.583989 | 0.178075 | 0.331929 | 1.000000 |
Remove artiste rating
train_X_2.head()
Song_Avg_Rtg | Avg_Song_Age | Expectation | Artiste_Rating | |
---|---|---|---|---|
873 | 43.00 | 16.0 | -17.000 | 43.00 |
1442 | 49.77 | 10.0 | -0.637 | 75.20 |
419 | 62.00 | 2.0 | -0.637 | 48.10 |
1418 | 4.00 | 32.0 | -51.400 | 49.00 |
1535 | 49.77 | 0.0 | -0.637 | 50.88 |
train_X_3 = train_X_2.iloc[:,0:3]
train_X_3.head()
Song_Avg_Rtg | Avg_Song_Age | Expectation | |
---|---|---|---|
873 | 43.00 | 16.0 | -17.000 |
1442 | 49.77 | 10.0 | -0.637 |
419 | 62.00 | 2.0 | -0.637 |
1418 | 4.00 | 32.0 | -51.400 |
1535 | 49.77 | 0.0 | -0.637 |
vif_data_3 = pd.DataFrame()
vif_data_3["feature"] = train_X_3.columns
vif_data_3["VIF"] = [variance_inflation_factor(train_X_3.values, i)
for i in range(len(train_X_3.columns))]
vif_data_3
feature | VIF | |
---|---|---|
0 | Song_Avg_Rtg | 2.167299 |
1 | Avg_Song_Age | 2.182946 |
2 | Expectation | 1.044139 |
train_y_3 = train_y_2
valid_X_3 = valid_X.iloc[:, 0:3]
valid_y_3 = valid_y
logreg_3 = LogisticRegression(penalty = "l2", C = 666)
# fit the model with data
logreg_3_model = logreg_3.fit(train_X_3, train_y_3)
print("The intercept is", logreg_3_model.intercept_)
print("The coefficients are", logreg_3_model.coef_)
coef_df_3 = pd.DataFrame({"Coefficient": logreg_3_model.coef_[0]}, index = train_X_3.columns)
coef_df_3
The intercept is [-1.89646513] The coefficients are [[0.0331629 0.02086211 0.00450225]]
Coefficient | |
---|---|
Song_Avg_Rtg | 0.033163 |
Avg_Song_Age | 0.020862 |
Expectation | 0.004502 |
train_y_pred_3 = logreg_3_model.predict(train_X_3)
train_y_pred_3[0:9]
array([0, 0, 1, 0, 0, 0, 1, 0, 0], dtype=int64)
valid_y_pred_3 = logreg_3_model.predict(valid_X_3)
valid_y_pred_3[0:9]
array([1, 1, 0, 1, 1, 0, 1, 1, 1], dtype=int64)
train_y_pred_prob_3 = logreg_3_model.predict_proba(train_X_3)
train_y_pred_prob_3[0:9]
array([[0.55310029, 0.44689971], [0.51004731, 0.48995269], [0.45053962, 0.54946038], [0.79045028, 0.20954972], [0.561885 , 0.438115 ], [0.54642327, 0.45357673], [0.48260282, 0.51739718], [0.56999484, 0.43000516], [0.51455868, 0.48544132]])
valid_y_pred_prob_3 = logreg_3_model.predict_proba(valid_X_3)
valid_y_pred_prob_3[0:9]
array([[0.12246934, 0.87753066], [0.39325181, 0.60674819], [0.83322383, 0.16677617], [0.31881743, 0.68118257], [0.46132636, 0.53867364], [0.6503909 , 0.3496091 ], [0.37628605, 0.62371395], [0.17806823, 0.82193177], [0.3754707 , 0.6245293 ]])
#from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix_train_3 = confusion_matrix(train_y_3, train_y_pred_3)
confusion_matrix_train_3
array([[407, 162], [218, 351]], dtype=int64)
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#import matplotlib.pyplot as plt
confusion_matrix_train_display_3 = ConfusionMatrixDisplay(confusion_matrix_train_3,
display_labels = logreg_3_model.classes_)
confusion_matrix_train_display_3.plot()
plt.grid(False)
accuracy_train_3 = accuracy_score(train_y_3, train_y_pred_3)
accuracy_train_3
0.6660808435852372
#from sklearn.metrics import classification_report
print(classification_report(train_y_3, train_y_pred_3))
precision recall f1-score support 0 0.65 0.72 0.68 569 1 0.68 0.62 0.65 569 accuracy 0.67 1138 macro avg 0.67 0.67 0.67 1138 weighted avg 0.67 0.67 0.67 1138
# from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix_valid_3 = confusion_matrix(valid_y_3, valid_y_pred_3)
confusion_matrix_valid_3
array([[ 66, 21], [149, 252]], dtype=int64)
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
confusion_matrix_valid_display_3 = ConfusionMatrixDisplay(confusion_matrix_valid_3,
display_labels = logreg_3_model.classes_)
confusion_matrix_valid_display_3.plot()
plt.grid(False)
# from sklearn.metrics import classification_report
print(classification_report(valid_y_3, valid_y_pred_3))
precision recall f1-score support 0 0.31 0.76 0.44 87 1 0.92 0.63 0.75 401 accuracy 0.65 488 macro avg 0.62 0.69 0.59 488 weighted avg 0.81 0.65 0.69 488
#from sklearn import metrics
#import matplotlib.pyplot as plt
#from sklearn.metrics import roc_curve
#Get the fpr, tpr and thresholds (i.e. cutoffs)
fpr1, tpr1, thresh1 = roc_curve(valid_y_3, valid_y_pred_prob_3[:,1], pos_label = 1)
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.plot(fpr1, tpr1, linestyle = '-', color = "blue", label = "Songs to Advance")
# roc curve for tpr = fpr (random line)
random_probs = [0 for i in range(len(valid_y_3))]
p_fpr, p_tpr, _ = roc_curve(valid_y, random_probs, pos_label = 1)
plt.plot(p_fpr, p_tpr, linestyle = '--', color = "green", label = "Random Advancement")
# If desired
plt.legend()
plt.title("Songs to Advance ROC (Balanced) with Regularisation")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Text(0, 0.5, 'True Positive Rate')
#from sklearn.metrics import roc_auc_score
auc_3 = roc_auc_score(valid_y_3, valid_y_pred_prob_3[:,1])
auc_3
0.737409350187749
# Get the optimal index position in the array
optimal_index = np.argmax(tpr1 - fpr1)
print(optimal_index)
50
optimal_cutoff = thresh1[optimal_index]
optimal_cutoff
0.48131717567164345
#Thresh (threshold) = cutoff = [0,1] because it is the probability threshold.
#But scikit learn adds +1 to the last number in the threshold array to cover the full range [0, 1].
thresh1[0:10]
array([1.9208441 , 0.9208441 , 0.87259103, 0.87114783, 0.81221349, 0.81179973, 0.777871 , 0.77731994, 0.76689277, 0.76640133])
New confusion matrix for training set
train_y_pred_prob_3a = (train_y_pred_prob_3[:, 1] > optimal_cutoff)
confusion_matrix_train_3a = confusion_matrix(train_y_3, train_y_pred_prob_3a)
confusion_matrix_train_3a
array([[370, 199], [202, 367]], dtype=int64)
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#import matplotlib.pyplot as plt
confusion_matrix_train_display_3a = ConfusionMatrixDisplay(confusion_matrix_train_3a,
display_labels = logreg_3_model.classes_)
confusion_matrix_train_display_3a.plot()
plt.grid(False)
def optimum_cutoff_pred_fn(prob):
if prob > optimal_cutoff:
return 1
elif prob <= optimal_cutoff:
return 0
train_y_pred_prob_3a_df = pd.DataFrame(train_y_pred_prob_3a, columns = ["train_prob_3a"])
train_y_pred_prob_3a_df.head()
train_y_pred_3a = train_y_pred_prob_3a_df["train_prob_3a"].apply(optimum_cutoff_pred_fn)
print(classification_report(train_y_3, train_y_pred_3a))
precision recall f1-score support 0 0.65 0.65 0.65 569 1 0.65 0.64 0.65 569 accuracy 0.65 1138 macro avg 0.65 0.65 0.65 1138 weighted avg 0.65 0.65 0.65 1138
New confusion matrix for validation set
valid_y_pred_prob_3a = (valid_y_pred_prob_3[:, 1] > optimal_cutoff)
confusion_matrix_valid_3a = confusion_matrix(valid_y_3, valid_y_pred_prob_3a)
confusion_matrix_valid_3a
array([[ 65, 22], [133, 268]], dtype=int64)
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#import matplotlib.pyplot as plt
confusion_matrix_valid_display_3a = ConfusionMatrixDisplay(confusion_matrix_valid_3a,
display_labels = logreg_3_model.classes_)
confusion_matrix_valid_display_3a.plot()
plt.grid(False)
valid_y_pred_prob_3a_df = pd.DataFrame(valid_y_pred_prob_3a, columns = ["valid_prob_3a"])
valid_y_pred_prob_3a_df.head()
valid_y_pred_3a = valid_y_pred_prob_3a_df["valid_prob_3a"].apply(optimum_cutoff_pred_fn)
print(classification_report(valid_y_3, valid_y_pred_3a))
precision recall f1-score support 0 0.33 0.75 0.46 87 1 0.92 0.67 0.78 401 accuracy 0.68 488 macro avg 0.63 0.71 0.62 488 weighted avg 0.82 0.68 0.72 488
new_songs_df_filtered_3 = new_songs_df_filtered.iloc[:, 0:3]
new_songs_prediction_3 = logreg_3_model.predict(new_songs_df_filtered_3)
new_songs_prediction_3
array([1, 1, 0], dtype=int64)
new_songs_prediction_prob_3 = logreg_3_model.predict_proba(new_songs_df_filtered_3)
new_songs_prediction_prob_3
array([[0.13323568, 0.86676432], [0.2190624 , 0.7809376 ], [0.78095512, 0.21904488]])
Be careful about the order
new_songs_prediction_prob_df_3 = pd.DataFrame(new_songs_prediction_prob_3,
columns = ["Fail Probability", "Advance Probability"])
new_songs_prediction_prob_df_3
Fail Probability | Advance Probability | |
---|---|---|
0 | 0.133236 | 0.866764 |
1 | 0.219062 | 0.780938 |
2 | 0.780955 | 0.219045 |
new_songs_prediction_df_3 = pd.DataFrame(new_songs_prediction_3, columns = ["Prediction"])
new_songs_prediction_df_3
Prediction | |
---|---|
0 | 1 |
1 | 1 |
2 | 0 |
Prediction using the optimum cutoff No change in the predictions
new_songs_with_optimum_prediction = pd.concat((new_songs_df, new_songs_prediction_prob_df_3, new_songs_prediction_df_3), axis = 1)
new_songs_with_optimum_prediction
No | Song_Title | Artiste | Song_Avg_Rtg | Year | Avg_Song_Age | Advance | Bottom | Elimination | Expectation | Artiste_Rating | Comments | Fail Probability | Advance Probability | Prediction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6661 | Walk With Me In Hell | Lamb of God | 96 | 2004 | 19 | NaN | NaN | NaN | 42 | 90 | Classic song from a legendary band | 0.133236 | 0.866764 | 1 |
1 | 6662 | The Watcher | Arch Enemy | 90 | 2022 | 1 | NaN | NaN | NaN | 36 | 90 | Fantastic song from a legendary band | 0.219062 | 0.780938 | 1 |
2 | 6663 | Frantic | Metallica | 6 | 2003 | 20 | NaN | NaN | NaN | 2 | 100 | Zzz song from a legendary band | 0.780955 | 0.219045 | 0 |
new_songs_with_optimum_prediction["Optimum_Prediction"] = new_songs_with_optimum_prediction["Advance Probability"].apply(optimum_cutoff_pred_fn)
new_songs_with_optimum_prediction
No | Song_Title | Artiste | Song_Avg_Rtg | Year | Avg_Song_Age | Advance | Bottom | Elimination | Expectation | Artiste_Rating | Comments | Fail Probability | Advance Probability | Prediction | Optimum_Prediction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6661 | Walk With Me In Hell | Lamb of God | 96 | 2004 | 19 | NaN | NaN | NaN | 42 | 90 | Classic song from a legendary band | 0.133236 | 0.866764 | 1 | 1 |
1 | 6662 | The Watcher | Arch Enemy | 90 | 2022 | 1 | NaN | NaN | NaN | 36 | 90 | Fantastic song from a legendary band | 0.219062 | 0.780938 | 1 | 1 |
2 | 6663 | Frantic | Metallica | 6 | 2003 | 20 | NaN | NaN | NaN | 2 | 100 | Zzz song from a legendary band | 0.780955 | 0.219045 | 0 | 0 |