import pandas as pd
Sometimes, we may need to use a specific encoding:
encoding = "ISO-8859-1"
encoding = "utf-8"
albums = pd.read_csv("jm_top_albums_v5.csv")
albums.head()
position | album_id | artist | name | year | genre | is_metal | is_rock | rating | num_ratings | num_reviews | force | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Album45 | Radiohead | Ok Computer | 1997 | Alternative Rock | No | Yes | 4.23 | 60527 | 1583 | 60 |
1 | 2 | Album46 | Radiohead | Kid A | 2000 | Art Rock | No | Yes | 4.23 | 50279 | 714 | 60 |
2 | 3 | Album976 | Pink Floyd | The Dark Side Of The Moon | 1973 | Art Rock | No | Yes | 4.23 | 50633 | 1524 | 90 |
3 | 4 | Album974 | Pink Floyd | Wish You Were Here | 1975 | Progressive Rock | No | Yes | 4.29 | 41760 | 939 | 90 |
4 | 5 | Album2328 | King Crimson | In The Court Of The Crimson King | 1969 | Progressive Rock | No | Yes | 4.31 | 37873 | 828 | 90 |
Variable names. Hard to read without the index.
albums.columns.values.tolist()
['position', 'album_id', 'artist', 'name', 'year', 'genre', 'is_metal', 'is_rock', 'rating', 'num_ratings', 'num_reviews', 'force']
albums_variables_df = pd.DataFrame(albums.columns.values, columns = ["Variables"])
albums_variables_df
# or use
# print(albums_variables_df.to_string())
Variables | |
---|---|
0 | position |
1 | album_id |
2 | artist |
3 | name |
4 | year |
5 | genre |
6 | is_metal |
7 | is_rock |
8 | rating |
9 | num_ratings |
10 | num_reviews |
11 | force |
print(albums.dtypes.to_string())
position int64 album_id object artist object name object year int64 genre object is_metal object is_rock object rating float64 num_ratings int64 num_reviews int64 force int64
Filter for the required varables.
import numpy as np
albums_2 = albums.iloc[:, np.r_[4, 6:12]]
albums_2.head()
# Or simply (if no ranges are used)
# albums.iloc[:, [4, 6, 7, 8, 9, 10, 11]].head()
year | is_metal | is_rock | rating | num_ratings | num_reviews | force | |
---|---|---|---|---|---|---|---|
0 | 1997 | No | Yes | 4.23 | 60527 | 1583 | 60 |
1 | 2000 | No | Yes | 4.23 | 50279 | 714 | 60 |
2 | 1973 | No | Yes | 4.23 | 50633 | 1524 | 90 |
3 | 1975 | No | Yes | 4.29 | 41760 | 939 | 90 |
4 | 1969 | No | Yes | 4.31 | 37873 | 828 | 90 |
from sklearn.model_selection import train_test_split
Define predictors and target variable.
Creating dummies applies to categorical variables only.
If a prefix is desired:
X = pd.get_dummies(X, prefix_sep = 'dummy', drop_first = True)
Or since it is just "yes" vs "no", a simple recoding will work too.
import numpy as np
albums["is_metal"] = np.where(albums["is_metal"] == "Yes", 1, 0)
X = albums_2.drop(columns = ["rating"])
# Get dummies for the caterogical variables
X = pd.get_dummies(X, drop_first = True)
y = albums_2["rating"]
X.head()
year | num_ratings | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|---|
0 | 1997 | 60527 | 1583 | 60 | 0 | 1 |
1 | 2000 | 50279 | 714 | 60 | 0 | 1 |
2 | 1973 | 50633 | 1524 | 90 | 0 | 1 |
3 | 1975 | 41760 | 939 | 90 | 0 | 1 |
4 | 1969 | 37873 | 828 | 90 | 0 | 1 |
y
0 4.23 1 4.23 2 4.23 3 4.29 4 4.31 ... 495 3.84 496 3.81 497 3.85 498 3.91 499 3.86 Name: rating, Length: 500, dtype: float64
Split the dataset
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.3, random_state = 666)
Check.
train_X.head()
year | num_ratings | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|---|
453 | 1986 | 10621 | 268 | 100 | 1 | 0 |
108 | 1975 | 16241 | 298 | 10 | 0 | 1 |
257 | 2003 | 8700 | 84 | 11 | 0 | 0 |
82 | 1988 | 15617 | 324 | 9 | 0 | 1 |
116 | 1968 | 13902 | 417 | 70 | 0 | 0 |
len(train_X)
350
train_y.head()
453 3.83 108 4.02 257 3.99 82 4.07 116 4.09 Name: rating, dtype: float64
len(train_y)
350
valid_X.head()
year | num_ratings | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|---|
291 | 1997 | 11571 | 174 | 9 | 0 | 1 |
412 | 1991 | 10223 | 97 | 11 | 0 | 1 |
356 | 1997 | 4050 | 94 | 60 | 0 | 0 |
258 | 1972 | 9585 | 242 | 70 | 0 | 1 |
97 | 2016 | 22408 | 214 | 11 | 0 | 0 |
len(valid_X)
150
valid_y.head()
291 3.91 412 3.86 356 4.05 258 3.97 97 4.01 Name: rating, dtype: float64
len(valid_y)
150
import sklearn
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(train_X, train_y)
LinearRegression()
Training set prediction
train_y_pred = model.predict(train_X)
train_y_pred[:5]
array([3.92031984, 3.96376431, 3.906202 , 3.92928306, 4.06906089])
train_y_pred_df = pd.DataFrame(train_y_pred, columns = ["Training_Prediction"])
train_y_pred_df.head()
Training_Prediction | |
---|---|
0 | 3.920320 |
1 | 3.963764 |
2 | 3.906202 |
3 | 3.929283 |
4 | 4.069061 |
print("model intercept: ", model.intercept_)
print("model coefficients: ", model.coef_)
print("Model score: ", model.score(train_X, train_y))
model intercept: 9.0589641002778 model coefficients: [-2.59060358e-03 2.77330684e-06 6.02367981e-05 6.39019796e-04 -1.03206280e-01 -4.81397498e-02] Model score: 0.2211235025610777
Coefficients, easier to read.
print(pd.DataFrame({"Predictor": train_X.columns, "Coefficient": model.coef_}))
Predictor Coefficient 0 year -0.002591 1 num_ratings 0.000003 2 num_reviews 0.000060 3 force 0.000639 4 is_metal_Yes -0.103206 5 is_rock_Yes -0.048140
Validation set prediction
valid_y_pred = model.predict(valid_X)
valid_y_pred[:5]
array([3.88571131, 3.89415632, 3.94076408, 3.98804492, 3.91837143])
valid_y_pred_df = pd.DataFrame(valid_y_pred, columns = ["Validation_Prediction"])
valid_y_pred_df.head()
Validation_Prediction | |
---|---|
0 | 3.885711 |
1 | 3.894156 |
2 | 3.940764 |
3 | 3.988045 |
4 | 3.918371 |
Get the RMSE for training set
mse_train = sklearn.metrics.mean_squared_error(train_y, train_y_pred)
mse_train
0.012971631724489941
import math
rmse_train = math.sqrt(mse_train)
rmse_train
0.11389307145076887
If using the dmba package:
pip install dmba
or
conda install -c conda-forge dmba
Then load the library
import dmba
from dmba import regressionSummary
import dmba
from dmba import regressionSummary
regressionSummary(train_y, train_y_pred)
Regression statistics Mean Error (ME) : -0.0000 Root Mean Squared Error (RMSE) : 0.1139 Mean Absolute Error (MAE) : 0.0872 Mean Percentage Error (MPE) : -0.0824 Mean Absolute Percentage Error (MAPE) : 2.1991
train_y.describe()
count 350.000000 mean 3.964000 std 0.129236 min 3.530000 25% 3.880000 50% 3.950000 75% 4.040000 max 4.350000 Name: rating, dtype: float64
from numpy import std
std(train_y)
0.12905148474266273
Get RMSE for the validation set
mse_valid = sklearn.metrics.mean_squared_error(valid_y, valid_y_pred)
mse_valid
0.013787692242144434
import math
rmse_valid = math.sqrt(mse_valid)
rmse_valid
0.11742100426305523
# import dmba
# from dmba import regressionSummary
regressionSummary(valid_y, valid_y_pred)
Regression statistics Mean Error (ME) : 0.0003 Root Mean Squared Error (RMSE) : 0.1174 Mean Absolute Error (MAE) : 0.0903 Mean Percentage Error (MPE) : -0.0794 Mean Absolute Percentage Error (MAPE) : 2.2878
Residuals.
train_residuals = train_y - train_y_pred
train_residuals
453 -0.090320 108 0.056236 257 0.083798 82 0.140717 116 0.020939 ... 318 -0.067202 414 -0.153011 429 -0.143237 386 -0.060268 236 -0.032025 Name: rating, Length: 350, dtype: float64
type(train_residuals)
pandas.core.series.Series
import matplotlib.pyplot as plt
plt.hist(train_residuals, bins = 30)
plt.title("Residuals for Training")
plt.show()
Use a df if preferred.
train_residuals_df = train_residuals.to_frame(name = "Force_Residuals")
train_residuals_df
Force_Residuals | |
---|---|
453 | -0.090320 |
108 | 0.056236 |
257 | 0.083798 |
82 | 0.140717 |
116 | 0.020939 |
... | ... |
318 | -0.067202 |
414 | -0.153011 |
429 | -0.143237 |
386 | -0.060268 |
236 | -0.032025 |
350 rows × 1 columns
import matplotlib.pyplot as plt
plt.hist(train_residuals_df["Force_Residuals"], bins = 30)
plt.title("Residuals for Training")
plt.show()
Normality
import numpy as np
from scipy.stats import shapiro
shapiro(train_y)
ShapiroResult(statistic=0.989881694316864, pvalue=0.0161635410040617)
shapiro(train_residuals)
ShapiroResult(statistic=0.9871808290481567, pvalue=0.0034690035972744226)
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame()
vif_df["features"] = train_X.columns
vif_df["VIF"] = [variance_inflation_factor(train_X.values, i) for i in range(train_X.shape[1])]
print(vif_df)
features VIF 0 year 5.214599 1 num_ratings 15.843626 2 num_reviews 13.049311 3 force 5.031296 4 is_metal_Yes 2.265050 5 is_rock_Yes 2.337720
train_X.corr()
year | num_ratings | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|---|
year | 1.000000 | 0.081395 | -0.203606 | -0.217776 | 0.096229 | -0.169221 |
num_ratings | 0.081395 | 1.000000 | 0.844890 | 0.123587 | -0.063767 | 0.191545 |
num_reviews | -0.203606 | 0.844890 | 1.000000 | 0.329668 | 0.032718 | 0.287826 |
force | -0.217776 | 0.123587 | 0.329668 | 1.000000 | 0.564572 | 0.288430 |
is_metal_Yes | 0.096229 | -0.063767 | 0.032718 | 0.564572 | 1.000000 | -0.247685 |
is_rock_Yes | -0.169221 | 0.191545 | 0.287826 | 0.288430 | -0.247685 | 1.000000 |
Change variables
train_X2 = train_X.drop(columns = ["num_ratings"])
train_X2.head()
year | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|
453 | 1986 | 268 | 100 | 1 | 0 |
108 | 1975 | 298 | 10 | 0 | 1 |
257 | 2003 | 84 | 11 | 0 | 0 |
82 | 1988 | 324 | 9 | 0 | 1 |
116 | 1968 | 417 | 70 | 0 | 0 |
train_y2 = train_y.copy()
train_y.head()
453 3.83 108 4.02 257 3.99 82 4.07 116 4.09 Name: rating, dtype: float64
valid_X2 = valid_X.drop(columns = ["num_ratings"])
valid_X2.head()
year | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|
291 | 1997 | 174 | 9 | 0 | 1 |
412 | 1991 | 97 | 11 | 0 | 1 |
356 | 1997 | 94 | 60 | 0 | 0 |
258 | 1972 | 242 | 70 | 0 | 1 |
97 | 2016 | 214 | 11 | 0 | 0 |
valid_y2 = valid_y.copy()
valid_y.head()
291 3.91 412 3.86 356 4.05 258 3.97 97 4.01 Name: rating, dtype: float64
model2 = LinearRegression()
model2.fit(train_X2, train_y2)
LinearRegression()
train_y_pred2 = model2.predict(train_X2)
train_y_pred2[:5]
array([3.92064404, 3.95644437, 3.90923205, 3.93135157, 4.07746071])
train_y_pred2_df = pd.DataFrame(train_y_pred2, columns = ["Training_Prediction"])
train_y_pred2_df.head()
Training_Prediction | |
---|---|
0 | 3.920644 |
1 | 3.956444 |
2 | 3.909232 |
3 | 3.931352 |
4 | 4.077461 |
print("model intercept 2: ", model2.intercept_)
print("model coefficients 2: ", model2.coef_)
print("Model score 2: ", model2.score(train_X2, train_y2))
model intercept 2: 8.329702647212667 model coefficients 2: [-0.00221718 0.00016643 0.0005967 -0.1100098 -0.04988872] Model score 2: 0.21504791523265543
Coefficients, easier to read.
print(pd.DataFrame({"Predictor": train_X2.columns, "Coefficient": model2.coef_}))
Predictor Coefficient 0 year -0.002217 1 num_reviews 0.000166 2 force 0.000597 3 is_metal_Yes -0.110010 4 is_rock_Yes -0.049889
Get the RMSE for training set
mse_train2 = sklearn.metrics.mean_squared_error(train_y2, train_y_pred2)
mse_train2
0.013072816291739581
import math
rmse_train2 = math.sqrt(mse_train2)
rmse_train2
0.11433641717204358
train_y2.describe()
count 350.000000 mean 3.964000 std 0.129236 min 3.530000 25% 3.880000 50% 3.950000 75% 4.040000 max 4.350000 Name: rating, dtype: float64
If using the dmba package:
pip install dmba
or
conda install -c conda-forge dmba
Then load the library
import dmba
from dmba import regressionSummary
# if not already
# import dmba
# from dmba import regressionSummary
regressionSummary(train_y2, train_y_pred2)
Regression statistics Mean Error (ME) : -0.0000 Root Mean Squared Error (RMSE) : 0.1143 Mean Absolute Error (MAE) : 0.0882 Mean Percentage Error (MPE) : -0.0830 Mean Absolute Percentage Error (MAPE) : 2.2231
from numpy import std
std(train_y2)
0.12905148474266273
Residuals.
train_residuals2 = train_y2 - train_y_pred2
train_residuals2
453 -0.090644 108 0.063556 257 0.080768 82 0.138648 116 0.012539 ... 318 -0.064393 414 -0.151840 429 -0.134696 386 -0.072665 236 -0.041754 Name: rating, Length: 350, dtype: float64
Normality
import numpy as np
from scipy.stats import shapiro
shapiro(train_y2)
ShapiroResult(statistic=0.989881694316864, pvalue=0.0161635410040617)
shapiro(train_residuals2)
ShapiroResult(statistic=0.9879220724105835, pvalue=0.005249316804111004)
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df2 = pd.DataFrame()
vif_df2["features"] = train_X2.columns
vif_df2["VIF"] = [variance_inflation_factor(train_X2.values, i) for i in range(train_X2.shape[1])]
print(vif_df2)
features VIF 0 year 3.430132 1 num_reviews 3.535408 2 force 4.802715 3 is_metal_Yes 2.264798 4 is_rock_Yes 2.334343
train_X2.corr()
year | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|
year | 1.000000 | -0.203606 | -0.217776 | 0.096229 | -0.169221 |
num_reviews | -0.203606 | 1.000000 | 0.329668 | 0.032718 | 0.287826 |
force | -0.217776 | 0.329668 | 1.000000 | 0.564572 | 0.288430 |
is_metal_Yes | 0.096229 | 0.032718 | 0.564572 | 1.000000 | -0.247685 |
is_rock_Yes | -0.169221 | 0.287826 | 0.288430 | -0.247685 | 1.000000 |
Scikit-learn does not provide traditional regression model summaries.
Use statsmodels package if desired.
conda install -c conda-forge statsmodels
or
pip install statsmodels
import statsmodels.api as sm
model_2_statsmodels = sm.OLS(train_y2, train_X2)
results = model_2_statsmodels.fit()
print(results.summary())
OLS Regression Results ======================================================================================= Dep. Variable: rating R-squared (uncentered): 0.999 Model: OLS Adj. R-squared (uncentered): 0.999 Method: Least Squares F-statistic: 6.453e+04 Date: Wed, 28 Feb 2024 Prob (F-statistic): 0.00 Time: 09:44:13 Log-Likelihood: 218.47 No. Observations: 350 AIC: -426.9 Df Residuals: 345 BIC: -407.7 Df Model: 5 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- year 0.0020 6.5e-06 301.427 0.000 0.002 0.002 num_reviews 0.0002 3.85e-05 5.275 0.000 0.000 0.000 force 0.0013 0.000 4.444 0.000 0.001 0.002 is_metal_Yes -0.1813 0.033 -5.536 0.000 -0.246 -0.117 is_rock_Yes -0.0571 0.018 -3.160 0.002 -0.093 -0.022 ============================================================================== Omnibus: 3.818 Durbin-Watson: 1.936 Prob(Omnibus): 0.148 Jarque-Bera (JB): 3.540 Skew: -0.212 Prob(JB): 0.170 Kurtosis: 3.251 Cond. No. 9.86e+03 ============================================================================== Notes: [1] R² is computed without centering (uncentered) since the model does not contain a constant. [2] Standard Errors assume that the covariance matrix of the errors is correctly specified. [3] The condition number is large, 9.86e+03. This might indicate that there are strong multicollinearity or other numerical problems.
valid_X2
year | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|
291 | 1997 | 174 | 9 | 0 | 1 |
412 | 1991 | 97 | 11 | 0 | 1 |
356 | 1997 | 94 | 60 | 0 | 0 |
258 | 1972 | 242 | 70 | 0 | 1 |
97 | 2016 | 214 | 11 | 0 | 0 |
... | ... | ... | ... | ... | ... |
41 | 1977 | 376 | 60 | 0 | 0 |
67 | 1972 | 441 | 90 | 0 | 0 |
154 | 1959 | 164 | 9 | 0 | 0 |
315 | 1990 | 131 | 60 | 0 | 0 |
196 | 1996 | 209 | 80 | 1 | 0 |
150 rows × 5 columns
valid_y_pred2 = model2.predict(valid_X2)
valid_y_pred2
array([3.88643199, 3.88811313, 3.95343762, 3.98957749, 3.90204498, 3.93702845, 4.16201299, 4.07955924, 3.98366014, 3.88593194, 3.88174593, 3.92942977, 3.93824385, 3.9146902 , 3.95129391, 3.88850851, 3.89602184, 4.03487042, 3.98817641, 3.89119847, 3.96076585, 3.91285366, 3.95479662, 3.94255539, 4.01915237, 3.91046796, 3.98149919, 3.90144816, 3.92543437, 3.85510706, 3.92150031, 4.04618787, 4.05244501, 4.00111492, 4.10811332, 3.86545484, 3.93190761, 3.92458663, 3.8932298 , 4.00973397, 3.97285354, 3.88640998, 4.15136763, 3.95854298, 3.96558195, 4.00560861, 3.911413 , 3.98242183, 3.88482624, 3.9566539 , 3.97127899, 4.03504188, 3.87937202, 3.99612162, 3.98843263, 3.95768924, 4.07873634, 3.99006073, 3.92056682, 3.90500086, 3.91708796, 3.94103623, 3.93094205, 3.89860996, 4.01725391, 3.96793143, 3.89035901, 3.96404992, 3.99228522, 3.90879207, 3.93679875, 3.97516655, 3.97029194, 3.94378894, 3.94811301, 3.95924411, 3.91662413, 3.87095722, 3.94865432, 3.88276819, 4.03478793, 4.04837947, 3.88119758, 3.88680611, 3.95026827, 3.94579375, 3.95448894, 4.00588241, 3.95039848, 3.965022 , 3.88136921, 3.88908077, 3.95261064, 4.02158694, 3.88254243, 4.00725897, 3.93455112, 3.88182315, 3.9394139 , 3.89951098, 3.98180065, 3.97080279, 4.11603111, 3.9632385 , 3.9938984 , 3.8897526 , 4.05822459, 3.9226758 , 4.00109124, 3.88570693, 3.96627731, 4.06224209, 4.02111675, 3.88346456, 3.98666119, 3.99818707, 3.93915267, 3.9923533 , 3.98333552, 4.04363275, 3.89939753, 3.99593182, 3.95667331, 3.93030393, 4.00250496, 3.94558238, 3.99645479, 3.93412687, 4.01120877, 3.87316044, 3.89264356, 4.11086368, 4.10569271, 4.02547429, 3.9593898 , 3.989411 , 4.03902198, 4.00909793, 3.9848291 , 4.14552375, 3.95217385, 3.94301108, 3.92643975, 4.01646634, 3.86834931, 4.04471535, 4.08452032, 4.01890928, 3.97511591, 3.87671874])
valid_y2_pred_df = pd.DataFrame(valid_y_pred2, columns = ["Validation_Prediction"])
valid_y2_pred_df
Validation_Prediction | |
---|---|
0 | 3.886432 |
1 | 3.888113 |
2 | 3.953438 |
3 | 3.989577 |
4 | 3.902045 |
... | ... |
145 | 4.044715 |
146 | 4.084520 |
147 | 4.018909 |
148 | 3.975116 |
149 | 3.876719 |
150 rows × 1 columns
Get the RMSE for validation set.
mse_valid_2 = sklearn.metrics.mean_squared_error(valid_y2, valid_y_pred2)
mse_valid_2
0.014244465795913257
# As before
# import math
rmse_valid_2 = math.sqrt(mse_valid_2)
rmse_valid_2
0.11935018138198725
# As before:
# If using the dmba package:
# pip install dmba
# Done earlier. Just for illustration
# import dmba
# from dmba import regressionSummary
regressionSummary(valid_y2, valid_y_pred2)
Regression statistics Mean Error (ME) : 0.0023 Root Mean Squared Error (RMSE) : 0.1194 Mean Absolute Error (MAE) : 0.0923 Mean Percentage Error (MPE) : -0.0308 Mean Absolute Percentage Error (MAPE) : 2.3377
new_album_df = pd.read_csv("cod.csv")
new_album_df
album_id | artist | name | year | genre | is_metal | is_rock | num_ratings | num_reviews | force | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Album666 | Children of Bodom | A Chapter Called Children of Bodom-Final Show ... | 2023 | Melodic Death metal | Yes | No | 6 | 6 | 100 |
Filter variables
new_album_df = new_album_df.loc[:, ["year", "num_reviews", "force", "is_metal", "is_rock"]]
new_album_df
year | num_reviews | force | is_metal | is_rock | |
---|---|---|---|---|---|
0 | 2023 | 6 | 100 | Yes | No |
new_album_df.dtypes
year int64 num_reviews int64 force int64 is_metal object is_rock object dtype: object
Get dummies to match training set Cannot use get_dummies() technically because there is only 1 record
But a workaround could be:
First, get whatever dummies from the new df
new_album_df_dummies = pd.get_dummies(new_album_df)
Then add the additional dummy columns from the training set, giving these a default value of "0"
new_album_df_dummies_full = new_album_df_dummies.reindex(columns = train_X2.columns, fill_value = 0)
import numpy as np
new_album_df["is_metal_Yes"] = np.where(new_album_df["is_metal"] == "Yes", "1", "0")
new_album_df["is_rock_Yes"] = np.where(new_album_df["is_rock"] == "Yes", "1", "0")
new_album_df
year | num_reviews | force | is_metal | is_rock | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|---|---|
0 | 2023 | 6 | 100 | Yes | No | 1 | 0 |
new_album_df.dtypes
year int64 num_reviews int64 force int64 is_metal object is_rock object is_metal_Yes object is_rock_Yes object dtype: object
columns = ["is_metal", "is_rock"]
new_album_df.drop(columns, inplace = True, axis = 1)
new_album_df
year | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|
0 | 2023 | 6 | 100 | 1 | 0 |
new_album_df.dtypes
year int64 num_reviews int64 force int64 is_metal_Yes object is_rock_Yes object dtype: object
new_album_pred = model2.predict(new_album_df)
new_album_pred
array([3.79500287])
# As before
# import pandas as pd
new_album_pred_df = pd.DataFrame(new_album_pred, columns = ["Prediction"])
new_album_pred_df
# to export
# new_album_pred_df.to_csv("whatever_name.csv")
Prediction | |
---|---|
0 | 3.795003 |
alpha = 0.05
ci = np.quantile(train_residuals2, 1 - alpha)
ci
0.19936714206436512
def generate_results_confint(preds, ci):
df = pd.DataFrame()
df["Prediction"] = preds
if ci >= 0:
df["upper"] = preds + ci
df["lower"] = preds - ci
else:
df["upper"] = preds - ci
df["lower"] = preds + ci
return df
new_album_pred_confint_df = generate_results_confint(new_album_pred, ci)
new_album_pred_confint_df
Prediction | upper | lower | |
---|---|---|---|
0 | 3.795003 | 3.99437 | 3.595636 |
from IPython import display
display.Image("horns3.png")
train_X2.dtypes
year int64 num_reviews int64 force int64 is_metal_Yes uint8 is_rock_Yes uint8 dtype: object
train_X2_variables_df = pd.DataFrame(train_X2.columns.values, columns = ["Variables"])
train_X2_variables_df
Variables | |
---|---|
0 | year |
1 | num_reviews |
2 | force |
3 | is_metal_Yes |
4 | is_rock_Yes |
train_X2_log10 = np.log10(train_X2.iloc[:,0:3])
train_X2_log10.head()
year | num_reviews | force | |
---|---|---|---|
453 | 3.297979 | 2.428135 | 2.000000 |
108 | 3.295567 | 2.474216 | 1.000000 |
257 | 3.301681 | 1.924279 | 1.041393 |
82 | 3.298416 | 2.510545 | 0.954243 |
116 | 3.294025 | 2.620136 | 1.845098 |
train_X2
year | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|
453 | 1986 | 268 | 100 | 1 | 0 |
108 | 1975 | 298 | 10 | 0 | 1 |
257 | 2003 | 84 | 11 | 0 | 0 |
82 | 1988 | 324 | 9 | 0 | 1 |
116 | 1968 | 417 | 70 | 0 | 0 |
... | ... | ... | ... | ... | ... |
318 | 2011 | 105 | 10 | 0 | 0 |
414 | 1980 | 242 | 70 | 0 | 1 |
429 | 2017 | 63 | 11 | 0 | 0 |
386 | 2007 | 281 | 10 | 0 | 1 |
236 | 1988 | 267 | 9 | 0 | 0 |
350 rows × 5 columns
For illustration, remove year and force
train_X3 = pd.concat((train_X2_log10.iloc[:, 1], train_X2.iloc[:,3:5]), axis = 1)
train_X3.head()
num_reviews | is_metal_Yes | is_rock_Yes | |
---|---|---|---|
453 | 2.428135 | 1 | 0 |
108 | 2.474216 | 0 | 1 |
257 | 1.924279 | 0 | 0 |
82 | 2.510545 | 0 | 1 |
116 | 2.620136 | 0 | 0 |
valid_X2
year | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|
291 | 1997 | 174 | 9 | 0 | 1 |
412 | 1991 | 97 | 11 | 0 | 1 |
356 | 1997 | 94 | 60 | 0 | 0 |
258 | 1972 | 242 | 70 | 0 | 1 |
97 | 2016 | 214 | 11 | 0 | 0 |
... | ... | ... | ... | ... | ... |
41 | 1977 | 376 | 60 | 0 | 0 |
67 | 1972 | 441 | 90 | 0 | 0 |
154 | 1959 | 164 | 9 | 0 | 0 |
315 | 1990 | 131 | 60 | 0 | 0 |
196 | 1996 | 209 | 80 | 1 | 0 |
150 rows × 5 columns
valid_X2_log10 = np.log10(valid_X2.iloc[:,0:3])
valid_X2_log10.head()
year | num_reviews | force | |
---|---|---|---|
291 | 3.300378 | 2.240549 | 0.954243 |
412 | 3.299071 | 1.986772 | 1.041393 |
356 | 3.300378 | 1.973128 | 1.778151 |
258 | 3.294907 | 2.383815 | 1.845098 |
97 | 3.304491 | 2.330414 | 1.041393 |
valid_X3 = pd.concat((valid_X2_log10.iloc[:, 1], valid_X2.iloc[:,3:5]), axis = 1)
valid_X3.head()
num_reviews | is_metal_Yes | is_rock_Yes | |
---|---|---|---|
291 | 2.240549 | 0 | 1 |
412 | 1.986772 | 0 | 1 |
356 | 1.973128 | 0 | 0 |
258 | 2.383815 | 0 | 1 |
97 | 2.330414 | 0 | 0 |
train_y3 = np.log10(train_y2)
train_y3
453 0.583199 108 0.604226 257 0.600973 82 0.609594 116 0.611723 ... 318 0.583199 414 0.582063 429 0.572872 386 0.580925 236 0.594393 Name: rating, Length: 350, dtype: float64
valid_y3 = np.log10(valid_y2)
valid_y3
291 0.592177 412 0.586587 356 0.607455 258 0.598791 97 0.603144 ... 41 0.617000 67 0.615950 154 0.612784 315 0.587711 196 0.592177 Name: rating, Length: 150, dtype: float64
import sklearn
from sklearn.linear_model import LinearRegression
model3 = LinearRegression()
model3.fit(train_X3, train_y3)
LinearRegression()
train_y3_pred = model3.predict(train_X3)
train_y3_pred[0:10]
array([0.59214885, 0.59881372, 0.59683686, 0.59899096, 0.60023175, 0.59923895, 0.59632589, 0.60073982, 0.59886721, 0.59969708])
train_y3_pred_df = pd.DataFrame(train_y3_pred, columns = ["Training_Prediction"])
train_y3_pred_df
Training_Prediction | |
---|---|
0 | 0.592149 |
1 | 0.598814 |
2 | 0.596837 |
3 | 0.598991 |
4 | 0.600232 |
... | ... |
345 | 0.597310 |
346 | 0.598373 |
347 | 0.596227 |
348 | 0.598689 |
349 | 0.599287 |
350 rows × 1 columns
print("model intercept: ", model3.intercept_)
print("model coefficients: ", model3.coef_)
print("Model score: ", model3.score(train_X3, train_y3))
model intercept: 0.5874488636044404 model coefficients: [ 0.00487871 -0.00714618 -0.00070612] Model score: 0.030262065335401545
Coefficients, easier to read.
print(pd.DataFrame({"Predictor": train_X3.columns, "Coefficient": model3.coef_}))
Predictor Coefficient 0 num_reviews 0.004879 1 is_metal_Yes -0.007146 2 is_rock_Yes -0.000706
Get the RMSE for training set
mse_train_3 = sklearn.metrics.mean_squared_error(train_y3, train_y3_pred)
mse_train_3
0.00019315048657599174
import math
rmse_train_3 = math.sqrt(mse_train_3)
rmse_train_3
0.013897859064474346
train_y3.describe()
count 350.000000 mean 0.597904 std 0.014133 min 0.547775 25% 0.588832 50% 0.596597 75% 0.606381 max 0.638489 Name: rating, dtype: float64
If using the dmba package:
pip install dmba
or
conda install -c conda-forge dmba
Then load the library
import dmba
from dmba import regressionSummary
import dmba
from dmba import regressionSummary
regressionSummary(train_y3, train_y3_pred)
Regression statistics Mean Error (ME) : 0.0000 Root Mean Squared Error (RMSE) : 0.0139 Mean Absolute Error (MAE) : 0.0108 Mean Percentage Error (MPE) : -0.0541 Mean Absolute Percentage Error (MAPE) : 1.8138
Normality
import numpy as np
from scipy.stats import shapiro
shapiro(train_y3)
ShapiroResult(statistic=0.9914869070053101, pvalue=0.04164901003241539)
train_residuals_3 = train_y3 - train_y3_pred
train_residuals_3
453 -0.008950 108 0.005412 257 0.004136 82 0.010603 116 0.011492 ... 318 -0.014111 414 -0.016309 429 -0.023356 386 -0.017764 236 -0.004895 Name: rating, Length: 350, dtype: float64
shapiro(train_residuals_3)
ShapiroResult(statistic=0.9939107894897461, pvalue=0.17373701930046082)
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df_3 = pd.DataFrame()
vif_df_3["features"] = train_X3.columns
vif_df_3["VIF"] = [variance_inflation_factor(train_X3.values, i) for i in range(train_X3.shape[1])]
print(vif_df_3)
features VIF 0 num_reviews 1.944113 1 is_metal_Yes 1.209597 2 is_rock_Yes 1.734516
train_X3.corr()
num_reviews | is_metal_Yes | is_rock_Yes | |
---|---|---|---|
num_reviews | 1.000000 | 0.082810 | 0.273074 |
is_metal_Yes | 0.082810 | 1.000000 | -0.247685 |
is_rock_Yes | 0.273074 | -0.247685 | 1.000000 |
valid_X3
num_reviews | is_metal_Yes | is_rock_Yes | |
---|---|---|---|
291 | 2.240549 | 0 | 1 |
412 | 1.986772 | 0 | 1 |
356 | 1.973128 | 0 | 0 |
258 | 2.383815 | 0 | 1 |
97 | 2.330414 | 0 | 0 |
... | ... | ... | ... |
41 | 2.575188 | 0 | 0 |
67 | 2.644439 | 0 | 0 |
154 | 2.214844 | 0 | 0 |
315 | 2.117271 | 0 | 0 |
196 | 2.320146 | 1 | 0 |
150 rows × 3 columns
valid_y3_pred = model3.predict(valid_X3)
valid_y3_pred
array([0.59767373, 0.59643562, 0.59707518, 0.59837268, 0.59881828, 0.59686194, 0.60171557, 0.59993791, 0.59858843, 0.59807946, 0.59779453, 0.59898939, 0.59752232, 0.59683686, 0.59891504, 0.59541808, 0.59858891, 0.59956589, 0.59835536, 0.59814848, 0.59871177, 0.59485725, 0.60007902, 0.59990852, 0.59961712, 0.59865363, 0.59388106, 0.5897766 , 0.59315062, 0.59035623, 0.59878836, 0.59988036, 0.59966413, 0.59881372, 0.60082599, 0.59058315, 0.59903625, 0.59656279, 0.59868549, 0.59979297, 0.59874781, 0.59714175, 0.60235208, 0.59366827, 0.59964411, 0.59899749, 0.59884777, 0.60101952, 0.598343 , 0.59945487, 0.5957796 , 0.6003742 , 0.59736935, 0.60128647, 0.59443207, 0.59997264, 0.60064123, 0.59833058, 0.59932642, 0.59818884, 0.5987414 , 0.59947675, 0.59768587, 0.5983551 , 0.59878836, 0.59941789, 0.59054979, 0.59934194, 0.59801076, 0.59736935, 0.59759259, 0.60091984, 0.59767908, 0.59822845, 0.59861249, 0.5970976 , 0.59779453, 0.59037458, 0.5977573 , 0.5908764 , 0.59985441, 0.60025198, 0.59716718, 0.59622732, 0.59761017, 0.59965081, 0.59982374, 0.59373662, 0.59918968, 0.59986743, 0.59759259, 0.59620509, 0.59831049, 0.59936501, 0.59042868, 0.59707518, 0.599484 , 0.59058315, 0.59855506, 0.59873403, 0.59911354, 0.59875802, 0.60206584, 0.59757486, 0.59833058, 0.59090503, 0.60011987, 0.59812203, 0.59673349, 0.597258 , 0.6000733 , 0.60072941, 0.59853252, 0.59734509, 0.59541808, 0.59368753, 0.59892454, 0.5986203 , 0.59724313, 0.60084129, 0.59882815, 0.59569484, 0.59908754, 0.59797889, 0.5996239 , 0.59819944, 0.59860466, 0.59731645, 0.59854382, 0.59683686, 0.5979475 , 0.60124551, 0.60097896, 0.59667984, 0.59911354, 0.59986743, 0.59724824, 0.59886985, 0.59831949, 0.6014903 , 0.59840407, 0.59754832, 0.59918135, 0.59785778, 0.5916522 , 0.60001246, 0.60035031, 0.59825444, 0.59777842, 0.59162201])
valid_y3_pred_df = pd.DataFrame(valid_y3_pred, columns = ["Validation_Prediction"])
valid_y3_pred_df
Validation_Prediction | |
---|---|
0 | 0.597674 |
1 | 0.596436 |
2 | 0.597075 |
3 | 0.598373 |
4 | 0.598818 |
... | ... |
145 | 0.600012 |
146 | 0.600350 |
147 | 0.598254 |
148 | 0.597778 |
149 | 0.591622 |
150 rows × 1 columns
Get the RMSE for validation set.
mse_valid_3 = sklearn.metrics.mean_squared_error(valid_y3, valid_y3_pred)
mse_valid_3
0.00021223070865340727
# As before
# import math
rmse_valid_3 = math.sqrt(mse_valid_3)
rmse_valid_3
0.014568140191987695
valid_y3.describe()
count 150.000000 mean 0.597971 std 0.014589 min 0.545307 25% 0.589950 50% 0.597695 75% 0.607455 max 0.634477 Name: rating, dtype: float64
# As before:
# If using the dmba package:
# pip install dmba
# Done earlier. Just for illustration
# import dmba
# from dmba import regressionSummary
regressionSummary(valid_y3, valid_y3_pred)
Regression statistics Mean Error (ME) : 0.0001 Root Mean Squared Error (RMSE) : 0.0146 Mean Absolute Error (MAE) : 0.0110 Mean Percentage Error (MPE) : -0.0403 Mean Absolute Percentage Error (MAPE) : 1.8481
new_album_df
year | num_reviews | force | is_metal_Yes | is_rock_Yes | |
---|---|---|---|---|---|
0 | 2023 | 6 | 100 | 1 | 0 |
new_album_df_log10 = np.log10(new_album_df.iloc[:,1])
new_album_df_log10.head()
0 0.778151 Name: num_reviews, dtype: float64
new_players_df_3 = pd.concat((new_album_df_log10, new_album_df.iloc[:,3:5]), axis = 1)
new_players_df_3
num_reviews | is_metal_Yes | is_rock_Yes | |
---|---|---|---|
0 | 0.778151 | 1 | 0 |
new_album_pred_3 = model3.predict(new_players_df_3)
new_album_pred_3
array([0.58409906])
# As before
# import pandas as pd
new_album_pred_df_3 = pd.DataFrame(new_album_pred_3, columns = ["Prediction"])
new_album_pred_df_3
# to export
# new_records_players_pred_df.to_csv("whatever_name.csv")
Prediction | |
---|---|
0 | 0.584099 |
alpha = 0.05
ci_3 = np.quantile(train_residuals_3, 1 - alpha)
ci_3
0.024360004758683662
# as before
def generate_results_confint_3(preds, ci_3):
df = pd.DataFrame()
df["Prediction"] = preds
if ci_3 >= 0:
df["upper"] = preds + ci_3
df["lower"] = preds - ci_3
else:
df["upper"] = preds - ci_3
df["lower"] = preds + ci_3
return df
new_album_pred_3_confint_df = generate_results_confint_3(new_album_pred_df_3, ci_3)
new_album_pred_3_confint_df
Prediction | upper | lower | |
---|---|---|---|
0 | 0.584099 | 0.608459 | 0.559739 |
def exp10(x):
return 10**x
# execute the function
new_album_pred_df_3_exp = new_album_pred_3_confint_df.apply(exp10)
new_album_pred_df_3_exp
Prediction | upper | lower | |
---|---|---|---|
0 | 3.837948 | 4.059374 | 3.6286 |