Directions

Logistic regression to predict whether to sing a song on American Idol.

Data for demo

Back to the spellbook

1. Load Data

songs <- read.csv("american_idol_songs_v8.csv", header = TRUE)
head(songs, 10)
##    No                Song_Title                     Artiste Song_Avg_Rtg Year
## 1   1     Stuff Like That There                Bette Midler           95 1991
## 2   2                In A Dream                    Badlands           94 1991
## 3   3     Build Me Up Buttercup             The Foundations           93 1969
## 4   4  Hemorrhage (In My Hands)                        Fuel           92 2000
## 5   5                 Solitaire                  Carpenters           92 1974
## 6   6 Will You Love Me Tomorrow               The Shirelles           92 1960
## 7   7                Chandelier                         Sia           91 2014
## 8   8   Don't Rain On My Parade            Barbra Streisand           91 1964
## 9   9         A Whole New World Peabo Bryson & Regina Belle           90 1992
## 10 10      I Don't Hurt Anymore            Dinah Washington           90 1943
##    Avg_Song_Age Advance Bottom Elimination Expectation Artiste_Rating
## 1          11.0       1      0           0        20.5           55.5
## 2          14.0       1      0           0        24.4           94.0
## 3          34.0       1      0           0        26.2           93.0
## 4           6.0       1      0           0        29.4           92.0
## 5          29.0       1      0           0        25.2           68.5
## 6          51.0       1      0           0        24.9           92.0
## 7           3.7       1      0           0        24.5           66.6
## 8          40.0       1      0           0        15.7           62.9
## 9          11.0       1      0           0        28.7           90.0
## 10         63.0       1      0           0        29.3           90.0

Check data.

head(songs)
##   No                Song_Title         Artiste Song_Avg_Rtg Year Avg_Song_Age
## 1  1     Stuff Like That There    Bette Midler           95 1991           11
## 2  2                In A Dream        Badlands           94 1991           14
## 3  3     Build Me Up Buttercup The Foundations           93 1969           34
## 4  4  Hemorrhage (In My Hands)            Fuel           92 2000            6
## 5  5                 Solitaire      Carpenters           92 1974           29
## 6  6 Will You Love Me Tomorrow   The Shirelles           92 1960           51
##   Advance Bottom Elimination Expectation Artiste_Rating
## 1       1      0           0        20.5           55.5
## 2       1      0           0        24.4           94.0
## 3       1      0           0        26.2           93.0
## 4       1      0           0        29.4           92.0
## 5       1      0           0        25.2           68.5
## 6       1      0           0        24.9           92.0
str(songs)
## 'data.frame':    1626 obs. of  11 variables:
##  $ No            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Song_Title    : chr  "Stuff Like That There" "In A Dream" "Build Me Up Buttercup" "Hemorrhage (In My Hands)" ...
##  $ Artiste       : chr  "Bette Midler" "Badlands" "The Foundations" "Fuel" ...
##  $ Song_Avg_Rtg  : num  95 94 93 92 92 92 91 91 90 90 ...
##  $ Year          : int  1991 1991 1969 2000 1974 1960 2014 1964 1992 1943 ...
##  $ Avg_Song_Age  : num  11 14 34 6 29 51 3.7 40 11 63 ...
##  $ Advance       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Bottom        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Elimination   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Expectation   : num  20.5 24.4 26.2 29.4 25.2 24.9 24.5 15.7 28.7 29.3 ...
##  $ Artiste_Rating: num  55.5 94 93 92 68.5 92 66.6 62.9 90 90 ...
table(songs$Advance)
## 
##    0    1 
##  287 1339

2 PreProcessing

2.1 Filter Variables

Factorise.

songs$Advance <- as.factor(songs$Advance)
str(songs)
## 'data.frame':    1626 obs. of  11 variables:
##  $ No            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Song_Title    : chr  "Stuff Like That There" "In A Dream" "Build Me Up Buttercup" "Hemorrhage (In My Hands)" ...
##  $ Artiste       : chr  "Bette Midler" "Badlands" "The Foundations" "Fuel" ...
##  $ Song_Avg_Rtg  : num  95 94 93 92 92 92 91 91 90 90 ...
##  $ Year          : int  1991 1991 1969 2000 1974 1960 2014 1964 1992 1943 ...
##  $ Avg_Song_Age  : num  11 14 34 6 29 51 3.7 40 11 63 ...
##  $ Advance       : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Bottom        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Elimination   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Expectation   : num  20.5 24.4 26.2 29.4 25.2 24.9 24.5 15.7 28.7 29.3 ...
##  $ Artiste_Rating: num  55.5 94 93 92 68.5 92 66.6 62.9 90 90 ...
t(t(names(songs)))
##       [,1]            
##  [1,] "No"            
##  [2,] "Song_Title"    
##  [3,] "Artiste"       
##  [4,] "Song_Avg_Rtg"  
##  [5,] "Year"          
##  [6,] "Avg_Song_Age"  
##  [7,] "Advance"       
##  [8,] "Bottom"        
##  [9,] "Elimination"   
## [10,] "Expectation"   
## [11,] "Artiste_Rating"
songs <- songs[, c(4, 6, 10, 11, 7)]

head(songs)
##   Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating Advance
## 1           95           11        20.5           55.5       1
## 2           94           14        24.4           94.0       1
## 3           93           34        26.2           93.0       1
## 4           92            6        29.4           92.0       1
## 5           92           29        25.2           68.5       1
## 6           92           51        24.9           92.0       1
str(songs)
## 'data.frame':    1626 obs. of  5 variables:
##  $ Song_Avg_Rtg  : num  95 94 93 92 92 92 91 91 90 90 ...
##  $ Avg_Song_Age  : num  11 14 34 6 29 51 3.7 40 11 63 ...
##  $ Expectation   : num  20.5 24.4 26.2 29.4 25.2 24.9 24.5 15.7 28.7 29.3 ...
##  $ Artiste_Rating: num  55.5 94 93 92 68.5 92 66.6 62.9 90 90 ...
##  $ Advance       : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

2.2 Training-Validation Split

Split the data into training and validation sets.

Set the seed using our favourite number :-)

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation.

train_index <- sample(1:nrow(songs), 0.7 * nrow(songs))
valid_index <- setdiff(1:nrow(songs), train_index)

Using the indices, create the training and validation sets. This is similar in principle to splitting a data frame by row.

train_df <- songs[train_index, ]
valid_df <- songs[valid_index, ]

It is a good habit to check after splitting.

nrow(train_df)
## [1] 1138
nrow(valid_df)
## [1] 488

3 Logistic Regression

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
logistic_reg <- train(Advance ~ Song_Avg_Rtg + Avg_Song_Age + Expectation +
                      Artiste_Rating,
                    data = train_df, method = "glm")
summary(logistic_reg)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6306   0.3292   0.5033   0.6629   1.3611  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -0.687320   0.339452  -2.025 0.042888 *  
## Song_Avg_Rtg    0.044150   0.007084   6.233 4.59e-10 ***
## Avg_Song_Age    0.016850   0.005073   3.321 0.000895 ***
## Expectation    -0.005037   0.006823  -0.738 0.460320    
## Artiste_Rating -0.003355   0.006546  -0.512 0.608313    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1079.41  on 1137  degrees of freedom
## Residual deviance:  983.95  on 1133  degrees of freedom
## AIC: 993.95
## 
## Number of Fisher Scoring iterations: 5
varImp(logistic_reg)
## glm variable importance
## 
##                Overall
## Song_Avg_Rtg   100.000
## Avg_Song_Age    49.108
## Expectation      3.948
## Artiste_Rating   0.000

Predict the training set.

logistic_reg_pred_train <- predict(logistic_reg, 
                               newdata = train_df, type = "raw")

head(logistic_reg_pred_train)
## [1] 1 1 1 1 1 1
## Levels: 0 1
logistic_reg_pred_train_prob <- predict(logistic_reg, 
                                   newdata = train_df, type = "prob")

head(logistic_reg_pred_train_prob)
##               0         1
## 1598 0.19365330 0.8063467
## 638  0.19275433 0.8072457
## 608  0.09530381 0.9046962
## 907  0.25007114 0.7499289
## 1147 0.12211830 0.8778817
## 1564 0.16397203 0.8360280

Predict the validation set.

logistic_reg_pred_valid <- predict(logistic_reg, 
                                   newdata = valid_df, type = "raw")

head(logistic_reg_pred_valid)
## [1] 1 1 1 1 1 1
## Levels: 0 1
logistic_reg_pred_valid_prob <- predict(logistic_reg, 
                                        newdata = valid_df, type = "prob")

head(logistic_reg_pred_valid_prob)
##             0         1
## 2  0.03695570 0.9630443
## 6  0.02188756 0.9781124
## 10 0.01987635 0.9801236
## 11 0.02503871 0.9749613
## 12 0.02704118 0.9729588
## 14 0.05819678 0.9418032

4. Model Evaluation

Multicollinearity

library("Hmisc")
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
rcorr(as.matrix(train_df[, -c(5)]))
##                Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating
## Song_Avg_Rtg           1.00         0.10        0.71           0.61
## Avg_Song_Age           0.10         1.00        0.06           0.18
## Expectation            0.71         0.06        1.00           0.41
## Artiste_Rating         0.61         0.18        0.41           1.00
## 
## n= 1138 
## 
## 
## P
##                Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating
## Song_Avg_Rtg                0.0011       0.0000      0.0000        
## Avg_Song_Age   0.0011                    0.0495      0.0000        
## Expectation    0.0000       0.0495                   0.0000        
## Artiste_Rating 0.0000       0.0000       0.0000
# Alternatively
# cor(train_df[, -c(5)])

Confusion matrix. Training set.

confusionMatrix(as.factor(logistic_reg_pred_train), 
                train_df$Advance, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  14   9
##          1 193 922
##                                          
##                Accuracy : 0.8225         
##                  95% CI : (0.799, 0.8443)
##     No Information Rate : 0.8181         
##     P-Value [Acc > NIR] : 0.3675         
##                                          
##                   Kappa : 0.0886         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.99033        
##             Specificity : 0.06763        
##          Pos Pred Value : 0.82691        
##          Neg Pred Value : 0.60870        
##              Prevalence : 0.81810        
##          Detection Rate : 0.81019        
##    Detection Prevalence : 0.97979        
##       Balanced Accuracy : 0.52898        
##                                          
##        'Positive' Class : 1              
## 

F1 score. Training set

con_mat_train <- confusionMatrix(as.factor(logistic_reg_pred_train), 
                train_df$Advance, positive = "1")

sensitivity_train <- con_mat_train$byClass[1]

precision_train <- con_mat_train$byClass[3]

f1_train <- 2/((1/sensitivity_train) + (1/precision_train))

# Use this to avoid awkward naming. It's just the way it works.
# f1_train <- unname(f1_train)

paste("The F1 score for traination is", f1_train)
## [1] "The F1 score for traination is 0.901270772238514"

Confusion matrix. Validation set.

confusionMatrix(as.factor(logistic_reg_pred_valid), 
                valid_df$Advance, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   9   1
##          1  71 407
##                                           
##                Accuracy : 0.8525          
##                  95% CI : (0.8178, 0.8827)
##     No Information Rate : 0.8361          
##     P-Value [Acc > NIR] : 0.1801          
##                                           
##                   Kappa : 0.1698          
##                                           
##  Mcnemar's Test P-Value : 4.232e-16       
##                                           
##             Sensitivity : 0.9975          
##             Specificity : 0.1125          
##          Pos Pred Value : 0.8515          
##          Neg Pred Value : 0.9000          
##              Prevalence : 0.8361          
##          Detection Rate : 0.8340          
##    Detection Prevalence : 0.9795          
##       Balanced Accuracy : 0.5550          
##                                           
##        'Positive' Class : 1               
## 

F1 score. Validation set

con_mat_valid <- confusionMatrix(as.factor(logistic_reg_pred_valid), 
                valid_df$Advance, positive = "1")

sensitivity_valid <- con_mat_valid$byClass[1]

precision_valid <- con_mat_valid$byClass[3]

f1_valid <- 2/((1/sensitivity_valid) + (1/precision_valid))

# Use this to avoid awkward naming. It's just the way it works.
# f1_valid <- unname(f1_valid)

paste("The F1 score for validation is", f1_valid)
## [1] "The F1 score for validation is 0.918735891647856"

ROC.

ROSE::roc.curve(valid_df$Advance, logistic_reg_pred_valid)

## Area under the curve (AUC): 0.555

5. Predict new songs

Import new songs.

New songs

new_songs <- read.csv("new_songs.csv", header = TRUE)
new_songs
##     No           Song_Title     Artiste Song_Avg_Rtg Year Avg_Song_Age Advance
## 1 6661 Walk With Me In Hell Lamb of God           96 2004           19      NA
## 2 6662          The Watcher  Arch Enemy           90 2022            1      NA
## 3 6663              Frantic   Metallica           28 2003           20      NA
##   Bottom Elimination Expectation Artiste_Rating
## 1     NA          NA          42            100
## 2     NA          NA          36            100
## 3     NA          NA          46            120
##                               Comments
## 1   Classic song from a legendary band
## 2 Fantastic song from a legendary band
## 3       Zzz song from a legendary band

Filter the variables.

names(new_songs)
##  [1] "No"             "Song_Title"     "Artiste"        "Song_Avg_Rtg"  
##  [5] "Year"           "Avg_Song_Age"   "Advance"        "Bottom"        
##  [9] "Elimination"    "Expectation"    "Artiste_Rating" "Comments"
new_songs_filter <- new_songs[, c(4, 6, 10, 11)]
new_songs_filter
##   Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating
## 1           96           19          42            100
## 2           90            1          36            100
## 3           28           20          46            120

Predict.

logistic_reg_pred_new_songs <- predict(logistic_reg, 
                                   newdata = new_songs_filter, type = "raw")

head(logistic_reg_pred_new_songs)
## [1] 1 1 1
## Levels: 0 1
logistic_reg_pred_new_songs_prob <- predict(logistic_reg, 
                                   newdata = new_songs_filter, type = "prob")

head(logistic_reg_pred_new_songs_prob)
##            0         1
## 1 0.03475215 0.9652478
## 2 0.05807576 0.9419242
## 3 0.43743522 0.5625648

If I’m being honest… :-)

logistic_reg_pred_new_songs_df <- as.data.frame(logistic_reg_pred_new_songs)
names(logistic_reg_pred_new_songs_df)[1] <- "Prediction"
names(logistic_reg_pred_new_songs_prob)[2] <- "Probability"
new_songs_prediction_df <- cbind(new_songs[c(2:3)], logistic_reg_pred_new_songs_df,
                                 logistic_reg_pred_new_songs_prob[2]) 
new_songs_prediction_df
##             Song_Title     Artiste Prediction Probability
## 1 Walk With Me In Hell Lamb of God          1   0.9652478
## 2          The Watcher  Arch Enemy          1   0.9419242
## 3              Frantic   Metallica          1   0.5625648

Or for a more confident prediction

new_songs_prediction_df$Prediction <- ifelse(new_songs_prediction_df$Probability > 0.8,
                                             "1", "0")

new_songs_prediction_df
##             Song_Title     Artiste Prediction Probability
## 1 Walk With Me In Hell Lamb of God          1   0.9652478
## 2          The Watcher  Arch Enemy          1   0.9419242
## 3              Frantic   Metallica          0   0.5625648