Directions

Regression models to predict the wages of football players.

Data for demo

Back to the spellbook

1. Load data

Load the data and explore them.

football <- read.csv("football_2.csv", header = FALSE)
head(football, 10)
##        V1           V2  V3                                             V4
## 1      ID         Name Age                                          Photo
## 2  207439   L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3  156713 A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4  229909     A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5  187347 I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6  153260       Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
## 7  187607    A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 8  204341   LuÌ_s Neto  30 https://cdn.sofifa.org/players/4/19/204341.png
## 9  223058   D. Kuzyaev  25 https://cdn.sofifa.org/players/4/19/223058.png
## 10 183389       G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
##             V5                                   V6      V7        V8
## 1  Nationality                                 Flag Overall Potential
## 2    Argentina  https://cdn.sofifa.org/flags/52.png      80        85
## 3       Sweden  https://cdn.sofifa.org/flags/46.png      80        80
## 4       Russia  https://cdn.sofifa.org/flags/40.png      79        81
## 5       Russia  https://cdn.sofifa.org/flags/40.png      79        79
## 6       Brazil  https://cdn.sofifa.org/flags/54.png      78        78
## 7       Russia  https://cdn.sofifa.org/flags/40.png      78        78
## 8     Portugal  https://cdn.sofifa.org/flags/38.png      77        77
## 9       Russia  https://cdn.sofifa.org/flags/40.png      77        80
## 10 Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77
##                 V9                                         V10   V11   V12
## 1             Club                                   Club Logo Value  Wage
## 2                          https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                          https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                          https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                          https://cdn.sofifa.org/flags/40.png  6030  1448
## 6  Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
## 7                          https://cdn.sofifa.org/flags/40.png  5764  1105
## 8                          https://cdn.sofifa.org/flags/38.png  6075  2836
## 9                          https://cdn.sofifa.org/flags/40.png  5565  2653
## 10                        https://cdn.sofifa.org/flags/108.png  5275  2138
##        V13            V14                      V15       V16         V17
## 1  Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2     2122          Right                        2         4           4
## 3     1797          Right                        2         4           2
## 4     1217          Right                        1         3           1
## 5     2038          Right                        2         3           3
## 6     1807          Right                        2         3           3
## 7     1810          Right                        2         3           3
## 8     1749          Right                        1         3           2
## 9     2041          Right                        1         3           3
## 10    1933           Left                        2         3           3
##               V18       V19       V20      V21           V22      V23
## 1       Work Rate Body Type Real Face Position Jersey Number   Joined
## 2  Medium/ Medium    Normal        No       CM             5         
## 3    High/ Medium    Normal        No      LCB             4         
## 4  Medium/ Medium    Normal        No       GK            12         
## 5      High/ High      Lean        No       RB             2         
## 6  Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
## 7    High/ Medium    Stocky        No       ST            22         
## 8  Medium/ Medium      Lean        No       CB             4         
## 9    Medium/ High      Lean        No       RM             7         
## 10      High/ Low    Normal        No       ST            21         
##            V24                  V25    V26    V27  V28  V29  V30  V31  V32  V33
## 1  Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                     5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                      6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                      6'2 176lbs                              
## 5                                     5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                              2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7                                      6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
## 8                                      6'2 157lbs 52+2 52+2 52+2 51+2 51+2 51+2
## 9                                      6'0 163lbs 70+2 70+2 70+2 74+2 74+2 74+2
## 10                                    5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2
##     V34  V35  V36  V37  V38  V39  V40  V41  V42  V43  V44  V45  V46  V47  V48
## 1    RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2  75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3  58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                            
## 5  72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6  59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7  74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
## 8  51+2 51+2 54+2 54+2 54+2 54+2 61+2 61+2 61+2 54+2 67+2 72+2 72+2 72+2 67+2
## 9  74+2 74+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2
## 10 75+2 75+2 74+2 74+2 74+2 74+2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2
##     V49  V50  V51  V52  V53      V54       V55             V56          V57
## 1    LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2  74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3  70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                 16        14              17           25
## 5  78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6  68+2 76+2 76+2 76+2 68+2       60        45              79           73
## 7  48+2 48+2 48+2 48+2 48+2       61        79              86           71
## 8  69+2 75+2 75+2 75+2 69+2       42        33              80           72
## 9  74+2 70+2 70+2 70+2 74+2       67        64              51           82
## 10 50+2 46+2 46+2 46+2 50+2       68        77              71           73
##        V58       V59   V60        V61         V62         V63          V64
## 1  Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2       73        78    79         78          82          82           75
## 3       37        49    36         40          67          63           46
## 4       13        15    18         17          32          17           58
## 5       57        72    49         46          75          72           84
## 6       51        63    42         48          72          73           33
## 7       74        71    64         60          55          77           66
## 8       40        49    52         43          77          48           57
## 9       57        78    60         61          75          79           78
## 10      73        76    73         69          67          76           78
##            V65     V66       V67     V68       V69     V70     V71      V72
## 1  SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2           69      77        74      77        82      61      79       69
## 3           49      55        76      36        74      64      67       83
## 4           54      36        76      50        24      60      27       70
## 5           90      80        75      76        67      85      93       68
## 6           38      51        70      60        55      79      54       76
## 7           65      50        75      32        78      63      77       93
## 8           59      69        78      61        42      79      72       72
## 9           81      80        73      76        76      60      79       59
## 10          85      79        71      73        77      70      78       74
##          V73        V74           V75         V76    V77       V78       V79
## 1  LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2         80         79            72          74     82        57        74
## 3         59         81            82          54     49        79        78
## 4         13         26            20          11     63        15        69
## 5         57         65            71          77     72        41        73
## 6         58         76            79          50     67        64        70
## 7         68         75            30          78     73        77        70
## 8         37         76            78          44     46        47        72
## 9         74         70            74          71     70        63        64
## 10        74         77            18          76     73        72        72
##        V80            V81           V82      V83        V84       V85
## 1  Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2       73             75            72        9         14         6
## 3       82             83            79        7          9        12
## 4       18             20            12       80         73        65
## 5       76             76            80        7         12        10
## 6       83             77            76       12          7        11
## 7       21             15            19       15         12        11
## 8       80             77            78       10         15        13
## 9       71             77            76       15         16        13
## 10      40             18            12       15          9        10
##              V86        V87            V88
## 1  GKPositioning GKReflexes Release Clause
## 2              9         10               
## 3             10         15               
## 4             77         85               
## 5              8         15               
## 6             12         13               
## 7             11          8               
## 8             15          8               
## 9              7          8               
## 10            15         16
names(football) <- football[1,]
head(football)
##       ID         Name Age                                          Photo
## 1     ID         Name Age                                          Photo
## 2 207439   L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909     A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260       Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
##   Nationality                                Flag Overall Potential
## 1 Nationality                                Flag Overall Potential
## 2   Argentina https://cdn.sofifa.org/flags/52.png      80        85
## 3      Sweden https://cdn.sofifa.org/flags/46.png      80        80
## 4      Russia https://cdn.sofifa.org/flags/40.png      79        81
## 5      Russia https://cdn.sofifa.org/flags/40.png      79        79
## 6      Brazil https://cdn.sofifa.org/flags/54.png      78        78
##              Club                                   Club Logo Value  Wage
## 1            Club                                   Club Logo Value  Wage
## 2                         https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                         https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                         https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                         https://cdn.sofifa.org/flags/40.png  6030  1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
##   Special Preferred Foot International Reputation Weak Foot Skill Moves
## 1 Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2    2122          Right                        2         4           4
## 3    1797          Right                        2         4           2
## 4    1217          Right                        1         3           1
## 5    2038          Right                        2         3           3
## 6    1807          Right                        2         3           3
##        Work Rate Body Type Real Face Position Jersey Number   Joined
## 1      Work Rate Body Type Real Face Position Jersey Number   Joined
## 2 Medium/ Medium    Normal        No       CM             5         
## 3   High/ Medium    Normal        No      LCB             4         
## 4 Medium/ Medium    Normal        No       GK            12         
## 5     High/ High      Lean        No       RB             2         
## 6 Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
##   Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 1 Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                    5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                     6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                     6'2 176lbs                              
## 5                                    5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                             2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
##     RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 1   RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                           
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
##     LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 1   LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3 70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                16        14              17           25
## 5 78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6 68+2 76+2 76+2 76+2 68+2       60        45              79           73
##   Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 1 Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2      73        78    79         78          82          82           75
## 3      37        49    36         40          67          63           46
## 4      13        15    18         17          32          17           58
## 5      57        72    49         46          75          72           84
## 6      51        63    42         48          72          73           33
##   SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 1 SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2          69      77        74      77        82      61      79       69
## 3          49      55        76      36        74      64      67       83
## 4          54      36        76      50        24      60      27       70
## 5          90      80        75      76        67      85      93       68
## 6          38      51        70      60        55      79      54       76
##   LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 1 LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2        80         79            72          74     82        57        74
## 3        59         81            82          54     49        79        78
## 4        13         26            20          11     63        15        69
## 5        57         65            71          77     72        41        73
## 6        58         76            79          50     67        64        70
##   Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 1 Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2      73             75            72        9         14         6
## 3      82             83            79        7          9        12
## 4      18             20            12       80         73        65
## 5      76             76            80        7         12        10
## 6      83             77            76       12          7        11
##   GKPositioning GKReflexes Release Clause
## 1 GKPositioning GKReflexes Release Clause
## 2             9         10               
## 3            10         15               
## 4            77         85               
## 5             8         15               
## 6            12         13
football <- football[-c(1),]
head(football)
##       ID         Name Age                                          Photo
## 2 207439   L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909     A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260       Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
## 7 187607    A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
##   Nationality                                Flag Overall Potential
## 2   Argentina https://cdn.sofifa.org/flags/52.png      80        85
## 3      Sweden https://cdn.sofifa.org/flags/46.png      80        80
## 4      Russia https://cdn.sofifa.org/flags/40.png      79        81
## 5      Russia https://cdn.sofifa.org/flags/40.png      79        79
## 6      Brazil https://cdn.sofifa.org/flags/54.png      78        78
## 7      Russia https://cdn.sofifa.org/flags/40.png      78        78
##              Club                                   Club Logo Value  Wage
## 2                         https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                         https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                         https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                         https://cdn.sofifa.org/flags/40.png  6030  1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
## 7                         https://cdn.sofifa.org/flags/40.png  5764  1105
##   Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2    2122          Right                        2         4           4
## 3    1797          Right                        2         4           2
## 4    1217          Right                        1         3           1
## 5    2038          Right                        2         3           3
## 6    1807          Right                        2         3           3
## 7    1810          Right                        2         3           3
##        Work Rate Body Type Real Face Position Jersey Number   Joined
## 2 Medium/ Medium    Normal        No       CM             5         
## 3   High/ Medium    Normal        No      LCB             4         
## 4 Medium/ Medium    Normal        No       GK            12         
## 5     High/ High      Lean        No       RB             2         
## 6 Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
## 7   High/ Medium    Stocky        No       ST            22         
##   Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                    5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                     6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                     6'2 176lbs                              
## 5                                    5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                             2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7                                     6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
##     RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                           
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7 74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
##     LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3 70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                16        14              17           25
## 5 78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6 68+2 76+2 76+2 76+2 68+2       60        45              79           73
## 7 48+2 48+2 48+2 48+2 48+2       61        79              86           71
##   Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2      73        78    79         78          82          82           75
## 3      37        49    36         40          67          63           46
## 4      13        15    18         17          32          17           58
## 5      57        72    49         46          75          72           84
## 6      51        63    42         48          72          73           33
## 7      74        71    64         60          55          77           66
##   SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2          69      77        74      77        82      61      79       69
## 3          49      55        76      36        74      64      67       83
## 4          54      36        76      50        24      60      27       70
## 5          90      80        75      76        67      85      93       68
## 6          38      51        70      60        55      79      54       76
## 7          65      50        75      32        78      63      77       93
##   LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2        80         79            72          74     82        57        74
## 3        59         81            82          54     49        79        78
## 4        13         26            20          11     63        15        69
## 5        57         65            71          77     72        41        73
## 6        58         76            79          50     67        64        70
## 7        68         75            30          78     73        77        70
##   Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2      73             75            72        9         14         6
## 3      82             83            79        7          9        12
## 4      18             20            12       80         73        65
## 5      76             76            80        7         12        10
## 6      83             77            76       12          7        11
## 7      21             15            19       15         12        11
##   GKPositioning GKReflexes Release Clause
## 2             9         10               
## 3            10         15               
## 4            77         85               
## 5             8         15               
## 6            12         13               
## 7            11          8
nrow(football)
## [1] 18207
table(football$Position)
## 
##       CAM   CB  CDM   CF   CM   GK  LAM   LB  LCB  LCM  LDM   LF   LM   LS   LW 
##   60  958 1778  948   74 1394 2025   21 1322  648  395  243   15 1095  207  381 
##  LWB  RAM   RB  RCB  RCM  RDM   RF   RM   RS   RW  RWB   ST 
##   78   21 1291  662  391  248   16 1124  203  370   87 2152

2. Scatter Plot

2.1 Filter for strikers

Strikers are defined in the dataset as Position = “ST”.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
football_st <- football %>% filter(Position == "ST")
head(football_st)
##       ID            Name Age                                          Photo
## 1 187607       A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389          G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683       K. Fofana  26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson  27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900    J. Sambenito  26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405       B. Angulo  22 https://cdn.sofifa.org/players/4/19/246405.png
##   Nationality                                 Flag Overall Potential Club
## 1      Russia  https://cdn.sofifa.org/flags/40.png      78        78     
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77     
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png      75        75     
## 4     Iceland  https://cdn.sofifa.org/flags/24.png      73        74     
## 5    Paraguay  https://cdn.sofifa.org/flags/58.png      71        74     
## 6     Ecuador  https://cdn.sofifa.org/flags/57.png      71        77     
##                              Club Logo Value Wage Special Preferred Foot
## 1  https://cdn.sofifa.org/flags/40.png  5764 1105    1810          Right
## 2 https://cdn.sofifa.org/flags/108.png  5275 2138    1933           Left
## 3 https://cdn.sofifa.org/flags/108.png  5589 3875    1877          Right
## 4  https://cdn.sofifa.org/flags/24.png  5629 3661    1893          Right
## 5  https://cdn.sofifa.org/flags/58.png  6113 2445    1651          Right
## 6  https://cdn.sofifa.org/flags/57.png  5057 2216    1628          Right
##   International Reputation Weak Foot Skill Moves      Work Rate Body Type
## 1                        2         3           3   High/ Medium    Stocky
## 2                        2         3           3      High/ Low    Normal
## 3                        1         3           3 Medium/ Medium    Normal
## 4                        1         4           3     High/ High    Normal
## 5                        1         3           2   High/ Medium      Lean
## 6                        1         4           3      High/ Low    Normal
##   Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1        No       ST            22                                        
## 2        No       ST            21                                        
## 3        No       ST            22                                        
## 4        No       ST             9                                        
## 5        No       ST             9                                        
## 6        No       ST            19                                        
##   Height Weight   LS   ST   RS   LW   LF   CF   RF   RW  LAM  CAM  RAM   LM
## 1    6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2   5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3    6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4    6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5    6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6    6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
##    LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB   LB  LCB   CB  RCB   RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
##   Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1       61        79              86           71      74        71    64
## 2       68        77              71           73      73        76    73
## 3       66        75              72           74      74        72    63
## 4       66        71              68           68      65        73    63
## 5       40        74              72           57      72        60    64
## 6       50        78              69           56      46        76    58
##   FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1         60          55          77           66          65      50        75
## 2         69          67          76           78          85      79        71
## 3         59          58          75           59          77      63        72
## 4         48          44          73           78          79      83        74
## 5         42          42          63           79          72      61        69
## 6         58          33          71           82          79      78        73
##   Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1      32        78      63      77       93        68         75            30
## 2      73        77      70      78       74        74         77            18
## 3      60        78      69      83       77        73         67            40
## 4      76        68      78      90       85        66         73            42
## 5      64        73      69      67       72        67         49            14
## 6      64        72      69      77       69        54         28            16
##   Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1          78     73        77        70      21             15            19
## 2          76     73        72        72      40             18            12
## 3          72     69        74        83      23             37            46
## 4          73     64        69        76      31             39            24
## 5          75     60        67        74      15             16            16
## 6          62     45        82        51      11             18            12
##   GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1       15         12        11            11          8               
## 2       15          9        10            15         16               
## 3        7         11         7            11         14               
## 4        9         12        10            15         16               
## 5       15         16        15             7          7               
## 6       11          8        10             7          6
nrow(football_st)
## [1] 2152

2.2 Scatter Plot

convert to numeric.

str(football_st$Wage)
##  chr [1:2152] "1105" "2138" "3875" "3661" "2445" "2216" "4457" "3370" ...
str(football_st$Value)
##  chr [1:2152] "5764" "5275" "5589" "5629" "6113" "5057" "6561" "6146" ...
football_st$Wage <- as.numeric(football_st$Wage)
football_st$Value <- as.numeric(football_st$Value)
library(ggplot2)
library(ggpubr)

ggplot(football_st) + aes(x = Wage, y = Value) +
  geom_point(shape = 2, colour = "black") +
  xlab("Wage") + ylab("Value") +
  ggtitle("Wage and Value") +
  geom_smooth(method = lm) +
  stat_regline_equation(label.x = 150000, label.y = 1700) +
  stat_cor(method = "pearson", label.x = 300000, label.y = 1600)
## `geom_smooth()` using formula 'y ~ x'

3. Simple Linear Regression

value_simple <- lm(football_st$Value ~ football_st$Wage)
summary(value_simple)
## 
## Call:
## lm(formula = football_st$Value ~ football_st$Wage)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -17073527   -633009   -209153    198333  38355242 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4.175e+05  7.060e+04  -5.913 3.91e-09 ***
## football_st$Wage  2.179e+02  2.721e+00  80.068  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2838000 on 2150 degrees of freedom
## Multiple R-squared:  0.7489, Adjusted R-squared:  0.7487 
## F-statistic:  6411 on 1 and 2150 DF,  p-value: < 2.2e-16
confint(value_simple, level = 0.95)
##                         2.5 %       97.5 %
## (Intercept)      -555911.3195 -278995.9221
## football_st$Wage     212.5681     223.2422

4. Residuals

value_simple_stdresiduals <- rstandard(value_simple)
head(value_simple_stdresiduals)
##           1           2           3           4           5           6 
##  0.06430004 -0.01520939 -0.14850129 -0.13205208 -0.03849210 -0.02127676

Standard residuals.

football_st_comb <- cbind(football_st, value_simple_stdresiduals)
head(football_st_comb)
##       ID            Name Age                                          Photo
## 1 187607       A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389          G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683       K. Fofana  26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson  27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900    J. Sambenito  26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405       B. Angulo  22 https://cdn.sofifa.org/players/4/19/246405.png
##   Nationality                                 Flag Overall Potential Club
## 1      Russia  https://cdn.sofifa.org/flags/40.png      78        78     
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77     
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png      75        75     
## 4     Iceland  https://cdn.sofifa.org/flags/24.png      73        74     
## 5    Paraguay  https://cdn.sofifa.org/flags/58.png      71        74     
## 6     Ecuador  https://cdn.sofifa.org/flags/57.png      71        77     
##                              Club Logo Value Wage Special Preferred Foot
## 1  https://cdn.sofifa.org/flags/40.png  5764 1105    1810          Right
## 2 https://cdn.sofifa.org/flags/108.png  5275 2138    1933           Left
## 3 https://cdn.sofifa.org/flags/108.png  5589 3875    1877          Right
## 4  https://cdn.sofifa.org/flags/24.png  5629 3661    1893          Right
## 5  https://cdn.sofifa.org/flags/58.png  6113 2445    1651          Right
## 6  https://cdn.sofifa.org/flags/57.png  5057 2216    1628          Right
##   International Reputation Weak Foot Skill Moves      Work Rate Body Type
## 1                        2         3           3   High/ Medium    Stocky
## 2                        2         3           3      High/ Low    Normal
## 3                        1         3           3 Medium/ Medium    Normal
## 4                        1         4           3     High/ High    Normal
## 5                        1         3           2   High/ Medium      Lean
## 6                        1         4           3      High/ Low    Normal
##   Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1        No       ST            22                                        
## 2        No       ST            21                                        
## 3        No       ST            22                                        
## 4        No       ST             9                                        
## 5        No       ST             9                                        
## 6        No       ST            19                                        
##   Height Weight   LS   ST   RS   LW   LF   CF   RF   RW  LAM  CAM  RAM   LM
## 1    6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2   5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3    6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4    6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5    6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6    6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
##    LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB   LB  LCB   CB  RCB   RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
##   Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1       61        79              86           71      74        71    64
## 2       68        77              71           73      73        76    73
## 3       66        75              72           74      74        72    63
## 4       66        71              68           68      65        73    63
## 5       40        74              72           57      72        60    64
## 6       50        78              69           56      46        76    58
##   FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1         60          55          77           66          65      50        75
## 2         69          67          76           78          85      79        71
## 3         59          58          75           59          77      63        72
## 4         48          44          73           78          79      83        74
## 5         42          42          63           79          72      61        69
## 6         58          33          71           82          79      78        73
##   Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1      32        78      63      77       93        68         75            30
## 2      73        77      70      78       74        74         77            18
## 3      60        78      69      83       77        73         67            40
## 4      76        68      78      90       85        66         73            42
## 5      64        73      69      67       72        67         49            14
## 6      64        72      69      77       69        54         28            16
##   Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1          78     73        77        70      21             15            19
## 2          76     73        72        72      40             18            12
## 3          72     69        74        83      23             37            46
## 4          73     64        69        76      31             39            24
## 5          75     60        67        74      15             16            16
## 6          62     45        82        51      11             18            12
##   GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1       15         12        11            11          8               
## 2       15          9        10            15         16               
## 3        7         11         7            11         14               
## 4        9         12        10            15         16               
## 5       15         16        15             7          7               
## 6       11          8        10             7          6               
##   value_simple_stdresiduals
## 1                0.06430004
## 2               -0.01520939
## 3               -0.14850129
## 4               -0.13205208
## 5               -0.03849210
## 6               -0.02127676

Plot residuals.

ggplot(football_st_comb) + aes(x = football_st_comb$Value, y = football_st_comb$value_simple_stdresiduals) +
  geom_point() +
  xlab("Value") + ylab("Standard Residuals") +
  ggtitle("Wage and Value Prediction, Residuals")
## Warning: Use of `football_st_comb$Value` is discouraged. Use `Value` instead.
## Warning: Use of `football_st_comb$value_simple_stdresiduals` is discouraged. Use
## `value_simple_stdresiduals` instead.

4.1 Normality

ggplot(football_st) + aes(x = Value) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-1: distribution is different from a normal distribution.

shapiro.test(football_st$Value)
## 
##  Shapiro-Wilk normality test
## 
## data:  football_st$Value
## W = 0.37447, p-value < 2.2e-16

4.2 Autocorrelation

May not be very applicable here. But just for illustration……

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
durbinWatsonTest(value_simple) 
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2167301      1.566536       0
##  Alternative hypothesis: rho != 0

5. Multiple Linear Regression

Subset data for simplicity.

football_st_2 <- football_st[, c("Age", "Balance", "ShotPower", "Aggression",
                                 "Positioning", "Composure", "Wage")]
head(football_st_2)
##   Age Balance ShotPower Aggression Positioning Composure Wage
## 1  29      32        78         75          78        70 1105
## 2  29      73        77         77          76        72 2138
## 3  26      60        78         67          72        83 3875
## 4  27      76        68         73          73        76 3661
## 5  26      64        73         49          75        74 2445
## 6  22      64        72         28          62        51 2216

Convert to numeric.

library(dplyr)
football_st_2 <- football_st_2 %>% mutate_if(is.character, as.numeric)
str(football_st_2)
## 'data.frame':    2152 obs. of  7 variables:
##  $ Age        : num  29 29 26 27 26 22 22 28 31 28 ...
##  $ Balance    : num  32 73 60 76 64 64 65 75 69 56 ...
##  $ ShotPower  : num  78 77 78 68 73 72 66 75 69 71 ...
##  $ Aggression : num  75 77 67 73 49 28 30 36 68 59 ...
##  $ Positioning: num  78 76 72 73 75 62 76 68 69 72 ...
##  $ Composure  : num  70 72 83 76 74 51 62 56 80 56 ...
##  $ Wage       : num  1105 2138 3875 3661 2445 ...

A multiple regression model showing unstandardised estimates.

The predictors included in the model are: Age, Balance, ShotPower, Aggression, Positioning, and Composure.

names(football_st_2)
## [1] "Age"         "Balance"     "ShotPower"   "Aggression"  "Positioning"
## [6] "Composure"   "Wage"
wage_model_st <- lm(Wage ~ Age + Balance + ShotPower +
                      Aggression + Positioning + Composure,
                    data = football_st_2)
summary(wage_model_st)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning + Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31822  -8232  -2313   4754 350592 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -77073.40    4064.61 -18.962  < 2e-16 ***
## Age          -1014.25     110.94  -9.143  < 2e-16 ***
## Balance        120.41      35.90   3.354  0.00081 ***
## ShotPower      498.07      74.43   6.692 2.81e-11 ***
## Aggression      15.96      32.29   0.494  0.62129    
## Positioning    741.71      82.42   8.999  < 2e-16 ***
## Composure      424.72      71.66   5.927 3.58e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18840 on 2145 degrees of freedom
## Multiple R-squared:  0.2997, Adjusted R-squared:  0.2978 
## F-statistic:   153 on 6 and 2145 DF,  p-value: < 2.2e-16
coef(wage_model_st)
##  (Intercept)          Age      Balance    ShotPower   Aggression  Positioning 
## -77073.39877  -1014.24567    120.40620    498.06517     15.95657    741.70804 
##    Composure 
##    424.72405
confint(wage_model_st, level = 0.95)
##                    2.5 %       97.5 %
## (Intercept) -85044.38590 -69102.41165
## Age          -1231.79758   -796.69375
## Balance         50.00615    190.80626
## ShotPower      352.09956    644.03079
## Aggression     -47.37581     79.28895
## Positioning    580.07796    903.33813
## Composure      284.19780    565.25031

5.1 Standardised estimates

A multiple regression model showing standardised estimates.

The predictors included in the model are: Age, Balance, ShotPower, Aggression, Positioning, and Composure.

library(lm.beta)

wage_model_st_std <- lm.beta::lm.beta(wage_model_st)


coef(wage_model_st_std)
## (Intercept)         Age     Balance   ShotPower  Aggression Positioning 
##  0.00000000 -0.21358305  0.06178231  0.20182976  0.01126852  0.30316025 
##   Composure 
##  0.19146721
confint(wage_model_st_std)
##                   2.5 %     97.5 %
## (Intercept) -7970.98713 7970.98713
## Age          -217.76550  217.33833
## Balance       -70.33827   70.46184
## ShotPower    -145.76378  146.16744
## Aggression    -63.32111   63.34365
## Positioning  -161.32692  161.93324
## Composure    -140.33479  140.71772

5.2 Residuals

wage_model_st_residuals <- rstandard(wage_model_st)
head(wage_model_st_residuals)
##          1          2          3          4          5          6 
## -1.2711799 -1.4183035 -1.5151160 -1.1956035 -1.3820667 -0.5348701
football_st_comb_2 <- cbind(football_st_2, wage_model_st_residuals)
head(football_st_comb_2)
##   Age Balance ShotPower Aggression Positioning Composure Wage
## 1  29      32        78         75          78        70 1105
## 2  29      73        77         77          76        72 2138
## 3  26      60        78         67          72        83 3875
## 4  27      76        68         73          73        76 3661
## 5  26      64        73         49          75        74 2445
## 6  22      64        72         28          62        51 2216
##   wage_model_st_residuals
## 1              -1.2711799
## 2              -1.4183035
## 3              -1.5151160
## 4              -1.1956035
## 5              -1.3820667
## 6              -0.5348701
ggplot(football_st_comb_2) + aes(x = Wage, y = wage_model_st_residuals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage Prediction")

ggplot(football_st_comb_2) + aes(x = Age, y = wage_model_st_residuals) +
  geom_point() + xlab("Age") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Age")

ggplot(football_st_comb_2) + aes(x = ShotPower, y = wage_model_st_residuals) +
  geom_point() + xlab("Shot Power") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Shot Power")

ggplot(football_st_comb_2) + aes(x = Aggression, y = wage_model_st_residuals) +
  geom_point() + xlab("Aggression") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Aggression")

ggplot(football_st_comb_2) + aes(x = Positioning, y = wage_model_st_residuals) +
  geom_point() + xlab("Positioning") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Positionng")

ggplot(football_st_comb_2) + aes(x = Composure, y = wage_model_st_residuals) +
  geom_point() + xlab("Composure") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Composure")

5.3 Model evaluation

5.3.1 Normality

library(ggplot2)

ggplot(football_st_2) + aes(x = Wage) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of wage (strikers)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(football_st_2) + aes(x = Wage) +
  geom_histogram() +
  ylab("Count") +
  scale_x_log10() +
  ggtitle("Distribution of log(wage) (strikers)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution

H-1: distribution is different from a normal distribution.

shapiro.test(football_st_2$Wage)
## 
##  Shapiro-Wilk normality test
## 
## data:  football_st_2$Wage
## W = 0.39056, p-value < 2.2e-16

5.3.2 Multicollinearity

How much the variance of an estimated regression coefficient increases if your predictors are correlated.

In other words, no 2 pairs of predicts should not be strongly correlated with each other.

If no factors are correlated, the VIFs will all be 1.

Rule of thumb: If VIF > 10, mullticollinearity is high.

library(car)
vif(wage_model_st)
##         Age     Balance   ShotPower  Aggression Positioning   Composure 
##    1.671663    1.039327    2.786601    1.593244    3.476150    3.196433

5.3.3 Autocorrelation

0 <= D-W <= 4.

Rule of thumb:

D-W = 2.0 means that there is no autocorrelation.

D-W < = means there is positive autocorrelation.

D-W > 2 means negative autocorrelation.

This applies in time series data; so not so applicable here.

durbinWatsonTest(wage_model_st)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.5038085     0.9915208       0
##  Alternative hypothesis: rho != 0

5.3.4 Automatic evaluation

We can also automatically evaluate the model.

library(gvlma)

gvlma(wage_model_st)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning + Composure, data = football_st_2)
## 
## Coefficients:
## (Intercept)          Age      Balance    ShotPower   Aggression  Positioning  
##   -77073.40     -1014.25       120.41       498.07        15.96       741.71  
##   Composure  
##      424.72  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st) 
## 
##                        Value p-value                   Decision
## Global Stat        1307104.5       0 Assumptions NOT satisfied!
## Skewness             26054.7       0 Assumptions NOT satisfied!
## Kurtosis           1280082.5       0 Assumptions NOT satisfied!
## Link Function          791.9       0 Assumptions NOT satisfied!
## Heteroscedasticity     175.5       0 Assumptions NOT satisfied!

5.3.5 Heteroskedasticity

Perform a Breusch-Pagan Test to test for heteroskedasticity/homoskedasticity.

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(wage_model_st)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st
## BP = 91.188, df = 6, p-value < 2.2e-16
plot(wage_model_st, 1)

library(olsrr) 
## Warning: package 'olsrr' was built under R version 4.0.5
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
ols_test_breusch_pagan(wage_model_st)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##               Data               
##  --------------------------------
##  Response : Wage 
##  Variables: fitted values of Wage 
## 
##         Test Summary         
##  ----------------------------
##  DF            =    1 
##  Chi2          =    5352.5071 
##  Prob > Chi2   =    0.0000

Multiple test for each variable.

ols_test_breusch_pagan(wage_model_st, rhs = TRUE,
                       multiple = TRUE)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                               Data                                
##  -----------------------------------------------------------------
##  Response : Wage 
##  Variables: Age Balance ShotPower Aggression Positioning Composure 
## 
##           Test Summary (Unadjusted p values)         
##  --------------------------------------------------
##   Variable           chi2       df          p       
##  --------------------------------------------------
##   Age               487.2866     1    5.549066e-108 
##   Balance           147.9854     1     4.778931e-34 
##   ShotPower        3632.0162     1     0.000000e+00 
##   Aggression        637.7948     1    1.008165e-140 
##   Positioning      4068.5226     1     0.000000e+00 
##   Composure        4081.2646     1     0.000000e+00 
##  --------------------------------------------------
##   simultaneous     5538.8585     6     0.000000e+00 
##  --------------------------------------------------

6. Stepwise regression

Stepwise regression is a modification of the ordinary regression.

library(stats)
wage_model_st_step <- step(wage_model_st,
                           direction = "both")
## Start:  AIC=42374.94
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning + 
##     Composure
## 
##               Df  Sum of Sq        RSS   AIC
## - Aggression   1 8.6672e+07 7.6162e+11 42373
## <none>                      7.6154e+11 42375
## - Balance      1 3.9939e+09 7.6553e+11 42384
## - Composure    1 1.2472e+10 7.7401e+11 42408
## - ShotPower    1 1.5897e+10 7.7743e+11 42417
## - Positioning  1 2.8752e+10 7.9029e+11 42453
## - Age          1 2.9676e+10 7.9121e+11 42455
## 
## Step:  AIC=42373.18
## Wage ~ Age + Balance + ShotPower + Positioning + Composure
## 
##               Df  Sum of Sq        RSS   AIC
## <none>                      7.6162e+11 42373
## + Aggression   1 8.6672e+07 7.6154e+11 42375
## - Balance      1 3.9197e+09 7.6554e+11 42382
## - Composure    1 1.2939e+10 7.7456e+11 42407
## - ShotPower    1 1.7279e+10 7.7890e+11 42419
## - Positioning  1 2.8770e+10 7.9039e+11 42451
## - Age          1 3.0373e+10 7.9200e+11 42455
summary(wage_model_st_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning + 
##     Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31793  -8228  -2326   4830 350282 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -77250.10    4048.13 -19.083  < 2e-16 ***
## Age          -1002.58     108.38  -9.251  < 2e-16 ***
## Balance        118.78      35.74   3.323 0.000904 ***
## ShotPower      506.25      72.55   6.978 3.98e-12 ***
## Positioning    741.93      82.40   9.004  < 2e-16 ***
## Composure      429.17      71.08   6.038 1.83e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18840 on 2146 degrees of freedom
## Multiple R-squared:  0.2997, Adjusted R-squared:  0.298 
## F-statistic: 183.6 on 5 and 2146 DF,  p-value: < 2.2e-16
gvlma(wage_model_st_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning + 
##     Composure, data = football_st_2)
## 
## Coefficients:
## (Intercept)          Age      Balance    ShotPower  Positioning    Composure  
##    -77250.1      -1002.6        118.8        506.2        741.9        429.2  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_step) 
## 
##                        Value p-value                   Decision
## Global Stat        1300530.2       0 Assumptions NOT satisfied!
## Skewness             25983.6       0 Assumptions NOT satisfied!
## Kurtosis           1273577.0       0 Assumptions NOT satisfied!
## Link Function          794.0       0 Assumptions NOT satisfied!
## Heteroscedasticity     175.5       0 Assumptions NOT satisfied!

7. Data mining approach

Now, we will use the data mining approach.

7.1 Training validation split

Split the data into training and validation sets.

Set the seed using our favourite number :-)

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation.

train_index <- sample(1:nrow(football_st_2), 0.6 * nrow(football_st_2))
valid_index <- setdiff(1:nrow(football_st_2), train_index)

Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.

train_df_st <- football_st_2[train_index, ]
valid_df_st <- football_st_2[valid_index, ]

It is a good habit to check after splitting.

nrow(train_df_st)
## [1] 1291
nrow(valid_df_st)
## [1] 861

7.2 Training

Training the model on the training set.

wage_model_st_2 <- lm(Wage ~ Age + Balance + ShotPower + 
                        Aggression + Positioning + Composure,
                      data = train_df_st)
summary(wage_model_st_2)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning + Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32654  -8533  -2462   5056 346913 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -82121.16    5937.75 -13.830  < 2e-16 ***
## Age           -948.45     166.15  -5.708 1.42e-08 ***
## Balance        114.04      53.15   2.146   0.0321 *  
## ShotPower      489.51     110.63   4.425 1.05e-05 ***
## Aggression     -23.76      47.29  -0.502   0.6154    
## Positioning    730.41     121.49   6.012 2.39e-09 ***
## Composure      544.62     104.74   5.200 2.32e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21280 on 1284 degrees of freedom
## Multiple R-squared:  0.2704, Adjusted R-squared:  0.267 
## F-statistic:  79.3 on 6 and 1284 DF,  p-value: < 2.2e-16

7.3 Predicting

Predict the outcome (i.e. wage) of the validation set using the model from the training set.

library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:ggpubr':
## 
##     gghistogram
wage_model_st_2_pred_train <- predict(wage_model_st_2,
                                train_df_st)

wage_model_st_2_pred <- predict(wage_model_st_2,
                                valid_df_st)

7.4 Model evaluation

Compare the errors between the training and validation sets.

accuracy(wage_model_st_2_pred_train, train_df_st$Wage)
##                    ME     RMSE      MAE       MPE     MAPE
## Test set 9.854906e-11 21227.13 9907.101 -32.18323 133.6435
accuracy(wage_model_st_2_pred, valid_df_st$Wage)
##                ME     RMSE      MAE       MPE     MAPE
## Test set 18.64833 14551.57 9388.476 -17.37455 128.1185
max(football_st_2$Wage) - min(football_st_2$Wage)
## [1] 406504
sd(football_st_2$Wage)
## [1] 22484.99

8. Categorical independent variables

Subset to include categorical variable: preferred foot

football_st_3 <- football_st[, c("Preferred Foot", "Positioning", "Composure", "Wage")]
head(football_st_3)
##   Preferred Foot Positioning Composure Wage
## 1          Right          78        70 1105
## 2           Left          76        72 2138
## 3          Right          72        83 3875
## 4          Right          73        76 3661
## 5          Right          75        74 2445
## 6          Right          62        51 2216
names(football_st_3)[1] <- "Preferred_Foot"

football_st_3$Positioning <- as.numeric(football_st_3$Positioning)
football_st_3$Composure <- as.numeric(football_st_3$Composure)
wage_model_st_cat <- lm(Wage ~ factor(Preferred_Foot) + Positioning + Composure, data = football_st_3)
summary(wage_model_st_cat)
## 
## Call:
## lm(formula = Wage ~ factor(Preferred_Foot) + Positioning + Composure, 
##     data = football_st_3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31805  -8181  -2270   4528 354971 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -65474.98    3209.87 -20.398  < 2e-16 ***
## factor(Preferred_Foot)Right  -1087.48    1221.25  -0.890    0.373    
## Positioning                    816.64      75.34  10.840  < 2e-16 ***
## Composure                      438.10      68.36   6.409  1.8e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19400 on 2148 degrees of freedom
## Multiple R-squared:  0.2564, Adjusted R-squared:  0.2554 
## F-statistic: 246.9 on 3 and 2148 DF,  p-value: < 2.2e-16
confint(wage_model_st_cat, level = 0.95)
##                                   2.5 %      97.5 %
## (Intercept)                 -71769.7525 -59180.1997
## factor(Preferred_Foot)Right  -3482.4352   1307.4754
## Positioning                    668.9041    964.3812
## Composure                      304.0428    572.1551

8.1 Residuals

wage_model_st_cat_stdresiduals <- rstandard(wage_model_st_cat)
head(wage_model_st_cat_stdresiduals)
##          1          2          3          4          5          6 
## -1.3769868 -1.3424571 -1.2770311 -1.1703986 -1.2718790 -0.2163923
football_st_3_cat <- cbind(football_st_3, wage_model_st_cat_stdresiduals)
head(football_st_3_cat)
##   Preferred_Foot Positioning Composure Wage wage_model_st_cat_stdresiduals
## 1          Right          78        70 1105                     -1.3769868
## 2           Left          76        72 2138                     -1.3424571
## 3          Right          72        83 3875                     -1.2770311
## 4          Right          73        76 3661                     -1.1703986
## 5          Right          75        74 2445                     -1.2718790
## 6          Right          62        51 2216                     -0.2163923
ggplot(football_st_3_cat) + aes(x = Wage, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage")

Positioning

ggplot(football_st_3_cat) + aes(x = Positioning, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Positioning") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Positioning")

Composure

ggplot(football_st_3_cat) + aes(x = Composure, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Composure") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Composure")

ggplot(football_st_3_cat) + aes(x = Preferred_Foot, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Preferred Foot") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Preferred Foot")

8.2 Model Evaluation

ggplot(football_st_3_cat) + aes(x = Wage) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Wage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-alt: distribution is different from a normal distribution.

shapiro.test(football_st_3_cat$Wage)
## 
##  Shapiro-Wilk normality test
## 
## data:  football_st_3_cat$Wage
## W = 0.39056, p-value < 2.2e-16

Multicollinearity

vif(wage_model_st_cat)
## factor(Preferred_Foot)            Positioning              Composure 
##               1.002720               2.738872               2.743181

Homoscedasticity.

ols_test_breusch_pagan(wage_model_st_cat)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##               Data               
##  --------------------------------
##  Response : Wage 
##  Variables: fitted values of Wage 
## 
##         Test Summary         
##  ----------------------------
##  DF            =    1 
##  Chi2          =    4754.5635 
##  Prob > Chi2   =    0.0000
gvlma(wage_model_st_cat)
## 
## Call:
## lm(formula = Wage ~ factor(Preferred_Foot) + Positioning + Composure, 
##     data = football_st_3)
## 
## Coefficients:
##                 (Intercept)  factor(Preferred_Foot)Right  
##                    -65475.0                      -1087.5  
##                 Positioning                    Composure  
##                       816.6                        438.1  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_cat) 
## 
##                        Value p-value                   Decision
## Global Stat        1208344.4       0 Assumptions NOT satisfied!
## Skewness             25297.7       0 Assumptions NOT satisfied!
## Kurtosis           1182302.6       0 Assumptions NOT satisfied!
## Link Function          600.3       0 Assumptions NOT satisfied!
## Heteroscedasticity     143.8       0 Assumptions NOT satisfied!

9. Non-Linear regression

Sometimes, a relationship may not be linear. In this case, we can specify a non-linear relationship in the model.

9.1 Traditional statistics

We start with the traditional statistics approach and evaluate.

The non-linear relationship is expressed in the model specification.

names(football_st_2)
## [1] "Age"         "Balance"     "ShotPower"   "Aggression"  "Positioning"
## [6] "Composure"   "Wage"
wage_model_st_nl <- lm(Wage ~ Age + Balance + ShotPower +
                         Aggression + Positioning * Composure, 
                       data = football_st_2)
summary(wage_model_st_nl)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning * Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58380  -5245     80   4644 267683 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           297675.783  13442.584  22.144   <2e-16 ***
## Age                     -789.963     94.502  -8.359   <2e-16 ***
## Balance                   57.694     30.555   1.888   0.0591 .  
## ShotPower                642.408     63.389  10.134   <2e-16 ***
## Aggression                19.805     27.418   0.722   0.4702    
## Positioning            -5016.022    211.523 -23.714   <2e-16 ***
## Composure              -6150.054    235.919 -26.069   <2e-16 ***
## Positioning:Composure     96.301      3.339  28.844   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16000 on 2144 degrees of freedom
## Multiple R-squared:  0.4955, Adjusted R-squared:  0.4939 
## F-statistic: 300.8 on 7 and 2144 DF,  p-value: < 2.2e-16
vif(wage_model_st_nl)
##                   Age               Balance             ShotPower 
##              1.683057              1.044616              2.804077 
##            Aggression           Positioning             Composure 
##              1.593281             31.765761             48.069231 
## Positioning:Composure 
##            127.119996
durbinWatsonTest(wage_model_st_nl)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2531554      1.491911       0
##  Alternative hypothesis: rho != 0

9.2 Traditional statistics stepwise

Perform a stepwise regression with a non-linear relationship and evaluate

wage_model_st_nl_step <- step(wage_model_st_nl,
                           direction = "both")
## Start:  AIC=41671.29
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning * 
##     Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## - Aggression             1 1.3352e+08 5.4877e+11 41670
## <none>                                5.4863e+11 41671
## - Balance                1 9.1234e+08 5.4955e+11 41673
## - Age                    1 1.7881e+10 5.6652e+11 41738
## - ShotPower              1 2.6282e+10 5.7492e+11 41770
## - Positioning:Composure  1 2.1290e+11 7.6154e+11 42375
## 
## Step:  AIC=41669.81
## Wage ~ Age + Balance + ShotPower + Positioning + Composure + 
##     Positioning:Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## <none>                                5.4877e+11 41670
## - Balance                1 8.5698e+08 5.4963e+11 41671
## + Aggression             1 1.3352e+08 5.4863e+11 41671
## - Age                    1 1.8041e+10 5.6681e+11 41737
## - ShotPower              1 2.8516e+10 5.7728e+11 41777
## - Positioning:Composure  1 2.1286e+11 7.6162e+11 42373
summary(wage_model_st_nl_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning + 
##     Composure + Positioning:Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58507  -5205     67   4579 267488 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           297410.796  13436.079  22.135   <2e-16 ***
## Age                     -775.517     92.352  -8.397   <2e-16 ***
## Balance                   55.684     30.424   1.830   0.0674 .  
## ShotPower                652.547     61.808  10.558   <2e-16 ***
## Positioning            -5015.048    211.495 -23.712   <2e-16 ***
## Composure              -6143.738    235.730 -26.063   <2e-16 ***
## Positioning:Composure     96.289      3.338  28.844   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15990 on 2145 degrees of freedom
## Multiple R-squared:  0.4954, Adjusted R-squared:  0.494 
## F-statistic:   351 on 6 and 2145 DF,  p-value: < 2.2e-16
vif(wage_model_st_nl_step)
##                   Age               Balance             ShotPower 
##              1.607679              1.035950              2.666586 
##           Positioning             Composure Positioning:Composure 
##             31.764471             48.003192            127.116986
durbinWatsonTest(wage_model_st_nl_step)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2522843      1.493672       0
##  Alternative hypothesis: rho != 0

9.3 Data mining approach

A data mining approach with the non-linear relationship.

wage_model_st_nl_2 <- lm(Wage ~ Age + Balance + ShotPower + Aggression +
                           Positioning * Composure,
                       data = train_df_st)
summary(wage_model_st_nl_2)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning * Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69712  -5516    431   5121 257569 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           373590.851  19296.451  19.361  < 2e-16 ***
## Age                     -702.769    137.707  -5.103 3.84e-07 ***
## Balance                   39.904     44.039   0.906    0.365    
## ShotPower                691.250     91.817   7.529 9.63e-14 ***
## Aggression               -24.766     39.088  -0.634    0.526    
## Positioning            -6254.863    303.169 -20.632  < 2e-16 ***
## Composure              -7433.528    337.988 -21.993  < 2e-16 ***
## Positioning:Composure    116.410      4.767  24.419  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17590 on 1283 degrees of freedom
## Multiple R-squared:  0.5019, Adjusted R-squared:  0.4992 
## F-statistic: 184.7 on 7 and 1283 DF,  p-value: < 2.2e-16
wage_model_st_nl_2_pred <- predict(wage_model_st_nl_2,
                                   valid_df_st)
accuracy(wage_model_st_nl_2_pred, valid_df_st$Wage)
##                 ME     RMSE      MAE      MPE     MAPE
## Test set -547.7242 13726.06 8807.617 -31.1167 112.4235
sd(football_st$Wage)
## [1] 22484.99
gvlma(wage_model_st_nl_2)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning * Composure, data = train_df_st)
## 
## Coefficients:
##           (Intercept)                    Age                Balance  
##             373590.85                -702.77                  39.90  
##             ShotPower             Aggression            Positioning  
##                691.25                 -24.77               -6254.86  
##             Composure  Positioning:Composure  
##              -7433.53                 116.41  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_nl_2) 
## 
##                        Value   p-value                   Decision
## Global Stat        337857.89 0.000e+00 Assumptions NOT satisfied!
## Skewness             7895.68 0.000e+00 Assumptions NOT satisfied!
## Kurtosis           329396.72 0.000e+00 Assumptions NOT satisfied!
## Link Function         507.03 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity     58.47 2.065e-14 Assumptions NOT satisfied!

9.4 Data mining approach using stepwise

A data mining approach using a stepwise regression and non-linear relationship.

wage_model_st_nl_2_step <- step(wage_model_st_nl_2,
                              direction = "both")
## Start:  AIC=25247.78
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning * 
##     Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## - Aggression             1 1.2426e+08 3.9726e+11 25246
## - Balance                1 2.5413e+08 3.9739e+11 25247
## <none>                                3.9713e+11 25248
## - Age                    1 8.0617e+09 4.0520e+11 25272
## - ShotPower              1 1.7544e+10 4.1468e+11 25302
## - Positioning:Composure  1 1.8458e+11 5.8171e+11 25739
## 
## Step:  AIC=25246.18
## Wage ~ Age + Balance + ShotPower + Positioning + Composure + 
##     Positioning:Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## - Balance                1 2.9864e+08 3.9756e+11 25245
## <none>                                3.9726e+11 25246
## + Aggression             1 1.2426e+08 3.9713e+11 25248
## - Age                    1 8.9937e+09 4.0625e+11 25273
## - ShotPower              1 1.7797e+10 4.1506e+11 25301
## - Positioning:Composure  1 1.8457e+11 5.8183e+11 25737
## 
## Step:  AIC=25245.15
## Wage ~ Age + ShotPower + Positioning + Composure + Positioning:Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## <none>                                3.9756e+11 25245
## + Balance                1 2.9864e+08 3.9726e+11 25246
## + Aggression             1 1.6876e+08 3.9739e+11 25247
## - Age                    1 9.3077e+09 4.0686e+11 25273
## - ShotPower              1 1.7503e+10 4.1506e+11 25299
## - Positioning:Composure  1 1.8649e+11 5.8405e+11 25740
summary(wage_model_st_nl_2_step)
## 
## Call:
## lm(formula = Wage ~ Age + ShotPower + Positioning + Composure + 
##     Positioning:Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69304  -5565    386   5182 257753 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           378121.671  18793.092  20.120  < 2e-16 ***
## Age                     -732.952    133.630  -5.485 4.98e-08 ***
## ShotPower                668.100     88.824   7.522 1.01e-13 ***
## Positioning            -6270.832    302.640 -20.720  < 2e-16 ***
## Composure              -7453.830    337.457 -22.088  < 2e-16 ***
## Positioning:Composure    116.731      4.754  24.552  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17590 on 1285 degrees of freedom
## Multiple R-squared:  0.5013, Adjusted R-squared:  0.4994 
## F-statistic: 258.4 on 5 and 1285 DF,  p-value: < 2.2e-16
wage_model_st_nl_2_step_pred <- predict(wage_model_st_nl_2_step,
                                        valid_df_st)

accuracy(wage_model_st_nl_2_step_pred, valid_df_st$Wage)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -584.0779 13718.68 8800.113 -31.92455 112.5375
gvlma(wage_model_st_nl_2_step)
## 
## Call:
## lm(formula = Wage ~ Age + ShotPower + Positioning + Composure + 
##     Positioning:Composure, data = train_df_st)
## 
## Coefficients:
##           (Intercept)                    Age              ShotPower  
##              378121.7                 -733.0                  668.1  
##           Positioning              Composure  Positioning:Composure  
##               -6270.8                -7453.8                  116.7  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_nl_2_step) 
## 
##                        Value   p-value                   Decision
## Global Stat        339857.02 0.000e+00 Assumptions NOT satisfied!
## Skewness             7928.00 0.000e+00 Assumptions NOT satisfied!
## Kurtosis           331364.96 0.000e+00 Assumptions NOT satisfied!
## Link Function         505.62 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity     58.44 2.098e-14 Assumptions NOT satisfied!

10. Log variables

Sometimes, the data need to be transformed. A common transformation is the log transformation.

10.1 Traditional statistics

A traditional statistics approach using a log transformation.

Here, the predictors are transformed using a log function.

wage_model_st_log <- lm(Wage ~ log(Age) + log(Balance) + log(ShotPower) +
                          log(Aggression) + log(Positioning) + log(Composure),
                    data = football_st_2)
summary(wage_model_st_log)
## 
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Aggression) + log(Positioning) + log(Composure), data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27838  -8379  -2853   4132 361712 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -345274      16356 -21.109  < 2e-16 ***
## log(Age)           -22193       2887  -7.688 2.26e-14 ***
## log(Balance)         6921       2150   3.220   0.0013 ** 
## log(ShotPower)      29539       4823   6.125 1.08e-09 ***
## log(Aggression)      1259       1621   0.777   0.4374    
## log(Positioning)    42091       5239   8.034 1.54e-15 ***
## log(Composure)      23706       4207   5.634 1.99e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19460 on 2145 degrees of freedom
## Multiple R-squared:  0.2529, Adjusted R-squared:  0.2508 
## F-statistic:   121 on 6 and 2145 DF,  p-value: < 2.2e-16
gvlma(wage_model_st_log)
## 
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Aggression) + log(Positioning) + log(Composure), data = football_st_2)
## 
## Coefficients:
##      (Intercept)          log(Age)      log(Balance)    log(ShotPower)  
##          -345274            -22193              6921             29539  
##  log(Aggression)  log(Positioning)    log(Composure)  
##             1259             42091             23706  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_log) 
## 
##                        Value p-value                   Decision
## Global Stat        1307658.4       0 Assumptions NOT satisfied!
## Skewness             26779.0       0 Assumptions NOT satisfied!
## Kurtosis           1280038.4       0 Assumptions NOT satisfied!
## Link Function          670.1       0 Assumptions NOT satisfied!
## Heteroscedasticity     170.9       0 Assumptions NOT satisfied!

10.2 Data mining with log

We can also use a data mining approach with the log transformation.

wage_model_st_log_2 <- lm(Wage ~ log(Age) + log(Balance) + log(ShotPower) +
                          log(Aggression) + log(Positioning) + log(Composure),
                    data = train_df_st)
summary(wage_model_st_log_2)
## 
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Aggression) + log(Positioning) + log(Composure), data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28621  -8672  -3007   4368 359491 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -362624.54   23959.55 -15.135  < 2e-16 ***
## log(Age)          -19955.08    4304.36  -4.636 3.91e-06 ***
## log(Balance)        7001.43    3201.89   2.187   0.0289 *  
## log(ShotPower)     28016.65    7149.05   3.919 9.36e-05 ***
## log(Aggression)       64.03    2383.50   0.027   0.9786    
## log(Positioning)   42553.11    7708.04   5.521 4.08e-08 ***
## log(Composure)     28357.91    6083.46   4.661 3.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21950 on 1284 degrees of freedom
## Multiple R-squared:  0.2241, Adjusted R-squared:  0.2204 
## F-statistic: 61.79 on 6 and 1284 DF,  p-value: < 2.2e-16
wage_model_st_log_2_pred <- predict(wage_model_st_log_2,
                                   valid_df_st)
accuracy(wage_model_st_log_2_pred, valid_df_st$Wage)
##                ME     RMSE      MAE       MPE    MAPE
## Test set 20.75738 15065.11 9419.259 -25.61638 127.768
sd(football_st_2$Wage)
## [1] 22484.99
range(football_st_2$Wage)
## [1]   1105 407609
gvlma(wage_model_st_log_2)
## 
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Aggression) + log(Positioning) + log(Composure), data = train_df_st)
## 
## Coefficients:
##      (Intercept)          log(Age)      log(Balance)    log(ShotPower)  
##       -362624.54         -19955.08           7001.43          28016.65  
##  log(Aggression)  log(Positioning)    log(Composure)  
##            64.03          42553.11          28357.91  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_log_2) 
## 
##                        Value p-value                   Decision
## Global Stat        748249.63       0 Assumptions NOT satisfied!
## Skewness            17669.59       0 Assumptions NOT satisfied!
## Kurtosis           730101.21       0 Assumptions NOT satisfied!
## Link Function         402.67       0 Assumptions NOT satisfied!
## Heteroscedasticity     76.17       0 Assumptions NOT satisfied!

10.3 Stepwise data mining with log

A stepwise regression using data mining and log transformations.

wage_model_st_log_2_step <- step(wage_model_st_log_2,
                           direction = "both")
## Start:  AIC=25818.01
## Wage ~ log(Age) + log(Balance) + log(ShotPower) + log(Aggression) + 
##     log(Positioning) + log(Composure)
## 
##                    Df  Sum of Sq        RSS   AIC
## - log(Aggression)   1 3.4775e+05 6.1864e+11 25816
## <none>                           6.1864e+11 25818
## - log(Balance)      1 2.3037e+09 6.2094e+11 25821
## - log(ShotPower)    1 7.3996e+09 6.2604e+11 25831
## - log(Age)          1 1.0355e+10 6.2899e+11 25837
## - log(Composure)    1 1.0469e+10 6.2911e+11 25838
## - log(Positioning)  1 1.4684e+10 6.3332e+11 25846
## 
## Step:  AIC=25816.01
## Wage ~ log(Age) + log(Balance) + log(ShotPower) + log(Positioning) + 
##     log(Composure)
## 
##                    Df  Sum of Sq        RSS   AIC
## <none>                           6.1864e+11 25816
## + log(Aggression)   1 3.4775e+05 6.1864e+11 25818
## - log(Balance)      1 2.3247e+09 6.2096e+11 25819
## - log(ShotPower)    1 7.7312e+09 6.2637e+11 25830
## - log(Composure)    1 1.0688e+10 6.2933e+11 25836
## - log(Age)          1 1.0936e+10 6.2957e+11 25837
## - log(Positioning)  1 1.4687e+10 6.3333e+11 25844
summary(wage_model_st_log_2_step)
## 
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Positioning) + log(Composure), data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28606  -8666  -2990   4367 359474 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -362686      23840 -15.214  < 2e-16 ***
## log(Age)           -19928       4181  -4.766 2.09e-06 ***
## log(Balance)         6992       3182   2.197   0.0282 *  
## log(ShotPower)      28055       7001   4.007 6.49e-05 ***
## log(Positioning)    42555       7705   5.523 4.02e-08 ***
## log(Composure)      28380       6023   4.712 2.72e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21940 on 1285 degrees of freedom
## Multiple R-squared:  0.2241, Adjusted R-squared:  0.221 
## F-statistic: 74.21 on 5 and 1285 DF,  p-value: < 2.2e-16
gvlma(wage_model_st_log_2_step)
## 
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Positioning) + log(Composure), data = train_df_st)
## 
## Coefficients:
##      (Intercept)          log(Age)      log(Balance)    log(ShotPower)  
##          -362686            -19928              6992             28055  
## log(Positioning)    log(Composure)  
##            42555             28380  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_log_2_step) 
## 
##                        Value p-value                   Decision
## Global Stat        748063.91       0 Assumptions NOT satisfied!
## Skewness            17667.20       0 Assumptions NOT satisfied!
## Kurtosis           729919.19       0 Assumptions NOT satisfied!
## Link Function         401.37       0 Assumptions NOT satisfied!
## Heteroscedasticity     76.15       0 Assumptions NOT satisfied!