Directions

Regression models to predict the wages of football players.

Data for demo

Back to the spellbook

1. Load data

Load the data and explore them.

football <- read.csv("football_2.csv", header = FALSE)
head(football, 10)
##        V1           V2  V3                                             V4
## 1      ID         Name Age                                          Photo
## 2  207439   L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3  156713 A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4  229909     A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5  187347 I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6  153260       Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
## 7  187607    A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 8  204341   LuÌ_s Neto  30 https://cdn.sofifa.org/players/4/19/204341.png
## 9  223058   D. Kuzyaev  25 https://cdn.sofifa.org/players/4/19/223058.png
## 10 183389       G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
##             V5                                   V6      V7        V8
## 1  Nationality                                 Flag Overall Potential
## 2    Argentina  https://cdn.sofifa.org/flags/52.png      80        85
## 3       Sweden  https://cdn.sofifa.org/flags/46.png      80        80
## 4       Russia  https://cdn.sofifa.org/flags/40.png      79        81
## 5       Russia  https://cdn.sofifa.org/flags/40.png      79        79
## 6       Brazil  https://cdn.sofifa.org/flags/54.png      78        78
## 7       Russia  https://cdn.sofifa.org/flags/40.png      78        78
## 8     Portugal  https://cdn.sofifa.org/flags/38.png      77        77
## 9       Russia  https://cdn.sofifa.org/flags/40.png      77        80
## 10 Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77
##                 V9                                         V10   V11   V12
## 1             Club                                   Club Logo Value  Wage
## 2                          https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                          https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                          https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                          https://cdn.sofifa.org/flags/40.png  6030  1448
## 6  Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
## 7                          https://cdn.sofifa.org/flags/40.png  5764  1105
## 8                          https://cdn.sofifa.org/flags/38.png  6075  2836
## 9                          https://cdn.sofifa.org/flags/40.png  5565  2653
## 10                        https://cdn.sofifa.org/flags/108.png  5275  2138
##        V13            V14                      V15       V16         V17
## 1  Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2     2122          Right                        2         4           4
## 3     1797          Right                        2         4           2
## 4     1217          Right                        1         3           1
## 5     2038          Right                        2         3           3
## 6     1807          Right                        2         3           3
## 7     1810          Right                        2         3           3
## 8     1749          Right                        1         3           2
## 9     2041          Right                        1         3           3
## 10    1933           Left                        2         3           3
##               V18       V19       V20      V21           V22      V23
## 1       Work Rate Body Type Real Face Position Jersey Number   Joined
## 2  Medium/ Medium    Normal        No       CM             5         
## 3    High/ Medium    Normal        No      LCB             4         
## 4  Medium/ Medium    Normal        No       GK            12         
## 5      High/ High      Lean        No       RB             2         
## 6  Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
## 7    High/ Medium    Stocky        No       ST            22         
## 8  Medium/ Medium      Lean        No       CB             4         
## 9    Medium/ High      Lean        No       RM             7         
## 10      High/ Low    Normal        No       ST            21         
##            V24                  V25    V26    V27  V28  V29  V30  V31  V32  V33
## 1  Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                     5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                      6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                      6'2 176lbs                              
## 5                                     5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                              2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7                                      6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
## 8                                      6'2 157lbs 52+2 52+2 52+2 51+2 51+2 51+2
## 9                                      6'0 163lbs 70+2 70+2 70+2 74+2 74+2 74+2
## 10                                    5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2
##     V34  V35  V36  V37  V38  V39  V40  V41  V42  V43  V44  V45  V46  V47  V48
## 1    RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2  75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3  58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                            
## 5  72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6  59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7  74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
## 8  51+2 51+2 54+2 54+2 54+2 54+2 61+2 61+2 61+2 54+2 67+2 72+2 72+2 72+2 67+2
## 9  74+2 74+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2
## 10 75+2 75+2 74+2 74+2 74+2 74+2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2
##     V49  V50  V51  V52  V53      V54       V55             V56          V57
## 1    LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2  74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3  70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                 16        14              17           25
## 5  78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6  68+2 76+2 76+2 76+2 68+2       60        45              79           73
## 7  48+2 48+2 48+2 48+2 48+2       61        79              86           71
## 8  69+2 75+2 75+2 75+2 69+2       42        33              80           72
## 9  74+2 70+2 70+2 70+2 74+2       67        64              51           82
## 10 50+2 46+2 46+2 46+2 50+2       68        77              71           73
##        V58       V59   V60        V61         V62         V63          V64
## 1  Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2       73        78    79         78          82          82           75
## 3       37        49    36         40          67          63           46
## 4       13        15    18         17          32          17           58
## 5       57        72    49         46          75          72           84
## 6       51        63    42         48          72          73           33
## 7       74        71    64         60          55          77           66
## 8       40        49    52         43          77          48           57
## 9       57        78    60         61          75          79           78
## 10      73        76    73         69          67          76           78
##            V65     V66       V67     V68       V69     V70     V71      V72
## 1  SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2           69      77        74      77        82      61      79       69
## 3           49      55        76      36        74      64      67       83
## 4           54      36        76      50        24      60      27       70
## 5           90      80        75      76        67      85      93       68
## 6           38      51        70      60        55      79      54       76
## 7           65      50        75      32        78      63      77       93
## 8           59      69        78      61        42      79      72       72
## 9           81      80        73      76        76      60      79       59
## 10          85      79        71      73        77      70      78       74
##          V73        V74           V75         V76    V77       V78       V79
## 1  LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2         80         79            72          74     82        57        74
## 3         59         81            82          54     49        79        78
## 4         13         26            20          11     63        15        69
## 5         57         65            71          77     72        41        73
## 6         58         76            79          50     67        64        70
## 7         68         75            30          78     73        77        70
## 8         37         76            78          44     46        47        72
## 9         74         70            74          71     70        63        64
## 10        74         77            18          76     73        72        72
##        V80            V81           V82      V83        V84       V85
## 1  Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2       73             75            72        9         14         6
## 3       82             83            79        7          9        12
## 4       18             20            12       80         73        65
## 5       76             76            80        7         12        10
## 6       83             77            76       12          7        11
## 7       21             15            19       15         12        11
## 8       80             77            78       10         15        13
## 9       71             77            76       15         16        13
## 10      40             18            12       15          9        10
##              V86        V87            V88
## 1  GKPositioning GKReflexes Release Clause
## 2              9         10               
## 3             10         15               
## 4             77         85               
## 5              8         15               
## 6             12         13               
## 7             11          8               
## 8             15          8               
## 9              7          8               
## 10            15         16
names(football) <- football[1,]
head(football)
##       ID         Name Age                                          Photo
## 1     ID         Name Age                                          Photo
## 2 207439   L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909     A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260       Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
##   Nationality                                Flag Overall Potential
## 1 Nationality                                Flag Overall Potential
## 2   Argentina https://cdn.sofifa.org/flags/52.png      80        85
## 3      Sweden https://cdn.sofifa.org/flags/46.png      80        80
## 4      Russia https://cdn.sofifa.org/flags/40.png      79        81
## 5      Russia https://cdn.sofifa.org/flags/40.png      79        79
## 6      Brazil https://cdn.sofifa.org/flags/54.png      78        78
##              Club                                   Club Logo Value  Wage
## 1            Club                                   Club Logo Value  Wage
## 2                         https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                         https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                         https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                         https://cdn.sofifa.org/flags/40.png  6030  1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
##   Special Preferred Foot International Reputation Weak Foot Skill Moves
## 1 Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2    2122          Right                        2         4           4
## 3    1797          Right                        2         4           2
## 4    1217          Right                        1         3           1
## 5    2038          Right                        2         3           3
## 6    1807          Right                        2         3           3
##        Work Rate Body Type Real Face Position Jersey Number   Joined
## 1      Work Rate Body Type Real Face Position Jersey Number   Joined
## 2 Medium/ Medium    Normal        No       CM             5         
## 3   High/ Medium    Normal        No      LCB             4         
## 4 Medium/ Medium    Normal        No       GK            12         
## 5     High/ High      Lean        No       RB             2         
## 6 Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
##   Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 1 Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                    5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                     6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                     6'2 176lbs                              
## 5                                    5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                             2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
##     RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 1   RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                           
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
##     LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 1   LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3 70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                16        14              17           25
## 5 78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6 68+2 76+2 76+2 76+2 68+2       60        45              79           73
##   Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 1 Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2      73        78    79         78          82          82           75
## 3      37        49    36         40          67          63           46
## 4      13        15    18         17          32          17           58
## 5      57        72    49         46          75          72           84
## 6      51        63    42         48          72          73           33
##   SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 1 SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2          69      77        74      77        82      61      79       69
## 3          49      55        76      36        74      64      67       83
## 4          54      36        76      50        24      60      27       70
## 5          90      80        75      76        67      85      93       68
## 6          38      51        70      60        55      79      54       76
##   LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 1 LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2        80         79            72          74     82        57        74
## 3        59         81            82          54     49        79        78
## 4        13         26            20          11     63        15        69
## 5        57         65            71          77     72        41        73
## 6        58         76            79          50     67        64        70
##   Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 1 Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2      73             75            72        9         14         6
## 3      82             83            79        7          9        12
## 4      18             20            12       80         73        65
## 5      76             76            80        7         12        10
## 6      83             77            76       12          7        11
##   GKPositioning GKReflexes Release Clause
## 1 GKPositioning GKReflexes Release Clause
## 2             9         10               
## 3            10         15               
## 4            77         85               
## 5             8         15               
## 6            12         13
football <- football[-c(1),]
head(football)
##       ID         Name Age                                          Photo
## 2 207439   L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909     A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260       Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
## 7 187607    A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
##   Nationality                                Flag Overall Potential
## 2   Argentina https://cdn.sofifa.org/flags/52.png      80        85
## 3      Sweden https://cdn.sofifa.org/flags/46.png      80        80
## 4      Russia https://cdn.sofifa.org/flags/40.png      79        81
## 5      Russia https://cdn.sofifa.org/flags/40.png      79        79
## 6      Brazil https://cdn.sofifa.org/flags/54.png      78        78
## 7      Russia https://cdn.sofifa.org/flags/40.png      78        78
##              Club                                   Club Logo Value  Wage
## 2                         https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                         https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                         https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                         https://cdn.sofifa.org/flags/40.png  6030  1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
## 7                         https://cdn.sofifa.org/flags/40.png  5764  1105
##   Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2    2122          Right                        2         4           4
## 3    1797          Right                        2         4           2
## 4    1217          Right                        1         3           1
## 5    2038          Right                        2         3           3
## 6    1807          Right                        2         3           3
## 7    1810          Right                        2         3           3
##        Work Rate Body Type Real Face Position Jersey Number   Joined
## 2 Medium/ Medium    Normal        No       CM             5         
## 3   High/ Medium    Normal        No      LCB             4         
## 4 Medium/ Medium    Normal        No       GK            12         
## 5     High/ High      Lean        No       RB             2         
## 6 Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
## 7   High/ Medium    Stocky        No       ST            22         
##   Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                    5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                     6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                     6'2 176lbs                              
## 5                                    5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                             2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7                                     6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
##     RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                           
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7 74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
##     LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3 70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                16        14              17           25
## 5 78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6 68+2 76+2 76+2 76+2 68+2       60        45              79           73
## 7 48+2 48+2 48+2 48+2 48+2       61        79              86           71
##   Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2      73        78    79         78          82          82           75
## 3      37        49    36         40          67          63           46
## 4      13        15    18         17          32          17           58
## 5      57        72    49         46          75          72           84
## 6      51        63    42         48          72          73           33
## 7      74        71    64         60          55          77           66
##   SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2          69      77        74      77        82      61      79       69
## 3          49      55        76      36        74      64      67       83
## 4          54      36        76      50        24      60      27       70
## 5          90      80        75      76        67      85      93       68
## 6          38      51        70      60        55      79      54       76
## 7          65      50        75      32        78      63      77       93
##   LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2        80         79            72          74     82        57        74
## 3        59         81            82          54     49        79        78
## 4        13         26            20          11     63        15        69
## 5        57         65            71          77     72        41        73
## 6        58         76            79          50     67        64        70
## 7        68         75            30          78     73        77        70
##   Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2      73             75            72        9         14         6
## 3      82             83            79        7          9        12
## 4      18             20            12       80         73        65
## 5      76             76            80        7         12        10
## 6      83             77            76       12          7        11
## 7      21             15            19       15         12        11
##   GKPositioning GKReflexes Release Clause
## 2             9         10               
## 3            10         15               
## 4            77         85               
## 5             8         15               
## 6            12         13               
## 7            11          8
nrow(football)
## [1] 18207
table(football$Position)
## 
##       CAM   CB  CDM   CF   CM   GK  LAM   LB  LCB  LCM  LDM   LF   LM   LS   LW 
##   60  958 1778  948   74 1394 2025   21 1322  648  395  243   15 1095  207  381 
##  LWB  RAM   RB  RCB  RCM  RDM   RF   RM   RS   RW  RWB   ST 
##   78   21 1291  662  391  248   16 1124  203  370   87 2152

2. Scatter Plot

2.1 Filter for strikers

Strikers are defined in the dataset as Position = “ST”.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
football_st <- football %>% filter(Position == "ST")
head(football_st)
##       ID            Name Age                                          Photo
## 1 187607       A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389          G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683       K. Fofana  26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson  27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900    J. Sambenito  26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405       B. Angulo  22 https://cdn.sofifa.org/players/4/19/246405.png
##   Nationality                                 Flag Overall Potential Club
## 1      Russia  https://cdn.sofifa.org/flags/40.png      78        78     
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77     
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png      75        75     
## 4     Iceland  https://cdn.sofifa.org/flags/24.png      73        74     
## 5    Paraguay  https://cdn.sofifa.org/flags/58.png      71        74     
## 6     Ecuador  https://cdn.sofifa.org/flags/57.png      71        77     
##                              Club Logo Value Wage Special Preferred Foot
## 1  https://cdn.sofifa.org/flags/40.png  5764 1105    1810          Right
## 2 https://cdn.sofifa.org/flags/108.png  5275 2138    1933           Left
## 3 https://cdn.sofifa.org/flags/108.png  5589 3875    1877          Right
## 4  https://cdn.sofifa.org/flags/24.png  5629 3661    1893          Right
## 5  https://cdn.sofifa.org/flags/58.png  6113 2445    1651          Right
## 6  https://cdn.sofifa.org/flags/57.png  5057 2216    1628          Right
##   International Reputation Weak Foot Skill Moves      Work Rate Body Type
## 1                        2         3           3   High/ Medium    Stocky
## 2                        2         3           3      High/ Low    Normal
## 3                        1         3           3 Medium/ Medium    Normal
## 4                        1         4           3     High/ High    Normal
## 5                        1         3           2   High/ Medium      Lean
## 6                        1         4           3      High/ Low    Normal
##   Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1        No       ST            22                                        
## 2        No       ST            21                                        
## 3        No       ST            22                                        
## 4        No       ST             9                                        
## 5        No       ST             9                                        
## 6        No       ST            19                                        
##   Height Weight   LS   ST   RS   LW   LF   CF   RF   RW  LAM  CAM  RAM   LM
## 1    6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2   5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3    6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4    6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5    6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6    6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
##    LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB   LB  LCB   CB  RCB   RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
##   Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1       61        79              86           71      74        71    64
## 2       68        77              71           73      73        76    73
## 3       66        75              72           74      74        72    63
## 4       66        71              68           68      65        73    63
## 5       40        74              72           57      72        60    64
## 6       50        78              69           56      46        76    58
##   FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1         60          55          77           66          65      50        75
## 2         69          67          76           78          85      79        71
## 3         59          58          75           59          77      63        72
## 4         48          44          73           78          79      83        74
## 5         42          42          63           79          72      61        69
## 6         58          33          71           82          79      78        73
##   Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1      32        78      63      77       93        68         75            30
## 2      73        77      70      78       74        74         77            18
## 3      60        78      69      83       77        73         67            40
## 4      76        68      78      90       85        66         73            42
## 5      64        73      69      67       72        67         49            14
## 6      64        72      69      77       69        54         28            16
##   Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1          78     73        77        70      21             15            19
## 2          76     73        72        72      40             18            12
## 3          72     69        74        83      23             37            46
## 4          73     64        69        76      31             39            24
## 5          75     60        67        74      15             16            16
## 6          62     45        82        51      11             18            12
##   GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1       15         12        11            11          8               
## 2       15          9        10            15         16               
## 3        7         11         7            11         14               
## 4        9         12        10            15         16               
## 5       15         16        15             7          7               
## 6       11          8        10             7          6
nrow(football_st)
## [1] 2152

2.2 Scatter Plot

convert to numeric.

str(football_st$Wage)
##  chr [1:2152] "1105" "2138" "3875" "3661" "2445" "2216" "4457" "3370" ...
str(football_st$Value)
##  chr [1:2152] "5764" "5275" "5589" "5629" "6113" "5057" "6561" "6146" ...
football_st$Wage <- as.numeric(football_st$Wage)
football_st$Value <- as.numeric(football_st$Value)
library(ggplot2)
library(ggpubr)

ggplot(football_st) + aes(x = Wage, y = Value) +
  geom_point(shape = 2, colour = "black") +
  xlab("Wage") + ylab("Value") +
  ggtitle("Wage and Value") +
  geom_smooth(method = lm) +
  stat_regline_equation(label.x = 150000, label.y = 1700) +
  stat_cor(method = "pearson", label.x = 300000, label.y = 1600)
## `geom_smooth()` using formula 'y ~ x'

3. Simple Linear Regression

value_simple <- lm(football_st$Value ~ football_st$Wage)
summary(value_simple)
## 
## Call:
## lm(formula = football_st$Value ~ football_st$Wage)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -17073527   -633009   -209153    198333  38355242 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4.175e+05  7.060e+04  -5.913 3.91e-09 ***
## football_st$Wage  2.179e+02  2.721e+00  80.068  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2838000 on 2150 degrees of freedom
## Multiple R-squared:  0.7489, Adjusted R-squared:  0.7487 
## F-statistic:  6411 on 1 and 2150 DF,  p-value: < 2.2e-16
confint(value_simple, level = 0.95)
##                         2.5 %       97.5 %
## (Intercept)      -555911.3195 -278995.9221
## football_st$Wage     212.5681     223.2422

4. Residuals

Standard residuals.

value_simple_stdresiduals <- rstandard(value_simple)
head(value_simple_stdresiduals)
##           1           2           3           4           5           6 
##  0.06430004 -0.01520939 -0.14850129 -0.13205208 -0.03849210 -0.02127676
football_st_comb <- cbind(football_st, value_simple_stdresiduals)
head(football_st_comb)
##       ID            Name Age                                          Photo
## 1 187607       A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389          G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683       K. Fofana  26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson  27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900    J. Sambenito  26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405       B. Angulo  22 https://cdn.sofifa.org/players/4/19/246405.png
##   Nationality                                 Flag Overall Potential Club
## 1      Russia  https://cdn.sofifa.org/flags/40.png      78        78     
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77     
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png      75        75     
## 4     Iceland  https://cdn.sofifa.org/flags/24.png      73        74     
## 5    Paraguay  https://cdn.sofifa.org/flags/58.png      71        74     
## 6     Ecuador  https://cdn.sofifa.org/flags/57.png      71        77     
##                              Club Logo Value Wage Special Preferred Foot
## 1  https://cdn.sofifa.org/flags/40.png  5764 1105    1810          Right
## 2 https://cdn.sofifa.org/flags/108.png  5275 2138    1933           Left
## 3 https://cdn.sofifa.org/flags/108.png  5589 3875    1877          Right
## 4  https://cdn.sofifa.org/flags/24.png  5629 3661    1893          Right
## 5  https://cdn.sofifa.org/flags/58.png  6113 2445    1651          Right
## 6  https://cdn.sofifa.org/flags/57.png  5057 2216    1628          Right
##   International Reputation Weak Foot Skill Moves      Work Rate Body Type
## 1                        2         3           3   High/ Medium    Stocky
## 2                        2         3           3      High/ Low    Normal
## 3                        1         3           3 Medium/ Medium    Normal
## 4                        1         4           3     High/ High    Normal
## 5                        1         3           2   High/ Medium      Lean
## 6                        1         4           3      High/ Low    Normal
##   Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1        No       ST            22                                        
## 2        No       ST            21                                        
## 3        No       ST            22                                        
## 4        No       ST             9                                        
## 5        No       ST             9                                        
## 6        No       ST            19                                        
##   Height Weight   LS   ST   RS   LW   LF   CF   RF   RW  LAM  CAM  RAM   LM
## 1    6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2   5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3    6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4    6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5    6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6    6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
##    LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB   LB  LCB   CB  RCB   RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
##   Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1       61        79              86           71      74        71    64
## 2       68        77              71           73      73        76    73
## 3       66        75              72           74      74        72    63
## 4       66        71              68           68      65        73    63
## 5       40        74              72           57      72        60    64
## 6       50        78              69           56      46        76    58
##   FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1         60          55          77           66          65      50        75
## 2         69          67          76           78          85      79        71
## 3         59          58          75           59          77      63        72
## 4         48          44          73           78          79      83        74
## 5         42          42          63           79          72      61        69
## 6         58          33          71           82          79      78        73
##   Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1      32        78      63      77       93        68         75            30
## 2      73        77      70      78       74        74         77            18
## 3      60        78      69      83       77        73         67            40
## 4      76        68      78      90       85        66         73            42
## 5      64        73      69      67       72        67         49            14
## 6      64        72      69      77       69        54         28            16
##   Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1          78     73        77        70      21             15            19
## 2          76     73        72        72      40             18            12
## 3          72     69        74        83      23             37            46
## 4          73     64        69        76      31             39            24
## 5          75     60        67        74      15             16            16
## 6          62     45        82        51      11             18            12
##   GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1       15         12        11            11          8               
## 2       15          9        10            15         16               
## 3        7         11         7            11         14               
## 4        9         12        10            15         16               
## 5       15         16        15             7          7               
## 6       11          8        10             7          6               
##   value_simple_stdresiduals
## 1                0.06430004
## 2               -0.01520939
## 3               -0.14850129
## 4               -0.13205208
## 5               -0.03849210
## 6               -0.02127676

4.1 Plot residuals

ggplot(football_st_comb) + aes(x = football_st_comb$Value, y = football_st_comb$value_simple_stdresiduals) +
  geom_point() +
  xlab("Value") + ylab("Standard Residuals") +
  ggtitle("Wage and Value Prediction, Residuals")
## Warning: Use of `football_st_comb$Value` is discouraged. Use `Value` instead.
## Warning: Use of `football_st_comb$value_simple_stdresiduals` is discouraged. Use
## `value_simple_stdresiduals` instead.

4.2 Normality

ggplot(football_st) + aes(x = Value) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-1: distribution is different from a normal distribution.

shapiro.test(football_st$Value)
## 
##  Shapiro-Wilk normality test
## 
## data:  football_st$Value
## W = 0.37447, p-value < 2.2e-16

5. Multiple Linear Regression

Subset data for simplicity.

football_st_2 <- football_st[, c("Age", "Balance", "ShotPower", "Aggression",
                                 "Positioning", "Composure", "Wage")]
head(football_st_2)
##   Age Balance ShotPower Aggression Positioning Composure Wage
## 1  29      32        78         75          78        70 1105
## 2  29      73        77         77          76        72 2138
## 3  26      60        78         67          72        83 3875
## 4  27      76        68         73          73        76 3661
## 5  26      64        73         49          75        74 2445
## 6  22      64        72         28          62        51 2216

Convert to numeric.

library(dplyr)
football_st_2 <- football_st_2 %>% mutate_if(is.character, as.numeric)
str(football_st_2)
## 'data.frame':    2152 obs. of  7 variables:
##  $ Age        : num  29 29 26 27 26 22 22 28 31 28 ...
##  $ Balance    : num  32 73 60 76 64 64 65 75 69 56 ...
##  $ ShotPower  : num  78 77 78 68 73 72 66 75 69 71 ...
##  $ Aggression : num  75 77 67 73 49 28 30 36 68 59 ...
##  $ Positioning: num  78 76 72 73 75 62 76 68 69 72 ...
##  $ Composure  : num  70 72 83 76 74 51 62 56 80 56 ...
##  $ Wage       : num  1105 2138 3875 3661 2445 ...

A multiple regression model showing unstandardised estimates.

The predictors included in the model are: Age, Balance, ShotPower, Aggression, Positioning, and Composure.

names(football_st_2)
## [1] "Age"         "Balance"     "ShotPower"   "Aggression"  "Positioning"
## [6] "Composure"   "Wage"
wage_model_st <- lm(Wage ~ Age + Balance + ShotPower +
                      Aggression + Positioning + Composure,
                    data = football_st_2)
summary(wage_model_st)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning + Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31822  -8232  -2313   4754 350592 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -77073.40    4064.61 -18.962  < 2e-16 ***
## Age          -1014.25     110.94  -9.143  < 2e-16 ***
## Balance        120.41      35.90   3.354  0.00081 ***
## ShotPower      498.07      74.43   6.692 2.81e-11 ***
## Aggression      15.96      32.29   0.494  0.62129    
## Positioning    741.71      82.42   8.999  < 2e-16 ***
## Composure      424.72      71.66   5.927 3.58e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18840 on 2145 degrees of freedom
## Multiple R-squared:  0.2997, Adjusted R-squared:  0.2978 
## F-statistic:   153 on 6 and 2145 DF,  p-value: < 2.2e-16
coef(wage_model_st)
##  (Intercept)          Age      Balance    ShotPower   Aggression  Positioning 
## -77073.39877  -1014.24567    120.40620    498.06517     15.95657    741.70804 
##    Composure 
##    424.72405
confint(wage_model_st, level = 0.95)
##                    2.5 %       97.5 %
## (Intercept) -85044.38590 -69102.41165
## Age          -1231.79758   -796.69375
## Balance         50.00615    190.80626
## ShotPower      352.09956    644.03079
## Aggression     -47.37581     79.28895
## Positioning    580.07796    903.33813
## Composure      284.19780    565.25031

5.1 Residuals

wage_model_st_residuals <- rstandard(wage_model_st)
head(wage_model_st_residuals)
##          1          2          3          4          5          6 
## -1.2711799 -1.4183035 -1.5151160 -1.1956035 -1.3820667 -0.5348701
football_st_comb_2 <- cbind(football_st_2, wage_model_st_residuals)
head(football_st_comb_2)
##   Age Balance ShotPower Aggression Positioning Composure Wage
## 1  29      32        78         75          78        70 1105
## 2  29      73        77         77          76        72 2138
## 3  26      60        78         67          72        83 3875
## 4  27      76        68         73          73        76 3661
## 5  26      64        73         49          75        74 2445
## 6  22      64        72         28          62        51 2216
##   wage_model_st_residuals
## 1              -1.2711799
## 2              -1.4183035
## 3              -1.5151160
## 4              -1.1956035
## 5              -1.3820667
## 6              -0.5348701
ggplot(football_st_comb_2) + aes(x = Wage, y = wage_model_st_residuals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage Prediction")

ggplot(football_st_comb_2) + aes(x = Age, y = wage_model_st_residuals) +
  geom_point() + xlab("Age") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Age")

ggplot(football_st_comb_2) + aes(x = ShotPower, y = wage_model_st_residuals) +
  geom_point() + xlab("Shot Power") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Shot Power")

ggplot(football_st_comb_2) + aes(x = Aggression, y = wage_model_st_residuals) +
  geom_point() + xlab("Aggression") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Aggression")

ggplot(football_st_comb_2) + aes(x = Positioning, y = wage_model_st_residuals) +
  geom_point() + xlab("Positioning") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Positionng")

ggplot(football_st_comb_2) + aes(x = Composure, y = wage_model_st_residuals) +
  geom_point() + xlab("Composure") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Composure")

5.2 Model evaluation

5.2.1 Normality

library(ggplot2)

ggplot(football_st_2) + aes(x = Wage) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of wage (strikers)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(football_st_2) + aes(x = Wage) +
  geom_histogram() +
  ylab("Count") +
  scale_x_log10() +
  ggtitle("Distribution of log(wage) (strikers)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution

H-1: distribution is different from a normal distribution.

shapiro.test(football_st_2$Wage)
## 
##  Shapiro-Wilk normality test
## 
## data:  football_st_2$Wage
## W = 0.39056, p-value < 2.2e-16

5.2.2 Multicollinearity

How much the variance of an estimated regression coefficient increases if your predictors are correlated.

In other words, no 2 pairs of predicts should not be strongly correlated with each other.

If no factors are correlated, the VIFs will all be 1.

Rule of thumb: If VIF > 10, mullticollinearity is high.

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(wage_model_st)
##         Age     Balance   ShotPower  Aggression Positioning   Composure 
##    1.671663    1.039327    2.786601    1.593244    3.476150    3.196433

5.2.3 Heteroskedasticity

Perform a Breusch-Pagan Test to test for heteroskedasticity/homoskedasticity.

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(wage_model_st)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st
## BP = 91.188, df = 6, p-value < 2.2e-16
plot(wage_model_st, 1)

library(olsrr) 
## Warning: package 'olsrr' was built under R version 4.0.5
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
ols_test_breusch_pagan(wage_model_st)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##               Data               
##  --------------------------------
##  Response : Wage 
##  Variables: fitted values of Wage 
## 
##         Test Summary         
##  ----------------------------
##  DF            =    1 
##  Chi2          =    5352.5071 
##  Prob > Chi2   =    0.0000

Multiple test for each variable.

ols_test_breusch_pagan(wage_model_st, rhs = TRUE,
                       multiple = TRUE)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                               Data                                
##  -----------------------------------------------------------------
##  Response : Wage 
##  Variables: Age Balance ShotPower Aggression Positioning Composure 
## 
##           Test Summary (Unadjusted p values)         
##  --------------------------------------------------
##   Variable           chi2       df          p       
##  --------------------------------------------------
##   Age               487.2866     1    5.549066e-108 
##   Balance           147.9854     1     4.778931e-34 
##   ShotPower        3632.0162     1     0.000000e+00 
##   Aggression        637.7948     1    1.008165e-140 
##   Positioning      4068.5226     1     0.000000e+00 
##   Composure        4081.2646     1     0.000000e+00 
##  --------------------------------------------------
##   simultaneous     5538.8585     6     0.000000e+00 
##  --------------------------------------------------

6. Stepwise regression

Stepwise regression is a modification of the ordinary regression.

library(stats)
wage_model_st_step <- step(wage_model_st,
                           direction = "both")
## Start:  AIC=42374.94
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning + 
##     Composure
## 
##               Df  Sum of Sq        RSS   AIC
## - Aggression   1 8.6672e+07 7.6162e+11 42373
## <none>                      7.6154e+11 42375
## - Balance      1 3.9939e+09 7.6553e+11 42384
## - Composure    1 1.2472e+10 7.7401e+11 42408
## - ShotPower    1 1.5897e+10 7.7743e+11 42417
## - Positioning  1 2.8752e+10 7.9029e+11 42453
## - Age          1 2.9676e+10 7.9121e+11 42455
## 
## Step:  AIC=42373.18
## Wage ~ Age + Balance + ShotPower + Positioning + Composure
## 
##               Df  Sum of Sq        RSS   AIC
## <none>                      7.6162e+11 42373
## + Aggression   1 8.6672e+07 7.6154e+11 42375
## - Balance      1 3.9197e+09 7.6554e+11 42382
## - Composure    1 1.2939e+10 7.7456e+11 42407
## - ShotPower    1 1.7279e+10 7.7890e+11 42419
## - Positioning  1 2.8770e+10 7.9039e+11 42451
## - Age          1 3.0373e+10 7.9200e+11 42455
summary(wage_model_st_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning + 
##     Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31793  -8228  -2326   4830 350282 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -77250.10    4048.13 -19.083  < 2e-16 ***
## Age          -1002.58     108.38  -9.251  < 2e-16 ***
## Balance        118.78      35.74   3.323 0.000904 ***
## ShotPower      506.25      72.55   6.978 3.98e-12 ***
## Positioning    741.93      82.40   9.004  < 2e-16 ***
## Composure      429.17      71.08   6.038 1.83e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18840 on 2146 degrees of freedom
## Multiple R-squared:  0.2997, Adjusted R-squared:  0.298 
## F-statistic: 183.6 on 5 and 2146 DF,  p-value: < 2.2e-16

7. Data mining approach

Now, we will use the data mining approach.

7.1 Training validation split

Split the data into training and validation sets.

Set the seed using our favourite number :-)

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation.

train_index <- sample(1:nrow(football_st_2), 0.6 * nrow(football_st_2))
valid_index <- setdiff(1:nrow(football_st_2), train_index)

Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.

train_df_st <- football_st_2[train_index, ]
valid_df_st <- football_st_2[valid_index, ]

It is a good habit to check after splitting.

nrow(train_df_st)
## [1] 1291
nrow(valid_df_st)
## [1] 861

7.2 Training

Training the model on the training set.

wage_model_st_2 <- lm(Wage ~ Age + Balance + ShotPower + 
                        Aggression + Positioning + Composure,
                      data = train_df_st)
summary(wage_model_st_2)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning + Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32654  -8533  -2462   5056 346913 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -82121.16    5937.75 -13.830  < 2e-16 ***
## Age           -948.45     166.15  -5.708 1.42e-08 ***
## Balance        114.04      53.15   2.146   0.0321 *  
## ShotPower      489.51     110.63   4.425 1.05e-05 ***
## Aggression     -23.76      47.29  -0.502   0.6154    
## Positioning    730.41     121.49   6.012 2.39e-09 ***
## Composure      544.62     104.74   5.200 2.32e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21280 on 1284 degrees of freedom
## Multiple R-squared:  0.2704, Adjusted R-squared:  0.267 
## F-statistic:  79.3 on 6 and 1284 DF,  p-value: < 2.2e-16

7.3 Predicting

Predict the outcome (i.e. wage) of the validation set using the model from the training set.

library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:ggpubr':
## 
##     gghistogram
wage_model_st_2_pred_train <- predict(wage_model_st_2,
                                train_df_st)

wage_model_st_2_pred <- predict(wage_model_st_2,
                                valid_df_st)

7.4 Model evaluation

Compare the errors between the training and validation sets.

accuracy(wage_model_st_2_pred_train, train_df_st$Wage)
##                    ME     RMSE      MAE       MPE     MAPE
## Test set 9.854906e-11 21227.13 9907.101 -32.18323 133.6435
accuracy(wage_model_st_2_pred, valid_df_st$Wage)
##                ME     RMSE      MAE       MPE     MAPE
## Test set 18.64833 14551.57 9388.476 -17.37455 128.1185
max(football_st_2$Wage) - min(football_st_2$Wage)
## [1] 406504
sd(football_st_2$Wage)
## [1] 22484.99

8. Categorical independent variables

Subset to include categorical variable: preferred foot

football_st_3 <- football_st[, c("Preferred Foot", "Positioning", "Composure", "Wage")]
head(football_st_3)
##   Preferred Foot Positioning Composure Wage
## 1          Right          78        70 1105
## 2           Left          76        72 2138
## 3          Right          72        83 3875
## 4          Right          73        76 3661
## 5          Right          75        74 2445
## 6          Right          62        51 2216
names(football_st_3)[1] <- "Preferred_Foot"

football_st_3$Positioning <- as.numeric(football_st_3$Positioning)
football_st_3$Composure <- as.numeric(football_st_3$Composure)
wage_model_st_cat <- lm(Wage ~ factor(Preferred_Foot) + Positioning + Composure, data = football_st_3)
summary(wage_model_st_cat)
## 
## Call:
## lm(formula = Wage ~ factor(Preferred_Foot) + Positioning + Composure, 
##     data = football_st_3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31805  -8181  -2270   4528 354971 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -65474.98    3209.87 -20.398  < 2e-16 ***
## factor(Preferred_Foot)Right  -1087.48    1221.25  -0.890    0.373    
## Positioning                    816.64      75.34  10.840  < 2e-16 ***
## Composure                      438.10      68.36   6.409  1.8e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19400 on 2148 degrees of freedom
## Multiple R-squared:  0.2564, Adjusted R-squared:  0.2554 
## F-statistic: 246.9 on 3 and 2148 DF,  p-value: < 2.2e-16
confint(wage_model_st_cat, level = 0.95)
##                                   2.5 %      97.5 %
## (Intercept)                 -71769.7525 -59180.1997
## factor(Preferred_Foot)Right  -3482.4352   1307.4754
## Positioning                    668.9041    964.3812
## Composure                      304.0428    572.1551

8.1 Residuals

wage_model_st_cat_stdresiduals <- rstandard(wage_model_st_cat)
head(wage_model_st_cat_stdresiduals)
##          1          2          3          4          5          6 
## -1.3769868 -1.3424571 -1.2770311 -1.1703986 -1.2718790 -0.2163923
football_st_3_cat <- cbind(football_st_3, wage_model_st_cat_stdresiduals)
head(football_st_3_cat)
##   Preferred_Foot Positioning Composure Wage wage_model_st_cat_stdresiduals
## 1          Right          78        70 1105                     -1.3769868
## 2           Left          76        72 2138                     -1.3424571
## 3          Right          72        83 3875                     -1.2770311
## 4          Right          73        76 3661                     -1.1703986
## 5          Right          75        74 2445                     -1.2718790
## 6          Right          62        51 2216                     -0.2163923
ggplot(football_st_3_cat) + aes(x = Wage, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage")

Positioning

ggplot(football_st_3_cat) + aes(x = Positioning, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Positioning") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Positioning")

Composure

ggplot(football_st_3_cat) + aes(x = Composure, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Composure") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Composure")

ggplot(football_st_3_cat) + aes(x = Preferred_Foot, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Preferred Foot") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Preferred Foot")

8.2 Model Evaluation

ggplot(football_st_3_cat) + aes(x = Wage) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Wage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-alt: distribution is different from a normal distribution.

shapiro.test(football_st_3_cat$Wage)
## 
##  Shapiro-Wilk normality test
## 
## data:  football_st_3_cat$Wage
## W = 0.39056, p-value < 2.2e-16

Multicollinearity

vif(wage_model_st_cat)
## factor(Preferred_Foot)            Positioning              Composure 
##               1.002720               2.738872               2.743181

Homoscedasticity.

ols_test_breusch_pagan(wage_model_st_cat)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##               Data               
##  --------------------------------
##  Response : Wage 
##  Variables: fitted values of Wage 
## 
##         Test Summary         
##  ----------------------------
##  DF            =    1 
##  Chi2          =    4754.5635 
##  Prob > Chi2   =    0.0000

9. Non-Linear regression

Sometimes, a relationship may not be linear. In this case, we can specify a non-linear relationship in the model.

9.1 Traditional statistics

We start with the traditional statistics approach and perform the diagnostics.

The non-linear relationship is expressed in the model specification.

names(football_st_2)
## [1] "Age"         "Balance"     "ShotPower"   "Aggression"  "Positioning"
## [6] "Composure"   "Wage"
wage_model_st_nl <- lm(Wage ~ Age + Balance + ShotPower +
                         Aggression + Positioning * Composure, 
                       data = football_st_2)
summary(wage_model_st_nl)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning * Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58380  -5245     80   4644 267683 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           297675.783  13442.584  22.144   <2e-16 ***
## Age                     -789.963     94.502  -8.359   <2e-16 ***
## Balance                   57.694     30.555   1.888   0.0591 .  
## ShotPower                642.408     63.389  10.134   <2e-16 ***
## Aggression                19.805     27.418   0.722   0.4702    
## Positioning            -5016.022    211.523 -23.714   <2e-16 ***
## Composure              -6150.054    235.919 -26.069   <2e-16 ***
## Positioning:Composure     96.301      3.339  28.844   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16000 on 2144 degrees of freedom
## Multiple R-squared:  0.4955, Adjusted R-squared:  0.4939 
## F-statistic: 300.8 on 7 and 2144 DF,  p-value: < 2.2e-16
vif(wage_model_st_nl)
##                   Age               Balance             ShotPower 
##              1.683057              1.044616              2.804077 
##            Aggression           Positioning             Composure 
##              1.593281             31.765761             48.069231 
## Positioning:Composure 
##            127.119996
durbinWatsonTest(wage_model_st_nl)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2531554      1.491911       0
##  Alternative hypothesis: rho != 0

9.2 Traditional statistics stepwise

Perform a stepwise regression with a non-linear relationship and the diagnostics.

wage_model_st_nl_step <- step(wage_model_st_nl,
                           direction = "both")
## Start:  AIC=41671.29
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning * 
##     Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## - Aggression             1 1.3352e+08 5.4877e+11 41670
## <none>                                5.4863e+11 41671
## - Balance                1 9.1234e+08 5.4955e+11 41673
## - Age                    1 1.7881e+10 5.6652e+11 41738
## - ShotPower              1 2.6282e+10 5.7492e+11 41770
## - Positioning:Composure  1 2.1290e+11 7.6154e+11 42375
## 
## Step:  AIC=41669.81
## Wage ~ Age + Balance + ShotPower + Positioning + Composure + 
##     Positioning:Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## <none>                                5.4877e+11 41670
## - Balance                1 8.5698e+08 5.4963e+11 41671
## + Aggression             1 1.3352e+08 5.4863e+11 41671
## - Age                    1 1.8041e+10 5.6681e+11 41737
## - ShotPower              1 2.8516e+10 5.7728e+11 41777
## - Positioning:Composure  1 2.1286e+11 7.6162e+11 42373
summary(wage_model_st_nl_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning + 
##     Composure + Positioning:Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58507  -5205     67   4579 267488 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           297410.796  13436.079  22.135   <2e-16 ***
## Age                     -775.517     92.352  -8.397   <2e-16 ***
## Balance                   55.684     30.424   1.830   0.0674 .  
## ShotPower                652.547     61.808  10.558   <2e-16 ***
## Positioning            -5015.048    211.495 -23.712   <2e-16 ***
## Composure              -6143.738    235.730 -26.063   <2e-16 ***
## Positioning:Composure     96.289      3.338  28.844   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15990 on 2145 degrees of freedom
## Multiple R-squared:  0.4954, Adjusted R-squared:  0.494 
## F-statistic:   351 on 6 and 2145 DF,  p-value: < 2.2e-16
vif(wage_model_st_nl_step)
##                   Age               Balance             ShotPower 
##              1.607679              1.035950              2.666586 
##           Positioning             Composure Positioning:Composure 
##             31.764471             48.003192            127.116986
durbinWatsonTest(wage_model_st_nl_step)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2522843      1.493672       0
##  Alternative hypothesis: rho != 0

9.3 Data mining approach

A data mining approach with the non-linear relationship.

As always, perform diagnostics on the model. This includes the various assumptions on a regression.

wage_model_st_nl_2 <- lm(Wage ~ Age + Balance + ShotPower + Aggression +
                           Positioning * Composure,
                       data = train_df_st)
summary(wage_model_st_nl_2)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning * Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69712  -5516    431   5121 257569 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           373590.851  19296.451  19.361  < 2e-16 ***
## Age                     -702.769    137.707  -5.103 3.84e-07 ***
## Balance                   39.904     44.039   0.906    0.365    
## ShotPower                691.250     91.817   7.529 9.63e-14 ***
## Aggression               -24.766     39.088  -0.634    0.526    
## Positioning            -6254.863    303.169 -20.632  < 2e-16 ***
## Composure              -7433.528    337.988 -21.993  < 2e-16 ***
## Positioning:Composure    116.410      4.767  24.419  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17590 on 1283 degrees of freedom
## Multiple R-squared:  0.5019, Adjusted R-squared:  0.4992 
## F-statistic: 184.7 on 7 and 1283 DF,  p-value: < 2.2e-16
wage_model_st_nl_2_pred <- predict(wage_model_st_nl_2,
                                   valid_df_st)
accuracy(wage_model_st_nl_2_pred, valid_df_st$Wage)
##                 ME     RMSE      MAE      MPE     MAPE
## Test set -547.7242 13726.06 8807.617 -31.1167 112.4235
sd(football_st$Wage)
## [1] 22484.99

9.4 Data mining approach using stepwise

A data mining approach using a stepwise regression and non-linear relationship.

wage_model_st_nl_2_step <- step(wage_model_st_nl_2,
                              direction = "both")
## Start:  AIC=25247.78
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning * 
##     Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## - Aggression             1 1.2426e+08 3.9726e+11 25246
## - Balance                1 2.5413e+08 3.9739e+11 25247
## <none>                                3.9713e+11 25248
## - Age                    1 8.0617e+09 4.0520e+11 25272
## - ShotPower              1 1.7544e+10 4.1468e+11 25302
## - Positioning:Composure  1 1.8458e+11 5.8171e+11 25739
## 
## Step:  AIC=25246.18
## Wage ~ Age + Balance + ShotPower + Positioning + Composure + 
##     Positioning:Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## - Balance                1 2.9864e+08 3.9756e+11 25245
## <none>                                3.9726e+11 25246
## + Aggression             1 1.2426e+08 3.9713e+11 25248
## - Age                    1 8.9937e+09 4.0625e+11 25273
## - ShotPower              1 1.7797e+10 4.1506e+11 25301
## - Positioning:Composure  1 1.8457e+11 5.8183e+11 25737
## 
## Step:  AIC=25245.15
## Wage ~ Age + ShotPower + Positioning + Composure + Positioning:Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## <none>                                3.9756e+11 25245
## + Balance                1 2.9864e+08 3.9726e+11 25246
## + Aggression             1 1.6876e+08 3.9739e+11 25247
## - Age                    1 9.3077e+09 4.0686e+11 25273
## - ShotPower              1 1.7503e+10 4.1506e+11 25299
## - Positioning:Composure  1 1.8649e+11 5.8405e+11 25740
summary(wage_model_st_nl_2_step)
## 
## Call:
## lm(formula = Wage ~ Age + ShotPower + Positioning + Composure + 
##     Positioning:Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69304  -5565    386   5182 257753 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           378121.671  18793.092  20.120  < 2e-16 ***
## Age                     -732.952    133.630  -5.485 4.98e-08 ***
## ShotPower                668.100     88.824   7.522 1.01e-13 ***
## Positioning            -6270.832    302.640 -20.720  < 2e-16 ***
## Composure              -7453.830    337.457 -22.088  < 2e-16 ***
## Positioning:Composure    116.731      4.754  24.552  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17590 on 1285 degrees of freedom
## Multiple R-squared:  0.5013, Adjusted R-squared:  0.4994 
## F-statistic: 258.4 on 5 and 1285 DF,  p-value: < 2.2e-16
wage_model_st_nl_2_step_pred <- predict(wage_model_st_nl_2_step,
                                        valid_df_st)

accuracy(wage_model_st_nl_2_step_pred, valid_df_st$Wage)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -584.0779 13718.68 8800.113 -31.92455 112.5375

10. Log variables

Sometimes, the data need to be transformed. A common transformation is the log transformation.

10.1 Traditional statistics

A traditional statistics approach using a log transformation.

Here, the predictors are transformed using a log function.

Age + Balance + ShotPower + Aggression + Positioning * Composure

wage_model_st_log <- lm(Wage ~ log(Age) + log(Balance) + log(ShotPower) +
                          log(Aggression) + log(Positioning) + log(Composure),
                    data = football_st_2)
summary(wage_model_st_log)
## 
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Aggression) + log(Positioning) + log(Composure), data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27838  -8379  -2853   4132 361712 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -345274      16356 -21.109  < 2e-16 ***
## log(Age)           -22193       2887  -7.688 2.26e-14 ***
## log(Balance)         6921       2150   3.220   0.0013 ** 
## log(ShotPower)      29539       4823   6.125 1.08e-09 ***
## log(Aggression)      1259       1621   0.777   0.4374    
## log(Positioning)    42091       5239   8.034 1.54e-15 ***
## log(Composure)      23706       4207   5.634 1.99e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19460 on 2145 degrees of freedom
## Multiple R-squared:  0.2529, Adjusted R-squared:  0.2508 
## F-statistic:   121 on 6 and 2145 DF,  p-value: < 2.2e-16

10.2 Data mining with log

We can also use a data mining approach with the log transformation.

wage_model_st_log_2 <- lm(Wage ~ log(Age) + log(Balance) + log(ShotPower) +
                          log(Aggression) + log(Positioning) + log(Composure),
                    data = train_df_st)
summary(wage_model_st_log_2)
## 
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Aggression) + log(Positioning) + log(Composure), data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28621  -8672  -3007   4368 359491 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -362624.54   23959.55 -15.135  < 2e-16 ***
## log(Age)          -19955.08    4304.36  -4.636 3.91e-06 ***
## log(Balance)        7001.43    3201.89   2.187   0.0289 *  
## log(ShotPower)     28016.65    7149.05   3.919 9.36e-05 ***
## log(Aggression)       64.03    2383.50   0.027   0.9786    
## log(Positioning)   42553.11    7708.04   5.521 4.08e-08 ***
## log(Composure)     28357.91    6083.46   4.661 3.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21950 on 1284 degrees of freedom
## Multiple R-squared:  0.2241, Adjusted R-squared:  0.2204 
## F-statistic: 61.79 on 6 and 1284 DF,  p-value: < 2.2e-16
wage_model_st_log_2_pred <- predict(wage_model_st_log_2,
                                   valid_df_st)
accuracy(wage_model_st_log_2_pred, valid_df_st$Wage)
##                ME     RMSE      MAE       MPE    MAPE
## Test set 20.75738 15065.11 9419.259 -25.61638 127.768
sd(football_st_2$Wage)
## [1] 22484.99
range(football_st_2$Wage)
## [1]   1105 407609

10.3 Stepwise data mining with log

A stepwise regression using data mining and log transformations.

wage_model_st_log_2_step <- step(wage_model_st_log_2,
                           direction = "both")
## Start:  AIC=25818.01
## Wage ~ log(Age) + log(Balance) + log(ShotPower) + log(Aggression) + 
##     log(Positioning) + log(Composure)
## 
##                    Df  Sum of Sq        RSS   AIC
## - log(Aggression)   1 3.4775e+05 6.1864e+11 25816
## <none>                           6.1864e+11 25818
## - log(Balance)      1 2.3037e+09 6.2094e+11 25821
## - log(ShotPower)    1 7.3996e+09 6.2604e+11 25831
## - log(Age)          1 1.0355e+10 6.2899e+11 25837
## - log(Composure)    1 1.0469e+10 6.2911e+11 25838
## - log(Positioning)  1 1.4684e+10 6.3332e+11 25846
## 
## Step:  AIC=25816.01
## Wage ~ log(Age) + log(Balance) + log(ShotPower) + log(Positioning) + 
##     log(Composure)
## 
##                    Df  Sum of Sq        RSS   AIC
## <none>                           6.1864e+11 25816
## + log(Aggression)   1 3.4775e+05 6.1864e+11 25818
## - log(Balance)      1 2.3247e+09 6.2096e+11 25819
## - log(ShotPower)    1 7.7312e+09 6.2637e+11 25830
## - log(Composure)    1 1.0688e+10 6.2933e+11 25836
## - log(Age)          1 1.0936e+10 6.2957e+11 25837
## - log(Positioning)  1 1.4687e+10 6.3333e+11 25844
summary(wage_model_st_log_2_step)
## 
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Positioning) + log(Composure), data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28606  -8666  -2990   4367 359474 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -362686      23840 -15.214  < 2e-16 ***
## log(Age)           -19928       4181  -4.766 2.09e-06 ***
## log(Balance)         6992       3182   2.197   0.0282 *  
## log(ShotPower)      28055       7001   4.007 6.49e-05 ***
## log(Positioning)    42555       7705   5.523 4.02e-08 ***
## log(Composure)      28380       6023   4.712 2.72e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21940 on 1285 degrees of freedom
## Multiple R-squared:  0.2241, Adjusted R-squared:  0.221 
## F-statistic: 74.21 on 5 and 1285 DF,  p-value: < 2.2e-16