Directions

Regression models to predict the wages of football players.

Data for demo

Back to the spellbook

1. Load Data

Load the data and explore them.

football <- read.csv("football_2.csv", header = FALSE)
head(football, 10)
##        V1            V2  V3                                             V4
## 1      ID          Name Age                                          Photo
## 2  207439    L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3  156713  A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4  229909      A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5  187347  I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6  153260        Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
## 7  187607     A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 8  204341 Lu\xcc_s Neto  30 https://cdn.sofifa.org/players/4/19/204341.png
## 9  223058    D. Kuzyaev  25 https://cdn.sofifa.org/players/4/19/223058.png
## 10 183389        G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
##             V5                                   V6      V7        V8
## 1  Nationality                                 Flag Overall Potential
## 2    Argentina  https://cdn.sofifa.org/flags/52.png      80        85
## 3       Sweden  https://cdn.sofifa.org/flags/46.png      80        80
## 4       Russia  https://cdn.sofifa.org/flags/40.png      79        81
## 5       Russia  https://cdn.sofifa.org/flags/40.png      79        79
## 6       Brazil  https://cdn.sofifa.org/flags/54.png      78        78
## 7       Russia  https://cdn.sofifa.org/flags/40.png      78        78
## 8     Portugal  https://cdn.sofifa.org/flags/38.png      77        77
## 9       Russia  https://cdn.sofifa.org/flags/40.png      77        80
## 10 Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77
##                 V9                                         V10   V11   V12
## 1             Club                                   Club Logo Value  Wage
## 2                          https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                          https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                          https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                          https://cdn.sofifa.org/flags/40.png  6030  1448
## 6  Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
## 7                          https://cdn.sofifa.org/flags/40.png  5764  1105
## 8                          https://cdn.sofifa.org/flags/38.png  6075  2836
## 9                          https://cdn.sofifa.org/flags/40.png  5565  2653
## 10                        https://cdn.sofifa.org/flags/108.png  5275  2138
##        V13            V14                      V15       V16         V17
## 1  Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2     2122          Right                        2         4           4
## 3     1797          Right                        2         4           2
## 4     1217          Right                        1         3           1
## 5     2038          Right                        2         3           3
## 6     1807          Right                        2         3           3
## 7     1810          Right                        2         3           3
## 8     1749          Right                        1         3           2
## 9     2041          Right                        1         3           3
## 10    1933           Left                        2         3           3
##               V18       V19       V20      V21           V22      V23
## 1       Work Rate Body Type Real Face Position Jersey Number   Joined
## 2  Medium/ Medium    Normal        No       CM             5         
## 3    High/ Medium    Normal        No      LCB             4         
## 4  Medium/ Medium    Normal        No       GK            12         
## 5      High/ High      Lean        No       RB             2         
## 6  Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
## 7    High/ Medium    Stocky        No       ST            22         
## 8  Medium/ Medium      Lean        No       CB             4         
## 9    Medium/ High      Lean        No       RM             7         
## 10      High/ Low    Normal        No       ST            21         
##            V24                  V25    V26    V27  V28  V29  V30  V31  V32  V33
## 1  Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                     5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                      6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                      6'2 176lbs                              
## 5                                     5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                              2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7                                      6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
## 8                                      6'2 157lbs 52+2 52+2 52+2 51+2 51+2 51+2
## 9                                      6'0 163lbs 70+2 70+2 70+2 74+2 74+2 74+2
## 10                                    5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2
##     V34  V35  V36  V37  V38  V39  V40  V41  V42  V43  V44  V45  V46  V47  V48
## 1    RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2  75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3  58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                            
## 5  72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6  59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7  74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
## 8  51+2 51+2 54+2 54+2 54+2 54+2 61+2 61+2 61+2 54+2 67+2 72+2 72+2 72+2 67+2
## 9  74+2 74+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2
## 10 75+2 75+2 74+2 74+2 74+2 74+2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2
##     V49  V50  V51  V52  V53      V54       V55             V56          V57
## 1    LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2  74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3  70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                 16        14              17           25
## 5  78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6  68+2 76+2 76+2 76+2 68+2       60        45              79           73
## 7  48+2 48+2 48+2 48+2 48+2       61        79              86           71
## 8  69+2 75+2 75+2 75+2 69+2       42        33              80           72
## 9  74+2 70+2 70+2 70+2 74+2       67        64              51           82
## 10 50+2 46+2 46+2 46+2 50+2       68        77              71           73
##        V58       V59   V60        V61         V62         V63          V64
## 1  Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2       73        78    79         78          82          82           75
## 3       37        49    36         40          67          63           46
## 4       13        15    18         17          32          17           58
## 5       57        72    49         46          75          72           84
## 6       51        63    42         48          72          73           33
## 7       74        71    64         60          55          77           66
## 8       40        49    52         43          77          48           57
## 9       57        78    60         61          75          79           78
## 10      73        76    73         69          67          76           78
##            V65     V66       V67     V68       V69     V70     V71      V72
## 1  SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2           69      77        74      77        82      61      79       69
## 3           49      55        76      36        74      64      67       83
## 4           54      36        76      50        24      60      27       70
## 5           90      80        75      76        67      85      93       68
## 6           38      51        70      60        55      79      54       76
## 7           65      50        75      32        78      63      77       93
## 8           59      69        78      61        42      79      72       72
## 9           81      80        73      76        76      60      79       59
## 10          85      79        71      73        77      70      78       74
##          V73        V74           V75         V76    V77       V78       V79
## 1  LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2         80         79            72          74     82        57        74
## 3         59         81            82          54     49        79        78
## 4         13         26            20          11     63        15        69
## 5         57         65            71          77     72        41        73
## 6         58         76            79          50     67        64        70
## 7         68         75            30          78     73        77        70
## 8         37         76            78          44     46        47        72
## 9         74         70            74          71     70        63        64
## 10        74         77            18          76     73        72        72
##        V80            V81           V82      V83        V84       V85
## 1  Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2       73             75            72        9         14         6
## 3       82             83            79        7          9        12
## 4       18             20            12       80         73        65
## 5       76             76            80        7         12        10
## 6       83             77            76       12          7        11
## 7       21             15            19       15         12        11
## 8       80             77            78       10         15        13
## 9       71             77            76       15         16        13
## 10      40             18            12       15          9        10
##              V86        V87            V88
## 1  GKPositioning GKReflexes Release Clause
## 2              9         10               
## 3             10         15               
## 4             77         85               
## 5              8         15               
## 6             12         13               
## 7             11          8               
## 8             15          8               
## 9              7          8               
## 10            15         16
names(football) <- football[1,]
head(football)
##       ID         Name Age                                          Photo
## 1     ID         Name Age                                          Photo
## 2 207439   L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909     A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260       Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
##   Nationality                                Flag Overall Potential
## 1 Nationality                                Flag Overall Potential
## 2   Argentina https://cdn.sofifa.org/flags/52.png      80        85
## 3      Sweden https://cdn.sofifa.org/flags/46.png      80        80
## 4      Russia https://cdn.sofifa.org/flags/40.png      79        81
## 5      Russia https://cdn.sofifa.org/flags/40.png      79        79
## 6      Brazil https://cdn.sofifa.org/flags/54.png      78        78
##              Club                                   Club Logo Value  Wage
## 1            Club                                   Club Logo Value  Wage
## 2                         https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                         https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                         https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                         https://cdn.sofifa.org/flags/40.png  6030  1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
##   Special Preferred Foot International Reputation Weak Foot Skill Moves
## 1 Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2    2122          Right                        2         4           4
## 3    1797          Right                        2         4           2
## 4    1217          Right                        1         3           1
## 5    2038          Right                        2         3           3
## 6    1807          Right                        2         3           3
##        Work Rate Body Type Real Face Position Jersey Number   Joined
## 1      Work Rate Body Type Real Face Position Jersey Number   Joined
## 2 Medium/ Medium    Normal        No       CM             5         
## 3   High/ Medium    Normal        No      LCB             4         
## 4 Medium/ Medium    Normal        No       GK            12         
## 5     High/ High      Lean        No       RB             2         
## 6 Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
##   Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 1 Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                    5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                     6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                     6'2 176lbs                              
## 5                                    5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                             2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
##     RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 1   RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                           
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
##     LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 1   LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3 70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                16        14              17           25
## 5 78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6 68+2 76+2 76+2 76+2 68+2       60        45              79           73
##   Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 1 Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2      73        78    79         78          82          82           75
## 3      37        49    36         40          67          63           46
## 4      13        15    18         17          32          17           58
## 5      57        72    49         46          75          72           84
## 6      51        63    42         48          72          73           33
##   SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 1 SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2          69      77        74      77        82      61      79       69
## 3          49      55        76      36        74      64      67       83
## 4          54      36        76      50        24      60      27       70
## 5          90      80        75      76        67      85      93       68
## 6          38      51        70      60        55      79      54       76
##   LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 1 LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2        80         79            72          74     82        57        74
## 3        59         81            82          54     49        79        78
## 4        13         26            20          11     63        15        69
## 5        57         65            71          77     72        41        73
## 6        58         76            79          50     67        64        70
##   Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 1 Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2      73             75            72        9         14         6
## 3      82             83            79        7          9        12
## 4      18             20            12       80         73        65
## 5      76             76            80        7         12        10
## 6      83             77            76       12          7        11
##   GKPositioning GKReflexes Release Clause
## 1 GKPositioning GKReflexes Release Clause
## 2             9         10               
## 3            10         15               
## 4            77         85               
## 5             8         15               
## 6            12         13
football <- football[-c(1),]
head(football)
##       ID         Name Age                                          Photo
## 2 207439   L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909     A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260       Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
## 7 187607    A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
##   Nationality                                Flag Overall Potential
## 2   Argentina https://cdn.sofifa.org/flags/52.png      80        85
## 3      Sweden https://cdn.sofifa.org/flags/46.png      80        80
## 4      Russia https://cdn.sofifa.org/flags/40.png      79        81
## 5      Russia https://cdn.sofifa.org/flags/40.png      79        79
## 6      Brazil https://cdn.sofifa.org/flags/54.png      78        78
## 7      Russia https://cdn.sofifa.org/flags/40.png      78        78
##              Club                                   Club Logo Value  Wage
## 2                         https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                         https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                         https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                         https://cdn.sofifa.org/flags/40.png  6030  1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
## 7                         https://cdn.sofifa.org/flags/40.png  5764  1105
##   Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2    2122          Right                        2         4           4
## 3    1797          Right                        2         4           2
## 4    1217          Right                        1         3           1
## 5    2038          Right                        2         3           3
## 6    1807          Right                        2         3           3
## 7    1810          Right                        2         3           3
##        Work Rate Body Type Real Face Position Jersey Number   Joined
## 2 Medium/ Medium    Normal        No       CM             5         
## 3   High/ Medium    Normal        No      LCB             4         
## 4 Medium/ Medium    Normal        No       GK            12         
## 5     High/ High      Lean        No       RB             2         
## 6 Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
## 7   High/ Medium    Stocky        No       ST            22         
##   Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                    5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                     6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                     6'2 176lbs                              
## 5                                    5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                             2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7                                     6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
##     RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                           
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7 74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
##     LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3 70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                16        14              17           25
## 5 78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6 68+2 76+2 76+2 76+2 68+2       60        45              79           73
## 7 48+2 48+2 48+2 48+2 48+2       61        79              86           71
##   Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2      73        78    79         78          82          82           75
## 3      37        49    36         40          67          63           46
## 4      13        15    18         17          32          17           58
## 5      57        72    49         46          75          72           84
## 6      51        63    42         48          72          73           33
## 7      74        71    64         60          55          77           66
##   SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2          69      77        74      77        82      61      79       69
## 3          49      55        76      36        74      64      67       83
## 4          54      36        76      50        24      60      27       70
## 5          90      80        75      76        67      85      93       68
## 6          38      51        70      60        55      79      54       76
## 7          65      50        75      32        78      63      77       93
##   LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2        80         79            72          74     82        57        74
## 3        59         81            82          54     49        79        78
## 4        13         26            20          11     63        15        69
## 5        57         65            71          77     72        41        73
## 6        58         76            79          50     67        64        70
## 7        68         75            30          78     73        77        70
##   Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2      73             75            72        9         14         6
## 3      82             83            79        7          9        12
## 4      18             20            12       80         73        65
## 5      76             76            80        7         12        10
## 6      83             77            76       12          7        11
## 7      21             15            19       15         12        11
##   GKPositioning GKReflexes Release Clause
## 2             9         10               
## 3            10         15               
## 4            77         85               
## 5             8         15               
## 6            12         13               
## 7            11          8
nrow(football)
## [1] 18207
table(football$Position)
## 
##       CAM   CB  CDM   CF   CM   GK  LAM   LB  LCB  LCM  LDM   LF   LM   LS   LW 
##   60  958 1778  948   74 1394 2025   21 1322  648  395  243   15 1095  207  381 
##  LWB  RAM   RB  RCB  RCM  RDM   RF   RM   RS   RW  RWB   ST 
##   78   21 1291  662  391  248   16 1124  203  370   87 2152

2. Filter

2.1 Filter for strikers

Strikers are defined in the dataset as Position = “ST”.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
football_st <- football %>% filter(Position == "ST")
head(football_st)
##       ID          Name Age                                          Photo
## 1 187607     A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389        G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683     K. Fofana  26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson  27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900  J. Sambenito  26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405     B. Angulo  22 https://cdn.sofifa.org/players/4/19/246405.png
##   Nationality                                 Flag Overall Potential Club
## 1      Russia  https://cdn.sofifa.org/flags/40.png      78        78     
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77     
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png      75        75     
## 4     Iceland  https://cdn.sofifa.org/flags/24.png      73        74     
## 5    Paraguay  https://cdn.sofifa.org/flags/58.png      71        74     
## 6     Ecuador  https://cdn.sofifa.org/flags/57.png      71        77     
##                              Club Logo Value Wage Special Preferred Foot
## 1  https://cdn.sofifa.org/flags/40.png  5764 1105    1810          Right
## 2 https://cdn.sofifa.org/flags/108.png  5275 2138    1933           Left
## 3 https://cdn.sofifa.org/flags/108.png  5589 3875    1877          Right
## 4  https://cdn.sofifa.org/flags/24.png  5629 3661    1893          Right
## 5  https://cdn.sofifa.org/flags/58.png  6113 2445    1651          Right
## 6  https://cdn.sofifa.org/flags/57.png  5057 2216    1628          Right
##   International Reputation Weak Foot Skill Moves      Work Rate Body Type
## 1                        2         3           3   High/ Medium    Stocky
## 2                        2         3           3      High/ Low    Normal
## 3                        1         3           3 Medium/ Medium    Normal
## 4                        1         4           3     High/ High    Normal
## 5                        1         3           2   High/ Medium      Lean
## 6                        1         4           3      High/ Low    Normal
##   Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1        No       ST            22                                        
## 2        No       ST            21                                        
## 3        No       ST            22                                        
## 4        No       ST             9                                        
## 5        No       ST             9                                        
## 6        No       ST            19                                        
##   Height Weight   LS   ST   RS   LW   LF   CF   RF   RW  LAM  CAM  RAM   LM
## 1    6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2   5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3    6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4    6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5    6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6    6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
##    LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB   LB  LCB   CB  RCB   RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
##   Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1       61        79              86           71      74        71    64
## 2       68        77              71           73      73        76    73
## 3       66        75              72           74      74        72    63
## 4       66        71              68           68      65        73    63
## 5       40        74              72           57      72        60    64
## 6       50        78              69           56      46        76    58
##   FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1         60          55          77           66          65      50        75
## 2         69          67          76           78          85      79        71
## 3         59          58          75           59          77      63        72
## 4         48          44          73           78          79      83        74
## 5         42          42          63           79          72      61        69
## 6         58          33          71           82          79      78        73
##   Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1      32        78      63      77       93        68         75            30
## 2      73        77      70      78       74        74         77            18
## 3      60        78      69      83       77        73         67            40
## 4      76        68      78      90       85        66         73            42
## 5      64        73      69      67       72        67         49            14
## 6      64        72      69      77       69        54         28            16
##   Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1          78     73        77        70      21             15            19
## 2          76     73        72        72      40             18            12
## 3          72     69        74        83      23             37            46
## 4          73     64        69        76      31             39            24
## 5          75     60        67        74      15             16            16
## 6          62     45        82        51      11             18            12
##   GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1       15         12        11            11          8               
## 2       15          9        10            15         16               
## 3        7         11         7            11         14               
## 4        9         12        10            15         16               
## 5       15         16        15             7          7               
## 6       11          8        10             7          6
nrow(football_st)
## [1] 2152

2.2 Scatter Plot)

It’s as good idea to explore the data.

convert to numeric.

str(football_st$Wage)
##  chr [1:2152] "1105" "2138" "3875" "3661" "2445" "2216" "4457" "3370" ...
str(football_st$Value)
##  chr [1:2152] "5764" "5275" "5589" "5629" "6113" "5057" "6561" "6146" ...
football_st$Wage <- as.numeric(football_st$Wage)
football_st$Value <- as.numeric(football_st$Value)
library(ggplot2)
library(ggpubr)

ggplot(football_st) + aes(x = Wage, y = Value) +
  geom_point(shape = 2, colour = "black") +
  xlab("Wage") + ylab("Value") +
  ggtitle("Wage and Value") +
  geom_smooth(method = lm) +
  stat_cor(method = "pearson", label.x = 300000, label.y = 1600)
## `geom_smooth()` using formula = 'y ~ x'

Localised with 30% of data.

ggplot(football_st) + aes(x = Wage, y = Value) +
  geom_point(shape = 2, colour = "black") +
  xlab("Wage") + ylab("Value") +
  ggtitle("Wage and Value") +
  geom_smooth(method = loess, span = 0.3) +
  stat_cor(method = "pearson", label.x = 300000, label.y = 1600)
## `geom_smooth()` using formula = 'y ~ x'

3. Machine learning

3.1 Training validation split

football_st_2 <- football_st[, c("Age", "Balance", "ShotPower", "Aggression",
                                 "Positioning", "Composure", "Wage")]
head(football_st_2)
##   Age Balance ShotPower Aggression Positioning Composure Wage
## 1  29      32        78         75          78        70 1105
## 2  29      73        77         77          76        72 2138
## 3  26      60        78         67          72        83 3875
## 4  27      76        68         73          73        76 3661
## 5  26      64        73         49          75        74 2445
## 6  22      64        72         28          62        51 2216

Change to numeric.

football_st_2 <- football_st_2 %>%
  mutate(across(everything(), as.numeric))

Split the data into training and validation sets.

Set the seed using our favourite number :-)

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation.

train_index <- sample(1:nrow(football_st_2), 0.7 * nrow(football_st_2))
valid_index <- setdiff(1:nrow(football_st_2), train_index)

Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.

train_df_st <- football_st_2[train_index, ]
valid_df_st <- football_st_2[valid_index, ]

It is a good habit to check after splitting.

nrow(train_df_st)
## [1] 1506
nrow(valid_df_st)
## [1] 646

3.2 Training

Training the model on the training set.

wage_model_st_2 <- lm(Wage ~ Age + Balance + ShotPower + 
                        Aggression + Positioning + Composure,
                      data = train_df_st)
summary(wage_model_st_2)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning + Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32861  -8569  -2336   5182 347609 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -81327.50    5253.63 -15.480  < 2e-16 ***
## Age          -1032.61     146.38  -7.054 2.64e-12 ***
## Balance        131.37      46.63   2.817  0.00491 ** 
## ShotPower      514.89      98.00   5.254 1.70e-07 ***
## Aggression      13.64      41.73   0.327  0.74380    
## Positioning    692.34     107.84   6.420 1.82e-10 ***
## Composure      533.27      93.18   5.723 1.26e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20410 on 1499 degrees of freedom
## Multiple R-squared:  0.2877, Adjusted R-squared:  0.2848 
## F-statistic: 100.9 on 6 and 1499 DF,  p-value: < 2.2e-16

3.3 Model evaluation

Predict the outcome (i.e. wage) of the training and validation sets using the model from the training set. Compare the errors between the training and validation sets. Check normality, residuals, multicollinearity, heteroskedasticity/homoskedasticity.

library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:ggpubr':
## 
##     gghistogram
wage_model_st_2_pred_train <- predict(wage_model_st_2,
                                train_df_st)

accuracy(wage_model_st_2_pred_train, train_df_st$Wage)
##                    ME     RMSE      MAE       MPE     MAPE
## Test set -3.02254e-10 20363.72 9804.857 -30.84404 131.6738
wage_model_st_2_pred_valid <- predict(wage_model_st_2,
                                valid_df_st)

accuracy(wage_model_st_2_pred_valid, valid_df_st$Wage)
##                 ME     RMSE      MAE      MPE     MAPE
## Test set -910.8093 14653.34 9444.861 -32.5431 130.3103
max(train_df_st$Wage) - min(train_df_st$Wage)
## [1] 406504
sd(train_df_st$Wage)
## [1] 24135.81
max(valid_df_st$Wage) - min(valid_df_st$Wage)
## [1] 205030
sd(valid_df_st$Wage)
## [1] 18074.14

Multicollinearity.

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(wage_model_st_2)
##         Age     Balance   ShotPower  Aggression Positioning   Composure 
##    1.690727    1.037080    2.900017    1.602035    3.573232    3.185606
cor_matrix_2 <- cor(train_df_st, use = "complete.obs")
cor_matrix_2
##                     Age      Balance   ShotPower Aggression Positioning
## Age          1.00000000 -0.091216618  0.54097985  0.4873814  0.57297740
## Balance     -0.09121662  1.000000000 -0.09505236 -0.1260503 -0.02009516
## ShotPower    0.54097985 -0.095052364  1.00000000  0.5602278  0.76578164
## Aggression   0.48738140 -0.126050298  0.56022778  1.0000000  0.49579804
## Positioning  0.57297740 -0.020095159  0.76578164  0.4957980  1.00000000
## Composure    0.57128687 -0.001725346  0.72347355  0.5123792  0.79931083
## Wage         0.18304028  0.055407602  0.44965603  0.2582019  0.48053332
##                Composure      Wage
## Age          0.571286868 0.1830403
## Balance     -0.001725346 0.0554076
## ShotPower    0.723473546 0.4496560
## Aggression   0.512379238 0.2582019
## Positioning  0.799310831 0.4805333
## Composure    1.000000000 0.4655139
## Wage         0.465513874 1.0000000

Perform a Breusch-Pagan Test to test for heteroskedasticity/homoskedasticity.

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(wage_model_st_2)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st_2
## BP = 72.484, df = 6, p-value = 1.264e-13

Normality.

ggplot(train_df_st) + aes(x = Wage) +
  geom_histogram(binwidth = 1000) +
  ylab("Count") +
  ggtitle("Distribution of wage (strikers)")

wage_model_st_residuals <- rstandard(wage_model_st_2)
head(wage_model_st_residuals)
##        1598         638         907         873         652        1697 
## -0.24905289 -0.14246223  0.23893391  0.36088574  0.09706996 -0.14061088
train_df_st_comb_2 <- cbind(train_df_st, wage_model_st_residuals)
head(train_df_st_comb_2)
##      Age Balance ShotPower Aggression Positioning Composure  Wage
## 1598  33      63        75         62          73        73 16730
## 638   20      67        70         36          52        52  4190
## 907   18      68        53         57          53        51  5843
## 873   23      55        59         44          58        47  5704
## 652   29      72        62         56          61        55  4414
## 1697  26      68        62         53          76        64 17288
##      wage_model_st_residuals
## 1598             -0.24905289
## 638              -0.14246223
## 907               0.23893391
## 873               0.36088574
## 652               0.09706996
## 1697             -0.14061088
ggplot(train_df_st_comb_2) + aes(x = Wage, y = wage_model_st_residuals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage Prediction")

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-1: distribution is different from a normal distribution.

shapiro.test(train_df_st$Wage)
## 
##  Shapiro-Wilk normality test
## 
## data:  train_df_st$Wage
## W = 0.36328, p-value < 2.2e-16

3.4 Predicting

Predict new players

Data for new players

new <- read.csv("new.csv", header = TRUE)

wage_model_st_2_pred_new <- predict(wage_model_st_2,
                                newdata = new, interval = "confidence")
wage_model_st_2_pred_new
##        fit      lwr      upr
## 1 21523.43 18689.82 24357.04
## 2 23759.40 20030.25 27488.55
## 3 21465.21 19657.65 23272.77

4. Categorical Predictors

Subset to include categorical variable: preferred foot

football_st_3 <- football_st[, c("Preferred Foot", "Positioning", "Composure", "Wage")]
head(football_st_3)
##   Preferred Foot Positioning Composure Wage
## 1          Right          78        70 1105
## 2           Left          76        72 2138
## 3          Right          72        83 3875
## 4          Right          73        76 3661
## 5          Right          75        74 2445
## 6          Right          62        51 2216
names(football_st_3)[1] <- "Preferred_Foot"

football_st_3$Positioning <- as.numeric(football_st_3$Positioning)
football_st_3$Composure <- as.numeric(football_st_3$Composure)
set.seed(666)
train_index_3 <- sample(1:nrow(football_st_3), 0.7 *
                          nrow(football_st_3))
valid_index_3 <- setdiff(1:nrow(football_st_3), train_index)

train_df_st_3 <- football_st_3[train_index_3, ]
valid_df_st_3 <- football_st_3[valid_index_3, ]
wage_model_st_cat_2 <- lm(Wage ~ factor(Preferred_Foot) + Positioning +
                            Composure, data = train_df_st_3)

summary(wage_model_st_cat_2)
## 
## Call:
## lm(formula = Wage ~ factor(Preferred_Foot) + Positioning + Composure, 
##     data = train_df_st_3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33599  -8588  -2271   5035 352343 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -68270.03    4142.64 -16.480  < 2e-16 ***
## factor(Preferred_Foot)Right  -2040.14    1572.46  -1.297    0.195    
## Positioning                    787.78      97.32   8.095 1.17e-15 ***
## Composure                      534.07      89.17   5.990 2.63e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20920 on 1502 degrees of freedom
## Multiple R-squared:  0.2501, Adjusted R-squared:  0.2486 
## F-statistic:   167 on 3 and 1502 DF,  p-value: < 2.2e-16

4.1 Model evaluation

wage_model_st_cat_2_pred_train <- predict(wage_model_st_cat_2,
                                train_df_st_3)

accuracy(wage_model_st_cat_2_pred_train, train_df_st_3$Wage)
##                     ME     RMSE      MAE       MPE     MAPE
## Test set -1.589124e-10 20893.75 9883.551 -37.84999 129.0229
wage_model_st_cat_2_pred_valid <- predict(wage_model_st_cat_2,
                                valid_df_st_3)
accuracy(wage_model_st_cat_2_pred_valid, valid_df_st_3$Wage)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -844.4194 15387.14 9700.628 -41.79669 130.5894
sd(train_df_st_3$Wage)
## [1] 24135.81
sd(valid_df_st_3$Wage)
## [1] 18074.14

Normality.

shapiro.test(train_df_st_3$Wage)
## 
##  Shapiro-Wilk normality test
## 
## data:  train_df_st_3$Wage
## W = 0.36328, p-value < 2.2e-16

Residuals.

wage_model_st_cat_2_residuals <- rstandard(wage_model_st_cat_2)
head(wage_model_st_cat_2_residuals)
##       1598        638        907        873        652       1697 
## -0.4524008  0.2758081  0.3427025  0.2498056 -0.1293316 -0.3088318
train_df_st_3_comb_2 <- cbind(train_df_st_3, wage_model_st_cat_2_residuals)
head(train_df_st_3_comb_2)
##      Preferred_Foot Positioning Composure  Wage wage_model_st_cat_2_residuals
## 1598          Right          73        73 16730                    -0.4524008
## 638           Right          52        52  4190                     0.2758081
## 907           Right          53        51  5843                     0.3427025
## 873           Right          58        47  5704                     0.2498056
## 652           Right          61        55  4414                    -0.1293316
## 1697          Right          76        64 17288                    -0.3088318

Check for all predictors too.

ggplot(train_df_st_3_comb_2) + aes(x = Wage, y = wage_model_st_cat_2_residuals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage Prediction")

Multicollinearity.

vif(wage_model_st_cat_2)
## factor(Preferred_Foot)            Positioning              Composure 
##               1.004517               2.769896               2.776543

Heteroskedasticity/Homoskedasticity.

bptest(wage_model_st_cat_2)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st_cat_2
## BP = 64.695, df = 3, p-value = 5.829e-14

4.2 Prediction

Data for more new players

new2 <- read.csv("new2.csv")
new2
##   Preferred.Foot Positioning Composure
## 1          Right          64        56
## 2          Right          65        47
new2$Preferred.Foot <- as.factor(new2$Preferred.Foot)

names(new2)
## [1] "Preferred.Foot" "Positioning"    "Composure"
names(new2)[1] <- "Preferred_Foot"

names(new2)
## [1] "Preferred_Foot" "Positioning"    "Composure"
wage_model_st_cat_2_pred_new <- predict(wage_model_st_cat_2,
                                newdata = new2, interval = "confidence")

wage_model_st_cat_2_pred_new
##         fit      lwr      upr
## 1 10016.014 8776.276 11255.75
## 2  5997.149 3512.208  8482.09

5. Non-Linear Regression

Sometimes, a relationship may not be linear. In this case, we can specify a non-linear relationship in the model.

wage_model_st_nl_2 <- lm(Wage ~ Age + Balance + ShotPower + Aggression +
                           Positioning * Composure,
                       data = train_df_st)
summary(wage_model_st_nl_2)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning * Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66085  -5447    302   4870 260260 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           342265.457  17203.875  19.895  < 2e-16 ***
## Age                     -758.277    122.802  -6.175 8.52e-10 ***
## Balance                   79.001     39.022   2.025   0.0431 *  
## ShotPower                699.312     82.219   8.505  < 2e-16 ***
## Aggression                13.470     34.875   0.386   0.6994    
## Positioning            -5818.796    271.105 -21.463  < 2e-16 ***
## Composure              -6961.508    304.442 -22.866  < 2e-16 ***
## Positioning:Composure    109.127      4.285  25.465  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17060 on 1498 degrees of freedom
## Multiple R-squared:  0.5029, Adjusted R-squared:  0.5006 
## F-statistic: 216.5 on 7 and 1498 DF,  p-value: < 2.2e-16

5.1 Model evaluation

Predict the training and validation sets using the non-linear model. Check the accuracy

wage_model_st_nl_2_pred_train <- predict(wage_model_st_nl_2,
                                   train_df_st)
accuracy(wage_model_st_nl_2_pred_train, train_df_st$Wage)
##                    ME    RMSE      MAE       MPE     MAPE
## Test set 1.966265e-09 17011.8 8516.679 -27.10041 105.2229
wage_model_st_nl_2_pred_valid <- predict(wage_model_st_nl_2,
                                   valid_df_st)
accuracy(wage_model_st_nl_2_pred_valid, valid_df_st$Wage)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -855.2409 13504.82 8758.766 -32.40985 113.7297

Normality.

shapiro.test(train_df_st$Wage)
## 
##  Shapiro-Wilk normality test
## 
## data:  train_df_st$Wage
## W = 0.36328, p-value < 2.2e-16

Residuals.

wage_model_st_nl_2_residuals <- rstandard(wage_model_st_nl_2)
head(wage_model_st_nl_2_residuals)
##       1598        638        907        873        652       1697 
## -0.4316213 -0.4791668  0.1460100  0.1104547  0.3550138  0.1330478
train_df_st_comb_3 <- cbind(train_df_st, wage_model_st_nl_2_residuals)
head(train_df_st_comb_3)
##      Age Balance ShotPower Aggression Positioning Composure  Wage
## 1598  33      63        75         62          73        73 16730
## 638   20      67        70         36          52        52  4190
## 907   18      68        53         57          53        51  5843
## 873   23      55        59         44          58        47  5704
## 652   29      72        62         56          61        55  4414
## 1697  26      68        62         53          76        64 17288
##      wage_model_st_nl_2_residuals
## 1598                   -0.4316213
## 638                    -0.4791668
## 907                     0.1460100
## 873                     0.1104547
## 652                     0.3550138
## 1697                    0.1330478

Check residuals for predictors too.

ggplot(train_df_st_comb_3) + aes(x = Wage, y = wage_model_st_nl_2_residuals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage Prediction")

Multicollinearity. Expected, due to interaction term.

vif(wage_model_st_nl_2)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
##                   Age               Balance             ShotPower 
##              1.703838              1.039969              2.922692 
##            Aggression           Positioning             Composure 
##              1.602035             32.336863             48.694956 
## Positioning:Composure 
##            128.868532

Heteroskedasticity/Homoskedasticity.

bptest(wage_model_st_nl_2)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st_nl_2
## BP = 286.28, df = 7, p-value < 2.2e-16

5.2 Prediction

Predict the wages of new players using the non-linear model.

wage_model_st_nl_2_pred_new <- predict(wage_model_st_nl_2,
                                        newdata = new, interval = "confidence")

wage_model_st_nl_2_pred_new
##        fit       lwr      upr
## 1 14285.88 11853.154 16718.61
## 2 12719.88  9489.583 15950.18
## 3 17129.30 15582.278 18676.32

5.3 Variation

wage_model_st_nl_3 <- lm(Wage ~ Age + Balance + ShotPower + Aggression +
                           I(Positioning * Composure),
                       data = train_df_st)
summary(wage_model_st_nl_3)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     I(Positioning * Composure), data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -36259  -8218  -1295   5576 333818 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -3.640e+04  5.645e+03  -6.448 1.53e-10 ***
## Age                        -1.132e+03  1.416e+02  -7.994 2.58e-15 ***
## Balance                     1.068e+02  4.538e+01   2.354   0.0187 *  
## ShotPower                   3.801e+02  9.288e+01   4.093 4.49e-05 ***
## Aggression                 -3.070e+00  4.051e+01  -0.076   0.9396    
## I(Positioning * Composure)  1.154e+01  7.433e-01  15.525  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19850 on 1500 degrees of freedom
## Multiple R-squared:  0.3259, Adjusted R-squared:  0.3237 
## F-statistic: 145.1 on 5 and 1500 DF,  p-value: < 2.2e-16
vif(wage_model_st_nl_3)
##                        Age                    Balance 
##                   1.672602                   1.038758 
##                  ShotPower                 Aggression 
##                   2.754568                   1.596489 
## I(Positioning * Composure) 
##                   2.862857

6. Stepwise

Can be applied to other linear regression models too

wage_model_st_nl_2_step <- step(wage_model_st_nl_2,
                              direction = "both")
## Start:  AIC=29357.89
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning * 
##     Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## - Aggression             1 4.3404e+07 4.3588e+11 29356
## <none>                                4.3584e+11 29358
## - Balance                1 1.1925e+09 4.3703e+11 29360
## - Age                    1 1.1093e+10 4.4693e+11 29394
## - ShotPower              1 2.1048e+10 4.5689e+11 29427
## - Positioning:Composure  1 1.8867e+11 6.2451e+11 29898
## 
## Step:  AIC=29356.04
## Wage ~ Age + Balance + ShotPower + Positioning + Composure + 
##     Positioning:Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## <none>                                4.3588e+11 29356
## + Aggression             1 4.3404e+07 4.3584e+11 29358
## - Balance                1 1.1606e+09 4.3704e+11 29358
## - Age                    1 1.1307e+10 4.4719e+11 29393
## - ShotPower              1 2.2847e+10 4.5873e+11 29431
## - Positioning:Composure  1 1.8867e+11 6.2455e+11 29896
summary(wage_model_st_nl_2_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning + 
##     Composure + Positioning:Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66274  -5469    260   4921 260121 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           342109.131  17194.232  19.897  < 2e-16 ***
## Age                     -748.254    119.996  -6.236 5.84e-10 ***
## Balance                   77.598     38.841   1.998   0.0459 *  
## ShotPower                706.988     79.759   8.864  < 2e-16 ***
## Positioning            -5818.926    271.028 -21.470  < 2e-16 ***
## Composure              -6958.216    304.237 -22.871  < 2e-16 ***
## Positioning:Composure    109.128      4.284  25.472  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17050 on 1499 degrees of freedom
## Multiple R-squared:  0.5028, Adjusted R-squared:  0.5008 
## F-statistic: 252.7 on 6 and 1499 DF,  p-value: < 2.2e-16

6.1 Model evaluation

Predict the training and validation sets using the stepwise, non-linear model. Check the accuracy.

wage_model_st_nl_2_step_pred_train <- predict(wage_model_st_nl_2_step,
                                        train_df_st)

accuracy(wage_model_st_nl_2_step_pred_train, train_df_st$Wage)
##                    ME     RMSE      MAE       MPE     MAPE
## Test set 1.982657e-09 17012.65 8519.261 -27.09032 105.1927
wage_model_st_nl_2_step_pred_valid <- predict(wage_model_st_nl_2_step,
                                        valid_df_st)

accuracy(wage_model_st_nl_2_step_pred_valid, valid_df_st$Wage)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -852.6068 13509.38 8763.627 -32.34321 113.7652

Check normality, residuals, multicollinearity, heteroskedasticity/homoskedasticity.

Normality.

shapiro.test(train_df_st$Wage)
## 
##  Shapiro-Wilk normality test
## 
## data:  train_df_st$Wage
## W = 0.36328, p-value < 2.2e-16

Residuals.

wage_model_st_nl_2_step_residuals <- rstandard(wage_model_st_nl_2_step)
train_df_st_comb_4 <- cbind(train_df_st, wage_model_st_nl_2_step_residuals)
head(train_df_st_comb_4)
##      Age Balance ShotPower Aggression Positioning Composure  Wage
## 1598  33      63        75         62          73        73 16730
## 638   20      67        70         36          52        52  4190
## 907   18      68        53         57          53        51  5843
## 873   23      55        59         44          58        47  5704
## 652   29      72        62         56          61        55  4414
## 1697  26      68        62         53          76        64 17288
##      wage_model_st_nl_2_step_residuals
## 1598                        -0.4351998
## 638                         -0.4890199
## 907                          0.1618064
## 873                          0.1101748
## 652                          0.3592566
## 1697                         0.1346568

Check residuals for predictors too.

ggplot(train_df_st_comb_4) + aes(x = Wage, y = wage_model_st_nl_2_step_residuals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage Prediction")

Multicollinearity.

Expected, due to interaction term.

vif(wage_model_st_nl_2_step)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
##                   Age               Balance             ShotPower 
##              1.627768              1.030962              2.751958 
##           Positioning             Composure Positioning:Composure 
##             32.336813             48.656781            128.868527

Heteroskedasticity/Homoskedasticity.

bptest(wage_model_st_nl_2_step)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st_nl_2_step
## BP = 284.9, df = 6, p-value < 2.2e-16

6.2 Prediction

Predict the wages of new players using the stepwise non-linear model.

wage_model_st_nl_2_step_pred_new <- predict(wage_model_st_nl_2_step,
                                        newdata = new, interval = "confidence")

wage_model_st_nl_2_step_pred_new
##        fit       lwr      upr
## 1 14529.97 12437.349 16622.59
## 2 12274.73  9968.014 14581.45
## 3 17001.40 15597.748 18405.04

6.3 Variation

wage_model_st_nl_3_step <- step(wage_model_st_nl_3,
                              direction = "both")
## Start:  AIC=29812.43
## Wage ~ Age + Balance + ShotPower + Aggression + I(Positioning * 
##     Composure)
## 
##                              Df  Sum of Sq        RSS   AIC
## - Aggression                  1 2.2629e+06 5.9096e+11 29810
## <none>                                     5.9096e+11 29812
## - Balance                     1 2.1828e+09 5.9314e+11 29816
## - ShotPower                   1 6.5994e+09 5.9756e+11 29827
## - Age                         1 2.5177e+10 6.1614e+11 29873
## - I(Positioning * Composure)  1 9.4958e+10 6.8592e+11 30035
## 
## Step:  AIC=29810.43
## Wage ~ Age + Balance + ShotPower + I(Positioning * Composure)
## 
##                              Df  Sum of Sq        RSS   AIC
## <none>                                     5.9096e+11 29810
## + Aggression                  1 2.2629e+06 5.9096e+11 29812
## - Balance                     1 2.2151e+09 5.9318e+11 29814
## - ShotPower                   1 6.9513e+09 5.9791e+11 29826
## - Age                         1 2.6501e+10 6.1746e+11 29875
## - I(Positioning * Composure)  1 9.6036e+10 6.8700e+11 30035
summary(wage_model_st_nl_3_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + I(Positioning * 
##     Composure), data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -36310  -8221  -1293   5571 333882 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -3.638e+04  5.639e+03  -6.451 1.49e-10 ***
## Age                        -1.134e+03  1.382e+02  -8.204 4.94e-16 ***
## Balance                     1.071e+02  4.517e+01   2.372   0.0178 *  
## ShotPower                   3.784e+02  9.006e+01   4.202 2.80e-05 ***
## I(Positioning * Composure)  1.153e+01  7.384e-01  15.618  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19840 on 1501 degrees of freedom
## Multiple R-squared:  0.3259, Adjusted R-squared:  0.3241 
## F-statistic: 181.5 on 4 and 1501 DF,  p-value: < 2.2e-16
vif(wage_model_st_nl_3_step)
##                        Age                    Balance 
##                   1.595555                   1.029752 
##                  ShotPower I(Positioning * Composure) 
##                   2.591612                   2.827630

7. Log Variables

Sometimes, the data need to be transformed. A common transformation is the log transformation.

ggplot(football_st_2) + aes(x = Wage) +
  geom_histogram(binwidth = 0.01) +
  ylab("Count") +
  scale_x_log10() +
  ggtitle("Distribution of log(wage) (strikers)")

wage_model_st_log_2 <- lm(log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
                          log(Aggression) + log(Positioning) + log(Composure),
                    data = train_df_st)
summary(wage_model_st_log_2)
## 
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Aggression) + log(Positioning) + log(Composure), data = train_df_st)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.55406 -0.38093 -0.03289  0.35960  2.32610 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -9.67383    0.59662 -16.214  < 2e-16 ***
## log(Age)         -0.69453    0.10718  -6.480 1.24e-10 ***
## log(Balance)      0.28514    0.07870   3.623 0.000301 ***
## log(ShotPower)    1.49775    0.17843   8.394  < 2e-16 ***
## log(Aggression)   0.07195    0.05938   1.212 0.225846    
## log(Positioning)  1.87362    0.19243   9.737  < 2e-16 ***
## log(Composure)    1.31511    0.15334   8.577  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5945 on 1499 degrees of freedom
## Multiple R-squared:  0.4985, Adjusted R-squared:  0.4965 
## F-statistic: 248.3 on 6 and 1499 DF,  p-value: < 2.2e-16

7.1 Model evaluation

Predict the training and validation sets. Check the accuracy.

wage_model_st_log_2_pred_train <- predict(wage_model_st_log_2,
                                   train_df_st)

train_df_st$logWage <- log(train_df_st$Wage)


accuracy(wage_model_st_log_2_pred_train, train_df_st$logWage)
##                    ME      RMSE       MAE        MPE     MAPE
## Test set 4.312485e-14 0.5931541 0.4587753 -0.4038503 5.089574
wage_model_st_log_2_pred_valid <- predict(wage_model_st_log_2,
                                   valid_df_st)
valid_df_st$logWage <- log(valid_df_st$Wage)

accuracy(wage_model_st_log_2_pred_valid, valid_df_st$logWage)
##                    ME      RMSE       MAE        MPE     MAPE
## Test set 0.0002915404 0.5808893 0.4554191 -0.3779982 5.050841

Normality.

shapiro.test(train_df_st$Wage)
## 
##  Shapiro-Wilk normality test
## 
## data:  train_df_st$Wage
## W = 0.36328, p-value < 2.2e-16

Residuals.

wage_model_st_log_2_residuals <- rstandard(wage_model_st_log_2)
head(wage_model_st_log_2_residuals)
##       1598        638        907        873        652       1697 
##  0.3389765 -0.5477587  0.5128142  0.5174893 -0.4341683  0.7426560
train_df_st_comb_5 <- cbind(train_df_st, wage_model_st_log_2_residuals)
head(train_df_st_comb_5)
##      Age Balance ShotPower Aggression Positioning Composure  Wage  logWage
## 1598  33      63        75         62          73        73 16730 9.724959
## 638   20      67        70         36          52        52  4190 8.340456
## 907   18      68        53         57          53        51  5843 8.673000
## 873   23      55        59         44          58        47  5704 8.648923
## 652   29      72        62         56          61        55  4414 8.392537
## 1697  26      68        62         53          76        64 17288 9.757768
##      wage_model_st_log_2_residuals
## 1598                     0.3389765
## 638                     -0.5477587
## 907                      0.5128142
## 873                      0.5174893
## 652                     -0.4341683
## 1697                     0.7426560

Check residuals for predictors too.

ggplot(train_df_st_comb_5) + aes(x = Wage, y = wage_model_st_log_2_residuals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage Prediction")

Multicollinearity.

vif(wage_model_st_log_2)
##         log(Age)     log(Balance)   log(ShotPower)  log(Aggression) 
##         1.763983         1.035522         2.828829         1.603185 
## log(Positioning)   log(Composure) 
##         3.343780         2.936552

Heteroskedasticity/Homoskedasticity.

bptest(wage_model_st_log_2)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st_log_2
## BP = 122.58, df = 6, p-value < 2.2e-16

7.2 Prediction

Predict new records

More and more new data

new3 <- read.csv("new3.csv", header = TRUE)
new3
##   X Age Balance ShotPower Aggression Positioning Composure
## 1 1  25      66        69         55          72        71
## 2 2  26      58        76         75          66        66
## 3 3  19      80        67         33          43        52
wage_model_st_log_2_pred_new3 <- predict(wage_model_st_log_2,
                                   newdata = new3, interval = "confidence")

wage_model_st_log_2_pred_new3
##        fit      lwr      upr
## 1 9.533908 9.484214 9.583602
## 2 9.377799 9.315354 9.440244
## 3 8.323197 8.170023 8.476371

Results as a data frame (if desired).

wage_model_st_log_2_pred_new3_df <- as.data.frame(wage_model_st_log_2_pred_new3)



wage_model_st_log_2_pred_new3_df_value <- exp(1)^wage_model_st_log_2_pred_new3_df

wage_model_st_log_2_pred_new3_df_value
##         fit       lwr       upr
## 1 13820.495 13150.482 14524.645
## 2 11822.968 11107.261 12584.791
## 3  4118.303  3533.423  4799.997

8. Combined

We can combine different tweaks

wage_model_st_log_3 <- lm(log(Wage) ~ log(Age) + I(Positioning * Composure),
                    data = train_df_st)
summary(wage_model_st_log_3)
## 
## Call:
## lm(formula = log(Wage) ~ log(Age) + I(Positioning * Composure), 
##     data = train_df_st)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.73661 -0.35786 -0.00011  0.35532  2.05304 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 8.666e+00  2.793e-01   31.03  < 2e-16 ***
## log(Age)                   -6.139e-01  9.807e-02   -6.26 5.02e-10 ***
## I(Positioning * Composure)  5.760e-04  1.599e-05   36.02  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.577 on 1503 degrees of freedom
## Multiple R-squared:  0.5264, Adjusted R-squared:  0.5258 
## F-statistic: 835.2 on 2 and 1503 DF,  p-value: < 2.2e-16

8.1 Model evaluation

Predict the training and validation sets. Check the accuracy.

wage_model_st_log_3_pred_train <- predict(wage_model_st_log_3,
                                   train_df_st)

train_df_st$logWage <- log(train_df_st$Wage)


accuracy(wage_model_st_log_3_pred_train, train_df_st$logWage)
##                    ME      RMSE       MAE       MPE     MAPE
## Test set 3.847045e-14 0.5764289 0.4423697 -0.388299 4.908134
wage_model_st_log_3_pred_valid <- predict(wage_model_st_log_3,
                                   valid_df_st)
valid_df_st$logWage <- log(valid_df_st$Wage)

accuracy(wage_model_st_log_3_pred_valid, valid_df_st$logWage)
##                   ME      RMSE       MAE        MPE    MAPE
## Test set 0.002845158 0.5841898 0.4527094 -0.3440159 5.01038

Normality.

shapiro.test(train_df_st$logWage)
## 
##  Shapiro-Wilk normality test
## 
## data:  train_df_st$logWage
## W = 0.929, p-value < 2.2e-16

Residuals.

wage_model_st_log_3_residuals <- rstandard(wage_model_st_log_3)
head(wage_model_st_log_3_residuals)
##        1598         638         907         873         652        1697 
##  0.23518684 -0.07693547  0.38879657  0.58491840 -0.24147460  0.50256918
train_df_st_comb_6 <- cbind(train_df_st, wage_model_st_log_3_residuals)
head(train_df_st_comb_6)
##      Age Balance ShotPower Aggression Positioning Composure  Wage  logWage
## 1598  33      63        75         62          73        73 16730 9.724959
## 638   20      67        70         36          52        52  4190 8.340456
## 907   18      68        53         57          53        51  5843 8.673000
## 873   23      55        59         44          58        47  5704 8.648923
## 652   29      72        62         56          61        55  4414 8.392537
## 1697  26      68        62         53          76        64 17288 9.757768
##      wage_model_st_log_3_residuals
## 1598                    0.23518684
## 638                    -0.07693547
## 907                     0.38879657
## 873                     0.58491840
## 652                    -0.24147460
## 1697                    0.50256918

Check residuals for predictors too.

ggplot(train_df_st_comb_6) + aes(x = Wage, y = wage_model_st_log_3_residuals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage Prediction")

Multicollinearity.

vif(wage_model_st_log_3)
##                   log(Age) I(Positioning * Composure) 
##                   1.568088                   1.568088

Heteroskedasticity/Homoskedasticity.

bptest(wage_model_st_log_3)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st_log_3
## BP = 148.44, df = 2, p-value < 2.2e-16

8.2 Prediction

Predict new records

More and more new data

new3 <- read.csv("new3.csv", header = TRUE)
new3
##   X Age Balance ShotPower Aggression Positioning Composure
## 1 1  25      66        69         55          72        71
## 2 2  26      58        76         75          66        66
## 3 3  19      80        67         33          43        52
wage_model_st_log_3_pred_new3 <- predict(wage_model_st_log_3,
                                   newdata = new3, interval = "confidence")

wage_model_st_log_3_pred_new3
##        fit      lwr      upr
## 1 9.634869 9.590987 9.678752
## 2 9.175323 9.143823 9.206824
## 3 8.146727 8.092877 8.200578

Results as a data frame (if desired).

wage_model_st_log_3_pred_new3_df <- as.data.frame(wage_model_st_log_3_pred_new3)



wage_model_st_log_3_pred_new3_df_value <- exp(1)^wage_model_st_log_3_pred_new3_df

wage_model_st_log_3_pred_new3_df_value
##         fit       lwr       upr
## 1 15288.703 14632.298 15974.554
## 2  9655.891  9356.468  9964.895
## 3  3452.064  3271.086  3643.054