Directions

Regression models to predict the wages of football players.

Data for demo

Back to the spellbook

1. Load Data

Load the data and explore them.

football <- read.csv("football_2.csv", header = FALSE)
head(football, 10)
##        V1            V2  V3                                             V4
## 1      ID          Name Age                                          Photo
## 2  207439    L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3  156713  A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4  229909      A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5  187347  I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6  153260        Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
## 7  187607     A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 8  204341 Lu\xcc_s Neto  30 https://cdn.sofifa.org/players/4/19/204341.png
## 9  223058    D. Kuzyaev  25 https://cdn.sofifa.org/players/4/19/223058.png
## 10 183389        G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
##             V5                                   V6      V7        V8
## 1  Nationality                                 Flag Overall Potential
## 2    Argentina  https://cdn.sofifa.org/flags/52.png      80        85
## 3       Sweden  https://cdn.sofifa.org/flags/46.png      80        80
## 4       Russia  https://cdn.sofifa.org/flags/40.png      79        81
## 5       Russia  https://cdn.sofifa.org/flags/40.png      79        79
## 6       Brazil  https://cdn.sofifa.org/flags/54.png      78        78
## 7       Russia  https://cdn.sofifa.org/flags/40.png      78        78
## 8     Portugal  https://cdn.sofifa.org/flags/38.png      77        77
## 9       Russia  https://cdn.sofifa.org/flags/40.png      77        80
## 10 Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77
##                 V9                                         V10   V11   V12
## 1             Club                                   Club Logo Value  Wage
## 2                          https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                          https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                          https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                          https://cdn.sofifa.org/flags/40.png  6030  1448
## 6  Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
## 7                          https://cdn.sofifa.org/flags/40.png  5764  1105
## 8                          https://cdn.sofifa.org/flags/38.png  6075  2836
## 9                          https://cdn.sofifa.org/flags/40.png  5565  2653
## 10                        https://cdn.sofifa.org/flags/108.png  5275  2138
##        V13            V14                      V15       V16         V17
## 1  Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2     2122          Right                        2         4           4
## 3     1797          Right                        2         4           2
## 4     1217          Right                        1         3           1
## 5     2038          Right                        2         3           3
## 6     1807          Right                        2         3           3
## 7     1810          Right                        2         3           3
## 8     1749          Right                        1         3           2
## 9     2041          Right                        1         3           3
## 10    1933           Left                        2         3           3
##               V18       V19       V20      V21           V22      V23
## 1       Work Rate Body Type Real Face Position Jersey Number   Joined
## 2  Medium/ Medium    Normal        No       CM             5         
## 3    High/ Medium    Normal        No      LCB             4         
## 4  Medium/ Medium    Normal        No       GK            12         
## 5      High/ High      Lean        No       RB             2         
## 6  Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
## 7    High/ Medium    Stocky        No       ST            22         
## 8  Medium/ Medium      Lean        No       CB             4         
## 9    Medium/ High      Lean        No       RM             7         
## 10      High/ Low    Normal        No       ST            21         
##            V24                  V25    V26    V27  V28  V29  V30  V31  V32  V33
## 1  Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                     5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                      6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                      6'2 176lbs                              
## 5                                     5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                              2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7                                      6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
## 8                                      6'2 157lbs 52+2 52+2 52+2 51+2 51+2 51+2
## 9                                      6'0 163lbs 70+2 70+2 70+2 74+2 74+2 74+2
## 10                                    5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2
##     V34  V35  V36  V37  V38  V39  V40  V41  V42  V43  V44  V45  V46  V47  V48
## 1    RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2  75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3  58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                            
## 5  72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6  59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7  74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
## 8  51+2 51+2 54+2 54+2 54+2 54+2 61+2 61+2 61+2 54+2 67+2 72+2 72+2 72+2 67+2
## 9  74+2 74+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2
## 10 75+2 75+2 74+2 74+2 74+2 74+2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2
##     V49  V50  V51  V52  V53      V54       V55             V56          V57
## 1    LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2  74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3  70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                 16        14              17           25
## 5  78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6  68+2 76+2 76+2 76+2 68+2       60        45              79           73
## 7  48+2 48+2 48+2 48+2 48+2       61        79              86           71
## 8  69+2 75+2 75+2 75+2 69+2       42        33              80           72
## 9  74+2 70+2 70+2 70+2 74+2       67        64              51           82
## 10 50+2 46+2 46+2 46+2 50+2       68        77              71           73
##        V58       V59   V60        V61         V62         V63          V64
## 1  Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2       73        78    79         78          82          82           75
## 3       37        49    36         40          67          63           46
## 4       13        15    18         17          32          17           58
## 5       57        72    49         46          75          72           84
## 6       51        63    42         48          72          73           33
## 7       74        71    64         60          55          77           66
## 8       40        49    52         43          77          48           57
## 9       57        78    60         61          75          79           78
## 10      73        76    73         69          67          76           78
##            V65     V66       V67     V68       V69     V70     V71      V72
## 1  SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2           69      77        74      77        82      61      79       69
## 3           49      55        76      36        74      64      67       83
## 4           54      36        76      50        24      60      27       70
## 5           90      80        75      76        67      85      93       68
## 6           38      51        70      60        55      79      54       76
## 7           65      50        75      32        78      63      77       93
## 8           59      69        78      61        42      79      72       72
## 9           81      80        73      76        76      60      79       59
## 10          85      79        71      73        77      70      78       74
##          V73        V74           V75         V76    V77       V78       V79
## 1  LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2         80         79            72          74     82        57        74
## 3         59         81            82          54     49        79        78
## 4         13         26            20          11     63        15        69
## 5         57         65            71          77     72        41        73
## 6         58         76            79          50     67        64        70
## 7         68         75            30          78     73        77        70
## 8         37         76            78          44     46        47        72
## 9         74         70            74          71     70        63        64
## 10        74         77            18          76     73        72        72
##        V80            V81           V82      V83        V84       V85
## 1  Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2       73             75            72        9         14         6
## 3       82             83            79        7          9        12
## 4       18             20            12       80         73        65
## 5       76             76            80        7         12        10
## 6       83             77            76       12          7        11
## 7       21             15            19       15         12        11
## 8       80             77            78       10         15        13
## 9       71             77            76       15         16        13
## 10      40             18            12       15          9        10
##              V86        V87            V88
## 1  GKPositioning GKReflexes Release Clause
## 2              9         10               
## 3             10         15               
## 4             77         85               
## 5              8         15               
## 6             12         13               
## 7             11          8               
## 8             15          8               
## 9              7          8               
## 10            15         16
names(football) <- football[1,]
head(football)
##       ID         Name Age                                          Photo
## 1     ID         Name Age                                          Photo
## 2 207439   L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909     A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260       Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
##   Nationality                                Flag Overall Potential
## 1 Nationality                                Flag Overall Potential
## 2   Argentina https://cdn.sofifa.org/flags/52.png      80        85
## 3      Sweden https://cdn.sofifa.org/flags/46.png      80        80
## 4      Russia https://cdn.sofifa.org/flags/40.png      79        81
## 5      Russia https://cdn.sofifa.org/flags/40.png      79        79
## 6      Brazil https://cdn.sofifa.org/flags/54.png      78        78
##              Club                                   Club Logo Value  Wage
## 1            Club                                   Club Logo Value  Wage
## 2                         https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                         https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                         https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                         https://cdn.sofifa.org/flags/40.png  6030  1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
##   Special Preferred Foot International Reputation Weak Foot Skill Moves
## 1 Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2    2122          Right                        2         4           4
## 3    1797          Right                        2         4           2
## 4    1217          Right                        1         3           1
## 5    2038          Right                        2         3           3
## 6    1807          Right                        2         3           3
##        Work Rate Body Type Real Face Position Jersey Number   Joined
## 1      Work Rate Body Type Real Face Position Jersey Number   Joined
## 2 Medium/ Medium    Normal        No       CM             5         
## 3   High/ Medium    Normal        No      LCB             4         
## 4 Medium/ Medium    Normal        No       GK            12         
## 5     High/ High      Lean        No       RB             2         
## 6 Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
##   Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 1 Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                    5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                     6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                     6'2 176lbs                              
## 5                                    5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                             2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
##     RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 1   RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                           
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
##     LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 1   LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3 70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                16        14              17           25
## 5 78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6 68+2 76+2 76+2 76+2 68+2       60        45              79           73
##   Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 1 Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2      73        78    79         78          82          82           75
## 3      37        49    36         40          67          63           46
## 4      13        15    18         17          32          17           58
## 5      57        72    49         46          75          72           84
## 6      51        63    42         48          72          73           33
##   SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 1 SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2          69      77        74      77        82      61      79       69
## 3          49      55        76      36        74      64      67       83
## 4          54      36        76      50        24      60      27       70
## 5          90      80        75      76        67      85      93       68
## 6          38      51        70      60        55      79      54       76
##   LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 1 LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2        80         79            72          74     82        57        74
## 3        59         81            82          54     49        79        78
## 4        13         26            20          11     63        15        69
## 5        57         65            71          77     72        41        73
## 6        58         76            79          50     67        64        70
##   Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 1 Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2      73             75            72        9         14         6
## 3      82             83            79        7          9        12
## 4      18             20            12       80         73        65
## 5      76             76            80        7         12        10
## 6      83             77            76       12          7        11
##   GKPositioning GKReflexes Release Clause
## 1 GKPositioning GKReflexes Release Clause
## 2             9         10               
## 3            10         15               
## 4            77         85               
## 5             8         15               
## 6            12         13
football <- football[-c(1),]
head(football)
##       ID         Name Age                                          Photo
## 2 207439   L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909     A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260       Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
## 7 187607    A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
##   Nationality                                Flag Overall Potential
## 2   Argentina https://cdn.sofifa.org/flags/52.png      80        85
## 3      Sweden https://cdn.sofifa.org/flags/46.png      80        80
## 4      Russia https://cdn.sofifa.org/flags/40.png      79        81
## 5      Russia https://cdn.sofifa.org/flags/40.png      79        79
## 6      Brazil https://cdn.sofifa.org/flags/54.png      78        78
## 7      Russia https://cdn.sofifa.org/flags/40.png      78        78
##              Club                                   Club Logo Value  Wage
## 2                         https://cdn.sofifa.org/flags/52.png  5684  1602
## 3                         https://cdn.sofifa.org/flags/46.png  6370  3591
## 4                         https://cdn.sofifa.org/flags/40.png  5675  3672
## 5                         https://cdn.sofifa.org/flags/40.png  6030  1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
## 7                         https://cdn.sofifa.org/flags/40.png  5764  1105
##   Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2    2122          Right                        2         4           4
## 3    1797          Right                        2         4           2
## 4    1217          Right                        1         3           1
## 5    2038          Right                        2         3           3
## 6    1807          Right                        2         3           3
## 7    1810          Right                        2         3           3
##        Work Rate Body Type Real Face Position Jersey Number   Joined
## 2 Medium/ Medium    Normal        No       CM             5         
## 3   High/ Medium    Normal        No      LCB             4         
## 4 Medium/ Medium    Normal        No       GK            12         
## 5     High/ High      Lean        No       RB             2         
## 6 Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
## 7   High/ Medium    Stocky        No       ST            22         
##   Loaned From Contract Valid Until Height Weight   LS   ST   RS   LW   LF   CF
## 2                                    5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3                                     6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4                                     6'2 176lbs                              
## 5                                    5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6                             2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7                                     6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
##     RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4                                                                           
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7 74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
##     LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 3 70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 4                                16        14              17           25
## 5 78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 6 68+2 76+2 76+2 76+2 68+2       60        45              79           73
## 7 48+2 48+2 48+2 48+2 48+2       61        79              86           71
##   Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2      73        78    79         78          82          82           75
## 3      37        49    36         40          67          63           46
## 4      13        15    18         17          32          17           58
## 5      57        72    49         46          75          72           84
## 6      51        63    42         48          72          73           33
## 7      74        71    64         60          55          77           66
##   SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2          69      77        74      77        82      61      79       69
## 3          49      55        76      36        74      64      67       83
## 4          54      36        76      50        24      60      27       70
## 5          90      80        75      76        67      85      93       68
## 6          38      51        70      60        55      79      54       76
## 7          65      50        75      32        78      63      77       93
##   LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2        80         79            72          74     82        57        74
## 3        59         81            82          54     49        79        78
## 4        13         26            20          11     63        15        69
## 5        57         65            71          77     72        41        73
## 6        58         76            79          50     67        64        70
## 7        68         75            30          78     73        77        70
##   Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2      73             75            72        9         14         6
## 3      82             83            79        7          9        12
## 4      18             20            12       80         73        65
## 5      76             76            80        7         12        10
## 6      83             77            76       12          7        11
## 7      21             15            19       15         12        11
##   GKPositioning GKReflexes Release Clause
## 2             9         10               
## 3            10         15               
## 4            77         85               
## 5             8         15               
## 6            12         13               
## 7            11          8
nrow(football)
## [1] 18207
table(football$Position)
## 
##       CAM   CB  CDM   CF   CM   GK  LAM   LB  LCB  LCM  LDM   LF   LM   LS   LW 
##   60  958 1778  948   74 1394 2025   21 1322  648  395  243   15 1095  207  381 
##  LWB  RAM   RB  RCB  RCM  RDM   RF   RM   RS   RW  RWB   ST 
##   78   21 1291  662  391  248   16 1124  203  370   87 2152

2. Scatter Plot

2.1 Filter for strikers

Strikers are defined in the dataset as Position = “ST”.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
football_st <- football %>% filter(Position == "ST")
head(football_st)
##       ID          Name Age                                          Photo
## 1 187607     A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389        G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683     K. Fofana  26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson  27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900  J. Sambenito  26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405     B. Angulo  22 https://cdn.sofifa.org/players/4/19/246405.png
##   Nationality                                 Flag Overall Potential Club
## 1      Russia  https://cdn.sofifa.org/flags/40.png      78        78     
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77     
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png      75        75     
## 4     Iceland  https://cdn.sofifa.org/flags/24.png      73        74     
## 5    Paraguay  https://cdn.sofifa.org/flags/58.png      71        74     
## 6     Ecuador  https://cdn.sofifa.org/flags/57.png      71        77     
##                              Club Logo Value Wage Special Preferred Foot
## 1  https://cdn.sofifa.org/flags/40.png  5764 1105    1810          Right
## 2 https://cdn.sofifa.org/flags/108.png  5275 2138    1933           Left
## 3 https://cdn.sofifa.org/flags/108.png  5589 3875    1877          Right
## 4  https://cdn.sofifa.org/flags/24.png  5629 3661    1893          Right
## 5  https://cdn.sofifa.org/flags/58.png  6113 2445    1651          Right
## 6  https://cdn.sofifa.org/flags/57.png  5057 2216    1628          Right
##   International Reputation Weak Foot Skill Moves      Work Rate Body Type
## 1                        2         3           3   High/ Medium    Stocky
## 2                        2         3           3      High/ Low    Normal
## 3                        1         3           3 Medium/ Medium    Normal
## 4                        1         4           3     High/ High    Normal
## 5                        1         3           2   High/ Medium      Lean
## 6                        1         4           3      High/ Low    Normal
##   Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1        No       ST            22                                        
## 2        No       ST            21                                        
## 3        No       ST            22                                        
## 4        No       ST             9                                        
## 5        No       ST             9                                        
## 6        No       ST            19                                        
##   Height Weight   LS   ST   RS   LW   LF   CF   RF   RW  LAM  CAM  RAM   LM
## 1    6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2   5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3    6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4    6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5    6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6    6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
##    LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB   LB  LCB   CB  RCB   RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
##   Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1       61        79              86           71      74        71    64
## 2       68        77              71           73      73        76    73
## 3       66        75              72           74      74        72    63
## 4       66        71              68           68      65        73    63
## 5       40        74              72           57      72        60    64
## 6       50        78              69           56      46        76    58
##   FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1         60          55          77           66          65      50        75
## 2         69          67          76           78          85      79        71
## 3         59          58          75           59          77      63        72
## 4         48          44          73           78          79      83        74
## 5         42          42          63           79          72      61        69
## 6         58          33          71           82          79      78        73
##   Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1      32        78      63      77       93        68         75            30
## 2      73        77      70      78       74        74         77            18
## 3      60        78      69      83       77        73         67            40
## 4      76        68      78      90       85        66         73            42
## 5      64        73      69      67       72        67         49            14
## 6      64        72      69      77       69        54         28            16
##   Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1          78     73        77        70      21             15            19
## 2          76     73        72        72      40             18            12
## 3          72     69        74        83      23             37            46
## 4          73     64        69        76      31             39            24
## 5          75     60        67        74      15             16            16
## 6          62     45        82        51      11             18            12
##   GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1       15         12        11            11          8               
## 2       15          9        10            15         16               
## 3        7         11         7            11         14               
## 4        9         12        10            15         16               
## 5       15         16        15             7          7               
## 6       11          8        10             7          6
nrow(football_st)
## [1] 2152

2.2 Scatter Plot

convert to numeric.

str(football_st$Wage)
##  chr [1:2152] "1105" "2138" "3875" "3661" "2445" "2216" "4457" "3370" ...
str(football_st$Value)
##  chr [1:2152] "5764" "5275" "5589" "5629" "6113" "5057" "6561" "6146" ...
football_st$Wage <- as.numeric(football_st$Wage)
football_st$Value <- as.numeric(football_st$Value)
library(ggplot2)
library(ggpubr)

ggplot(football_st) + aes(x = Wage, y = Value) +
  geom_point(shape = 2, colour = "black") +
  xlab("Wage") + ylab("Value") +
  ggtitle("Wage and Value") +
  geom_smooth(method = lm) +
  stat_regline_equation(label.x = 150000, label.y = 1700) +
  stat_cor(method = "pearson", label.x = 300000, label.y = 1600)
## `geom_smooth()` using formula = 'y ~ x'

3. Simple Linear Regression

value_simple <- lm(football_st$Value ~ football_st$Wage)
summary(value_simple)
## 
## Call:
## lm(formula = football_st$Value ~ football_st$Wage)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -17073527   -633009   -209153    198333  38355242 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4.175e+05  7.060e+04  -5.913 3.91e-09 ***
## football_st$Wage  2.179e+02  2.721e+00  80.068  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2838000 on 2150 degrees of freedom
## Multiple R-squared:  0.7489, Adjusted R-squared:  0.7487 
## F-statistic:  6411 on 1 and 2150 DF,  p-value: < 2.2e-16
confint(value_simple, level = 0.95)
##                         2.5 %       97.5 %
## (Intercept)      -555911.3195 -278995.9221
## football_st$Wage     212.5681     223.2422

4. Residuals

value_simple_stdresiduals <- rstandard(value_simple)
head(value_simple_stdresiduals)
##           1           2           3           4           5           6 
##  0.06430004 -0.01520939 -0.14850129 -0.13205208 -0.03849210 -0.02127676

Standard residuals.

football_st_comb <- cbind(football_st, value_simple_stdresiduals)
head(football_st_comb)
##       ID          Name Age                                          Photo
## 1 187607     A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389        G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683     K. Fofana  26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson  27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900  J. Sambenito  26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405     B. Angulo  22 https://cdn.sofifa.org/players/4/19/246405.png
##   Nationality                                 Flag Overall Potential Club
## 1      Russia  https://cdn.sofifa.org/flags/40.png      78        78     
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77     
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png      75        75     
## 4     Iceland  https://cdn.sofifa.org/flags/24.png      73        74     
## 5    Paraguay  https://cdn.sofifa.org/flags/58.png      71        74     
## 6     Ecuador  https://cdn.sofifa.org/flags/57.png      71        77     
##                              Club Logo Value Wage Special Preferred Foot
## 1  https://cdn.sofifa.org/flags/40.png  5764 1105    1810          Right
## 2 https://cdn.sofifa.org/flags/108.png  5275 2138    1933           Left
## 3 https://cdn.sofifa.org/flags/108.png  5589 3875    1877          Right
## 4  https://cdn.sofifa.org/flags/24.png  5629 3661    1893          Right
## 5  https://cdn.sofifa.org/flags/58.png  6113 2445    1651          Right
## 6  https://cdn.sofifa.org/flags/57.png  5057 2216    1628          Right
##   International Reputation Weak Foot Skill Moves      Work Rate Body Type
## 1                        2         3           3   High/ Medium    Stocky
## 2                        2         3           3      High/ Low    Normal
## 3                        1         3           3 Medium/ Medium    Normal
## 4                        1         4           3     High/ High    Normal
## 5                        1         3           2   High/ Medium      Lean
## 6                        1         4           3      High/ Low    Normal
##   Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1        No       ST            22                                        
## 2        No       ST            21                                        
## 3        No       ST            22                                        
## 4        No       ST             9                                        
## 5        No       ST             9                                        
## 6        No       ST            19                                        
##   Height Weight   LS   ST   RS   LW   LF   CF   RF   RW  LAM  CAM  RAM   LM
## 1    6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2   5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3    6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4    6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5    6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6    6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
##    LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB   LB  LCB   CB  RCB   RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
##   Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1       61        79              86           71      74        71    64
## 2       68        77              71           73      73        76    73
## 3       66        75              72           74      74        72    63
## 4       66        71              68           68      65        73    63
## 5       40        74              72           57      72        60    64
## 6       50        78              69           56      46        76    58
##   FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1         60          55          77           66          65      50        75
## 2         69          67          76           78          85      79        71
## 3         59          58          75           59          77      63        72
## 4         48          44          73           78          79      83        74
## 5         42          42          63           79          72      61        69
## 6         58          33          71           82          79      78        73
##   Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1      32        78      63      77       93        68         75            30
## 2      73        77      70      78       74        74         77            18
## 3      60        78      69      83       77        73         67            40
## 4      76        68      78      90       85        66         73            42
## 5      64        73      69      67       72        67         49            14
## 6      64        72      69      77       69        54         28            16
##   Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1          78     73        77        70      21             15            19
## 2          76     73        72        72      40             18            12
## 3          72     69        74        83      23             37            46
## 4          73     64        69        76      31             39            24
## 5          75     60        67        74      15             16            16
## 6          62     45        82        51      11             18            12
##   GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1       15         12        11            11          8               
## 2       15          9        10            15         16               
## 3        7         11         7            11         14               
## 4        9         12        10            15         16               
## 5       15         16        15             7          7               
## 6       11          8        10             7          6               
##   value_simple_stdresiduals
## 1                0.06430004
## 2               -0.01520939
## 3               -0.14850129
## 4               -0.13205208
## 5               -0.03849210
## 6               -0.02127676

Plot residuals.

ggplot(football_st_comb) + aes(x = football_st_comb$Value, y = football_st_comb$value_simple_stdresiduals) +
  geom_point() +
  xlab("Value") + ylab("Standard Residuals") +
  ggtitle("Wage and Value Prediction, Residuals")
## Warning: Use of `football_st_comb$Value` is discouraged.
## ℹ Use `Value` instead.
## Warning: Use of `football_st_comb$value_simple_stdresiduals` is discouraged.
## ℹ Use `value_simple_stdresiduals` instead.

4.1 Normality

ggplot(football_st) + aes(x = Value) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-1: distribution is different from a normal distribution.

shapiro.test(football_st$Value)
## 
##  Shapiro-Wilk normality test
## 
## data:  football_st$Value
## W = 0.37447, p-value < 2.2e-16

4.2 Autocorrelation

May not be very applicable here. But just for illustration……

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
durbinWatsonTest(value_simple) 
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2167301      1.566536       0
##  Alternative hypothesis: rho != 0

5. Multiple Linear Regression

Subset data for simplicity.

football_st_2 <- football_st[, c("Age", "Balance", "ShotPower", "Aggression",
                                 "Positioning", "Composure", "Wage")]
head(football_st_2)
##   Age Balance ShotPower Aggression Positioning Composure Wage
## 1  29      32        78         75          78        70 1105
## 2  29      73        77         77          76        72 2138
## 3  26      60        78         67          72        83 3875
## 4  27      76        68         73          73        76 3661
## 5  26      64        73         49          75        74 2445
## 6  22      64        72         28          62        51 2216

Convert to numeric.

library(dplyr)
football_st_2 <- football_st_2 %>% mutate_if(is.character, as.numeric)
str(football_st_2)
## 'data.frame':    2152 obs. of  7 variables:
##  $ Age        : num  29 29 26 27 26 22 22 28 31 28 ...
##  $ Balance    : num  32 73 60 76 64 64 65 75 69 56 ...
##  $ ShotPower  : num  78 77 78 68 73 72 66 75 69 71 ...
##  $ Aggression : num  75 77 67 73 49 28 30 36 68 59 ...
##  $ Positioning: num  78 76 72 73 75 62 76 68 69 72 ...
##  $ Composure  : num  70 72 83 76 74 51 62 56 80 56 ...
##  $ Wage       : num  1105 2138 3875 3661 2445 ...

A multiple regression model showing unstandardised estimates.

The predictors included in the model are: Age, Balance, ShotPower, Aggression, Positioning, and Composure.

names(football_st_2)
## [1] "Age"         "Balance"     "ShotPower"   "Aggression"  "Positioning"
## [6] "Composure"   "Wage"
wage_model_st <- lm(Wage ~ Age + Balance + ShotPower +
                      Aggression + Positioning + Composure,
                    data = football_st_2)
summary(wage_model_st)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning + Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31822  -8232  -2313   4754 350592 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -77073.40    4064.61 -18.962  < 2e-16 ***
## Age          -1014.25     110.94  -9.143  < 2e-16 ***
## Balance        120.41      35.90   3.354  0.00081 ***
## ShotPower      498.07      74.43   6.692 2.81e-11 ***
## Aggression      15.96      32.29   0.494  0.62129    
## Positioning    741.71      82.42   8.999  < 2e-16 ***
## Composure      424.72      71.66   5.927 3.58e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18840 on 2145 degrees of freedom
## Multiple R-squared:  0.2997, Adjusted R-squared:  0.2978 
## F-statistic:   153 on 6 and 2145 DF,  p-value: < 2.2e-16
coef(wage_model_st)
##  (Intercept)          Age      Balance    ShotPower   Aggression  Positioning 
## -77073.39877  -1014.24567    120.40620    498.06517     15.95657    741.70804 
##    Composure 
##    424.72405
confint(wage_model_st, level = 0.95)
##                    2.5 %       97.5 %
## (Intercept) -85044.38590 -69102.41165
## Age          -1231.79758   -796.69375
## Balance         50.00615    190.80626
## ShotPower      352.09956    644.03079
## Aggression     -47.37581     79.28895
## Positioning    580.07796    903.33813
## Composure      284.19780    565.25031

5.1 Standardised estimates

A multiple regression model showing standardised estimates.

The predictors included in the model are: Age, Balance, ShotPower, Aggression, Positioning, and Composure.

library(lm.beta)
## Warning: package 'lm.beta' was built under R version 4.4.2
wage_model_st_std <- lm.beta::lm.beta(wage_model_st)


coef(wage_model_st_std)
## (Intercept)         Age     Balance   ShotPower  Aggression Positioning 
##          NA -0.21358305  0.06178231  0.20182976  0.01126852  0.30316025 
##   Composure 
##  0.19146721
confint(wage_model_st_std)
##                  2.5 %    97.5 %
## (Intercept)         NA        NA
## Age         -217.76550 217.33833
## Balance      -70.33827  70.46184
## ShotPower   -145.76378 146.16744
## Aggression   -63.32111  63.34365
## Positioning -161.32692 161.93324
## Composure   -140.33479 140.71772

5.2 Residuals

wage_model_st_residuals <- rstandard(wage_model_st)
head(wage_model_st_residuals)
##          1          2          3          4          5          6 
## -1.2711799 -1.4183035 -1.5151160 -1.1956035 -1.3820667 -0.5348701
football_st_comb_2 <- cbind(football_st_2, wage_model_st_residuals)
head(football_st_comb_2)
##   Age Balance ShotPower Aggression Positioning Composure Wage
## 1  29      32        78         75          78        70 1105
## 2  29      73        77         77          76        72 2138
## 3  26      60        78         67          72        83 3875
## 4  27      76        68         73          73        76 3661
## 5  26      64        73         49          75        74 2445
## 6  22      64        72         28          62        51 2216
##   wage_model_st_residuals
## 1              -1.2711799
## 2              -1.4183035
## 3              -1.5151160
## 4              -1.1956035
## 5              -1.3820667
## 6              -0.5348701
ggplot(football_st_comb_2) + aes(x = Wage, y = wage_model_st_residuals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage Prediction")

ggplot(football_st_comb_2) + aes(x = Age, y = wage_model_st_residuals) +
  geom_point() + xlab("Age") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Age")

ggplot(football_st_comb_2) + aes(x = ShotPower, y = wage_model_st_residuals) +
  geom_point() + xlab("Shot Power") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Shot Power")

ggplot(football_st_comb_2) + aes(x = Aggression, y = wage_model_st_residuals) +
  geom_point() + xlab("Aggression") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Aggression")

ggplot(football_st_comb_2) + aes(x = Positioning, y = wage_model_st_residuals) +
  geom_point() + xlab("Positioning") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Positionng")

ggplot(football_st_comb_2) + aes(x = Composure, y = wage_model_st_residuals) +
  geom_point() + xlab("Composure") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Composure")

5.3 Model evaluation

5.3.1 Normality

library(ggplot2)

ggplot(football_st_2) + aes(x = Wage) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of wage (strikers)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(football_st_2) + aes(x = Wage) +
  geom_histogram() +
  ylab("Count") +
  scale_x_log10() +
  ggtitle("Distribution of log(wage) (strikers)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution

H-1: distribution is different from a normal distribution.

shapiro.test(football_st_2$Wage)
## 
##  Shapiro-Wilk normality test
## 
## data:  football_st_2$Wage
## W = 0.39056, p-value < 2.2e-16

5.3.2 Multicollinearity

How much the variance of an estimated regression coefficient increases if your predictors are correlated.

In other words, no 2 pairs of predicts should not be strongly correlated with each other.

If no factors are correlated, the VIFs will all be 1.

Rule of thumb: If VIF > 10, mullticollinearity is high.

library(car)
vif(wage_model_st)
##         Age     Balance   ShotPower  Aggression Positioning   Composure 
##    1.671663    1.039327    2.786601    1.593244    3.476150    3.196433

5.3.3 Autocorrelation

0 <= D-W <= 4.

Rule of thumb:

D-W = 2.0 means that there is no autocorrelation.

D-W < = means there is positive autocorrelation.

D-W > 2 means negative autocorrelation.

This applies in time series data; so not so applicable here.

durbinWatsonTest(wage_model_st)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.5038085     0.9915208       0
##  Alternative hypothesis: rho != 0

5.3.4 Heteroskedasticity

Perform a Breusch-Pagan Test to test for heteroskedasticity/homoskedasticity.

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(wage_model_st)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st
## BP = 91.188, df = 6, p-value < 2.2e-16

5.3.5 Automatic evaluation

We can also automatically evaluate the model.

library(gvlma)

gvlma(wage_model_st)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning + Composure, data = football_st_2)
## 
## Coefficients:
## (Intercept)          Age      Balance    ShotPower   Aggression  Positioning  
##   -77073.40     -1014.25       120.41       498.07        15.96       741.71  
##   Composure  
##      424.72  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st) 
## 
##                        Value p-value                   Decision
## Global Stat        1307104.5       0 Assumptions NOT satisfied!
## Skewness             26054.7       0 Assumptions NOT satisfied!
## Kurtosis           1280082.5       0 Assumptions NOT satisfied!
## Link Function          791.9       0 Assumptions NOT satisfied!
## Heteroscedasticity     175.5       0 Assumptions NOT satisfied!

6. Stepwise Regression

Stepwise regression is a modification of the ordinary regression approach.

library(stats)
wage_model_st_step <- step(wage_model_st,
                           direction = "both")
## Start:  AIC=42374.94
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning + 
##     Composure
## 
##               Df  Sum of Sq        RSS   AIC
## - Aggression   1 8.6672e+07 7.6162e+11 42373
## <none>                      7.6154e+11 42375
## - Balance      1 3.9939e+09 7.6553e+11 42384
## - Composure    1 1.2472e+10 7.7401e+11 42408
## - ShotPower    1 1.5897e+10 7.7743e+11 42417
## - Positioning  1 2.8752e+10 7.9029e+11 42453
## - Age          1 2.9676e+10 7.9121e+11 42455
## 
## Step:  AIC=42373.18
## Wage ~ Age + Balance + ShotPower + Positioning + Composure
## 
##               Df  Sum of Sq        RSS   AIC
## <none>                      7.6162e+11 42373
## + Aggression   1 8.6672e+07 7.6154e+11 42375
## - Balance      1 3.9197e+09 7.6554e+11 42382
## - Composure    1 1.2939e+10 7.7456e+11 42407
## - ShotPower    1 1.7279e+10 7.7890e+11 42419
## - Positioning  1 2.8770e+10 7.9039e+11 42451
## - Age          1 3.0373e+10 7.9200e+11 42455
summary(wage_model_st_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning + 
##     Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31793  -8228  -2326   4830 350282 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -77250.10    4048.13 -19.083  < 2e-16 ***
## Age          -1002.58     108.38  -9.251  < 2e-16 ***
## Balance        118.78      35.74   3.323 0.000904 ***
## ShotPower      506.25      72.55   6.978 3.98e-12 ***
## Positioning    741.93      82.40   9.004  < 2e-16 ***
## Composure      429.17      71.08   6.038 1.83e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18840 on 2146 degrees of freedom
## Multiple R-squared:  0.2997, Adjusted R-squared:  0.298 
## F-statistic: 183.6 on 5 and 2146 DF,  p-value: < 2.2e-16
gvlma(wage_model_st_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning + 
##     Composure, data = football_st_2)
## 
## Coefficients:
## (Intercept)          Age      Balance    ShotPower  Positioning    Composure  
##    -77250.1      -1002.6        118.8        506.2        741.9        429.2  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_step) 
## 
##                        Value p-value                   Decision
## Global Stat        1300530.2       0 Assumptions NOT satisfied!
## Skewness             25983.6       0 Assumptions NOT satisfied!
## Kurtosis           1273577.0       0 Assumptions NOT satisfied!
## Link Function          794.0       0 Assumptions NOT satisfied!
## Heteroscedasticity     175.5       0 Assumptions NOT satisfied!

7. Data mining approach

Now, we will use the data mining approach.

7.1 Training validation split

Split the data into training and validation sets.

Set the seed using our favourite number :-)

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation.

train_index <- sample(1:nrow(football_st_2), 0.7 * nrow(football_st_2))
valid_index <- setdiff(1:nrow(football_st_2), train_index)

Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.

train_df_st <- football_st_2[train_index, ]
valid_df_st <- football_st_2[valid_index, ]

It is a good habit to check after splitting.

nrow(train_df_st)
## [1] 1506
nrow(valid_df_st)
## [1] 646

7.2 Training

Training the model on the training set.

wage_model_st_2 <- lm(Wage ~ Age + Balance + ShotPower + 
                        Aggression + Positioning + Composure,
                      data = train_df_st)
summary(wage_model_st_2)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning + Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32861  -8569  -2336   5182 347609 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -81327.50    5253.63 -15.480  < 2e-16 ***
## Age          -1032.61     146.38  -7.054 2.64e-12 ***
## Balance        131.37      46.63   2.817  0.00491 ** 
## ShotPower      514.89      98.00   5.254 1.70e-07 ***
## Aggression      13.64      41.73   0.327  0.74380    
## Positioning    692.34     107.84   6.420 1.82e-10 ***
## Composure      533.27      93.18   5.723 1.26e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20410 on 1499 degrees of freedom
## Multiple R-squared:  0.2877, Adjusted R-squared:  0.2848 
## F-statistic: 100.9 on 6 and 1499 DF,  p-value: < 2.2e-16

7.3 Model evaluation

Predict the outcome (i.e. wage) of the training and validation sets using the model from the training set. Compare the errors between the training and validation sets.

library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:ggpubr':
## 
##     gghistogram
wage_model_st_2_pred_train <- predict(wage_model_st_2,
                                train_df_st)

accuracy(wage_model_st_2_pred_train, train_df_st$Wage)
##                    ME     RMSE      MAE       MPE     MAPE
## Test set -3.02254e-10 20363.72 9804.857 -30.84404 131.6738
wage_model_st_2_pred_valid <- predict(wage_model_st_2,
                                valid_df_st)

accuracy(wage_model_st_2_pred_valid, valid_df_st$Wage)
##                 ME     RMSE      MAE      MPE     MAPE
## Test set -910.8093 14653.34 9444.861 -32.5431 130.3103
max(train_df_st$Wage) - min(train_df_st$Wage)
## [1] 406504
sd(train_df_st$Wage)
## [1] 24135.81
max(valid_df_st$Wage) - min(valid_df_st$Wage)
## [1] 205030
sd(valid_df_st$Wage)
## [1] 18074.14
library(car)
vif(wage_model_st_2)
##         Age     Balance   ShotPower  Aggression Positioning   Composure 
##    1.690727    1.037080    2.900017    1.602035    3.573232    3.185606
library(lmtest)
bptest(wage_model_st_2)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st_2
## BP = 72.484, df = 6, p-value = 1.264e-13

7.4 Predicting

Predict new players

Data for new players

new <- read.csv("new.csv", header = TRUE)

wage_model_st_2_pred_new <- predict(wage_model_st_2,
                                newdata = new, interval = "confidence")
wage_model_st_2_pred_new
##        fit      lwr      upr
## 1 21523.43 18689.82 24357.04
## 2 23759.40 20030.25 27488.55
## 3 21465.21 19657.65 23272.77

8. Categorical Predictors

Subset to include categorical variable: preferred foot

football_st_3 <- football_st[, c("Preferred Foot", "Positioning", "Composure", "Wage")]
head(football_st_3)
##   Preferred Foot Positioning Composure Wage
## 1          Right          78        70 1105
## 2           Left          76        72 2138
## 3          Right          72        83 3875
## 4          Right          73        76 3661
## 5          Right          75        74 2445
## 6          Right          62        51 2216
names(football_st_3)[1] <- "Preferred_Foot"

football_st_3$Positioning <- as.numeric(football_st_3$Positioning)
football_st_3$Composure <- as.numeric(football_st_3$Composure)

8.1 Traditional Statistics

wage_model_st_cat <- lm(Wage ~ factor(Preferred_Foot) + Positioning + Composure, data = football_st_3)
confint(wage_model_st_cat, level = 0.95)
##                                   2.5 %      97.5 %
## (Intercept)                 -71769.7525 -59180.1997
## factor(Preferred_Foot)Right  -3482.4352   1307.4754
## Positioning                    668.9041    964.3812
## Composure                      304.0428    572.1551

Residuals.

wage_model_st_cat_stdresiduals <- rstandard(wage_model_st_cat)
head(wage_model_st_cat_stdresiduals)
##          1          2          3          4          5          6 
## -1.3769868 -1.3424571 -1.2770311 -1.1703986 -1.2718790 -0.2163923
football_st_3_cat <- cbind(football_st_3, wage_model_st_cat_stdresiduals)
head(football_st_3_cat)
##   Preferred_Foot Positioning Composure Wage wage_model_st_cat_stdresiduals
## 1          Right          78        70 1105                     -1.3769868
## 2           Left          76        72 2138                     -1.3424571
## 3          Right          72        83 3875                     -1.2770311
## 4          Right          73        76 3661                     -1.1703986
## 5          Right          75        74 2445                     -1.2718790
## 6          Right          62        51 2216                     -0.2163923
ggplot(football_st_3_cat) + aes(x = Wage, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Wage")

Positioning

ggplot(football_st_3_cat) + aes(x = Positioning, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Positioning") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Positioning")

Composure

ggplot(football_st_3_cat) + aes(x = Composure, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Composure") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Composure")

ggplot(football_st_3_cat) + aes(x = Preferred_Foot, y = wage_model_st_cat_stdresiduals) +
  geom_point() + xlab("Preferred Foot") + ylab("Standarised Residuals") +
  ggtitle("Standarised Residual Plot, Preferred Foot")

8.2 Model Evaluation

ggplot(football_st_3_cat) + aes(x = Wage) +
  geom_histogram() +
  ylab("Count") +
  ggtitle("Distribution of Wage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using the Shapiro-Wilks test.

H-0: normal distribution.

H-alt: distribution is different from a normal distribution.

shapiro.test(football_st_3_cat$Wage)
## 
##  Shapiro-Wilk normality test
## 
## data:  football_st_3_cat$Wage
## W = 0.39056, p-value < 2.2e-16

Multicollinearity

vif(wage_model_st_cat)
## factor(Preferred_Foot)            Positioning              Composure 
##               1.002720               2.738872               2.743181

Homoscedasticity.

bptest(wage_model_st_cat)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st_cat
## BP = 82.465, df = 3, p-value < 2.2e-16
gvlma(wage_model_st_cat)
## 
## Call:
## lm(formula = Wage ~ factor(Preferred_Foot) + Positioning + Composure, 
##     data = football_st_3)
## 
## Coefficients:
##                 (Intercept)  factor(Preferred_Foot)Right  
##                    -65475.0                      -1087.5  
##                 Positioning                    Composure  
##                       816.6                        438.1  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_cat) 
## 
##                        Value p-value                   Decision
## Global Stat        1208344.4       0 Assumptions NOT satisfied!
## Skewness             25297.7       0 Assumptions NOT satisfied!
## Kurtosis           1182302.6       0 Assumptions NOT satisfied!
## Link Function          600.3       0 Assumptions NOT satisfied!
## Heteroscedasticity     143.8       0 Assumptions NOT satisfied!

8.3 Data mining approach

set.seed(666)
train_index_3 <- sample(1:nrow(football_st_3), 0.7 *
                          nrow(football_st_3))
valid_index_3 <- setdiff(1:nrow(football_st_3), train_index)

train_df_st_3 <- football_st_3[train_index_3, ]
valid_df_st_3 <- football_st_3[valid_index_3, ]
wage_model_st_cat_2 <- lm(Wage ~ factor(Preferred_Foot) + Positioning +
                            Composure, data = train_df_st_3)

summary(wage_model_st_cat_2)
## 
## Call:
## lm(formula = Wage ~ factor(Preferred_Foot) + Positioning + Composure, 
##     data = train_df_st_3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33599  -8588  -2271   5035 352343 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -68270.03    4142.64 -16.480  < 2e-16 ***
## factor(Preferred_Foot)Right  -2040.14    1572.46  -1.297    0.195    
## Positioning                    787.78      97.32   8.095 1.17e-15 ***
## Composure                      534.07      89.17   5.990 2.63e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20920 on 1502 degrees of freedom
## Multiple R-squared:  0.2501, Adjusted R-squared:  0.2486 
## F-statistic:   167 on 3 and 1502 DF,  p-value: < 2.2e-16
wage_model_st_cat_2_pred_train <- predict(wage_model_st_cat_2,
                                train_df_st_3)

accuracy(wage_model_st_cat_2_pred_train, train_df_st_3$Wage)
##                     ME     RMSE      MAE       MPE     MAPE
## Test set -1.589124e-10 20893.75 9883.551 -37.84999 129.0229
wage_model_st_cat_2_pred_valid <- predict(wage_model_st_cat_2,
                                valid_df_st_3)
accuracy(wage_model_st_cat_2_pred_valid, valid_df_st_3$Wage)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -844.4194 15387.14 9700.628 -41.79669 130.5894
sd(train_df_st_3$Wage)
## [1] 24135.81
sd(valid_df_st_3$Wage)
## [1] 18074.14
vif(wage_model_st_cat_2)
## factor(Preferred_Foot)            Positioning              Composure 
##               1.004517               2.769896               2.776543
bptest(wage_model_st_cat_2)
## 
##  studentized Breusch-Pagan test
## 
## data:  wage_model_st_cat_2
## BP = 64.695, df = 3, p-value = 5.829e-14

Data for more new players

new2 <- read.csv("new2.csv")
new2
##   Preferred.Foot Positioning Composure
## 1          Right          64        56
## 2          Right          65        47
new2$Preferred.Foot <- as.factor(new2$Preferred.Foot)

names(new2)
## [1] "Preferred.Foot" "Positioning"    "Composure"
names(new2)[1] <- "Preferred_Foot"

names(new2)
## [1] "Preferred_Foot" "Positioning"    "Composure"
wage_model_st_cat_2_pred_new <- predict(wage_model_st_cat_2,
                                newdata = new2, interval = "confidence")

wage_model_st_cat_2_pred_new
##         fit      lwr      upr
## 1 10016.014 8776.276 11255.75
## 2  5997.149 3512.208  8482.09

9. Non-Linear Regression

Sometimes, a relationship may not be linear. In this case, we can specify a non-linear relationship in the model.

9.1 Traditional statistics

We start with the traditional statistics approach and evaluate.

The non-linear relationship is expressed in the model specification.

names(football_st_2)
## [1] "Age"         "Balance"     "ShotPower"   "Aggression"  "Positioning"
## [6] "Composure"   "Wage"
wage_model_st_nl <- lm(Wage ~ Age + Balance + ShotPower +
                         Aggression + Positioning * Composure, 
                       data = football_st_2)
summary(wage_model_st_nl)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning * Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58380  -5245     80   4644 267683 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           297675.783  13442.584  22.144   <2e-16 ***
## Age                     -789.963     94.502  -8.359   <2e-16 ***
## Balance                   57.694     30.555   1.888   0.0591 .  
## ShotPower                642.408     63.389  10.134   <2e-16 ***
## Aggression                19.805     27.418   0.722   0.4702    
## Positioning            -5016.022    211.523 -23.714   <2e-16 ***
## Composure              -6150.054    235.919 -26.069   <2e-16 ***
## Positioning:Composure     96.301      3.339  28.844   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16000 on 2144 degrees of freedom
## Multiple R-squared:  0.4955, Adjusted R-squared:  0.4939 
## F-statistic: 300.8 on 7 and 2144 DF,  p-value: < 2.2e-16
vif(wage_model_st_nl)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
##                   Age               Balance             ShotPower 
##              1.683057              1.044616              2.804077 
##            Aggression           Positioning             Composure 
##              1.593281             31.765761             48.069231 
## Positioning:Composure 
##            127.119996
durbinWatsonTest(wage_model_st_nl)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2531554      1.491911       0
##  Alternative hypothesis: rho != 0

9.2 Traditional statistics stepwise

Perform a stepwise regression with a non-linear relationship and evaluate

wage_model_st_nl_step <- step(wage_model_st_nl,
                           direction = "both")
## Start:  AIC=41671.29
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning * 
##     Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## - Aggression             1 1.3352e+08 5.4877e+11 41670
## <none>                                5.4863e+11 41671
## - Balance                1 9.1234e+08 5.4955e+11 41673
## - Age                    1 1.7881e+10 5.6652e+11 41738
## - ShotPower              1 2.6282e+10 5.7492e+11 41770
## - Positioning:Composure  1 2.1290e+11 7.6154e+11 42375
## 
## Step:  AIC=41669.81
## Wage ~ Age + Balance + ShotPower + Positioning + Composure + 
##     Positioning:Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## <none>                                5.4877e+11 41670
## - Balance                1 8.5698e+08 5.4963e+11 41671
## + Aggression             1 1.3352e+08 5.4863e+11 41671
## - Age                    1 1.8041e+10 5.6681e+11 41737
## - ShotPower              1 2.8516e+10 5.7728e+11 41777
## - Positioning:Composure  1 2.1286e+11 7.6162e+11 42373
summary(wage_model_st_nl_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning + 
##     Composure + Positioning:Composure, data = football_st_2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58507  -5205     67   4579 267488 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           297410.796  13436.079  22.135   <2e-16 ***
## Age                     -775.517     92.352  -8.397   <2e-16 ***
## Balance                   55.684     30.424   1.830   0.0674 .  
## ShotPower                652.547     61.808  10.558   <2e-16 ***
## Positioning            -5015.048    211.495 -23.712   <2e-16 ***
## Composure              -6143.738    235.730 -26.063   <2e-16 ***
## Positioning:Composure     96.289      3.338  28.844   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15990 on 2145 degrees of freedom
## Multiple R-squared:  0.4954, Adjusted R-squared:  0.494 
## F-statistic:   351 on 6 and 2145 DF,  p-value: < 2.2e-16
vif(wage_model_st_nl_step)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
##                   Age               Balance             ShotPower 
##              1.607679              1.035950              2.666586 
##           Positioning             Composure Positioning:Composure 
##             31.764471             48.003192            127.116986
durbinWatsonTest(wage_model_st_nl_step)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2522843      1.493672       0
##  Alternative hypothesis: rho != 0

9.3 Data mining approach

A data mining approach with the non-linear relationship.

wage_model_st_nl_2 <- lm(Wage ~ Age + Balance + ShotPower + Aggression +
                           Positioning * Composure,
                       data = train_df_st)
summary(wage_model_st_nl_2)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning * Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66085  -5447    302   4870 260260 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           342265.457  17203.875  19.895  < 2e-16 ***
## Age                     -758.277    122.802  -6.175 8.52e-10 ***
## Balance                   79.001     39.022   2.025   0.0431 *  
## ShotPower                699.312     82.219   8.505  < 2e-16 ***
## Aggression                13.470     34.875   0.386   0.6994    
## Positioning            -5818.796    271.105 -21.463  < 2e-16 ***
## Composure              -6961.508    304.442 -22.866  < 2e-16 ***
## Positioning:Composure    109.127      4.285  25.465  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17060 on 1498 degrees of freedom
## Multiple R-squared:  0.5029, Adjusted R-squared:  0.5006 
## F-statistic: 216.5 on 7 and 1498 DF,  p-value: < 2.2e-16

Predict the training and validation sets using the non-linear model. Check the accyracu.

wage_model_st_nl_2_pred_train <- predict(wage_model_st_nl_2,
                                   train_df_st)
accuracy(wage_model_st_nl_2_pred_train, train_df_st$Wage)
##                    ME    RMSE      MAE       MPE     MAPE
## Test set 1.966265e-09 17011.8 8516.679 -27.10041 105.2229
wage_model_st_nl_2_pred_valid <- predict(wage_model_st_nl_2,
                                   valid_df_st)
accuracy(wage_model_st_nl_2_pred_valid, valid_df_st$Wage)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -855.2409 13504.82 8758.766 -32.40985 113.7297

Predict the wages of new players using the non-linear model.

wage_model_st_nl_2_pred_new <- predict(wage_model_st_nl_2,
                                        newdata = new, interval = "confidence")

wage_model_st_nl_2_pred_new
##        fit       lwr      upr
## 1 14285.88 11853.154 16718.61
## 2 12719.88  9489.583 15950.18
## 3 17129.30 15582.278 18676.32
gvlma(wage_model_st_nl_2)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression + 
##     Positioning * Composure, data = train_df_st)
## 
## Coefficients:
##           (Intercept)                    Age                Balance  
##             342265.46                -758.28                  79.00  
##             ShotPower             Aggression            Positioning  
##                699.31                  13.47               -5818.80  
##             Composure  Positioning:Composure  
##              -6961.51                 109.13  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_nl_2) 
## 
##                        Value   p-value                   Decision
## Global Stat        435462.84 0.0000000 Assumptions NOT satisfied!
## Skewness             9663.54 0.0000000 Assumptions NOT satisfied!
## Kurtosis           425239.16 0.0000000 Assumptions NOT satisfied!
## Link Function         545.56 0.0000000 Assumptions NOT satisfied!
## Heteroscedasticity     14.58 0.0001342 Assumptions NOT satisfied!

9.4 Data mining approach using stepwise

A data mining approach using a stepwise regression and non-linear relationship.

wage_model_st_nl_2_step <- step(wage_model_st_nl_2,
                              direction = "both")
## Start:  AIC=29357.89
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning * 
##     Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## - Aggression             1 4.3404e+07 4.3588e+11 29356
## <none>                                4.3584e+11 29358
## - Balance                1 1.1925e+09 4.3703e+11 29360
## - Age                    1 1.1093e+10 4.4693e+11 29394
## - ShotPower              1 2.1048e+10 4.5689e+11 29427
## - Positioning:Composure  1 1.8867e+11 6.2451e+11 29898
## 
## Step:  AIC=29356.04
## Wage ~ Age + Balance + ShotPower + Positioning + Composure + 
##     Positioning:Composure
## 
##                         Df  Sum of Sq        RSS   AIC
## <none>                                4.3588e+11 29356
## + Aggression             1 4.3404e+07 4.3584e+11 29358
## - Balance                1 1.1606e+09 4.3704e+11 29358
## - Age                    1 1.1307e+10 4.4719e+11 29393
## - ShotPower              1 2.2847e+10 4.5873e+11 29431
## - Positioning:Composure  1 1.8867e+11 6.2455e+11 29896
summary(wage_model_st_nl_2_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning + 
##     Composure + Positioning:Composure, data = train_df_st)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66274  -5469    260   4921 260121 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           342109.131  17194.232  19.897  < 2e-16 ***
## Age                     -748.254    119.996  -6.236 5.84e-10 ***
## Balance                   77.598     38.841   1.998   0.0459 *  
## ShotPower                706.988     79.759   8.864  < 2e-16 ***
## Positioning            -5818.926    271.028 -21.470  < 2e-16 ***
## Composure              -6958.216    304.237 -22.871  < 2e-16 ***
## Positioning:Composure    109.128      4.284  25.472  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17050 on 1499 degrees of freedom
## Multiple R-squared:  0.5028, Adjusted R-squared:  0.5008 
## F-statistic: 252.7 on 6 and 1499 DF,  p-value: < 2.2e-16

Predict the training and validation sets using the stepwise, non-linear model. Check the accuracy.

wage_model_st_nl_2_step_pred_train <- predict(wage_model_st_nl_2_step,
                                        train_df_st)

accuracy(wage_model_st_nl_2_step_pred_train, train_df_st$Wage)
##                    ME     RMSE      MAE       MPE     MAPE
## Test set 1.982657e-09 17012.65 8519.261 -27.09032 105.1927
wage_model_st_nl_2_step_pred_valid <- predict(wage_model_st_nl_2_step,
                                        valid_df_st)

accuracy(wage_model_st_nl_2_step_pred_valid, valid_df_st$Wage)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -852.6068 13509.38 8763.627 -32.34321 113.7652

Predict the wages of new players using the stepwise non-linear model.

wage_model_st_nl_2_step_pred_new <- predict(wage_model_st_nl_2_step,
                                        newdata = new, interval = "confidence")

wage_model_st_nl_2_step_pred_new
##        fit       lwr      upr
## 1 14529.97 12437.349 16622.59
## 2 12274.73  9968.014 14581.45
## 3 17001.40 15597.748 18405.04
gvlma(wage_model_st_nl_2_step)
## 
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning + 
##     Composure + Positioning:Composure, data = train_df_st)
## 
## Coefficients:
##           (Intercept)                    Age                Balance  
##              342109.1                 -748.3                   77.6  
##             ShotPower            Positioning              Composure  
##                 707.0                -5818.9                -6958.2  
## Positioning:Composure  
##                 109.1  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_nl_2_step) 
## 
##                        Value   p-value                   Decision
## Global Stat        433134.34 0.0000000 Assumptions NOT satisfied!
## Skewness             9632.18 0.0000000 Assumptions NOT satisfied!
## Kurtosis           422942.94 0.0000000 Assumptions NOT satisfied!
## Link Function         544.53 0.0000000 Assumptions NOT satisfied!
## Heteroscedasticity     14.69 0.0001267 Assumptions NOT satisfied!

10. Log Variables

Sometimes, the data need to be transformed. A common transformation is the log transformation.

10.1 Traditional statistics

A traditional statistics approach using a log transformation.

Here, the predictors are transformed using a log function.

wage_model_st_log <- lm(log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
                          log(Aggression) + log(Positioning) + log(Composure),
                    data = football_st_2)
summary(wage_model_st_log)
## 
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Aggression) + log(Positioning) + log(Composure), data = football_st_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.58521 -0.38112 -0.03033  0.36168  2.32404 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -9.49560    0.49568 -19.157  < 2e-16 ***
## log(Age)         -0.79291    0.08748  -9.064  < 2e-16 ***
## log(Balance)      0.23309    0.06514   3.578 0.000354 ***
## log(ShotPower)    1.66883    0.14615  11.418  < 2e-16 ***
## log(Aggression)   0.06081    0.04912   1.238 0.215839    
## log(Positioning)  1.94852    0.15876  12.273  < 2e-16 ***
## log(Composure)    1.16000    0.12750   9.098  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5898 on 2145 degrees of freedom
## Multiple R-squared:  0.4989, Adjusted R-squared:  0.4975 
## F-statistic: 355.9 on 6 and 2145 DF,  p-value: < 2.2e-16
gvlma(wage_model_st_log)
## 
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Aggression) + log(Positioning) + log(Composure), data = football_st_2)
## 
## Coefficients:
##      (Intercept)          log(Age)      log(Balance)    log(ShotPower)  
##         -9.49560          -0.79291           0.23309           1.66883  
##  log(Aggression)  log(Positioning)    log(Composure)  
##          0.06081           1.94852           1.16000  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_log) 
## 
##                      Value   p-value                   Decision
## Global Stat        561.540 0.000e+00 Assumptions NOT satisfied!
## Skewness             2.261 1.326e-01    Assumptions acceptable.
## Kurtosis            66.615 3.331e-16 Assumptions NOT satisfied!
## Link Function      481.799 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity  10.865 9.802e-04 Assumptions NOT satisfied!

10.2 Data mining with log

We can also use a data mining approach with the log transformation.

wage_model_st_log_2 <- lm(log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
                          log(Aggression) + log(Positioning) + log(Composure),
                    data = train_df_st)
summary(wage_model_st_log_2)
## 
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Aggression) + log(Positioning) + log(Composure), data = train_df_st)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.55406 -0.38093 -0.03289  0.35960  2.32610 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -9.67383    0.59662 -16.214  < 2e-16 ***
## log(Age)         -0.69453    0.10718  -6.480 1.24e-10 ***
## log(Balance)      0.28514    0.07870   3.623 0.000301 ***
## log(ShotPower)    1.49775    0.17843   8.394  < 2e-16 ***
## log(Aggression)   0.07195    0.05938   1.212 0.225846    
## log(Positioning)  1.87362    0.19243   9.737  < 2e-16 ***
## log(Composure)    1.31511    0.15334   8.577  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5945 on 1499 degrees of freedom
## Multiple R-squared:  0.4985, Adjusted R-squared:  0.4965 
## F-statistic: 248.3 on 6 and 1499 DF,  p-value: < 2.2e-16

Predict the training and validation sets. Check the accuracy.

wage_model_st_log_2_pred_train <- predict(wage_model_st_log_2,
                                   train_df_st)

train_df_st$logWage <- log(train_df_st$Wage)


accuracy(wage_model_st_log_2_pred_train, train_df_st$logWage)
##                    ME      RMSE       MAE        MPE     MAPE
## Test set 4.312485e-14 0.5931541 0.4587753 -0.4038503 5.089574
wage_model_st_log_2_pred_valid <- predict(wage_model_st_log_2,
                                   valid_df_st)
valid_df_st$logWage <- log(valid_df_st$Wage)

accuracy(wage_model_st_log_2_pred_valid, valid_df_st$logWage)
##                    ME      RMSE       MAE        MPE     MAPE
## Test set 0.0002915404 0.5808893 0.4554191 -0.3779982 5.050841
gvlma(wage_model_st_log_2)
## 
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Aggression) + log(Positioning) + log(Composure), data = train_df_st)
## 
## Coefficients:
##      (Intercept)          log(Age)      log(Balance)    log(ShotPower)  
##         -9.67383          -0.69453           0.28514           1.49775  
##  log(Aggression)  log(Positioning)    log(Composure)  
##          0.07195           1.87362           1.31511  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_log_2) 
## 
##                      Value   p-value                   Decision
## Global Stat        394.158 0.000e+00 Assumptions NOT satisfied!
## Skewness             1.842 1.747e-01    Assumptions acceptable.
## Kurtosis            54.044 1.960e-13 Assumptions NOT satisfied!
## Link Function      336.747 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity   1.524 2.170e-01    Assumptions acceptable.

Predict new records

More and more new data

new3 <- read.csv("new3.csv", header = TRUE)
new3
##   X Age Balance ShotPower Aggression Positioning Composure
## 1 1  25      66        69         55          72        71
## 2 2  26      58        76         75          66        66
## 3 3  19      80        67         33          43        52
wage_model_st_log_2_pred_new3 <- predict(wage_model_st_log_2,
                                   newdata = new3, interval = "confidence")

wage_model_st_log_2_pred_new3
##        fit      lwr      upr
## 1 9.533908 9.484214 9.583602
## 2 9.377799 9.315354 9.440244
## 3 8.323197 8.170023 8.476371

Results as a data frame (if desired).

wage_model_st_log_2_pred_new3_df <- as.data.frame(wage_model_st_log_2_pred_new3)



wage_model_st_log_2_pred_new3_df_value <- exp(1)^wage_model_st_log_2_pred_new3_df

wage_model_st_log_2_pred_new3_df_value
##         fit       lwr       upr
## 1 13820.495 13150.482 14524.645
## 2 11822.968 11107.261 12584.791
## 3  4118.303  3533.423  4799.997

10.3 Stepwise data mining with log

A stepwise regression using data mining and log transformations.

wage_model_st_log_2_step <- step(wage_model_st_log_2,
                           direction = "both")
## Start:  AIC=-1559.17
## log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + log(Aggression) + 
##     log(Positioning) + log(Composure)
## 
##                    Df Sum of Sq    RSS     AIC
## - log(Aggression)   1     0.519 530.38 -1559.7
## <none>                          529.86 -1559.2
## - log(Balance)      1     4.640 534.50 -1548.0
## - log(Age)          1    14.843 544.70 -1519.6
## - log(ShotPower)    1    24.907 554.77 -1492.0
## - log(Composure)    1    26.002 555.86 -1489.0
## - log(Positioning)  1    33.510 563.37 -1468.8
## 
## Step:  AIC=-1559.7
## log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + log(Positioning) + 
##     log(Composure)
## 
##                    Df Sum of Sq    RSS     AIC
## <none>                          530.38 -1559.7
## + log(Aggression)   1     0.519 529.86 -1559.2
## - log(Balance)      1     4.393 534.77 -1549.3
## - log(Age)          1    14.336 544.71 -1521.5
## - log(Composure)    1    27.350 557.73 -1486.0
## - log(ShotPower)    1    27.771 558.15 -1484.8
## - log(Positioning)  1    33.605 563.98 -1469.2
summary(wage_model_st_log_2_step)
## 
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Positioning) + log(Composure), data = train_df_st)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5474 -0.3754 -0.0318  0.3602  2.3170 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -9.74502    0.59382 -16.411  < 2e-16 ***
## log(Age)         -0.66674    0.10471  -6.367 2.55e-10 ***
## log(Balance)      0.27624    0.07837   3.525 0.000436 ***
## log(ShotPower)    1.54434    0.17426   8.862  < 2e-16 ***
## log(Positioning)  1.87617    0.19245   9.749  < 2e-16 ***
## log(Composure)    1.33827    0.15216   8.795  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5946 on 1500 degrees of freedom
## Multiple R-squared:  0.498,  Adjusted R-squared:  0.4963 
## F-statistic: 297.6 on 5 and 1500 DF,  p-value: < 2.2e-16
gvlma(wage_model_st_log_2_step)
## 
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Positioning) + log(Composure), data = train_df_st)
## 
## Coefficients:
##      (Intercept)          log(Age)      log(Balance)    log(ShotPower)  
##          -9.7450           -0.6667            0.2762            1.5443  
## log(Positioning)    log(Composure)  
##           1.8762            1.3383  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_log_2_step) 
## 
##                      Value   p-value                   Decision
## Global Stat        393.169 0.000e+00 Assumptions NOT satisfied!
## Skewness             2.178 1.400e-01    Assumptions acceptable.
## Kurtosis            52.586 4.117e-13 Assumptions NOT satisfied!
## Link Function      336.860 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity   1.545 2.139e-01    Assumptions acceptable.

Predict the training and validation sets. Check the accuracy.

wage_model_st_log_2_step_pred_train <- predict(wage_model_st_log_2_step, train_df_st)
accuracy(wage_model_st_log_2_step_pred_train, train_df_st$logWage)
##                  ME      RMSE       MAE        MPE     MAPE
## Test set 4.4382e-14 0.5934445 0.4588034 -0.4041652 5.089808
wage_model_st_log_2_step_pred_valid <- predict(wage_model_st_log_2_step, valid_df_st)
accuracy(wage_model_st_log_2_step_pred_valid, valid_df_st$logWage)
##                    ME     RMSE       MAE        MPE     MAPE
## Test set 0.0007094787 0.580847 0.4553575 -0.3734178 5.049277
gvlma(wage_model_st_log_2_step)
## 
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + 
##     log(Positioning) + log(Composure), data = train_df_st)
## 
## Coefficients:
##      (Intercept)          log(Age)      log(Balance)    log(ShotPower)  
##          -9.7450           -0.6667            0.2762            1.5443  
## log(Positioning)    log(Composure)  
##           1.8762            1.3383  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wage_model_st_log_2_step) 
## 
##                      Value   p-value                   Decision
## Global Stat        393.169 0.000e+00 Assumptions NOT satisfied!
## Skewness             2.178 1.400e-01    Assumptions acceptable.
## Kurtosis            52.586 4.117e-13 Assumptions NOT satisfied!
## Link Function      336.860 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity   1.545 2.139e-01    Assumptions acceptable.

Predict new records using the stepwise log model

wage_model_st_log_2_step_pred_new3 <- predict(wage_model_st_log_2_step,
                                   newdata = new3, interval = "confidence")

wage_model_st_log_2_step_pred_new3
##        fit      lwr      upr
## 1 9.533442 9.483746 9.583138
## 2 9.359849 9.304570 9.415129
## 3 8.340250 8.189562 8.490939
wage_model_st_log_2_step_pred_new3_df <- as.data.frame(wage_model_st_log_2_step_pred_new3)



wage_model_st_log_2_step_pred_new3_df_value <- exp(1)^wage_model_st_log_2_step_pred_new3_df

wage_model_st_log_2_step_pred_new3_df_value
##         fit       lwr       upr
## 1 13814.060 13144.332 14517.911
## 2 11612.636 10988.116 12272.652
## 3  4189.138  3603.142  4870.437

11. Combined

We can combine different settings.

wage_model_st_log_3 <- lm(log(Wage) ~ log(Age) + I(Positioning * Composure),
                    data = train_df_st)
summary(wage_model_st_log_3)
## 
## Call:
## lm(formula = log(Wage) ~ log(Age) + I(Positioning * Composure), 
##     data = train_df_st)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.73661 -0.35786 -0.00011  0.35532  2.05304 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 8.666e+00  2.793e-01   31.03  < 2e-16 ***
## log(Age)                   -6.139e-01  9.807e-02   -6.26 5.02e-10 ***
## I(Positioning * Composure)  5.760e-04  1.599e-05   36.02  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.577 on 1503 degrees of freedom
## Multiple R-squared:  0.5264, Adjusted R-squared:  0.5258 
## F-statistic: 835.2 on 2 and 1503 DF,  p-value: < 2.2e-16

Predict the training and validation sets. Check the accuracy.

wage_model_st_log_3_pred_train <- predict(wage_model_st_log_3,
                                   train_df_st)

train_df_st$logWage <- log(train_df_st$Wage)


accuracy(wage_model_st_log_3_pred_train, train_df_st$logWage)
##                    ME      RMSE       MAE       MPE     MAPE
## Test set 3.847045e-14 0.5764289 0.4423697 -0.388299 4.908134
wage_model_st_log_3_pred_valid <- predict(wage_model_st_log_3,
                                   valid_df_st)
valid_df_st$logWage <- log(valid_df_st$Wage)

accuracy(wage_model_st_log_3_pred_valid, valid_df_st$logWage)
##                   ME      RMSE       MAE        MPE    MAPE
## Test set 0.002845158 0.5841898 0.4527094 -0.3440159 5.01038
vif(wage_model_st_log_3)
##                   log(Age) I(Positioning * Composure) 
##                   1.568088                   1.568088

Predict new records

More and more new data

new3 <- read.csv("new3.csv", header = TRUE)
new3
##   X Age Balance ShotPower Aggression Positioning Composure
## 1 1  25      66        69         55          72        71
## 2 2  26      58        76         75          66        66
## 3 3  19      80        67         33          43        52
wage_model_st_log_3_pred_new3 <- predict(wage_model_st_log_3,
                                   newdata = new3, interval = "confidence")

wage_model_st_log_3_pred_new3
##        fit      lwr      upr
## 1 9.634869 9.590987 9.678752
## 2 9.175323 9.143823 9.206824
## 3 8.146727 8.092877 8.200578

Results as a data frame (if desired).

wage_model_st_log_3_pred_new3_df <- as.data.frame(wage_model_st_log_3_pred_new3)



wage_model_st_log_3_pred_new3_df_value <- exp(1)^wage_model_st_log_3_pred_new3_df

wage_model_st_log_3_pred_new3_df_value
##         fit       lwr       upr
## 1 15288.703 14632.298 15974.554
## 2  9655.891  9356.468  9964.895
## 3  3452.064  3271.086  3643.054