Regression models to predict the wages of football players.
Load the data and explore them.
football <- read.csv("football_2.csv", header = FALSE)
head(football, 10)
## V1 V2 V3 V4
## 1 ID Name Age Photo
## 2 207439 L. Paredes 24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist 33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909 A. Lunev 26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov 29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260 Hilton 40 https://cdn.sofifa.org/players/4/19/153260.png
## 7 187607 A. Dzyuba 29 https://cdn.sofifa.org/players/4/19/187607.png
## 8 204341 Lu\xcc_s Neto 30 https://cdn.sofifa.org/players/4/19/204341.png
## 9 223058 D. Kuzyaev 25 https://cdn.sofifa.org/players/4/19/223058.png
## 10 183389 G. Sio 29 https://cdn.sofifa.org/players/4/19/183389.png
## V5 V6 V7 V8
## 1 Nationality Flag Overall Potential
## 2 Argentina https://cdn.sofifa.org/flags/52.png 80 85
## 3 Sweden https://cdn.sofifa.org/flags/46.png 80 80
## 4 Russia https://cdn.sofifa.org/flags/40.png 79 81
## 5 Russia https://cdn.sofifa.org/flags/40.png 79 79
## 6 Brazil https://cdn.sofifa.org/flags/54.png 78 78
## 7 Russia https://cdn.sofifa.org/flags/40.png 78 78
## 8 Portugal https://cdn.sofifa.org/flags/38.png 77 77
## 9 Russia https://cdn.sofifa.org/flags/40.png 77 80
## 10 Ivory Coast https://cdn.sofifa.org/flags/108.png 77 77
## V9 V10 V11 V12
## 1 Club Club Logo Value Wage
## 2 https://cdn.sofifa.org/flags/52.png 5684 1602
## 3 https://cdn.sofifa.org/flags/46.png 6370 3591
## 4 https://cdn.sofifa.org/flags/40.png 5675 3672
## 5 https://cdn.sofifa.org/flags/40.png 6030 1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png 6405 19799
## 7 https://cdn.sofifa.org/flags/40.png 5764 1105
## 8 https://cdn.sofifa.org/flags/38.png 6075 2836
## 9 https://cdn.sofifa.org/flags/40.png 5565 2653
## 10 https://cdn.sofifa.org/flags/108.png 5275 2138
## V13 V14 V15 V16 V17
## 1 Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2 2122 Right 2 4 4
## 3 1797 Right 2 4 2
## 4 1217 Right 1 3 1
## 5 2038 Right 2 3 3
## 6 1807 Right 2 3 3
## 7 1810 Right 2 3 3
## 8 1749 Right 1 3 2
## 9 2041 Right 1 3 3
## 10 1933 Left 2 3 3
## V18 V19 V20 V21 V22 V23
## 1 Work Rate Body Type Real Face Position Jersey Number Joined
## 2 Medium/ Medium Normal No CM 5
## 3 High/ Medium Normal No LCB 4
## 4 Medium/ Medium Normal No GK 12
## 5 High/ High Lean No RB 2
## 6 Medium/ Medium Normal Yes CB 4 1-Aug-11
## 7 High/ Medium Stocky No ST 22
## 8 Medium/ Medium Lean No CB 4
## 9 Medium/ High Lean No RM 7
## 10 High/ Low Normal No ST 21
## V24 V25 V26 V27 V28 V29 V30 V31 V32 V33
## 1 Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 2 5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3 6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4 6'2 176lbs
## 5 5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6 2019 5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7 6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
## 8 6'2 157lbs 52+2 52+2 52+2 51+2 51+2 51+2
## 9 6'0 163lbs 70+2 70+2 70+2 74+2 74+2 74+2
## 10 5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2
## V34 V35 V36 V37 V38 V39 V40 V41 V42 V43 V44 V45 V46 V47 V48
## 1 RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7 74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
## 8 51+2 51+2 54+2 54+2 54+2 54+2 61+2 61+2 61+2 54+2 67+2 72+2 72+2 72+2 67+2
## 9 74+2 74+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2
## 10 75+2 75+2 74+2 74+2 74+2 74+2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2
## V49 V50 V51 V52 V53 V54 V55 V56 V57
## 1 LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2 76 55 60 84
## 3 70+2 79+2 79+2 79+2 70+2 49 51 81 73
## 4 16 14 17 25
## 5 78+2 73+2 73+2 73+2 78+2 73 61 69 79
## 6 68+2 76+2 76+2 76+2 68+2 60 45 79 73
## 7 48+2 48+2 48+2 48+2 48+2 61 79 86 71
## 8 69+2 75+2 75+2 75+2 69+2 42 33 80 72
## 9 74+2 70+2 70+2 70+2 74+2 67 64 51 82
## 10 50+2 46+2 46+2 46+2 50+2 68 77 71 73
## V58 V59 V60 V61 V62 V63 V64
## 1 Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2 73 78 79 78 82 82 75
## 3 37 49 36 40 67 63 46
## 4 13 15 18 17 32 17 58
## 5 57 72 49 46 75 72 84
## 6 51 63 42 48 72 73 33
## 7 74 71 64 60 55 77 66
## 8 40 49 52 43 77 48 57
## 9 57 78 60 61 75 79 78
## 10 73 76 73 69 67 76 78
## V65 V66 V67 V68 V69 V70 V71 V72
## 1 SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2 69 77 74 77 82 61 79 69
## 3 49 55 76 36 74 64 67 83
## 4 54 36 76 50 24 60 27 70
## 5 90 80 75 76 67 85 93 68
## 6 38 51 70 60 55 79 54 76
## 7 65 50 75 32 78 63 77 93
## 8 59 69 78 61 42 79 72 72
## 9 81 80 73 76 76 60 79 59
## 10 85 79 71 73 77 70 78 74
## V73 V74 V75 V76 V77 V78 V79
## 1 LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2 80 79 72 74 82 57 74
## 3 59 81 82 54 49 79 78
## 4 13 26 20 11 63 15 69
## 5 57 65 71 77 72 41 73
## 6 58 76 79 50 67 64 70
## 7 68 75 30 78 73 77 70
## 8 37 76 78 44 46 47 72
## 9 74 70 74 71 70 63 64
## 10 74 77 18 76 73 72 72
## V80 V81 V82 V83 V84 V85
## 1 Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2 73 75 72 9 14 6
## 3 82 83 79 7 9 12
## 4 18 20 12 80 73 65
## 5 76 76 80 7 12 10
## 6 83 77 76 12 7 11
## 7 21 15 19 15 12 11
## 8 80 77 78 10 15 13
## 9 71 77 76 15 16 13
## 10 40 18 12 15 9 10
## V86 V87 V88
## 1 GKPositioning GKReflexes Release Clause
## 2 9 10
## 3 10 15
## 4 77 85
## 5 8 15
## 6 12 13
## 7 11 8
## 8 15 8
## 9 7 8
## 10 15 16
names(football) <- football[1,]
head(football)
## ID Name Age Photo
## 1 ID Name Age Photo
## 2 207439 L. Paredes 24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist 33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909 A. Lunev 26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov 29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260 Hilton 40 https://cdn.sofifa.org/players/4/19/153260.png
## Nationality Flag Overall Potential
## 1 Nationality Flag Overall Potential
## 2 Argentina https://cdn.sofifa.org/flags/52.png 80 85
## 3 Sweden https://cdn.sofifa.org/flags/46.png 80 80
## 4 Russia https://cdn.sofifa.org/flags/40.png 79 81
## 5 Russia https://cdn.sofifa.org/flags/40.png 79 79
## 6 Brazil https://cdn.sofifa.org/flags/54.png 78 78
## Club Club Logo Value Wage
## 1 Club Club Logo Value Wage
## 2 https://cdn.sofifa.org/flags/52.png 5684 1602
## 3 https://cdn.sofifa.org/flags/46.png 6370 3591
## 4 https://cdn.sofifa.org/flags/40.png 5675 3672
## 5 https://cdn.sofifa.org/flags/40.png 6030 1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png 6405 19799
## Special Preferred Foot International Reputation Weak Foot Skill Moves
## 1 Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2 2122 Right 2 4 4
## 3 1797 Right 2 4 2
## 4 1217 Right 1 3 1
## 5 2038 Right 2 3 3
## 6 1807 Right 2 3 3
## Work Rate Body Type Real Face Position Jersey Number Joined
## 1 Work Rate Body Type Real Face Position Jersey Number Joined
## 2 Medium/ Medium Normal No CM 5
## 3 High/ Medium Normal No LCB 4
## 4 Medium/ Medium Normal No GK 12
## 5 High/ High Lean No RB 2
## 6 Medium/ Medium Normal Yes CB 4 1-Aug-11
## Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 1 Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 2 5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3 6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4 6'2 176lbs
## 5 5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6 2019 5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 1 RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 1 LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2 76 55 60 84
## 3 70+2 79+2 79+2 79+2 70+2 49 51 81 73
## 4 16 14 17 25
## 5 78+2 73+2 73+2 73+2 78+2 73 61 69 79
## 6 68+2 76+2 76+2 76+2 68+2 60 45 79 73
## Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 1 Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2 73 78 79 78 82 82 75
## 3 37 49 36 40 67 63 46
## 4 13 15 18 17 32 17 58
## 5 57 72 49 46 75 72 84
## 6 51 63 42 48 72 73 33
## SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 1 SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2 69 77 74 77 82 61 79 69
## 3 49 55 76 36 74 64 67 83
## 4 54 36 76 50 24 60 27 70
## 5 90 80 75 76 67 85 93 68
## 6 38 51 70 60 55 79 54 76
## LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 1 LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2 80 79 72 74 82 57 74
## 3 59 81 82 54 49 79 78
## 4 13 26 20 11 63 15 69
## 5 57 65 71 77 72 41 73
## 6 58 76 79 50 67 64 70
## Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 1 Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2 73 75 72 9 14 6
## 3 82 83 79 7 9 12
## 4 18 20 12 80 73 65
## 5 76 76 80 7 12 10
## 6 83 77 76 12 7 11
## GKPositioning GKReflexes Release Clause
## 1 GKPositioning GKReflexes Release Clause
## 2 9 10
## 3 10 15
## 4 77 85
## 5 8 15
## 6 12 13
football <- football[-c(1),]
head(football)
## ID Name Age Photo
## 2 207439 L. Paredes 24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist 33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909 A. Lunev 26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov 29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260 Hilton 40 https://cdn.sofifa.org/players/4/19/153260.png
## 7 187607 A. Dzyuba 29 https://cdn.sofifa.org/players/4/19/187607.png
## Nationality Flag Overall Potential
## 2 Argentina https://cdn.sofifa.org/flags/52.png 80 85
## 3 Sweden https://cdn.sofifa.org/flags/46.png 80 80
## 4 Russia https://cdn.sofifa.org/flags/40.png 79 81
## 5 Russia https://cdn.sofifa.org/flags/40.png 79 79
## 6 Brazil https://cdn.sofifa.org/flags/54.png 78 78
## 7 Russia https://cdn.sofifa.org/flags/40.png 78 78
## Club Club Logo Value Wage
## 2 https://cdn.sofifa.org/flags/52.png 5684 1602
## 3 https://cdn.sofifa.org/flags/46.png 6370 3591
## 4 https://cdn.sofifa.org/flags/40.png 5675 3672
## 5 https://cdn.sofifa.org/flags/40.png 6030 1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png 6405 19799
## 7 https://cdn.sofifa.org/flags/40.png 5764 1105
## Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2 2122 Right 2 4 4
## 3 1797 Right 2 4 2
## 4 1217 Right 1 3 1
## 5 2038 Right 2 3 3
## 6 1807 Right 2 3 3
## 7 1810 Right 2 3 3
## Work Rate Body Type Real Face Position Jersey Number Joined
## 2 Medium/ Medium Normal No CM 5
## 3 High/ Medium Normal No LCB 4
## 4 Medium/ Medium Normal No GK 12
## 5 High/ High Lean No RB 2
## 6 Medium/ Medium Normal Yes CB 4 1-Aug-11
## 7 High/ Medium Stocky No ST 22
## Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 2 5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3 6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4 6'2 176lbs
## 5 5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6 2019 5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7 6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
## RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7 74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
## LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2 76 55 60 84
## 3 70+2 79+2 79+2 79+2 70+2 49 51 81 73
## 4 16 14 17 25
## 5 78+2 73+2 73+2 73+2 78+2 73 61 69 79
## 6 68+2 76+2 76+2 76+2 68+2 60 45 79 73
## 7 48+2 48+2 48+2 48+2 48+2 61 79 86 71
## Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2 73 78 79 78 82 82 75
## 3 37 49 36 40 67 63 46
## 4 13 15 18 17 32 17 58
## 5 57 72 49 46 75 72 84
## 6 51 63 42 48 72 73 33
## 7 74 71 64 60 55 77 66
## SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2 69 77 74 77 82 61 79 69
## 3 49 55 76 36 74 64 67 83
## 4 54 36 76 50 24 60 27 70
## 5 90 80 75 76 67 85 93 68
## 6 38 51 70 60 55 79 54 76
## 7 65 50 75 32 78 63 77 93
## LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2 80 79 72 74 82 57 74
## 3 59 81 82 54 49 79 78
## 4 13 26 20 11 63 15 69
## 5 57 65 71 77 72 41 73
## 6 58 76 79 50 67 64 70
## 7 68 75 30 78 73 77 70
## Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2 73 75 72 9 14 6
## 3 82 83 79 7 9 12
## 4 18 20 12 80 73 65
## 5 76 76 80 7 12 10
## 6 83 77 76 12 7 11
## 7 21 15 19 15 12 11
## GKPositioning GKReflexes Release Clause
## 2 9 10
## 3 10 15
## 4 77 85
## 5 8 15
## 6 12 13
## 7 11 8
nrow(football)
## [1] 18207
table(football$Position)
##
## CAM CB CDM CF CM GK LAM LB LCB LCM LDM LF LM LS LW
## 60 958 1778 948 74 1394 2025 21 1322 648 395 243 15 1095 207 381
## LWB RAM RB RCB RCM RDM RF RM RS RW RWB ST
## 78 21 1291 662 391 248 16 1124 203 370 87 2152
Strikers are defined in the dataset as Position = “ST”.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
football_st <- football %>% filter(Position == "ST")
head(football_st)
## ID Name Age Photo
## 1 187607 A. Dzyuba 29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389 G. Sio 29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683 K. Fofana 26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson 27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900 J. Sambenito 26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405 B. Angulo 22 https://cdn.sofifa.org/players/4/19/246405.png
## Nationality Flag Overall Potential Club
## 1 Russia https://cdn.sofifa.org/flags/40.png 78 78
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png 77 77
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png 75 75
## 4 Iceland https://cdn.sofifa.org/flags/24.png 73 74
## 5 Paraguay https://cdn.sofifa.org/flags/58.png 71 74
## 6 Ecuador https://cdn.sofifa.org/flags/57.png 71 77
## Club Logo Value Wage Special Preferred Foot
## 1 https://cdn.sofifa.org/flags/40.png 5764 1105 1810 Right
## 2 https://cdn.sofifa.org/flags/108.png 5275 2138 1933 Left
## 3 https://cdn.sofifa.org/flags/108.png 5589 3875 1877 Right
## 4 https://cdn.sofifa.org/flags/24.png 5629 3661 1893 Right
## 5 https://cdn.sofifa.org/flags/58.png 6113 2445 1651 Right
## 6 https://cdn.sofifa.org/flags/57.png 5057 2216 1628 Right
## International Reputation Weak Foot Skill Moves Work Rate Body Type
## 1 2 3 3 High/ Medium Stocky
## 2 2 3 3 High/ Low Normal
## 3 1 3 3 Medium/ Medium Normal
## 4 1 4 3 High/ High Normal
## 5 1 3 2 High/ Medium Lean
## 6 1 4 3 High/ Low Normal
## Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1 No ST 22
## 2 No ST 21
## 3 No ST 22
## 4 No ST 9
## 5 No ST 9
## 6 No ST 19
## Height Weight LS ST RS LW LF CF RF RW LAM CAM RAM LM
## 1 6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2 5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3 6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4 6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5 6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6 6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
## LCM CM RCM RM LWB LDM CDM RDM RWB LB LCB CB RCB RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
## Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1 61 79 86 71 74 71 64
## 2 68 77 71 73 73 76 73
## 3 66 75 72 74 74 72 63
## 4 66 71 68 68 65 73 63
## 5 40 74 72 57 72 60 64
## 6 50 78 69 56 46 76 58
## FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1 60 55 77 66 65 50 75
## 2 69 67 76 78 85 79 71
## 3 59 58 75 59 77 63 72
## 4 48 44 73 78 79 83 74
## 5 42 42 63 79 72 61 69
## 6 58 33 71 82 79 78 73
## Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1 32 78 63 77 93 68 75 30
## 2 73 77 70 78 74 74 77 18
## 3 60 78 69 83 77 73 67 40
## 4 76 68 78 90 85 66 73 42
## 5 64 73 69 67 72 67 49 14
## 6 64 72 69 77 69 54 28 16
## Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1 78 73 77 70 21 15 19
## 2 76 73 72 72 40 18 12
## 3 72 69 74 83 23 37 46
## 4 73 64 69 76 31 39 24
## 5 75 60 67 74 15 16 16
## 6 62 45 82 51 11 18 12
## GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1 15 12 11 11 8
## 2 15 9 10 15 16
## 3 7 11 7 11 14
## 4 9 12 10 15 16
## 5 15 16 15 7 7
## 6 11 8 10 7 6
nrow(football_st)
## [1] 2152
It’s as good idea to explore the data.
convert to numeric.
str(football_st$Wage)
## chr [1:2152] "1105" "2138" "3875" "3661" "2445" "2216" "4457" "3370" ...
str(football_st$Value)
## chr [1:2152] "5764" "5275" "5589" "5629" "6113" "5057" "6561" "6146" ...
football_st$Wage <- as.numeric(football_st$Wage)
football_st$Value <- as.numeric(football_st$Value)
library(ggplot2)
library(ggpubr)
ggplot(football_st) + aes(x = Wage, y = Value) +
geom_point(shape = 2, colour = "black") +
xlab("Wage") + ylab("Value") +
ggtitle("Wage and Value") +
geom_smooth(method = lm) +
stat_cor(method = "pearson", label.x = 300000, label.y = 1600)
## `geom_smooth()` using formula = 'y ~ x'
Localised with 30% of data.
ggplot(football_st) + aes(x = Wage, y = Value) +
geom_point(shape = 2, colour = "black") +
xlab("Wage") + ylab("Value") +
ggtitle("Wage and Value") +
geom_smooth(method = loess, span = 0.3) +
stat_cor(method = "pearson", label.x = 300000, label.y = 1600)
## `geom_smooth()` using formula = 'y ~ x'
football_st_2 <- football_st[, c("Age", "Balance", "ShotPower", "Aggression",
"Positioning", "Composure", "Wage")]
head(football_st_2)
## Age Balance ShotPower Aggression Positioning Composure Wage
## 1 29 32 78 75 78 70 1105
## 2 29 73 77 77 76 72 2138
## 3 26 60 78 67 72 83 3875
## 4 27 76 68 73 73 76 3661
## 5 26 64 73 49 75 74 2445
## 6 22 64 72 28 62 51 2216
Change to numeric.
football_st_2 <- football_st_2 %>%
mutate(across(everything(), as.numeric))
Split the data into training and validation sets.
Set the seed using our favourite number :-)
set.seed(666)
Create the indices for the split This samples the row indices to split the data into training and validation.
train_index <- sample(1:nrow(football_st_2), 0.7 * nrow(football_st_2))
valid_index <- setdiff(1:nrow(football_st_2), train_index)
Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.
train_df_st <- football_st_2[train_index, ]
valid_df_st <- football_st_2[valid_index, ]
It is a good habit to check after splitting.
nrow(train_df_st)
## [1] 1506
nrow(valid_df_st)
## [1] 646
Training the model on the training set.
wage_model_st_2 <- lm(Wage ~ Age + Balance + ShotPower +
Aggression + Positioning + Composure,
data = train_df_st)
summary(wage_model_st_2)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning + Composure, data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32861 -8569 -2336 5182 347609
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -81327.50 5253.63 -15.480 < 2e-16 ***
## Age -1032.61 146.38 -7.054 2.64e-12 ***
## Balance 131.37 46.63 2.817 0.00491 **
## ShotPower 514.89 98.00 5.254 1.70e-07 ***
## Aggression 13.64 41.73 0.327 0.74380
## Positioning 692.34 107.84 6.420 1.82e-10 ***
## Composure 533.27 93.18 5.723 1.26e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20410 on 1499 degrees of freedom
## Multiple R-squared: 0.2877, Adjusted R-squared: 0.2848
## F-statistic: 100.9 on 6 and 1499 DF, p-value: < 2.2e-16
Predict the outcome (i.e. wage) of the training and validation sets using the model from the training set. Compare the errors between the training and validation sets. Check normality, residuals, multicollinearity, heteroskedasticity/homoskedasticity.
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
## The following object is masked from 'package:ggpubr':
##
## gghistogram
wage_model_st_2_pred_train <- predict(wage_model_st_2,
train_df_st)
accuracy(wage_model_st_2_pred_train, train_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set -3.02254e-10 20363.72 9804.857 -30.84404 131.6738
wage_model_st_2_pred_valid <- predict(wage_model_st_2,
valid_df_st)
accuracy(wage_model_st_2_pred_valid, valid_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set -910.8093 14653.34 9444.861 -32.5431 130.3103
max(train_df_st$Wage) - min(train_df_st$Wage)
## [1] 406504
sd(train_df_st$Wage)
## [1] 24135.81
max(valid_df_st$Wage) - min(valid_df_st$Wage)
## [1] 205030
sd(valid_df_st$Wage)
## [1] 18074.14
Multicollinearity.
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(wage_model_st_2)
## Age Balance ShotPower Aggression Positioning Composure
## 1.690727 1.037080 2.900017 1.602035 3.573232 3.185606
cor_matrix_2 <- cor(train_df_st, use = "complete.obs")
cor_matrix_2
## Age Balance ShotPower Aggression Positioning
## Age 1.00000000 -0.091216618 0.54097985 0.4873814 0.57297740
## Balance -0.09121662 1.000000000 -0.09505236 -0.1260503 -0.02009516
## ShotPower 0.54097985 -0.095052364 1.00000000 0.5602278 0.76578164
## Aggression 0.48738140 -0.126050298 0.56022778 1.0000000 0.49579804
## Positioning 0.57297740 -0.020095159 0.76578164 0.4957980 1.00000000
## Composure 0.57128687 -0.001725346 0.72347355 0.5123792 0.79931083
## Wage 0.18304028 0.055407602 0.44965603 0.2582019 0.48053332
## Composure Wage
## Age 0.571286868 0.1830403
## Balance -0.001725346 0.0554076
## ShotPower 0.723473546 0.4496560
## Aggression 0.512379238 0.2582019
## Positioning 0.799310831 0.4805333
## Composure 1.000000000 0.4655139
## Wage 0.465513874 1.0000000
Perform a Breusch-Pagan Test to test for heteroskedasticity/homoskedasticity.
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(wage_model_st_2)
##
## studentized Breusch-Pagan test
##
## data: wage_model_st_2
## BP = 72.484, df = 6, p-value = 1.264e-13
Normality.
ggplot(train_df_st) + aes(x = Wage) +
geom_histogram(binwidth = 1000) +
ylab("Count") +
ggtitle("Distribution of wage (strikers)")
wage_model_st_residuals <- rstandard(wage_model_st_2)
head(wage_model_st_residuals)
## 1598 638 907 873 652 1697
## -0.24905289 -0.14246223 0.23893391 0.36088574 0.09706996 -0.14061088
train_df_st_comb_2 <- cbind(train_df_st, wage_model_st_residuals)
head(train_df_st_comb_2)
## Age Balance ShotPower Aggression Positioning Composure Wage
## 1598 33 63 75 62 73 73 16730
## 638 20 67 70 36 52 52 4190
## 907 18 68 53 57 53 51 5843
## 873 23 55 59 44 58 47 5704
## 652 29 72 62 56 61 55 4414
## 1697 26 68 62 53 76 64 17288
## wage_model_st_residuals
## 1598 -0.24905289
## 638 -0.14246223
## 907 0.23893391
## 873 0.36088574
## 652 0.09706996
## 1697 -0.14061088
ggplot(train_df_st_comb_2) + aes(x = Wage, y = wage_model_st_residuals) +
geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Wage Prediction")
Using the Shapiro-Wilks test.
H-0: normal distribution.
H-1: distribution is different from a normal distribution.
shapiro.test(train_df_st$Wage)
##
## Shapiro-Wilk normality test
##
## data: train_df_st$Wage
## W = 0.36328, p-value < 2.2e-16
Predict new players
new <- read.csv("new.csv", header = TRUE)
wage_model_st_2_pred_new <- predict(wage_model_st_2,
newdata = new, interval = "confidence")
wage_model_st_2_pred_new
## fit lwr upr
## 1 21523.43 18689.82 24357.04
## 2 23759.40 20030.25 27488.55
## 3 21465.21 19657.65 23272.77
Subset to include categorical variable: preferred foot
football_st_3 <- football_st[, c("Preferred Foot", "Positioning", "Composure", "Wage")]
head(football_st_3)
## Preferred Foot Positioning Composure Wage
## 1 Right 78 70 1105
## 2 Left 76 72 2138
## 3 Right 72 83 3875
## 4 Right 73 76 3661
## 5 Right 75 74 2445
## 6 Right 62 51 2216
names(football_st_3)[1] <- "Preferred_Foot"
football_st_3$Positioning <- as.numeric(football_st_3$Positioning)
football_st_3$Composure <- as.numeric(football_st_3$Composure)
set.seed(666)
train_index_3 <- sample(1:nrow(football_st_3), 0.7 *
nrow(football_st_3))
valid_index_3 <- setdiff(1:nrow(football_st_3), train_index)
train_df_st_3 <- football_st_3[train_index_3, ]
valid_df_st_3 <- football_st_3[valid_index_3, ]
wage_model_st_cat_2 <- lm(Wage ~ factor(Preferred_Foot) + Positioning +
Composure, data = train_df_st_3)
summary(wage_model_st_cat_2)
##
## Call:
## lm(formula = Wage ~ factor(Preferred_Foot) + Positioning + Composure,
## data = train_df_st_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33599 -8588 -2271 5035 352343
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -68270.03 4142.64 -16.480 < 2e-16 ***
## factor(Preferred_Foot)Right -2040.14 1572.46 -1.297 0.195
## Positioning 787.78 97.32 8.095 1.17e-15 ***
## Composure 534.07 89.17 5.990 2.63e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20920 on 1502 degrees of freedom
## Multiple R-squared: 0.2501, Adjusted R-squared: 0.2486
## F-statistic: 167 on 3 and 1502 DF, p-value: < 2.2e-16
wage_model_st_cat_2_pred_train <- predict(wage_model_st_cat_2,
train_df_st_3)
accuracy(wage_model_st_cat_2_pred_train, train_df_st_3$Wage)
## ME RMSE MAE MPE MAPE
## Test set -1.589124e-10 20893.75 9883.551 -37.84999 129.0229
wage_model_st_cat_2_pred_valid <- predict(wage_model_st_cat_2,
valid_df_st_3)
accuracy(wage_model_st_cat_2_pred_valid, valid_df_st_3$Wage)
## ME RMSE MAE MPE MAPE
## Test set -844.4194 15387.14 9700.628 -41.79669 130.5894
sd(train_df_st_3$Wage)
## [1] 24135.81
sd(valid_df_st_3$Wage)
## [1] 18074.14
Normality.
shapiro.test(train_df_st_3$Wage)
##
## Shapiro-Wilk normality test
##
## data: train_df_st_3$Wage
## W = 0.36328, p-value < 2.2e-16
Residuals.
wage_model_st_cat_2_residuals <- rstandard(wage_model_st_cat_2)
head(wage_model_st_cat_2_residuals)
## 1598 638 907 873 652 1697
## -0.4524008 0.2758081 0.3427025 0.2498056 -0.1293316 -0.3088318
train_df_st_3_comb_2 <- cbind(train_df_st_3, wage_model_st_cat_2_residuals)
head(train_df_st_3_comb_2)
## Preferred_Foot Positioning Composure Wage wage_model_st_cat_2_residuals
## 1598 Right 73 73 16730 -0.4524008
## 638 Right 52 52 4190 0.2758081
## 907 Right 53 51 5843 0.3427025
## 873 Right 58 47 5704 0.2498056
## 652 Right 61 55 4414 -0.1293316
## 1697 Right 76 64 17288 -0.3088318
Check for all predictors too.
ggplot(train_df_st_3_comb_2) + aes(x = Wage, y = wage_model_st_cat_2_residuals) +
geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Wage Prediction")
Multicollinearity.
vif(wage_model_st_cat_2)
## factor(Preferred_Foot) Positioning Composure
## 1.004517 2.769896 2.776543
Heteroskedasticity/Homoskedasticity.
bptest(wage_model_st_cat_2)
##
## studentized Breusch-Pagan test
##
## data: wage_model_st_cat_2
## BP = 64.695, df = 3, p-value = 5.829e-14
new2 <- read.csv("new2.csv")
new2
## Preferred.Foot Positioning Composure
## 1 Right 64 56
## 2 Right 65 47
new2$Preferred.Foot <- as.factor(new2$Preferred.Foot)
names(new2)
## [1] "Preferred.Foot" "Positioning" "Composure"
names(new2)[1] <- "Preferred_Foot"
names(new2)
## [1] "Preferred_Foot" "Positioning" "Composure"
wage_model_st_cat_2_pred_new <- predict(wage_model_st_cat_2,
newdata = new2, interval = "confidence")
wage_model_st_cat_2_pred_new
## fit lwr upr
## 1 10016.014 8776.276 11255.75
## 2 5997.149 3512.208 8482.09
Sometimes, a relationship may not be linear. In this case, we can specify a non-linear relationship in the model.
wage_model_st_nl_2 <- lm(Wage ~ Age + Balance + ShotPower + Aggression +
Positioning * Composure,
data = train_df_st)
summary(wage_model_st_nl_2)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning * Composure, data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66085 -5447 302 4870 260260
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 342265.457 17203.875 19.895 < 2e-16 ***
## Age -758.277 122.802 -6.175 8.52e-10 ***
## Balance 79.001 39.022 2.025 0.0431 *
## ShotPower 699.312 82.219 8.505 < 2e-16 ***
## Aggression 13.470 34.875 0.386 0.6994
## Positioning -5818.796 271.105 -21.463 < 2e-16 ***
## Composure -6961.508 304.442 -22.866 < 2e-16 ***
## Positioning:Composure 109.127 4.285 25.465 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17060 on 1498 degrees of freedom
## Multiple R-squared: 0.5029, Adjusted R-squared: 0.5006
## F-statistic: 216.5 on 7 and 1498 DF, p-value: < 2.2e-16
Predict the training and validation sets using the non-linear model. Check the accuracy
wage_model_st_nl_2_pred_train <- predict(wage_model_st_nl_2,
train_df_st)
accuracy(wage_model_st_nl_2_pred_train, train_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set 1.966265e-09 17011.8 8516.679 -27.10041 105.2229
wage_model_st_nl_2_pred_valid <- predict(wage_model_st_nl_2,
valid_df_st)
accuracy(wage_model_st_nl_2_pred_valid, valid_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set -855.2409 13504.82 8758.766 -32.40985 113.7297
Normality.
shapiro.test(train_df_st$Wage)
##
## Shapiro-Wilk normality test
##
## data: train_df_st$Wage
## W = 0.36328, p-value < 2.2e-16
Residuals.
wage_model_st_nl_2_residuals <- rstandard(wage_model_st_nl_2)
head(wage_model_st_nl_2_residuals)
## 1598 638 907 873 652 1697
## -0.4316213 -0.4791668 0.1460100 0.1104547 0.3550138 0.1330478
train_df_st_comb_3 <- cbind(train_df_st, wage_model_st_nl_2_residuals)
head(train_df_st_comb_3)
## Age Balance ShotPower Aggression Positioning Composure Wage
## 1598 33 63 75 62 73 73 16730
## 638 20 67 70 36 52 52 4190
## 907 18 68 53 57 53 51 5843
## 873 23 55 59 44 58 47 5704
## 652 29 72 62 56 61 55 4414
## 1697 26 68 62 53 76 64 17288
## wage_model_st_nl_2_residuals
## 1598 -0.4316213
## 638 -0.4791668
## 907 0.1460100
## 873 0.1104547
## 652 0.3550138
## 1697 0.1330478
Check residuals for predictors too.
ggplot(train_df_st_comb_3) + aes(x = Wage, y = wage_model_st_nl_2_residuals) +
geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Wage Prediction")
Multicollinearity. Expected, due to interaction term.
vif(wage_model_st_nl_2)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## Age Balance ShotPower
## 1.703838 1.039969 2.922692
## Aggression Positioning Composure
## 1.602035 32.336863 48.694956
## Positioning:Composure
## 128.868532
Heteroskedasticity/Homoskedasticity.
bptest(wage_model_st_nl_2)
##
## studentized Breusch-Pagan test
##
## data: wage_model_st_nl_2
## BP = 286.28, df = 7, p-value < 2.2e-16
Predict the wages of new players using the non-linear model.
wage_model_st_nl_2_pred_new <- predict(wage_model_st_nl_2,
newdata = new, interval = "confidence")
wage_model_st_nl_2_pred_new
## fit lwr upr
## 1 14285.88 11853.154 16718.61
## 2 12719.88 9489.583 15950.18
## 3 17129.30 15582.278 18676.32
wage_model_st_nl_3 <- lm(Wage ~ Age + Balance + ShotPower + Aggression +
I(Positioning * Composure),
data = train_df_st)
summary(wage_model_st_nl_3)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## I(Positioning * Composure), data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36259 -8218 -1295 5576 333818
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.640e+04 5.645e+03 -6.448 1.53e-10 ***
## Age -1.132e+03 1.416e+02 -7.994 2.58e-15 ***
## Balance 1.068e+02 4.538e+01 2.354 0.0187 *
## ShotPower 3.801e+02 9.288e+01 4.093 4.49e-05 ***
## Aggression -3.070e+00 4.051e+01 -0.076 0.9396
## I(Positioning * Composure) 1.154e+01 7.433e-01 15.525 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19850 on 1500 degrees of freedom
## Multiple R-squared: 0.3259, Adjusted R-squared: 0.3237
## F-statistic: 145.1 on 5 and 1500 DF, p-value: < 2.2e-16
vif(wage_model_st_nl_3)
## Age Balance
## 1.672602 1.038758
## ShotPower Aggression
## 2.754568 1.596489
## I(Positioning * Composure)
## 2.862857
Can be applied to other linear regression models too
wage_model_st_nl_2_step <- step(wage_model_st_nl_2,
direction = "both")
## Start: AIC=29357.89
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning *
## Composure
##
## Df Sum of Sq RSS AIC
## - Aggression 1 4.3404e+07 4.3588e+11 29356
## <none> 4.3584e+11 29358
## - Balance 1 1.1925e+09 4.3703e+11 29360
## - Age 1 1.1093e+10 4.4693e+11 29394
## - ShotPower 1 2.1048e+10 4.5689e+11 29427
## - Positioning:Composure 1 1.8867e+11 6.2451e+11 29898
##
## Step: AIC=29356.04
## Wage ~ Age + Balance + ShotPower + Positioning + Composure +
## Positioning:Composure
##
## Df Sum of Sq RSS AIC
## <none> 4.3588e+11 29356
## + Aggression 1 4.3404e+07 4.3584e+11 29358
## - Balance 1 1.1606e+09 4.3704e+11 29358
## - Age 1 1.1307e+10 4.4719e+11 29393
## - ShotPower 1 2.2847e+10 4.5873e+11 29431
## - Positioning:Composure 1 1.8867e+11 6.2455e+11 29896
summary(wage_model_st_nl_2_step)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning +
## Composure + Positioning:Composure, data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66274 -5469 260 4921 260121
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 342109.131 17194.232 19.897 < 2e-16 ***
## Age -748.254 119.996 -6.236 5.84e-10 ***
## Balance 77.598 38.841 1.998 0.0459 *
## ShotPower 706.988 79.759 8.864 < 2e-16 ***
## Positioning -5818.926 271.028 -21.470 < 2e-16 ***
## Composure -6958.216 304.237 -22.871 < 2e-16 ***
## Positioning:Composure 109.128 4.284 25.472 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17050 on 1499 degrees of freedom
## Multiple R-squared: 0.5028, Adjusted R-squared: 0.5008
## F-statistic: 252.7 on 6 and 1499 DF, p-value: < 2.2e-16
Predict the training and validation sets using the stepwise, non-linear model. Check the accuracy.
wage_model_st_nl_2_step_pred_train <- predict(wage_model_st_nl_2_step,
train_df_st)
accuracy(wage_model_st_nl_2_step_pred_train, train_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set 1.982657e-09 17012.65 8519.261 -27.09032 105.1927
wage_model_st_nl_2_step_pred_valid <- predict(wage_model_st_nl_2_step,
valid_df_st)
accuracy(wage_model_st_nl_2_step_pred_valid, valid_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set -852.6068 13509.38 8763.627 -32.34321 113.7652
Check normality, residuals, multicollinearity, heteroskedasticity/homoskedasticity.
Normality.
shapiro.test(train_df_st$Wage)
##
## Shapiro-Wilk normality test
##
## data: train_df_st$Wage
## W = 0.36328, p-value < 2.2e-16
Residuals.
wage_model_st_nl_2_step_residuals <- rstandard(wage_model_st_nl_2_step)
train_df_st_comb_4 <- cbind(train_df_st, wage_model_st_nl_2_step_residuals)
head(train_df_st_comb_4)
## Age Balance ShotPower Aggression Positioning Composure Wage
## 1598 33 63 75 62 73 73 16730
## 638 20 67 70 36 52 52 4190
## 907 18 68 53 57 53 51 5843
## 873 23 55 59 44 58 47 5704
## 652 29 72 62 56 61 55 4414
## 1697 26 68 62 53 76 64 17288
## wage_model_st_nl_2_step_residuals
## 1598 -0.4351998
## 638 -0.4890199
## 907 0.1618064
## 873 0.1101748
## 652 0.3592566
## 1697 0.1346568
Check residuals for predictors too.
ggplot(train_df_st_comb_4) + aes(x = Wage, y = wage_model_st_nl_2_step_residuals) +
geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Wage Prediction")
Multicollinearity.
Expected, due to interaction term.
vif(wage_model_st_nl_2_step)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## Age Balance ShotPower
## 1.627768 1.030962 2.751958
## Positioning Composure Positioning:Composure
## 32.336813 48.656781 128.868527
Heteroskedasticity/Homoskedasticity.
bptest(wage_model_st_nl_2_step)
##
## studentized Breusch-Pagan test
##
## data: wage_model_st_nl_2_step
## BP = 284.9, df = 6, p-value < 2.2e-16
Predict the wages of new players using the stepwise non-linear model.
wage_model_st_nl_2_step_pred_new <- predict(wage_model_st_nl_2_step,
newdata = new, interval = "confidence")
wage_model_st_nl_2_step_pred_new
## fit lwr upr
## 1 14529.97 12437.349 16622.59
## 2 12274.73 9968.014 14581.45
## 3 17001.40 15597.748 18405.04
wage_model_st_nl_3_step <- step(wage_model_st_nl_3,
direction = "both")
## Start: AIC=29812.43
## Wage ~ Age + Balance + ShotPower + Aggression + I(Positioning *
## Composure)
##
## Df Sum of Sq RSS AIC
## - Aggression 1 2.2629e+06 5.9096e+11 29810
## <none> 5.9096e+11 29812
## - Balance 1 2.1828e+09 5.9314e+11 29816
## - ShotPower 1 6.5994e+09 5.9756e+11 29827
## - Age 1 2.5177e+10 6.1614e+11 29873
## - I(Positioning * Composure) 1 9.4958e+10 6.8592e+11 30035
##
## Step: AIC=29810.43
## Wage ~ Age + Balance + ShotPower + I(Positioning * Composure)
##
## Df Sum of Sq RSS AIC
## <none> 5.9096e+11 29810
## + Aggression 1 2.2629e+06 5.9096e+11 29812
## - Balance 1 2.2151e+09 5.9318e+11 29814
## - ShotPower 1 6.9513e+09 5.9791e+11 29826
## - Age 1 2.6501e+10 6.1746e+11 29875
## - I(Positioning * Composure) 1 9.6036e+10 6.8700e+11 30035
summary(wage_model_st_nl_3_step)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + I(Positioning *
## Composure), data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36310 -8221 -1293 5571 333882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.638e+04 5.639e+03 -6.451 1.49e-10 ***
## Age -1.134e+03 1.382e+02 -8.204 4.94e-16 ***
## Balance 1.071e+02 4.517e+01 2.372 0.0178 *
## ShotPower 3.784e+02 9.006e+01 4.202 2.80e-05 ***
## I(Positioning * Composure) 1.153e+01 7.384e-01 15.618 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19840 on 1501 degrees of freedom
## Multiple R-squared: 0.3259, Adjusted R-squared: 0.3241
## F-statistic: 181.5 on 4 and 1501 DF, p-value: < 2.2e-16
vif(wage_model_st_nl_3_step)
## Age Balance
## 1.595555 1.029752
## ShotPower I(Positioning * Composure)
## 2.591612 2.827630
Sometimes, the data need to be transformed. A common transformation is the log transformation.
ggplot(football_st_2) + aes(x = Wage) +
geom_histogram(binwidth = 0.01) +
ylab("Count") +
scale_x_log10() +
ggtitle("Distribution of log(wage) (strikers)")
wage_model_st_log_2 <- lm(log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
log(Aggression) + log(Positioning) + log(Composure),
data = train_df_st)
summary(wage_model_st_log_2)
##
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Aggression) + log(Positioning) + log(Composure), data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.55406 -0.38093 -0.03289 0.35960 2.32610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.67383 0.59662 -16.214 < 2e-16 ***
## log(Age) -0.69453 0.10718 -6.480 1.24e-10 ***
## log(Balance) 0.28514 0.07870 3.623 0.000301 ***
## log(ShotPower) 1.49775 0.17843 8.394 < 2e-16 ***
## log(Aggression) 0.07195 0.05938 1.212 0.225846
## log(Positioning) 1.87362 0.19243 9.737 < 2e-16 ***
## log(Composure) 1.31511 0.15334 8.577 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5945 on 1499 degrees of freedom
## Multiple R-squared: 0.4985, Adjusted R-squared: 0.4965
## F-statistic: 248.3 on 6 and 1499 DF, p-value: < 2.2e-16
Predict the training and validation sets. Check the accuracy.
wage_model_st_log_2_pred_train <- predict(wage_model_st_log_2,
train_df_st)
train_df_st$logWage <- log(train_df_st$Wage)
accuracy(wage_model_st_log_2_pred_train, train_df_st$logWage)
## ME RMSE MAE MPE MAPE
## Test set 4.312485e-14 0.5931541 0.4587753 -0.4038503 5.089574
wage_model_st_log_2_pred_valid <- predict(wage_model_st_log_2,
valid_df_st)
valid_df_st$logWage <- log(valid_df_st$Wage)
accuracy(wage_model_st_log_2_pred_valid, valid_df_st$logWage)
## ME RMSE MAE MPE MAPE
## Test set 0.0002915404 0.5808893 0.4554191 -0.3779982 5.050841
Normality.
shapiro.test(train_df_st$Wage)
##
## Shapiro-Wilk normality test
##
## data: train_df_st$Wage
## W = 0.36328, p-value < 2.2e-16
Residuals.
wage_model_st_log_2_residuals <- rstandard(wage_model_st_log_2)
head(wage_model_st_log_2_residuals)
## 1598 638 907 873 652 1697
## 0.3389765 -0.5477587 0.5128142 0.5174893 -0.4341683 0.7426560
train_df_st_comb_5 <- cbind(train_df_st, wage_model_st_log_2_residuals)
head(train_df_st_comb_5)
## Age Balance ShotPower Aggression Positioning Composure Wage logWage
## 1598 33 63 75 62 73 73 16730 9.724959
## 638 20 67 70 36 52 52 4190 8.340456
## 907 18 68 53 57 53 51 5843 8.673000
## 873 23 55 59 44 58 47 5704 8.648923
## 652 29 72 62 56 61 55 4414 8.392537
## 1697 26 68 62 53 76 64 17288 9.757768
## wage_model_st_log_2_residuals
## 1598 0.3389765
## 638 -0.5477587
## 907 0.5128142
## 873 0.5174893
## 652 -0.4341683
## 1697 0.7426560
Check residuals for predictors too.
ggplot(train_df_st_comb_5) + aes(x = Wage, y = wage_model_st_log_2_residuals) +
geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Wage Prediction")
Multicollinearity.
vif(wage_model_st_log_2)
## log(Age) log(Balance) log(ShotPower) log(Aggression)
## 1.763983 1.035522 2.828829 1.603185
## log(Positioning) log(Composure)
## 3.343780 2.936552
Heteroskedasticity/Homoskedasticity.
bptest(wage_model_st_log_2)
##
## studentized Breusch-Pagan test
##
## data: wage_model_st_log_2
## BP = 122.58, df = 6, p-value < 2.2e-16
Predict new records
new3 <- read.csv("new3.csv", header = TRUE)
new3
## X Age Balance ShotPower Aggression Positioning Composure
## 1 1 25 66 69 55 72 71
## 2 2 26 58 76 75 66 66
## 3 3 19 80 67 33 43 52
wage_model_st_log_2_pred_new3 <- predict(wage_model_st_log_2,
newdata = new3, interval = "confidence")
wage_model_st_log_2_pred_new3
## fit lwr upr
## 1 9.533908 9.484214 9.583602
## 2 9.377799 9.315354 9.440244
## 3 8.323197 8.170023 8.476371
Results as a data frame (if desired).
wage_model_st_log_2_pred_new3_df <- as.data.frame(wage_model_st_log_2_pred_new3)
wage_model_st_log_2_pred_new3_df_value <- exp(1)^wage_model_st_log_2_pred_new3_df
wage_model_st_log_2_pred_new3_df_value
## fit lwr upr
## 1 13820.495 13150.482 14524.645
## 2 11822.968 11107.261 12584.791
## 3 4118.303 3533.423 4799.997
We can combine different tweaks
wage_model_st_log_3 <- lm(log(Wage) ~ log(Age) + I(Positioning * Composure),
data = train_df_st)
summary(wage_model_st_log_3)
##
## Call:
## lm(formula = log(Wage) ~ log(Age) + I(Positioning * Composure),
## data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.73661 -0.35786 -0.00011 0.35532 2.05304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.666e+00 2.793e-01 31.03 < 2e-16 ***
## log(Age) -6.139e-01 9.807e-02 -6.26 5.02e-10 ***
## I(Positioning * Composure) 5.760e-04 1.599e-05 36.02 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.577 on 1503 degrees of freedom
## Multiple R-squared: 0.5264, Adjusted R-squared: 0.5258
## F-statistic: 835.2 on 2 and 1503 DF, p-value: < 2.2e-16
Predict the training and validation sets. Check the accuracy.
wage_model_st_log_3_pred_train <- predict(wage_model_st_log_3,
train_df_st)
train_df_st$logWage <- log(train_df_st$Wage)
accuracy(wage_model_st_log_3_pred_train, train_df_st$logWage)
## ME RMSE MAE MPE MAPE
## Test set 3.847045e-14 0.5764289 0.4423697 -0.388299 4.908134
wage_model_st_log_3_pred_valid <- predict(wage_model_st_log_3,
valid_df_st)
valid_df_st$logWage <- log(valid_df_st$Wage)
accuracy(wage_model_st_log_3_pred_valid, valid_df_st$logWage)
## ME RMSE MAE MPE MAPE
## Test set 0.002845158 0.5841898 0.4527094 -0.3440159 5.01038
Normality.
shapiro.test(train_df_st$logWage)
##
## Shapiro-Wilk normality test
##
## data: train_df_st$logWage
## W = 0.929, p-value < 2.2e-16
Residuals.
wage_model_st_log_3_residuals <- rstandard(wage_model_st_log_3)
head(wage_model_st_log_3_residuals)
## 1598 638 907 873 652 1697
## 0.23518684 -0.07693547 0.38879657 0.58491840 -0.24147460 0.50256918
train_df_st_comb_6 <- cbind(train_df_st, wage_model_st_log_3_residuals)
head(train_df_st_comb_6)
## Age Balance ShotPower Aggression Positioning Composure Wage logWage
## 1598 33 63 75 62 73 73 16730 9.724959
## 638 20 67 70 36 52 52 4190 8.340456
## 907 18 68 53 57 53 51 5843 8.673000
## 873 23 55 59 44 58 47 5704 8.648923
## 652 29 72 62 56 61 55 4414 8.392537
## 1697 26 68 62 53 76 64 17288 9.757768
## wage_model_st_log_3_residuals
## 1598 0.23518684
## 638 -0.07693547
## 907 0.38879657
## 873 0.58491840
## 652 -0.24147460
## 1697 0.50256918
Check residuals for predictors too.
ggplot(train_df_st_comb_6) + aes(x = Wage, y = wage_model_st_log_3_residuals) +
geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Wage Prediction")
Multicollinearity.
vif(wage_model_st_log_3)
## log(Age) I(Positioning * Composure)
## 1.568088 1.568088
Heteroskedasticity/Homoskedasticity.
bptest(wage_model_st_log_3)
##
## studentized Breusch-Pagan test
##
## data: wage_model_st_log_3
## BP = 148.44, df = 2, p-value < 2.2e-16
Predict new records
new3 <- read.csv("new3.csv", header = TRUE)
new3
## X Age Balance ShotPower Aggression Positioning Composure
## 1 1 25 66 69 55 72 71
## 2 2 26 58 76 75 66 66
## 3 3 19 80 67 33 43 52
wage_model_st_log_3_pred_new3 <- predict(wage_model_st_log_3,
newdata = new3, interval = "confidence")
wage_model_st_log_3_pred_new3
## fit lwr upr
## 1 9.634869 9.590987 9.678752
## 2 9.175323 9.143823 9.206824
## 3 8.146727 8.092877 8.200578
Results as a data frame (if desired).
wage_model_st_log_3_pred_new3_df <- as.data.frame(wage_model_st_log_3_pred_new3)
wage_model_st_log_3_pred_new3_df_value <- exp(1)^wage_model_st_log_3_pred_new3_df
wage_model_st_log_3_pred_new3_df_value
## fit lwr upr
## 1 15288.703 14632.298 15974.554
## 2 9655.891 9356.468 9964.895
## 3 3452.064 3271.086 3643.054