Regression models to predict the wages of football players.
Load the data and explore them.
football <- read.csv("football_2.csv", header = FALSE)
head(football, 10)
## V1 V2 V3 V4
## 1 ID Name Age Photo
## 2 207439 L. Paredes 24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist 33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909 A. Lunev 26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov 29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260 Hilton 40 https://cdn.sofifa.org/players/4/19/153260.png
## 7 187607 A. Dzyuba 29 https://cdn.sofifa.org/players/4/19/187607.png
## 8 204341 LuÌ_s Neto 30 https://cdn.sofifa.org/players/4/19/204341.png
## 9 223058 D. Kuzyaev 25 https://cdn.sofifa.org/players/4/19/223058.png
## 10 183389 G. Sio 29 https://cdn.sofifa.org/players/4/19/183389.png
## V5 V6 V7 V8
## 1 Nationality Flag Overall Potential
## 2 Argentina https://cdn.sofifa.org/flags/52.png 80 85
## 3 Sweden https://cdn.sofifa.org/flags/46.png 80 80
## 4 Russia https://cdn.sofifa.org/flags/40.png 79 81
## 5 Russia https://cdn.sofifa.org/flags/40.png 79 79
## 6 Brazil https://cdn.sofifa.org/flags/54.png 78 78
## 7 Russia https://cdn.sofifa.org/flags/40.png 78 78
## 8 Portugal https://cdn.sofifa.org/flags/38.png 77 77
## 9 Russia https://cdn.sofifa.org/flags/40.png 77 80
## 10 Ivory Coast https://cdn.sofifa.org/flags/108.png 77 77
## V9 V10 V11 V12
## 1 Club Club Logo Value Wage
## 2 https://cdn.sofifa.org/flags/52.png 5684 1602
## 3 https://cdn.sofifa.org/flags/46.png 6370 3591
## 4 https://cdn.sofifa.org/flags/40.png 5675 3672
## 5 https://cdn.sofifa.org/flags/40.png 6030 1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png 6405 19799
## 7 https://cdn.sofifa.org/flags/40.png 5764 1105
## 8 https://cdn.sofifa.org/flags/38.png 6075 2836
## 9 https://cdn.sofifa.org/flags/40.png 5565 2653
## 10 https://cdn.sofifa.org/flags/108.png 5275 2138
## V13 V14 V15 V16 V17
## 1 Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2 2122 Right 2 4 4
## 3 1797 Right 2 4 2
## 4 1217 Right 1 3 1
## 5 2038 Right 2 3 3
## 6 1807 Right 2 3 3
## 7 1810 Right 2 3 3
## 8 1749 Right 1 3 2
## 9 2041 Right 1 3 3
## 10 1933 Left 2 3 3
## V18 V19 V20 V21 V22 V23
## 1 Work Rate Body Type Real Face Position Jersey Number Joined
## 2 Medium/ Medium Normal No CM 5
## 3 High/ Medium Normal No LCB 4
## 4 Medium/ Medium Normal No GK 12
## 5 High/ High Lean No RB 2
## 6 Medium/ Medium Normal Yes CB 4 1-Aug-11
## 7 High/ Medium Stocky No ST 22
## 8 Medium/ Medium Lean No CB 4
## 9 Medium/ High Lean No RM 7
## 10 High/ Low Normal No ST 21
## V24 V25 V26 V27 V28 V29 V30 V31 V32 V33
## 1 Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 2 5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3 6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4 6'2 176lbs
## 5 5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6 2019 5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7 6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
## 8 6'2 157lbs 52+2 52+2 52+2 51+2 51+2 51+2
## 9 6'0 163lbs 70+2 70+2 70+2 74+2 74+2 74+2
## 10 5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2
## V34 V35 V36 V37 V38 V39 V40 V41 V42 V43 V44 V45 V46 V47 V48
## 1 RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7 74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
## 8 51+2 51+2 54+2 54+2 54+2 54+2 61+2 61+2 61+2 54+2 67+2 72+2 72+2 72+2 67+2
## 9 74+2 74+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2
## 10 75+2 75+2 74+2 74+2 74+2 74+2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2
## V49 V50 V51 V52 V53 V54 V55 V56 V57
## 1 LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2 76 55 60 84
## 3 70+2 79+2 79+2 79+2 70+2 49 51 81 73
## 4 16 14 17 25
## 5 78+2 73+2 73+2 73+2 78+2 73 61 69 79
## 6 68+2 76+2 76+2 76+2 68+2 60 45 79 73
## 7 48+2 48+2 48+2 48+2 48+2 61 79 86 71
## 8 69+2 75+2 75+2 75+2 69+2 42 33 80 72
## 9 74+2 70+2 70+2 70+2 74+2 67 64 51 82
## 10 50+2 46+2 46+2 46+2 50+2 68 77 71 73
## V58 V59 V60 V61 V62 V63 V64
## 1 Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2 73 78 79 78 82 82 75
## 3 37 49 36 40 67 63 46
## 4 13 15 18 17 32 17 58
## 5 57 72 49 46 75 72 84
## 6 51 63 42 48 72 73 33
## 7 74 71 64 60 55 77 66
## 8 40 49 52 43 77 48 57
## 9 57 78 60 61 75 79 78
## 10 73 76 73 69 67 76 78
## V65 V66 V67 V68 V69 V70 V71 V72
## 1 SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2 69 77 74 77 82 61 79 69
## 3 49 55 76 36 74 64 67 83
## 4 54 36 76 50 24 60 27 70
## 5 90 80 75 76 67 85 93 68
## 6 38 51 70 60 55 79 54 76
## 7 65 50 75 32 78 63 77 93
## 8 59 69 78 61 42 79 72 72
## 9 81 80 73 76 76 60 79 59
## 10 85 79 71 73 77 70 78 74
## V73 V74 V75 V76 V77 V78 V79
## 1 LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2 80 79 72 74 82 57 74
## 3 59 81 82 54 49 79 78
## 4 13 26 20 11 63 15 69
## 5 57 65 71 77 72 41 73
## 6 58 76 79 50 67 64 70
## 7 68 75 30 78 73 77 70
## 8 37 76 78 44 46 47 72
## 9 74 70 74 71 70 63 64
## 10 74 77 18 76 73 72 72
## V80 V81 V82 V83 V84 V85
## 1 Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2 73 75 72 9 14 6
## 3 82 83 79 7 9 12
## 4 18 20 12 80 73 65
## 5 76 76 80 7 12 10
## 6 83 77 76 12 7 11
## 7 21 15 19 15 12 11
## 8 80 77 78 10 15 13
## 9 71 77 76 15 16 13
## 10 40 18 12 15 9 10
## V86 V87 V88
## 1 GKPositioning GKReflexes Release Clause
## 2 9 10
## 3 10 15
## 4 77 85
## 5 8 15
## 6 12 13
## 7 11 8
## 8 15 8
## 9 7 8
## 10 15 16
names(football) <- football[1,]
head(football)
## ID Name Age Photo
## 1 ID Name Age Photo
## 2 207439 L. Paredes 24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist 33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909 A. Lunev 26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov 29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260 Hilton 40 https://cdn.sofifa.org/players/4/19/153260.png
## Nationality Flag Overall Potential
## 1 Nationality Flag Overall Potential
## 2 Argentina https://cdn.sofifa.org/flags/52.png 80 85
## 3 Sweden https://cdn.sofifa.org/flags/46.png 80 80
## 4 Russia https://cdn.sofifa.org/flags/40.png 79 81
## 5 Russia https://cdn.sofifa.org/flags/40.png 79 79
## 6 Brazil https://cdn.sofifa.org/flags/54.png 78 78
## Club Club Logo Value Wage
## 1 Club Club Logo Value Wage
## 2 https://cdn.sofifa.org/flags/52.png 5684 1602
## 3 https://cdn.sofifa.org/flags/46.png 6370 3591
## 4 https://cdn.sofifa.org/flags/40.png 5675 3672
## 5 https://cdn.sofifa.org/flags/40.png 6030 1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png 6405 19799
## Special Preferred Foot International Reputation Weak Foot Skill Moves
## 1 Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2 2122 Right 2 4 4
## 3 1797 Right 2 4 2
## 4 1217 Right 1 3 1
## 5 2038 Right 2 3 3
## 6 1807 Right 2 3 3
## Work Rate Body Type Real Face Position Jersey Number Joined
## 1 Work Rate Body Type Real Face Position Jersey Number Joined
## 2 Medium/ Medium Normal No CM 5
## 3 High/ Medium Normal No LCB 4
## 4 Medium/ Medium Normal No GK 12
## 5 High/ High Lean No RB 2
## 6 Medium/ Medium Normal Yes CB 4 1-Aug-11
## Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 1 Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 2 5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3 6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4 6'2 176lbs
## 5 5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6 2019 5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 1 RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 1 LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2 76 55 60 84
## 3 70+2 79+2 79+2 79+2 70+2 49 51 81 73
## 4 16 14 17 25
## 5 78+2 73+2 73+2 73+2 78+2 73 61 69 79
## 6 68+2 76+2 76+2 76+2 68+2 60 45 79 73
## Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 1 Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2 73 78 79 78 82 82 75
## 3 37 49 36 40 67 63 46
## 4 13 15 18 17 32 17 58
## 5 57 72 49 46 75 72 84
## 6 51 63 42 48 72 73 33
## SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 1 SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2 69 77 74 77 82 61 79 69
## 3 49 55 76 36 74 64 67 83
## 4 54 36 76 50 24 60 27 70
## 5 90 80 75 76 67 85 93 68
## 6 38 51 70 60 55 79 54 76
## LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 1 LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2 80 79 72 74 82 57 74
## 3 59 81 82 54 49 79 78
## 4 13 26 20 11 63 15 69
## 5 57 65 71 77 72 41 73
## 6 58 76 79 50 67 64 70
## Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 1 Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2 73 75 72 9 14 6
## 3 82 83 79 7 9 12
## 4 18 20 12 80 73 65
## 5 76 76 80 7 12 10
## 6 83 77 76 12 7 11
## GKPositioning GKReflexes Release Clause
## 1 GKPositioning GKReflexes Release Clause
## 2 9 10
## 3 10 15
## 4 77 85
## 5 8 15
## 6 12 13
football <- football[-c(1),]
head(football)
## ID Name Age Photo
## 2 207439 L. Paredes 24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist 33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909 A. Lunev 26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov 29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260 Hilton 40 https://cdn.sofifa.org/players/4/19/153260.png
## 7 187607 A. Dzyuba 29 https://cdn.sofifa.org/players/4/19/187607.png
## Nationality Flag Overall Potential
## 2 Argentina https://cdn.sofifa.org/flags/52.png 80 85
## 3 Sweden https://cdn.sofifa.org/flags/46.png 80 80
## 4 Russia https://cdn.sofifa.org/flags/40.png 79 81
## 5 Russia https://cdn.sofifa.org/flags/40.png 79 79
## 6 Brazil https://cdn.sofifa.org/flags/54.png 78 78
## 7 Russia https://cdn.sofifa.org/flags/40.png 78 78
## Club Club Logo Value Wage
## 2 https://cdn.sofifa.org/flags/52.png 5684 1602
## 3 https://cdn.sofifa.org/flags/46.png 6370 3591
## 4 https://cdn.sofifa.org/flags/40.png 5675 3672
## 5 https://cdn.sofifa.org/flags/40.png 6030 1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png 6405 19799
## 7 https://cdn.sofifa.org/flags/40.png 5764 1105
## Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2 2122 Right 2 4 4
## 3 1797 Right 2 4 2
## 4 1217 Right 1 3 1
## 5 2038 Right 2 3 3
## 6 1807 Right 2 3 3
## 7 1810 Right 2 3 3
## Work Rate Body Type Real Face Position Jersey Number Joined
## 2 Medium/ Medium Normal No CM 5
## 3 High/ Medium Normal No LCB 4
## 4 Medium/ Medium Normal No GK 12
## 5 High/ High Lean No RB 2
## 6 Medium/ Medium Normal Yes CB 4 1-Aug-11
## 7 High/ Medium Stocky No ST 22
## Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 2 5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3 6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4 6'2 176lbs
## 5 5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6 2019 5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7 6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
## RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7 74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
## LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2 76 55 60 84
## 3 70+2 79+2 79+2 79+2 70+2 49 51 81 73
## 4 16 14 17 25
## 5 78+2 73+2 73+2 73+2 78+2 73 61 69 79
## 6 68+2 76+2 76+2 76+2 68+2 60 45 79 73
## 7 48+2 48+2 48+2 48+2 48+2 61 79 86 71
## Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2 73 78 79 78 82 82 75
## 3 37 49 36 40 67 63 46
## 4 13 15 18 17 32 17 58
## 5 57 72 49 46 75 72 84
## 6 51 63 42 48 72 73 33
## 7 74 71 64 60 55 77 66
## SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2 69 77 74 77 82 61 79 69
## 3 49 55 76 36 74 64 67 83
## 4 54 36 76 50 24 60 27 70
## 5 90 80 75 76 67 85 93 68
## 6 38 51 70 60 55 79 54 76
## 7 65 50 75 32 78 63 77 93
## LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2 80 79 72 74 82 57 74
## 3 59 81 82 54 49 79 78
## 4 13 26 20 11 63 15 69
## 5 57 65 71 77 72 41 73
## 6 58 76 79 50 67 64 70
## 7 68 75 30 78 73 77 70
## Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2 73 75 72 9 14 6
## 3 82 83 79 7 9 12
## 4 18 20 12 80 73 65
## 5 76 76 80 7 12 10
## 6 83 77 76 12 7 11
## 7 21 15 19 15 12 11
## GKPositioning GKReflexes Release Clause
## 2 9 10
## 3 10 15
## 4 77 85
## 5 8 15
## 6 12 13
## 7 11 8
nrow(football)
## [1] 18207
table(football$Position)
##
## CAM CB CDM CF CM GK LAM LB LCB LCM LDM LF LM LS LW
## 60 958 1778 948 74 1394 2025 21 1322 648 395 243 15 1095 207 381
## LWB RAM RB RCB RCM RDM RF RM RS RW RWB ST
## 78 21 1291 662 391 248 16 1124 203 370 87 2152
Strikers are defined in the dataset as Position = “ST”.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
football_st <- football %>% filter(Position == "ST")
head(football_st)
## ID Name Age Photo
## 1 187607 A. Dzyuba 29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389 G. Sio 29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683 K. Fofana 26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson 27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900 J. Sambenito 26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405 B. Angulo 22 https://cdn.sofifa.org/players/4/19/246405.png
## Nationality Flag Overall Potential Club
## 1 Russia https://cdn.sofifa.org/flags/40.png 78 78
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png 77 77
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png 75 75
## 4 Iceland https://cdn.sofifa.org/flags/24.png 73 74
## 5 Paraguay https://cdn.sofifa.org/flags/58.png 71 74
## 6 Ecuador https://cdn.sofifa.org/flags/57.png 71 77
## Club Logo Value Wage Special Preferred Foot
## 1 https://cdn.sofifa.org/flags/40.png 5764 1105 1810 Right
## 2 https://cdn.sofifa.org/flags/108.png 5275 2138 1933 Left
## 3 https://cdn.sofifa.org/flags/108.png 5589 3875 1877 Right
## 4 https://cdn.sofifa.org/flags/24.png 5629 3661 1893 Right
## 5 https://cdn.sofifa.org/flags/58.png 6113 2445 1651 Right
## 6 https://cdn.sofifa.org/flags/57.png 5057 2216 1628 Right
## International Reputation Weak Foot Skill Moves Work Rate Body Type
## 1 2 3 3 High/ Medium Stocky
## 2 2 3 3 High/ Low Normal
## 3 1 3 3 Medium/ Medium Normal
## 4 1 4 3 High/ High Normal
## 5 1 3 2 High/ Medium Lean
## 6 1 4 3 High/ Low Normal
## Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1 No ST 22
## 2 No ST 21
## 3 No ST 22
## 4 No ST 9
## 5 No ST 9
## 6 No ST 19
## Height Weight LS ST RS LW LF CF RF RW LAM CAM RAM LM
## 1 6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2 5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3 6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4 6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5 6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6 6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
## LCM CM RCM RM LWB LDM CDM RDM RWB LB LCB CB RCB RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
## Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1 61 79 86 71 74 71 64
## 2 68 77 71 73 73 76 73
## 3 66 75 72 74 74 72 63
## 4 66 71 68 68 65 73 63
## 5 40 74 72 57 72 60 64
## 6 50 78 69 56 46 76 58
## FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1 60 55 77 66 65 50 75
## 2 69 67 76 78 85 79 71
## 3 59 58 75 59 77 63 72
## 4 48 44 73 78 79 83 74
## 5 42 42 63 79 72 61 69
## 6 58 33 71 82 79 78 73
## Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1 32 78 63 77 93 68 75 30
## 2 73 77 70 78 74 74 77 18
## 3 60 78 69 83 77 73 67 40
## 4 76 68 78 90 85 66 73 42
## 5 64 73 69 67 72 67 49 14
## 6 64 72 69 77 69 54 28 16
## Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1 78 73 77 70 21 15 19
## 2 76 73 72 72 40 18 12
## 3 72 69 74 83 23 37 46
## 4 73 64 69 76 31 39 24
## 5 75 60 67 74 15 16 16
## 6 62 45 82 51 11 18 12
## GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1 15 12 11 11 8
## 2 15 9 10 15 16
## 3 7 11 7 11 14
## 4 9 12 10 15 16
## 5 15 16 15 7 7
## 6 11 8 10 7 6
nrow(football_st)
## [1] 2152
convert to numeric.
str(football_st$Wage)
## chr [1:2152] "1105" "2138" "3875" "3661" "2445" "2216" "4457" "3370" ...
str(football_st$Value)
## chr [1:2152] "5764" "5275" "5589" "5629" "6113" "5057" "6561" "6146" ...
football_st$Wage <- as.numeric(football_st$Wage)
football_st$Value <- as.numeric(football_st$Value)
library(ggplot2)
library(ggpubr)
ggplot(football_st) + aes(x = Wage, y = Value) +
geom_point(shape = 2, colour = "black") +
xlab("Wage") + ylab("Value") +
ggtitle("Wage and Value") +
geom_smooth(method = lm) +
stat_regline_equation(label.x = 150000, label.y = 1700) +
stat_cor(method = "pearson", label.x = 300000, label.y = 1600)
## `geom_smooth()` using formula 'y ~ x'
value_simple <- lm(football_st$Value ~ football_st$Wage)
summary(value_simple)
##
## Call:
## lm(formula = football_st$Value ~ football_st$Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17073527 -633009 -209153 198333 38355242
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.175e+05 7.060e+04 -5.913 3.91e-09 ***
## football_st$Wage 2.179e+02 2.721e+00 80.068 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2838000 on 2150 degrees of freedom
## Multiple R-squared: 0.7489, Adjusted R-squared: 0.7487
## F-statistic: 6411 on 1 and 2150 DF, p-value: < 2.2e-16
confint(value_simple, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -555911.3195 -278995.9221
## football_st$Wage 212.5681 223.2422
value_simple_stdresiduals <- rstandard(value_simple)
head(value_simple_stdresiduals)
## 1 2 3 4 5 6
## 0.06430004 -0.01520939 -0.14850129 -0.13205208 -0.03849210 -0.02127676
Standard residuals.
football_st_comb <- cbind(football_st, value_simple_stdresiduals)
head(football_st_comb)
## ID Name Age Photo
## 1 187607 A. Dzyuba 29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389 G. Sio 29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683 K. Fofana 26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson 27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900 J. Sambenito 26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405 B. Angulo 22 https://cdn.sofifa.org/players/4/19/246405.png
## Nationality Flag Overall Potential Club
## 1 Russia https://cdn.sofifa.org/flags/40.png 78 78
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png 77 77
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png 75 75
## 4 Iceland https://cdn.sofifa.org/flags/24.png 73 74
## 5 Paraguay https://cdn.sofifa.org/flags/58.png 71 74
## 6 Ecuador https://cdn.sofifa.org/flags/57.png 71 77
## Club Logo Value Wage Special Preferred Foot
## 1 https://cdn.sofifa.org/flags/40.png 5764 1105 1810 Right
## 2 https://cdn.sofifa.org/flags/108.png 5275 2138 1933 Left
## 3 https://cdn.sofifa.org/flags/108.png 5589 3875 1877 Right
## 4 https://cdn.sofifa.org/flags/24.png 5629 3661 1893 Right
## 5 https://cdn.sofifa.org/flags/58.png 6113 2445 1651 Right
## 6 https://cdn.sofifa.org/flags/57.png 5057 2216 1628 Right
## International Reputation Weak Foot Skill Moves Work Rate Body Type
## 1 2 3 3 High/ Medium Stocky
## 2 2 3 3 High/ Low Normal
## 3 1 3 3 Medium/ Medium Normal
## 4 1 4 3 High/ High Normal
## 5 1 3 2 High/ Medium Lean
## 6 1 4 3 High/ Low Normal
## Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1 No ST 22
## 2 No ST 21
## 3 No ST 22
## 4 No ST 9
## 5 No ST 9
## 6 No ST 19
## Height Weight LS ST RS LW LF CF RF RW LAM CAM RAM LM
## 1 6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2 5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3 6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4 6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5 6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6 6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
## LCM CM RCM RM LWB LDM CDM RDM RWB LB LCB CB RCB RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
## Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1 61 79 86 71 74 71 64
## 2 68 77 71 73 73 76 73
## 3 66 75 72 74 74 72 63
## 4 66 71 68 68 65 73 63
## 5 40 74 72 57 72 60 64
## 6 50 78 69 56 46 76 58
## FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1 60 55 77 66 65 50 75
## 2 69 67 76 78 85 79 71
## 3 59 58 75 59 77 63 72
## 4 48 44 73 78 79 83 74
## 5 42 42 63 79 72 61 69
## 6 58 33 71 82 79 78 73
## Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1 32 78 63 77 93 68 75 30
## 2 73 77 70 78 74 74 77 18
## 3 60 78 69 83 77 73 67 40
## 4 76 68 78 90 85 66 73 42
## 5 64 73 69 67 72 67 49 14
## 6 64 72 69 77 69 54 28 16
## Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1 78 73 77 70 21 15 19
## 2 76 73 72 72 40 18 12
## 3 72 69 74 83 23 37 46
## 4 73 64 69 76 31 39 24
## 5 75 60 67 74 15 16 16
## 6 62 45 82 51 11 18 12
## GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1 15 12 11 11 8
## 2 15 9 10 15 16
## 3 7 11 7 11 14
## 4 9 12 10 15 16
## 5 15 16 15 7 7
## 6 11 8 10 7 6
## value_simple_stdresiduals
## 1 0.06430004
## 2 -0.01520939
## 3 -0.14850129
## 4 -0.13205208
## 5 -0.03849210
## 6 -0.02127676
Plot residuals.
ggplot(football_st_comb) + aes(x = football_st_comb$Value, y = football_st_comb$value_simple_stdresiduals) +
geom_point() +
xlab("Value") + ylab("Standard Residuals") +
ggtitle("Wage and Value Prediction, Residuals")
## Warning: Use of `football_st_comb$Value` is discouraged. Use `Value` instead.
## Warning: Use of `football_st_comb$value_simple_stdresiduals` is discouraged. Use
## `value_simple_stdresiduals` instead.
ggplot(football_st) + aes(x = Value) +
geom_histogram() +
ylab("Count") +
ggtitle("Distribution of Value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the Shapiro-Wilks test.
H-0: normal distribution.
H-1: distribution is different from a normal distribution.
shapiro.test(football_st$Value)
##
## Shapiro-Wilk normality test
##
## data: football_st$Value
## W = 0.37447, p-value < 2.2e-16
May not be very applicable here. But just for illustration……
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
durbinWatsonTest(value_simple)
## lag Autocorrelation D-W Statistic p-value
## 1 0.2167301 1.566536 0
## Alternative hypothesis: rho != 0
Subset data for simplicity.
football_st_2 <- football_st[, c("Age", "Balance", "ShotPower", "Aggression",
"Positioning", "Composure", "Wage")]
head(football_st_2)
## Age Balance ShotPower Aggression Positioning Composure Wage
## 1 29 32 78 75 78 70 1105
## 2 29 73 77 77 76 72 2138
## 3 26 60 78 67 72 83 3875
## 4 27 76 68 73 73 76 3661
## 5 26 64 73 49 75 74 2445
## 6 22 64 72 28 62 51 2216
Convert to numeric.
library(dplyr)
football_st_2 <- football_st_2 %>% mutate_if(is.character, as.numeric)
str(football_st_2)
## 'data.frame': 2152 obs. of 7 variables:
## $ Age : num 29 29 26 27 26 22 22 28 31 28 ...
## $ Balance : num 32 73 60 76 64 64 65 75 69 56 ...
## $ ShotPower : num 78 77 78 68 73 72 66 75 69 71 ...
## $ Aggression : num 75 77 67 73 49 28 30 36 68 59 ...
## $ Positioning: num 78 76 72 73 75 62 76 68 69 72 ...
## $ Composure : num 70 72 83 76 74 51 62 56 80 56 ...
## $ Wage : num 1105 2138 3875 3661 2445 ...
A multiple regression model showing unstandardised estimates.
The predictors included in the model are: Age, Balance, ShotPower, Aggression, Positioning, and Composure.
names(football_st_2)
## [1] "Age" "Balance" "ShotPower" "Aggression" "Positioning"
## [6] "Composure" "Wage"
wage_model_st <- lm(Wage ~ Age + Balance + ShotPower +
Aggression + Positioning + Composure,
data = football_st_2)
summary(wage_model_st)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning + Composure, data = football_st_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31822 -8232 -2313 4754 350592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -77073.40 4064.61 -18.962 < 2e-16 ***
## Age -1014.25 110.94 -9.143 < 2e-16 ***
## Balance 120.41 35.90 3.354 0.00081 ***
## ShotPower 498.07 74.43 6.692 2.81e-11 ***
## Aggression 15.96 32.29 0.494 0.62129
## Positioning 741.71 82.42 8.999 < 2e-16 ***
## Composure 424.72 71.66 5.927 3.58e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18840 on 2145 degrees of freedom
## Multiple R-squared: 0.2997, Adjusted R-squared: 0.2978
## F-statistic: 153 on 6 and 2145 DF, p-value: < 2.2e-16
coef(wage_model_st)
## (Intercept) Age Balance ShotPower Aggression Positioning
## -77073.39877 -1014.24567 120.40620 498.06517 15.95657 741.70804
## Composure
## 424.72405
confint(wage_model_st, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -85044.38590 -69102.41165
## Age -1231.79758 -796.69375
## Balance 50.00615 190.80626
## ShotPower 352.09956 644.03079
## Aggression -47.37581 79.28895
## Positioning 580.07796 903.33813
## Composure 284.19780 565.25031
A multiple regression model showing standardised estimates.
The predictors included in the model are: Age, Balance, ShotPower, Aggression, Positioning, and Composure.
library(lm.beta)
wage_model_st_std <- lm.beta::lm.beta(wage_model_st)
coef(wage_model_st_std)
## (Intercept) Age Balance ShotPower Aggression Positioning
## 0.00000000 -0.21358305 0.06178231 0.20182976 0.01126852 0.30316025
## Composure
## 0.19146721
confint(wage_model_st_std)
## 2.5 % 97.5 %
## (Intercept) -7970.98713 7970.98713
## Age -217.76550 217.33833
## Balance -70.33827 70.46184
## ShotPower -145.76378 146.16744
## Aggression -63.32111 63.34365
## Positioning -161.32692 161.93324
## Composure -140.33479 140.71772
wage_model_st_residuals <- rstandard(wage_model_st)
head(wage_model_st_residuals)
## 1 2 3 4 5 6
## -1.2711799 -1.4183035 -1.5151160 -1.1956035 -1.3820667 -0.5348701
football_st_comb_2 <- cbind(football_st_2, wage_model_st_residuals)
head(football_st_comb_2)
## Age Balance ShotPower Aggression Positioning Composure Wage
## 1 29 32 78 75 78 70 1105
## 2 29 73 77 77 76 72 2138
## 3 26 60 78 67 72 83 3875
## 4 27 76 68 73 73 76 3661
## 5 26 64 73 49 75 74 2445
## 6 22 64 72 28 62 51 2216
## wage_model_st_residuals
## 1 -1.2711799
## 2 -1.4183035
## 3 -1.5151160
## 4 -1.1956035
## 5 -1.3820667
## 6 -0.5348701
ggplot(football_st_comb_2) + aes(x = Wage, y = wage_model_st_residuals) +
geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Wage Prediction")
ggplot(football_st_comb_2) + aes(x = Age, y = wage_model_st_residuals) +
geom_point() + xlab("Age") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Age")
ggplot(football_st_comb_2) + aes(x = ShotPower, y = wage_model_st_residuals) +
geom_point() + xlab("Shot Power") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Shot Power")
ggplot(football_st_comb_2) + aes(x = Aggression, y = wage_model_st_residuals) +
geom_point() + xlab("Aggression") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Aggression")
ggplot(football_st_comb_2) + aes(x = Positioning, y = wage_model_st_residuals) +
geom_point() + xlab("Positioning") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Positionng")
ggplot(football_st_comb_2) + aes(x = Composure, y = wage_model_st_residuals) +
geom_point() + xlab("Composure") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Composure")
library(ggplot2)
ggplot(football_st_2) + aes(x = Wage) +
geom_histogram() +
ylab("Count") +
ggtitle("Distribution of wage (strikers)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(football_st_2) + aes(x = Wage) +
geom_histogram() +
ylab("Count") +
scale_x_log10() +
ggtitle("Distribution of log(wage) (strikers)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the Shapiro-Wilks test.
H-0: normal distribution
H-1: distribution is different from a normal distribution.
shapiro.test(football_st_2$Wage)
##
## Shapiro-Wilk normality test
##
## data: football_st_2$Wage
## W = 0.39056, p-value < 2.2e-16
How much the variance of an estimated regression coefficient increases if your predictors are correlated.
In other words, no 2 pairs of predicts should not be strongly correlated with each other.
If no factors are correlated, the VIFs will all be 1.
Rule of thumb: If VIF > 10, mullticollinearity is high.
library(car)
vif(wage_model_st)
## Age Balance ShotPower Aggression Positioning Composure
## 1.671663 1.039327 2.786601 1.593244 3.476150 3.196433
0 <= D-W <= 4.
Rule of thumb:
D-W = 2.0 means that there is no autocorrelation.
D-W < = means there is positive autocorrelation.
D-W > 2 means negative autocorrelation.
This applies in time series data; so not so applicable here.
durbinWatsonTest(wage_model_st)
## lag Autocorrelation D-W Statistic p-value
## 1 0.5038085 0.9915208 0
## Alternative hypothesis: rho != 0
We can also automatically evaluate the model.
library(gvlma)
gvlma(wage_model_st)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning + Composure, data = football_st_2)
##
## Coefficients:
## (Intercept) Age Balance ShotPower Aggression Positioning
## -77073.40 -1014.25 120.41 498.07 15.96 741.71
## Composure
## 424.72
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st)
##
## Value p-value Decision
## Global Stat 1307104.5 0 Assumptions NOT satisfied!
## Skewness 26054.7 0 Assumptions NOT satisfied!
## Kurtosis 1280082.5 0 Assumptions NOT satisfied!
## Link Function 791.9 0 Assumptions NOT satisfied!
## Heteroscedasticity 175.5 0 Assumptions NOT satisfied!
Perform a Breusch-Pagan Test to test for heteroskedasticity/homoskedasticity.
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(wage_model_st)
##
## studentized Breusch-Pagan test
##
## data: wage_model_st
## BP = 91.188, df = 6, p-value < 2.2e-16
plot(wage_model_st, 1)
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.0.5
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(wage_model_st)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## --------------------------------
## Response : Wage
## Variables: fitted values of Wage
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 5352.5071
## Prob > Chi2 = 0.0000
Multiple test for each variable.
ols_test_breusch_pagan(wage_model_st, rhs = TRUE,
multiple = TRUE)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -----------------------------------------------------------------
## Response : Wage
## Variables: Age Balance ShotPower Aggression Positioning Composure
##
## Test Summary (Unadjusted p values)
## --------------------------------------------------
## Variable chi2 df p
## --------------------------------------------------
## Age 487.2866 1 5.549066e-108
## Balance 147.9854 1 4.778931e-34
## ShotPower 3632.0162 1 0.000000e+00
## Aggression 637.7948 1 1.008165e-140
## Positioning 4068.5226 1 0.000000e+00
## Composure 4081.2646 1 0.000000e+00
## --------------------------------------------------
## simultaneous 5538.8585 6 0.000000e+00
## --------------------------------------------------
Stepwise regression is a modification of the ordinary regression.
library(stats)
wage_model_st_step <- step(wage_model_st,
direction = "both")
## Start: AIC=42374.94
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning +
## Composure
##
## Df Sum of Sq RSS AIC
## - Aggression 1 8.6672e+07 7.6162e+11 42373
## <none> 7.6154e+11 42375
## - Balance 1 3.9939e+09 7.6553e+11 42384
## - Composure 1 1.2472e+10 7.7401e+11 42408
## - ShotPower 1 1.5897e+10 7.7743e+11 42417
## - Positioning 1 2.8752e+10 7.9029e+11 42453
## - Age 1 2.9676e+10 7.9121e+11 42455
##
## Step: AIC=42373.18
## Wage ~ Age + Balance + ShotPower + Positioning + Composure
##
## Df Sum of Sq RSS AIC
## <none> 7.6162e+11 42373
## + Aggression 1 8.6672e+07 7.6154e+11 42375
## - Balance 1 3.9197e+09 7.6554e+11 42382
## - Composure 1 1.2939e+10 7.7456e+11 42407
## - ShotPower 1 1.7279e+10 7.7890e+11 42419
## - Positioning 1 2.8770e+10 7.9039e+11 42451
## - Age 1 3.0373e+10 7.9200e+11 42455
summary(wage_model_st_step)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning +
## Composure, data = football_st_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31793 -8228 -2326 4830 350282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -77250.10 4048.13 -19.083 < 2e-16 ***
## Age -1002.58 108.38 -9.251 < 2e-16 ***
## Balance 118.78 35.74 3.323 0.000904 ***
## ShotPower 506.25 72.55 6.978 3.98e-12 ***
## Positioning 741.93 82.40 9.004 < 2e-16 ***
## Composure 429.17 71.08 6.038 1.83e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18840 on 2146 degrees of freedom
## Multiple R-squared: 0.2997, Adjusted R-squared: 0.298
## F-statistic: 183.6 on 5 and 2146 DF, p-value: < 2.2e-16
gvlma(wage_model_st_step)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning +
## Composure, data = football_st_2)
##
## Coefficients:
## (Intercept) Age Balance ShotPower Positioning Composure
## -77250.1 -1002.6 118.8 506.2 741.9 429.2
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_step)
##
## Value p-value Decision
## Global Stat 1300530.2 0 Assumptions NOT satisfied!
## Skewness 25983.6 0 Assumptions NOT satisfied!
## Kurtosis 1273577.0 0 Assumptions NOT satisfied!
## Link Function 794.0 0 Assumptions NOT satisfied!
## Heteroscedasticity 175.5 0 Assumptions NOT satisfied!
Now, we will use the data mining approach.
Split the data into training and validation sets.
Set the seed using our favourite number :-)
set.seed(666)
Create the indices for the split This samples the row indices to split the data into training and validation.
train_index <- sample(1:nrow(football_st_2), 0.6 * nrow(football_st_2))
valid_index <- setdiff(1:nrow(football_st_2), train_index)
Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.
train_df_st <- football_st_2[train_index, ]
valid_df_st <- football_st_2[valid_index, ]
It is a good habit to check after splitting.
nrow(train_df_st)
## [1] 1291
nrow(valid_df_st)
## [1] 861
Training the model on the training set.
wage_model_st_2 <- lm(Wage ~ Age + Balance + ShotPower +
Aggression + Positioning + Composure,
data = train_df_st)
summary(wage_model_st_2)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning + Composure, data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32654 -8533 -2462 5056 346913
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -82121.16 5937.75 -13.830 < 2e-16 ***
## Age -948.45 166.15 -5.708 1.42e-08 ***
## Balance 114.04 53.15 2.146 0.0321 *
## ShotPower 489.51 110.63 4.425 1.05e-05 ***
## Aggression -23.76 47.29 -0.502 0.6154
## Positioning 730.41 121.49 6.012 2.39e-09 ***
## Composure 544.62 104.74 5.200 2.32e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21280 on 1284 degrees of freedom
## Multiple R-squared: 0.2704, Adjusted R-squared: 0.267
## F-statistic: 79.3 on 6 and 1284 DF, p-value: < 2.2e-16
Predict the outcome (i.e. wage) of the validation set using the model from the training set.
library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
## The following object is masked from 'package:ggpubr':
##
## gghistogram
wage_model_st_2_pred_train <- predict(wage_model_st_2,
train_df_st)
wage_model_st_2_pred <- predict(wage_model_st_2,
valid_df_st)
Compare the errors between the training and validation sets.
accuracy(wage_model_st_2_pred_train, train_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set 9.854906e-11 21227.13 9907.101 -32.18323 133.6435
accuracy(wage_model_st_2_pred, valid_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set 18.64833 14551.57 9388.476 -17.37455 128.1185
max(football_st_2$Wage) - min(football_st_2$Wage)
## [1] 406504
sd(football_st_2$Wage)
## [1] 22484.99
Subset to include categorical variable: preferred foot
football_st_3 <- football_st[, c("Preferred Foot", "Positioning", "Composure", "Wage")]
head(football_st_3)
## Preferred Foot Positioning Composure Wage
## 1 Right 78 70 1105
## 2 Left 76 72 2138
## 3 Right 72 83 3875
## 4 Right 73 76 3661
## 5 Right 75 74 2445
## 6 Right 62 51 2216
names(football_st_3)[1] <- "Preferred_Foot"
football_st_3$Positioning <- as.numeric(football_st_3$Positioning)
football_st_3$Composure <- as.numeric(football_st_3$Composure)
wage_model_st_cat <- lm(Wage ~ factor(Preferred_Foot) + Positioning + Composure, data = football_st_3)
summary(wage_model_st_cat)
##
## Call:
## lm(formula = Wage ~ factor(Preferred_Foot) + Positioning + Composure,
## data = football_st_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31805 -8181 -2270 4528 354971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -65474.98 3209.87 -20.398 < 2e-16 ***
## factor(Preferred_Foot)Right -1087.48 1221.25 -0.890 0.373
## Positioning 816.64 75.34 10.840 < 2e-16 ***
## Composure 438.10 68.36 6.409 1.8e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19400 on 2148 degrees of freedom
## Multiple R-squared: 0.2564, Adjusted R-squared: 0.2554
## F-statistic: 246.9 on 3 and 2148 DF, p-value: < 2.2e-16
confint(wage_model_st_cat, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -71769.7525 -59180.1997
## factor(Preferred_Foot)Right -3482.4352 1307.4754
## Positioning 668.9041 964.3812
## Composure 304.0428 572.1551
wage_model_st_cat_stdresiduals <- rstandard(wage_model_st_cat)
head(wage_model_st_cat_stdresiduals)
## 1 2 3 4 5 6
## -1.3769868 -1.3424571 -1.2770311 -1.1703986 -1.2718790 -0.2163923
football_st_3_cat <- cbind(football_st_3, wage_model_st_cat_stdresiduals)
head(football_st_3_cat)
## Preferred_Foot Positioning Composure Wage wage_model_st_cat_stdresiduals
## 1 Right 78 70 1105 -1.3769868
## 2 Left 76 72 2138 -1.3424571
## 3 Right 72 83 3875 -1.2770311
## 4 Right 73 76 3661 -1.1703986
## 5 Right 75 74 2445 -1.2718790
## 6 Right 62 51 2216 -0.2163923
ggplot(football_st_3_cat) + aes(x = Wage, y = wage_model_st_cat_stdresiduals) +
geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Wage")
Positioning
ggplot(football_st_3_cat) + aes(x = Positioning, y = wage_model_st_cat_stdresiduals) +
geom_point() + xlab("Positioning") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Positioning")
Composure
ggplot(football_st_3_cat) + aes(x = Composure, y = wage_model_st_cat_stdresiduals) +
geom_point() + xlab("Composure") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Composure")
ggplot(football_st_3_cat) + aes(x = Preferred_Foot, y = wage_model_st_cat_stdresiduals) +
geom_point() + xlab("Preferred Foot") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Preferred Foot")
ggplot(football_st_3_cat) + aes(x = Wage) +
geom_histogram() +
ylab("Count") +
ggtitle("Distribution of Wage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the Shapiro-Wilks test.
H-0: normal distribution.
H-alt: distribution is different from a normal distribution.
shapiro.test(football_st_3_cat$Wage)
##
## Shapiro-Wilk normality test
##
## data: football_st_3_cat$Wage
## W = 0.39056, p-value < 2.2e-16
Multicollinearity
vif(wage_model_st_cat)
## factor(Preferred_Foot) Positioning Composure
## 1.002720 2.738872 2.743181
Homoscedasticity.
ols_test_breusch_pagan(wage_model_st_cat)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## --------------------------------
## Response : Wage
## Variables: fitted values of Wage
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 4754.5635
## Prob > Chi2 = 0.0000
gvlma(wage_model_st_cat)
##
## Call:
## lm(formula = Wage ~ factor(Preferred_Foot) + Positioning + Composure,
## data = football_st_3)
##
## Coefficients:
## (Intercept) factor(Preferred_Foot)Right
## -65475.0 -1087.5
## Positioning Composure
## 816.6 438.1
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_cat)
##
## Value p-value Decision
## Global Stat 1208344.4 0 Assumptions NOT satisfied!
## Skewness 25297.7 0 Assumptions NOT satisfied!
## Kurtosis 1182302.6 0 Assumptions NOT satisfied!
## Link Function 600.3 0 Assumptions NOT satisfied!
## Heteroscedasticity 143.8 0 Assumptions NOT satisfied!
Sometimes, a relationship may not be linear. In this case, we can specify a non-linear relationship in the model.
We start with the traditional statistics approach and evaluate.
The non-linear relationship is expressed in the model specification.
names(football_st_2)
## [1] "Age" "Balance" "ShotPower" "Aggression" "Positioning"
## [6] "Composure" "Wage"
wage_model_st_nl <- lm(Wage ~ Age + Balance + ShotPower +
Aggression + Positioning * Composure,
data = football_st_2)
summary(wage_model_st_nl)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning * Composure, data = football_st_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58380 -5245 80 4644 267683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 297675.783 13442.584 22.144 <2e-16 ***
## Age -789.963 94.502 -8.359 <2e-16 ***
## Balance 57.694 30.555 1.888 0.0591 .
## ShotPower 642.408 63.389 10.134 <2e-16 ***
## Aggression 19.805 27.418 0.722 0.4702
## Positioning -5016.022 211.523 -23.714 <2e-16 ***
## Composure -6150.054 235.919 -26.069 <2e-16 ***
## Positioning:Composure 96.301 3.339 28.844 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16000 on 2144 degrees of freedom
## Multiple R-squared: 0.4955, Adjusted R-squared: 0.4939
## F-statistic: 300.8 on 7 and 2144 DF, p-value: < 2.2e-16
vif(wage_model_st_nl)
## Age Balance ShotPower
## 1.683057 1.044616 2.804077
## Aggression Positioning Composure
## 1.593281 31.765761 48.069231
## Positioning:Composure
## 127.119996
durbinWatsonTest(wage_model_st_nl)
## lag Autocorrelation D-W Statistic p-value
## 1 0.2531554 1.491911 0
## Alternative hypothesis: rho != 0
Perform a stepwise regression with a non-linear relationship and evaluate
wage_model_st_nl_step <- step(wage_model_st_nl,
direction = "both")
## Start: AIC=41671.29
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning *
## Composure
##
## Df Sum of Sq RSS AIC
## - Aggression 1 1.3352e+08 5.4877e+11 41670
## <none> 5.4863e+11 41671
## - Balance 1 9.1234e+08 5.4955e+11 41673
## - Age 1 1.7881e+10 5.6652e+11 41738
## - ShotPower 1 2.6282e+10 5.7492e+11 41770
## - Positioning:Composure 1 2.1290e+11 7.6154e+11 42375
##
## Step: AIC=41669.81
## Wage ~ Age + Balance + ShotPower + Positioning + Composure +
## Positioning:Composure
##
## Df Sum of Sq RSS AIC
## <none> 5.4877e+11 41670
## - Balance 1 8.5698e+08 5.4963e+11 41671
## + Aggression 1 1.3352e+08 5.4863e+11 41671
## - Age 1 1.8041e+10 5.6681e+11 41737
## - ShotPower 1 2.8516e+10 5.7728e+11 41777
## - Positioning:Composure 1 2.1286e+11 7.6162e+11 42373
summary(wage_model_st_nl_step)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning +
## Composure + Positioning:Composure, data = football_st_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58507 -5205 67 4579 267488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 297410.796 13436.079 22.135 <2e-16 ***
## Age -775.517 92.352 -8.397 <2e-16 ***
## Balance 55.684 30.424 1.830 0.0674 .
## ShotPower 652.547 61.808 10.558 <2e-16 ***
## Positioning -5015.048 211.495 -23.712 <2e-16 ***
## Composure -6143.738 235.730 -26.063 <2e-16 ***
## Positioning:Composure 96.289 3.338 28.844 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15990 on 2145 degrees of freedom
## Multiple R-squared: 0.4954, Adjusted R-squared: 0.494
## F-statistic: 351 on 6 and 2145 DF, p-value: < 2.2e-16
vif(wage_model_st_nl_step)
## Age Balance ShotPower
## 1.607679 1.035950 2.666586
## Positioning Composure Positioning:Composure
## 31.764471 48.003192 127.116986
durbinWatsonTest(wage_model_st_nl_step)
## lag Autocorrelation D-W Statistic p-value
## 1 0.2522843 1.493672 0
## Alternative hypothesis: rho != 0
A data mining approach with the non-linear relationship.
wage_model_st_nl_2 <- lm(Wage ~ Age + Balance + ShotPower + Aggression +
Positioning * Composure,
data = train_df_st)
summary(wage_model_st_nl_2)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning * Composure, data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69712 -5516 431 5121 257569
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 373590.851 19296.451 19.361 < 2e-16 ***
## Age -702.769 137.707 -5.103 3.84e-07 ***
## Balance 39.904 44.039 0.906 0.365
## ShotPower 691.250 91.817 7.529 9.63e-14 ***
## Aggression -24.766 39.088 -0.634 0.526
## Positioning -6254.863 303.169 -20.632 < 2e-16 ***
## Composure -7433.528 337.988 -21.993 < 2e-16 ***
## Positioning:Composure 116.410 4.767 24.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17590 on 1283 degrees of freedom
## Multiple R-squared: 0.5019, Adjusted R-squared: 0.4992
## F-statistic: 184.7 on 7 and 1283 DF, p-value: < 2.2e-16
wage_model_st_nl_2_pred <- predict(wage_model_st_nl_2,
valid_df_st)
accuracy(wage_model_st_nl_2_pred, valid_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set -547.7242 13726.06 8807.617 -31.1167 112.4235
sd(football_st$Wage)
## [1] 22484.99
gvlma(wage_model_st_nl_2)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning * Composure, data = train_df_st)
##
## Coefficients:
## (Intercept) Age Balance
## 373590.85 -702.77 39.90
## ShotPower Aggression Positioning
## 691.25 -24.77 -6254.86
## Composure Positioning:Composure
## -7433.53 116.41
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_nl_2)
##
## Value p-value Decision
## Global Stat 337857.89 0.000e+00 Assumptions NOT satisfied!
## Skewness 7895.68 0.000e+00 Assumptions NOT satisfied!
## Kurtosis 329396.72 0.000e+00 Assumptions NOT satisfied!
## Link Function 507.03 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity 58.47 2.065e-14 Assumptions NOT satisfied!
A data mining approach using a stepwise regression and non-linear relationship.
wage_model_st_nl_2_step <- step(wage_model_st_nl_2,
direction = "both")
## Start: AIC=25247.78
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning *
## Composure
##
## Df Sum of Sq RSS AIC
## - Aggression 1 1.2426e+08 3.9726e+11 25246
## - Balance 1 2.5413e+08 3.9739e+11 25247
## <none> 3.9713e+11 25248
## - Age 1 8.0617e+09 4.0520e+11 25272
## - ShotPower 1 1.7544e+10 4.1468e+11 25302
## - Positioning:Composure 1 1.8458e+11 5.8171e+11 25739
##
## Step: AIC=25246.18
## Wage ~ Age + Balance + ShotPower + Positioning + Composure +
## Positioning:Composure
##
## Df Sum of Sq RSS AIC
## - Balance 1 2.9864e+08 3.9756e+11 25245
## <none> 3.9726e+11 25246
## + Aggression 1 1.2426e+08 3.9713e+11 25248
## - Age 1 8.9937e+09 4.0625e+11 25273
## - ShotPower 1 1.7797e+10 4.1506e+11 25301
## - Positioning:Composure 1 1.8457e+11 5.8183e+11 25737
##
## Step: AIC=25245.15
## Wage ~ Age + ShotPower + Positioning + Composure + Positioning:Composure
##
## Df Sum of Sq RSS AIC
## <none> 3.9756e+11 25245
## + Balance 1 2.9864e+08 3.9726e+11 25246
## + Aggression 1 1.6876e+08 3.9739e+11 25247
## - Age 1 9.3077e+09 4.0686e+11 25273
## - ShotPower 1 1.7503e+10 4.1506e+11 25299
## - Positioning:Composure 1 1.8649e+11 5.8405e+11 25740
summary(wage_model_st_nl_2_step)
##
## Call:
## lm(formula = Wage ~ Age + ShotPower + Positioning + Composure +
## Positioning:Composure, data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69304 -5565 386 5182 257753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 378121.671 18793.092 20.120 < 2e-16 ***
## Age -732.952 133.630 -5.485 4.98e-08 ***
## ShotPower 668.100 88.824 7.522 1.01e-13 ***
## Positioning -6270.832 302.640 -20.720 < 2e-16 ***
## Composure -7453.830 337.457 -22.088 < 2e-16 ***
## Positioning:Composure 116.731 4.754 24.552 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17590 on 1285 degrees of freedom
## Multiple R-squared: 0.5013, Adjusted R-squared: 0.4994
## F-statistic: 258.4 on 5 and 1285 DF, p-value: < 2.2e-16
wage_model_st_nl_2_step_pred <- predict(wage_model_st_nl_2_step,
valid_df_st)
accuracy(wage_model_st_nl_2_step_pred, valid_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set -584.0779 13718.68 8800.113 -31.92455 112.5375
gvlma(wage_model_st_nl_2_step)
##
## Call:
## lm(formula = Wage ~ Age + ShotPower + Positioning + Composure +
## Positioning:Composure, data = train_df_st)
##
## Coefficients:
## (Intercept) Age ShotPower
## 378121.7 -733.0 668.1
## Positioning Composure Positioning:Composure
## -6270.8 -7453.8 116.7
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_nl_2_step)
##
## Value p-value Decision
## Global Stat 339857.02 0.000e+00 Assumptions NOT satisfied!
## Skewness 7928.00 0.000e+00 Assumptions NOT satisfied!
## Kurtosis 331364.96 0.000e+00 Assumptions NOT satisfied!
## Link Function 505.62 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity 58.44 2.098e-14 Assumptions NOT satisfied!
Sometimes, the data need to be transformed. A common transformation is the log transformation.
A traditional statistics approach using a log transformation.
Here, the predictors are transformed using a log function.
wage_model_st_log <- lm(Wage ~ log(Age) + log(Balance) + log(ShotPower) +
log(Aggression) + log(Positioning) + log(Composure),
data = football_st_2)
summary(wage_model_st_log)
##
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Aggression) + log(Positioning) + log(Composure), data = football_st_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27838 -8379 -2853 4132 361712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -345274 16356 -21.109 < 2e-16 ***
## log(Age) -22193 2887 -7.688 2.26e-14 ***
## log(Balance) 6921 2150 3.220 0.0013 **
## log(ShotPower) 29539 4823 6.125 1.08e-09 ***
## log(Aggression) 1259 1621 0.777 0.4374
## log(Positioning) 42091 5239 8.034 1.54e-15 ***
## log(Composure) 23706 4207 5.634 1.99e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19460 on 2145 degrees of freedom
## Multiple R-squared: 0.2529, Adjusted R-squared: 0.2508
## F-statistic: 121 on 6 and 2145 DF, p-value: < 2.2e-16
gvlma(wage_model_st_log)
##
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Aggression) + log(Positioning) + log(Composure), data = football_st_2)
##
## Coefficients:
## (Intercept) log(Age) log(Balance) log(ShotPower)
## -345274 -22193 6921 29539
## log(Aggression) log(Positioning) log(Composure)
## 1259 42091 23706
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_log)
##
## Value p-value Decision
## Global Stat 1307658.4 0 Assumptions NOT satisfied!
## Skewness 26779.0 0 Assumptions NOT satisfied!
## Kurtosis 1280038.4 0 Assumptions NOT satisfied!
## Link Function 670.1 0 Assumptions NOT satisfied!
## Heteroscedasticity 170.9 0 Assumptions NOT satisfied!
We can also use a data mining approach with the log transformation.
wage_model_st_log_2 <- lm(Wage ~ log(Age) + log(Balance) + log(ShotPower) +
log(Aggression) + log(Positioning) + log(Composure),
data = train_df_st)
summary(wage_model_st_log_2)
##
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Aggression) + log(Positioning) + log(Composure), data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28621 -8672 -3007 4368 359491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -362624.54 23959.55 -15.135 < 2e-16 ***
## log(Age) -19955.08 4304.36 -4.636 3.91e-06 ***
## log(Balance) 7001.43 3201.89 2.187 0.0289 *
## log(ShotPower) 28016.65 7149.05 3.919 9.36e-05 ***
## log(Aggression) 64.03 2383.50 0.027 0.9786
## log(Positioning) 42553.11 7708.04 5.521 4.08e-08 ***
## log(Composure) 28357.91 6083.46 4.661 3.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21950 on 1284 degrees of freedom
## Multiple R-squared: 0.2241, Adjusted R-squared: 0.2204
## F-statistic: 61.79 on 6 and 1284 DF, p-value: < 2.2e-16
wage_model_st_log_2_pred <- predict(wage_model_st_log_2,
valid_df_st)
accuracy(wage_model_st_log_2_pred, valid_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set 20.75738 15065.11 9419.259 -25.61638 127.768
sd(football_st_2$Wage)
## [1] 22484.99
range(football_st_2$Wage)
## [1] 1105 407609
gvlma(wage_model_st_log_2)
##
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Aggression) + log(Positioning) + log(Composure), data = train_df_st)
##
## Coefficients:
## (Intercept) log(Age) log(Balance) log(ShotPower)
## -362624.54 -19955.08 7001.43 28016.65
## log(Aggression) log(Positioning) log(Composure)
## 64.03 42553.11 28357.91
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_log_2)
##
## Value p-value Decision
## Global Stat 748249.63 0 Assumptions NOT satisfied!
## Skewness 17669.59 0 Assumptions NOT satisfied!
## Kurtosis 730101.21 0 Assumptions NOT satisfied!
## Link Function 402.67 0 Assumptions NOT satisfied!
## Heteroscedasticity 76.17 0 Assumptions NOT satisfied!
A stepwise regression using data mining and log transformations.
wage_model_st_log_2_step <- step(wage_model_st_log_2,
direction = "both")
## Start: AIC=25818.01
## Wage ~ log(Age) + log(Balance) + log(ShotPower) + log(Aggression) +
## log(Positioning) + log(Composure)
##
## Df Sum of Sq RSS AIC
## - log(Aggression) 1 3.4775e+05 6.1864e+11 25816
## <none> 6.1864e+11 25818
## - log(Balance) 1 2.3037e+09 6.2094e+11 25821
## - log(ShotPower) 1 7.3996e+09 6.2604e+11 25831
## - log(Age) 1 1.0355e+10 6.2899e+11 25837
## - log(Composure) 1 1.0469e+10 6.2911e+11 25838
## - log(Positioning) 1 1.4684e+10 6.3332e+11 25846
##
## Step: AIC=25816.01
## Wage ~ log(Age) + log(Balance) + log(ShotPower) + log(Positioning) +
## log(Composure)
##
## Df Sum of Sq RSS AIC
## <none> 6.1864e+11 25816
## + log(Aggression) 1 3.4775e+05 6.1864e+11 25818
## - log(Balance) 1 2.3247e+09 6.2096e+11 25819
## - log(ShotPower) 1 7.7312e+09 6.2637e+11 25830
## - log(Composure) 1 1.0688e+10 6.2933e+11 25836
## - log(Age) 1 1.0936e+10 6.2957e+11 25837
## - log(Positioning) 1 1.4687e+10 6.3333e+11 25844
summary(wage_model_st_log_2_step)
##
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Positioning) + log(Composure), data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28606 -8666 -2990 4367 359474
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -362686 23840 -15.214 < 2e-16 ***
## log(Age) -19928 4181 -4.766 2.09e-06 ***
## log(Balance) 6992 3182 2.197 0.0282 *
## log(ShotPower) 28055 7001 4.007 6.49e-05 ***
## log(Positioning) 42555 7705 5.523 4.02e-08 ***
## log(Composure) 28380 6023 4.712 2.72e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21940 on 1285 degrees of freedom
## Multiple R-squared: 0.2241, Adjusted R-squared: 0.221
## F-statistic: 74.21 on 5 and 1285 DF, p-value: < 2.2e-16
gvlma(wage_model_st_log_2_step)
##
## Call:
## lm(formula = Wage ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Positioning) + log(Composure), data = train_df_st)
##
## Coefficients:
## (Intercept) log(Age) log(Balance) log(ShotPower)
## -362686 -19928 6992 28055
## log(Positioning) log(Composure)
## 42555 28380
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_log_2_step)
##
## Value p-value Decision
## Global Stat 748063.91 0 Assumptions NOT satisfied!
## Skewness 17667.20 0 Assumptions NOT satisfied!
## Kurtosis 729919.19 0 Assumptions NOT satisfied!
## Link Function 401.37 0 Assumptions NOT satisfied!
## Heteroscedasticity 76.15 0 Assumptions NOT satisfied!