Regression models to predict the wages of football players.
Load the data and explore them.
football <- read.csv("football_2.csv", header = FALSE)
head(football, 10)
## V1 V2 V3 V4
## 1 ID Name Age Photo
## 2 207439 L. Paredes 24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist 33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909 A. Lunev 26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov 29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260 Hilton 40 https://cdn.sofifa.org/players/4/19/153260.png
## 7 187607 A. Dzyuba 29 https://cdn.sofifa.org/players/4/19/187607.png
## 8 204341 Lu\xcc_s Neto 30 https://cdn.sofifa.org/players/4/19/204341.png
## 9 223058 D. Kuzyaev 25 https://cdn.sofifa.org/players/4/19/223058.png
## 10 183389 G. Sio 29 https://cdn.sofifa.org/players/4/19/183389.png
## V5 V6 V7 V8
## 1 Nationality Flag Overall Potential
## 2 Argentina https://cdn.sofifa.org/flags/52.png 80 85
## 3 Sweden https://cdn.sofifa.org/flags/46.png 80 80
## 4 Russia https://cdn.sofifa.org/flags/40.png 79 81
## 5 Russia https://cdn.sofifa.org/flags/40.png 79 79
## 6 Brazil https://cdn.sofifa.org/flags/54.png 78 78
## 7 Russia https://cdn.sofifa.org/flags/40.png 78 78
## 8 Portugal https://cdn.sofifa.org/flags/38.png 77 77
## 9 Russia https://cdn.sofifa.org/flags/40.png 77 80
## 10 Ivory Coast https://cdn.sofifa.org/flags/108.png 77 77
## V9 V10 V11 V12
## 1 Club Club Logo Value Wage
## 2 https://cdn.sofifa.org/flags/52.png 5684 1602
## 3 https://cdn.sofifa.org/flags/46.png 6370 3591
## 4 https://cdn.sofifa.org/flags/40.png 5675 3672
## 5 https://cdn.sofifa.org/flags/40.png 6030 1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png 6405 19799
## 7 https://cdn.sofifa.org/flags/40.png 5764 1105
## 8 https://cdn.sofifa.org/flags/38.png 6075 2836
## 9 https://cdn.sofifa.org/flags/40.png 5565 2653
## 10 https://cdn.sofifa.org/flags/108.png 5275 2138
## V13 V14 V15 V16 V17
## 1 Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2 2122 Right 2 4 4
## 3 1797 Right 2 4 2
## 4 1217 Right 1 3 1
## 5 2038 Right 2 3 3
## 6 1807 Right 2 3 3
## 7 1810 Right 2 3 3
## 8 1749 Right 1 3 2
## 9 2041 Right 1 3 3
## 10 1933 Left 2 3 3
## V18 V19 V20 V21 V22 V23
## 1 Work Rate Body Type Real Face Position Jersey Number Joined
## 2 Medium/ Medium Normal No CM 5
## 3 High/ Medium Normal No LCB 4
## 4 Medium/ Medium Normal No GK 12
## 5 High/ High Lean No RB 2
## 6 Medium/ Medium Normal Yes CB 4 1-Aug-11
## 7 High/ Medium Stocky No ST 22
## 8 Medium/ Medium Lean No CB 4
## 9 Medium/ High Lean No RM 7
## 10 High/ Low Normal No ST 21
## V24 V25 V26 V27 V28 V29 V30 V31 V32 V33
## 1 Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 2 5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3 6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4 6'2 176lbs
## 5 5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6 2019 5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7 6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
## 8 6'2 157lbs 52+2 52+2 52+2 51+2 51+2 51+2
## 9 6'0 163lbs 70+2 70+2 70+2 74+2 74+2 74+2
## 10 5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2
## V34 V35 V36 V37 V38 V39 V40 V41 V42 V43 V44 V45 V46 V47 V48
## 1 RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7 74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
## 8 51+2 51+2 54+2 54+2 54+2 54+2 61+2 61+2 61+2 54+2 67+2 72+2 72+2 72+2 67+2
## 9 74+2 74+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2
## 10 75+2 75+2 74+2 74+2 74+2 74+2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2
## V49 V50 V51 V52 V53 V54 V55 V56 V57
## 1 LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2 76 55 60 84
## 3 70+2 79+2 79+2 79+2 70+2 49 51 81 73
## 4 16 14 17 25
## 5 78+2 73+2 73+2 73+2 78+2 73 61 69 79
## 6 68+2 76+2 76+2 76+2 68+2 60 45 79 73
## 7 48+2 48+2 48+2 48+2 48+2 61 79 86 71
## 8 69+2 75+2 75+2 75+2 69+2 42 33 80 72
## 9 74+2 70+2 70+2 70+2 74+2 67 64 51 82
## 10 50+2 46+2 46+2 46+2 50+2 68 77 71 73
## V58 V59 V60 V61 V62 V63 V64
## 1 Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2 73 78 79 78 82 82 75
## 3 37 49 36 40 67 63 46
## 4 13 15 18 17 32 17 58
## 5 57 72 49 46 75 72 84
## 6 51 63 42 48 72 73 33
## 7 74 71 64 60 55 77 66
## 8 40 49 52 43 77 48 57
## 9 57 78 60 61 75 79 78
## 10 73 76 73 69 67 76 78
## V65 V66 V67 V68 V69 V70 V71 V72
## 1 SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2 69 77 74 77 82 61 79 69
## 3 49 55 76 36 74 64 67 83
## 4 54 36 76 50 24 60 27 70
## 5 90 80 75 76 67 85 93 68
## 6 38 51 70 60 55 79 54 76
## 7 65 50 75 32 78 63 77 93
## 8 59 69 78 61 42 79 72 72
## 9 81 80 73 76 76 60 79 59
## 10 85 79 71 73 77 70 78 74
## V73 V74 V75 V76 V77 V78 V79
## 1 LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2 80 79 72 74 82 57 74
## 3 59 81 82 54 49 79 78
## 4 13 26 20 11 63 15 69
## 5 57 65 71 77 72 41 73
## 6 58 76 79 50 67 64 70
## 7 68 75 30 78 73 77 70
## 8 37 76 78 44 46 47 72
## 9 74 70 74 71 70 63 64
## 10 74 77 18 76 73 72 72
## V80 V81 V82 V83 V84 V85
## 1 Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2 73 75 72 9 14 6
## 3 82 83 79 7 9 12
## 4 18 20 12 80 73 65
## 5 76 76 80 7 12 10
## 6 83 77 76 12 7 11
## 7 21 15 19 15 12 11
## 8 80 77 78 10 15 13
## 9 71 77 76 15 16 13
## 10 40 18 12 15 9 10
## V86 V87 V88
## 1 GKPositioning GKReflexes Release Clause
## 2 9 10
## 3 10 15
## 4 77 85
## 5 8 15
## 6 12 13
## 7 11 8
## 8 15 8
## 9 7 8
## 10 15 16
names(football) <- football[1,]
head(football)
## ID Name Age Photo
## 1 ID Name Age Photo
## 2 207439 L. Paredes 24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist 33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909 A. Lunev 26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov 29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260 Hilton 40 https://cdn.sofifa.org/players/4/19/153260.png
## Nationality Flag Overall Potential
## 1 Nationality Flag Overall Potential
## 2 Argentina https://cdn.sofifa.org/flags/52.png 80 85
## 3 Sweden https://cdn.sofifa.org/flags/46.png 80 80
## 4 Russia https://cdn.sofifa.org/flags/40.png 79 81
## 5 Russia https://cdn.sofifa.org/flags/40.png 79 79
## 6 Brazil https://cdn.sofifa.org/flags/54.png 78 78
## Club Club Logo Value Wage
## 1 Club Club Logo Value Wage
## 2 https://cdn.sofifa.org/flags/52.png 5684 1602
## 3 https://cdn.sofifa.org/flags/46.png 6370 3591
## 4 https://cdn.sofifa.org/flags/40.png 5675 3672
## 5 https://cdn.sofifa.org/flags/40.png 6030 1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png 6405 19799
## Special Preferred Foot International Reputation Weak Foot Skill Moves
## 1 Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2 2122 Right 2 4 4
## 3 1797 Right 2 4 2
## 4 1217 Right 1 3 1
## 5 2038 Right 2 3 3
## 6 1807 Right 2 3 3
## Work Rate Body Type Real Face Position Jersey Number Joined
## 1 Work Rate Body Type Real Face Position Jersey Number Joined
## 2 Medium/ Medium Normal No CM 5
## 3 High/ Medium Normal No LCB 4
## 4 Medium/ Medium Normal No GK 12
## 5 High/ High Lean No RB 2
## 6 Medium/ Medium Normal Yes CB 4 1-Aug-11
## Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 1 Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 2 5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3 6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4 6'2 176lbs
## 5 5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6 2019 5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 1 RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 1 LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2 76 55 60 84
## 3 70+2 79+2 79+2 79+2 70+2 49 51 81 73
## 4 16 14 17 25
## 5 78+2 73+2 73+2 73+2 78+2 73 61 69 79
## 6 68+2 76+2 76+2 76+2 68+2 60 45 79 73
## Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 1 Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2 73 78 79 78 82 82 75
## 3 37 49 36 40 67 63 46
## 4 13 15 18 17 32 17 58
## 5 57 72 49 46 75 72 84
## 6 51 63 42 48 72 73 33
## SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 1 SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2 69 77 74 77 82 61 79 69
## 3 49 55 76 36 74 64 67 83
## 4 54 36 76 50 24 60 27 70
## 5 90 80 75 76 67 85 93 68
## 6 38 51 70 60 55 79 54 76
## LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 1 LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2 80 79 72 74 82 57 74
## 3 59 81 82 54 49 79 78
## 4 13 26 20 11 63 15 69
## 5 57 65 71 77 72 41 73
## 6 58 76 79 50 67 64 70
## Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 1 Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2 73 75 72 9 14 6
## 3 82 83 79 7 9 12
## 4 18 20 12 80 73 65
## 5 76 76 80 7 12 10
## 6 83 77 76 12 7 11
## GKPositioning GKReflexes Release Clause
## 1 GKPositioning GKReflexes Release Clause
## 2 9 10
## 3 10 15
## 4 77 85
## 5 8 15
## 6 12 13
football <- football[-c(1),]
head(football)
## ID Name Age Photo
## 2 207439 L. Paredes 24 https://cdn.sofifa.org/players/4/19/207439.png
## 3 156713 A. Granqvist 33 https://cdn.sofifa.org/players/4/19/156713.png
## 4 229909 A. Lunev 26 https://cdn.sofifa.org/players/4/19/229909.png
## 5 187347 I. Smolnikov 29 https://cdn.sofifa.org/players/4/19/187347.png
## 6 153260 Hilton 40 https://cdn.sofifa.org/players/4/19/153260.png
## 7 187607 A. Dzyuba 29 https://cdn.sofifa.org/players/4/19/187607.png
## Nationality Flag Overall Potential
## 2 Argentina https://cdn.sofifa.org/flags/52.png 80 85
## 3 Sweden https://cdn.sofifa.org/flags/46.png 80 80
## 4 Russia https://cdn.sofifa.org/flags/40.png 79 81
## 5 Russia https://cdn.sofifa.org/flags/40.png 79 79
## 6 Brazil https://cdn.sofifa.org/flags/54.png 78 78
## 7 Russia https://cdn.sofifa.org/flags/40.png 78 78
## Club Club Logo Value Wage
## 2 https://cdn.sofifa.org/flags/52.png 5684 1602
## 3 https://cdn.sofifa.org/flags/46.png 6370 3591
## 4 https://cdn.sofifa.org/flags/40.png 5675 3672
## 5 https://cdn.sofifa.org/flags/40.png 6030 1448
## 6 Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png 6405 19799
## 7 https://cdn.sofifa.org/flags/40.png 5764 1105
## Special Preferred Foot International Reputation Weak Foot Skill Moves
## 2 2122 Right 2 4 4
## 3 1797 Right 2 4 2
## 4 1217 Right 1 3 1
## 5 2038 Right 2 3 3
## 6 1807 Right 2 3 3
## 7 1810 Right 2 3 3
## Work Rate Body Type Real Face Position Jersey Number Joined
## 2 Medium/ Medium Normal No CM 5
## 3 High/ Medium Normal No LCB 4
## 4 Medium/ Medium Normal No GK 12
## 5 High/ High Lean No RB 2
## 6 Medium/ Medium Normal Yes CB 4 1-Aug-11
## 7 High/ Medium Stocky No ST 22
## Loaned From Contract Valid Until Height Weight LS ST RS LW LF CF
## 2 5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 3 6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 4 6'2 176lbs
## 5 5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 6 2019 5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 7 6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
## RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB
## 2 75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 3 58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 4
## 5 72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 6 59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 7 74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
## LB LCB CB RCB RB Crossing Finishing HeadingAccuracy ShortPassing
## 2 74+2 72+2 72+2 72+2 74+2 76 55 60 84
## 3 70+2 79+2 79+2 79+2 70+2 49 51 81 73
## 4 16 14 17 25
## 5 78+2 73+2 73+2 73+2 78+2 73 61 69 79
## 6 68+2 76+2 76+2 76+2 68+2 60 45 79 73
## 7 48+2 48+2 48+2 48+2 48+2 61 79 86 71
## Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 2 73 78 79 78 82 82 75
## 3 37 49 36 40 67 63 46
## 4 13 15 18 17 32 17 58
## 5 57 72 49 46 75 72 84
## 6 51 63 42 48 72 73 33
## 7 74 71 64 60 55 77 66
## SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 2 69 77 74 77 82 61 79 69
## 3 49 55 76 36 74 64 67 83
## 4 54 36 76 50 24 60 27 70
## 5 90 80 75 76 67 85 93 68
## 6 38 51 70 60 55 79 54 76
## 7 65 50 75 32 78 63 77 93
## LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 2 80 79 72 74 82 57 74
## 3 59 81 82 54 49 79 78
## 4 13 26 20 11 63 15 69
## 5 57 65 71 77 72 41 73
## 6 58 76 79 50 67 64 70
## 7 68 75 30 78 73 77 70
## Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 2 73 75 72 9 14 6
## 3 82 83 79 7 9 12
## 4 18 20 12 80 73 65
## 5 76 76 80 7 12 10
## 6 83 77 76 12 7 11
## 7 21 15 19 15 12 11
## GKPositioning GKReflexes Release Clause
## 2 9 10
## 3 10 15
## 4 77 85
## 5 8 15
## 6 12 13
## 7 11 8
nrow(football)
## [1] 18207
table(football$Position)
##
## CAM CB CDM CF CM GK LAM LB LCB LCM LDM LF LM LS LW
## 60 958 1778 948 74 1394 2025 21 1322 648 395 243 15 1095 207 381
## LWB RAM RB RCB RCM RDM RF RM RS RW RWB ST
## 78 21 1291 662 391 248 16 1124 203 370 87 2152
Strikers are defined in the dataset as Position = “ST”.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
football_st <- football %>% filter(Position == "ST")
head(football_st)
## ID Name Age Photo
## 1 187607 A. Dzyuba 29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389 G. Sio 29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683 K. Fofana 26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson 27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900 J. Sambenito 26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405 B. Angulo 22 https://cdn.sofifa.org/players/4/19/246405.png
## Nationality Flag Overall Potential Club
## 1 Russia https://cdn.sofifa.org/flags/40.png 78 78
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png 77 77
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png 75 75
## 4 Iceland https://cdn.sofifa.org/flags/24.png 73 74
## 5 Paraguay https://cdn.sofifa.org/flags/58.png 71 74
## 6 Ecuador https://cdn.sofifa.org/flags/57.png 71 77
## Club Logo Value Wage Special Preferred Foot
## 1 https://cdn.sofifa.org/flags/40.png 5764 1105 1810 Right
## 2 https://cdn.sofifa.org/flags/108.png 5275 2138 1933 Left
## 3 https://cdn.sofifa.org/flags/108.png 5589 3875 1877 Right
## 4 https://cdn.sofifa.org/flags/24.png 5629 3661 1893 Right
## 5 https://cdn.sofifa.org/flags/58.png 6113 2445 1651 Right
## 6 https://cdn.sofifa.org/flags/57.png 5057 2216 1628 Right
## International Reputation Weak Foot Skill Moves Work Rate Body Type
## 1 2 3 3 High/ Medium Stocky
## 2 2 3 3 High/ Low Normal
## 3 1 3 3 Medium/ Medium Normal
## 4 1 4 3 High/ High Normal
## 5 1 3 2 High/ Medium Lean
## 6 1 4 3 High/ Low Normal
## Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1 No ST 22
## 2 No ST 21
## 3 No ST 22
## 4 No ST 9
## 5 No ST 9
## 6 No ST 19
## Height Weight LS ST RS LW LF CF RF RW LAM CAM RAM LM
## 1 6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2 5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3 6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4 6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5 6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6 6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
## LCM CM RCM RM LWB LDM CDM RDM RWB LB LCB CB RCB RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
## Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1 61 79 86 71 74 71 64
## 2 68 77 71 73 73 76 73
## 3 66 75 72 74 74 72 63
## 4 66 71 68 68 65 73 63
## 5 40 74 72 57 72 60 64
## 6 50 78 69 56 46 76 58
## FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1 60 55 77 66 65 50 75
## 2 69 67 76 78 85 79 71
## 3 59 58 75 59 77 63 72
## 4 48 44 73 78 79 83 74
## 5 42 42 63 79 72 61 69
## 6 58 33 71 82 79 78 73
## Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1 32 78 63 77 93 68 75 30
## 2 73 77 70 78 74 74 77 18
## 3 60 78 69 83 77 73 67 40
## 4 76 68 78 90 85 66 73 42
## 5 64 73 69 67 72 67 49 14
## 6 64 72 69 77 69 54 28 16
## Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1 78 73 77 70 21 15 19
## 2 76 73 72 72 40 18 12
## 3 72 69 74 83 23 37 46
## 4 73 64 69 76 31 39 24
## 5 75 60 67 74 15 16 16
## 6 62 45 82 51 11 18 12
## GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1 15 12 11 11 8
## 2 15 9 10 15 16
## 3 7 11 7 11 14
## 4 9 12 10 15 16
## 5 15 16 15 7 7
## 6 11 8 10 7 6
nrow(football_st)
## [1] 2152
convert to numeric.
str(football_st$Wage)
## chr [1:2152] "1105" "2138" "3875" "3661" "2445" "2216" "4457" "3370" ...
str(football_st$Value)
## chr [1:2152] "5764" "5275" "5589" "5629" "6113" "5057" "6561" "6146" ...
football_st$Wage <- as.numeric(football_st$Wage)
football_st$Value <- as.numeric(football_st$Value)
library(ggplot2)
library(ggpubr)
ggplot(football_st) + aes(x = Wage, y = Value) +
geom_point(shape = 2, colour = "black") +
xlab("Wage") + ylab("Value") +
ggtitle("Wage and Value") +
geom_smooth(method = lm) +
stat_regline_equation(label.x = 150000, label.y = 1700) +
stat_cor(method = "pearson", label.x = 300000, label.y = 1600)
## `geom_smooth()` using formula = 'y ~ x'
value_simple <- lm(football_st$Value ~ football_st$Wage)
summary(value_simple)
##
## Call:
## lm(formula = football_st$Value ~ football_st$Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17073527 -633009 -209153 198333 38355242
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.175e+05 7.060e+04 -5.913 3.91e-09 ***
## football_st$Wage 2.179e+02 2.721e+00 80.068 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2838000 on 2150 degrees of freedom
## Multiple R-squared: 0.7489, Adjusted R-squared: 0.7487
## F-statistic: 6411 on 1 and 2150 DF, p-value: < 2.2e-16
confint(value_simple, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -555911.3195 -278995.9221
## football_st$Wage 212.5681 223.2422
value_simple_stdresiduals <- rstandard(value_simple)
head(value_simple_stdresiduals)
## 1 2 3 4 5 6
## 0.06430004 -0.01520939 -0.14850129 -0.13205208 -0.03849210 -0.02127676
Standard residuals.
football_st_comb <- cbind(football_st, value_simple_stdresiduals)
head(football_st_comb)
## ID Name Age Photo
## 1 187607 A. Dzyuba 29 https://cdn.sofifa.org/players/4/19/187607.png
## 2 183389 G. Sio 29 https://cdn.sofifa.org/players/4/19/183389.png
## 3 245683 K. Fofana 26 https://cdn.sofifa.org/players/4/19/245683.png
## 4 190461 B. Sigur̡arson 27 https://cdn.sofifa.org/players/4/19/190461.png
## 5 225900 J. Sambenito 26 https://cdn.sofifa.org/players/4/19/225900.png
## 6 246405 B. Angulo 22 https://cdn.sofifa.org/players/4/19/246405.png
## Nationality Flag Overall Potential Club
## 1 Russia https://cdn.sofifa.org/flags/40.png 78 78
## 2 Ivory Coast https://cdn.sofifa.org/flags/108.png 77 77
## 3 Ivory Coast https://cdn.sofifa.org/flags/108.png 75 75
## 4 Iceland https://cdn.sofifa.org/flags/24.png 73 74
## 5 Paraguay https://cdn.sofifa.org/flags/58.png 71 74
## 6 Ecuador https://cdn.sofifa.org/flags/57.png 71 77
## Club Logo Value Wage Special Preferred Foot
## 1 https://cdn.sofifa.org/flags/40.png 5764 1105 1810 Right
## 2 https://cdn.sofifa.org/flags/108.png 5275 2138 1933 Left
## 3 https://cdn.sofifa.org/flags/108.png 5589 3875 1877 Right
## 4 https://cdn.sofifa.org/flags/24.png 5629 3661 1893 Right
## 5 https://cdn.sofifa.org/flags/58.png 6113 2445 1651 Right
## 6 https://cdn.sofifa.org/flags/57.png 5057 2216 1628 Right
## International Reputation Weak Foot Skill Moves Work Rate Body Type
## 1 2 3 3 High/ Medium Stocky
## 2 2 3 3 High/ Low Normal
## 3 1 3 3 Medium/ Medium Normal
## 4 1 4 3 High/ High Normal
## 5 1 3 2 High/ Medium Lean
## 6 1 4 3 High/ Low Normal
## Real Face Position Jersey Number Joined Loaned From Contract Valid Until
## 1 No ST 22
## 2 No ST 21
## 3 No ST 22
## 4 No ST 9
## 5 No ST 9
## 6 No ST 19
## Height Weight LS ST RS LW LF CF RF RW LAM CAM RAM LM
## 1 6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 2 5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 3 6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 4 6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 5 6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 6 6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
## LCM CM RCM RM LWB LDM CDM RDM RWB LB LCB CB RCB RB
## 1 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 3 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 4 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 5 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 6 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
## Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1 61 79 86 71 74 71 64
## 2 68 77 71 73 73 76 73
## 3 66 75 72 74 74 72 63
## 4 66 71 68 68 65 73 63
## 5 40 74 72 57 72 60 64
## 6 50 78 69 56 46 76 58
## FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions
## 1 60 55 77 66 65 50 75
## 2 69 67 76 78 85 79 71
## 3 59 58 75 59 77 63 72
## 4 48 44 73 78 79 83 74
## 5 42 42 63 79 72 61 69
## 6 58 33 71 82 79 78 73
## Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions
## 1 32 78 63 77 93 68 75 30
## 2 73 77 70 78 74 74 77 18
## 3 60 78 69 83 77 73 67 40
## 4 76 68 78 90 85 66 73 42
## 5 64 73 69 67 72 67 49 14
## 6 64 72 69 77 69 54 28 16
## Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle
## 1 78 73 77 70 21 15 19
## 2 76 73 72 72 40 18 12
## 3 72 69 74 83 23 37 46
## 4 73 64 69 76 31 39 24
## 5 75 60 67 74 15 16 16
## 6 62 45 82 51 11 18 12
## GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
## 1 15 12 11 11 8
## 2 15 9 10 15 16
## 3 7 11 7 11 14
## 4 9 12 10 15 16
## 5 15 16 15 7 7
## 6 11 8 10 7 6
## value_simple_stdresiduals
## 1 0.06430004
## 2 -0.01520939
## 3 -0.14850129
## 4 -0.13205208
## 5 -0.03849210
## 6 -0.02127676
Plot residuals.
ggplot(football_st_comb) + aes(x = football_st_comb$Value, y = football_st_comb$value_simple_stdresiduals) +
geom_point() +
xlab("Value") + ylab("Standard Residuals") +
ggtitle("Wage and Value Prediction, Residuals")
## Warning: Use of `football_st_comb$Value` is discouraged.
## ℹ Use `Value` instead.
## Warning: Use of `football_st_comb$value_simple_stdresiduals` is discouraged.
## ℹ Use `value_simple_stdresiduals` instead.
ggplot(football_st) + aes(x = Value) +
geom_histogram() +
ylab("Count") +
ggtitle("Distribution of Value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the Shapiro-Wilks test.
H-0: normal distribution.
H-1: distribution is different from a normal distribution.
shapiro.test(football_st$Value)
##
## Shapiro-Wilk normality test
##
## data: football_st$Value
## W = 0.37447, p-value < 2.2e-16
May not be very applicable here. But just for illustration……
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
durbinWatsonTest(value_simple)
## lag Autocorrelation D-W Statistic p-value
## 1 0.2167301 1.566536 0
## Alternative hypothesis: rho != 0
Subset data for simplicity.
football_st_2 <- football_st[, c("Age", "Balance", "ShotPower", "Aggression",
"Positioning", "Composure", "Wage")]
head(football_st_2)
## Age Balance ShotPower Aggression Positioning Composure Wage
## 1 29 32 78 75 78 70 1105
## 2 29 73 77 77 76 72 2138
## 3 26 60 78 67 72 83 3875
## 4 27 76 68 73 73 76 3661
## 5 26 64 73 49 75 74 2445
## 6 22 64 72 28 62 51 2216
Convert to numeric.
library(dplyr)
football_st_2 <- football_st_2 %>% mutate_if(is.character, as.numeric)
str(football_st_2)
## 'data.frame': 2152 obs. of 7 variables:
## $ Age : num 29 29 26 27 26 22 22 28 31 28 ...
## $ Balance : num 32 73 60 76 64 64 65 75 69 56 ...
## $ ShotPower : num 78 77 78 68 73 72 66 75 69 71 ...
## $ Aggression : num 75 77 67 73 49 28 30 36 68 59 ...
## $ Positioning: num 78 76 72 73 75 62 76 68 69 72 ...
## $ Composure : num 70 72 83 76 74 51 62 56 80 56 ...
## $ Wage : num 1105 2138 3875 3661 2445 ...
A multiple regression model showing unstandardised estimates.
The predictors included in the model are: Age, Balance, ShotPower, Aggression, Positioning, and Composure.
names(football_st_2)
## [1] "Age" "Balance" "ShotPower" "Aggression" "Positioning"
## [6] "Composure" "Wage"
wage_model_st <- lm(Wage ~ Age + Balance + ShotPower +
Aggression + Positioning + Composure,
data = football_st_2)
summary(wage_model_st)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning + Composure, data = football_st_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31822 -8232 -2313 4754 350592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -77073.40 4064.61 -18.962 < 2e-16 ***
## Age -1014.25 110.94 -9.143 < 2e-16 ***
## Balance 120.41 35.90 3.354 0.00081 ***
## ShotPower 498.07 74.43 6.692 2.81e-11 ***
## Aggression 15.96 32.29 0.494 0.62129
## Positioning 741.71 82.42 8.999 < 2e-16 ***
## Composure 424.72 71.66 5.927 3.58e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18840 on 2145 degrees of freedom
## Multiple R-squared: 0.2997, Adjusted R-squared: 0.2978
## F-statistic: 153 on 6 and 2145 DF, p-value: < 2.2e-16
coef(wage_model_st)
## (Intercept) Age Balance ShotPower Aggression Positioning
## -77073.39877 -1014.24567 120.40620 498.06517 15.95657 741.70804
## Composure
## 424.72405
confint(wage_model_st, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -85044.38590 -69102.41165
## Age -1231.79758 -796.69375
## Balance 50.00615 190.80626
## ShotPower 352.09956 644.03079
## Aggression -47.37581 79.28895
## Positioning 580.07796 903.33813
## Composure 284.19780 565.25031
A multiple regression model showing standardised estimates.
The predictors included in the model are: Age, Balance, ShotPower, Aggression, Positioning, and Composure.
library(lm.beta)
## Warning: package 'lm.beta' was built under R version 4.4.2
wage_model_st_std <- lm.beta::lm.beta(wage_model_st)
coef(wage_model_st_std)
## (Intercept) Age Balance ShotPower Aggression Positioning
## NA -0.21358305 0.06178231 0.20182976 0.01126852 0.30316025
## Composure
## 0.19146721
confint(wage_model_st_std)
## 2.5 % 97.5 %
## (Intercept) NA NA
## Age -217.76550 217.33833
## Balance -70.33827 70.46184
## ShotPower -145.76378 146.16744
## Aggression -63.32111 63.34365
## Positioning -161.32692 161.93324
## Composure -140.33479 140.71772
wage_model_st_residuals <- rstandard(wage_model_st)
head(wage_model_st_residuals)
## 1 2 3 4 5 6
## -1.2711799 -1.4183035 -1.5151160 -1.1956035 -1.3820667 -0.5348701
football_st_comb_2 <- cbind(football_st_2, wage_model_st_residuals)
head(football_st_comb_2)
## Age Balance ShotPower Aggression Positioning Composure Wage
## 1 29 32 78 75 78 70 1105
## 2 29 73 77 77 76 72 2138
## 3 26 60 78 67 72 83 3875
## 4 27 76 68 73 73 76 3661
## 5 26 64 73 49 75 74 2445
## 6 22 64 72 28 62 51 2216
## wage_model_st_residuals
## 1 -1.2711799
## 2 -1.4183035
## 3 -1.5151160
## 4 -1.1956035
## 5 -1.3820667
## 6 -0.5348701
ggplot(football_st_comb_2) + aes(x = Wage, y = wage_model_st_residuals) +
geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Wage Prediction")
ggplot(football_st_comb_2) + aes(x = Age, y = wage_model_st_residuals) +
geom_point() + xlab("Age") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Age")
ggplot(football_st_comb_2) + aes(x = ShotPower, y = wage_model_st_residuals) +
geom_point() + xlab("Shot Power") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Shot Power")
ggplot(football_st_comb_2) + aes(x = Aggression, y = wage_model_st_residuals) +
geom_point() + xlab("Aggression") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Aggression")
ggplot(football_st_comb_2) + aes(x = Positioning, y = wage_model_st_residuals) +
geom_point() + xlab("Positioning") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Positionng")
ggplot(football_st_comb_2) + aes(x = Composure, y = wage_model_st_residuals) +
geom_point() + xlab("Composure") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Composure")
library(ggplot2)
ggplot(football_st_2) + aes(x = Wage) +
geom_histogram() +
ylab("Count") +
ggtitle("Distribution of wage (strikers)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(football_st_2) + aes(x = Wage) +
geom_histogram() +
ylab("Count") +
scale_x_log10() +
ggtitle("Distribution of log(wage) (strikers)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the Shapiro-Wilks test.
H-0: normal distribution
H-1: distribution is different from a normal distribution.
shapiro.test(football_st_2$Wage)
##
## Shapiro-Wilk normality test
##
## data: football_st_2$Wage
## W = 0.39056, p-value < 2.2e-16
How much the variance of an estimated regression coefficient increases if your predictors are correlated.
In other words, no 2 pairs of predicts should not be strongly correlated with each other.
If no factors are correlated, the VIFs will all be 1.
Rule of thumb: If VIF > 10, mullticollinearity is high.
library(car)
vif(wage_model_st)
## Age Balance ShotPower Aggression Positioning Composure
## 1.671663 1.039327 2.786601 1.593244 3.476150 3.196433
0 <= D-W <= 4.
Rule of thumb:
D-W = 2.0 means that there is no autocorrelation.
D-W < = means there is positive autocorrelation.
D-W > 2 means negative autocorrelation.
This applies in time series data; so not so applicable here.
durbinWatsonTest(wage_model_st)
## lag Autocorrelation D-W Statistic p-value
## 1 0.5038085 0.9915208 0
## Alternative hypothesis: rho != 0
Perform a Breusch-Pagan Test to test for heteroskedasticity/homoskedasticity.
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(wage_model_st)
##
## studentized Breusch-Pagan test
##
## data: wage_model_st
## BP = 91.188, df = 6, p-value < 2.2e-16
We can also automatically evaluate the model.
library(gvlma)
gvlma(wage_model_st)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning + Composure, data = football_st_2)
##
## Coefficients:
## (Intercept) Age Balance ShotPower Aggression Positioning
## -77073.40 -1014.25 120.41 498.07 15.96 741.71
## Composure
## 424.72
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st)
##
## Value p-value Decision
## Global Stat 1307104.5 0 Assumptions NOT satisfied!
## Skewness 26054.7 0 Assumptions NOT satisfied!
## Kurtosis 1280082.5 0 Assumptions NOT satisfied!
## Link Function 791.9 0 Assumptions NOT satisfied!
## Heteroscedasticity 175.5 0 Assumptions NOT satisfied!
Stepwise regression is a modification of the ordinary regression approach.
library(stats)
wage_model_st_step <- step(wage_model_st,
direction = "both")
## Start: AIC=42374.94
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning +
## Composure
##
## Df Sum of Sq RSS AIC
## - Aggression 1 8.6672e+07 7.6162e+11 42373
## <none> 7.6154e+11 42375
## - Balance 1 3.9939e+09 7.6553e+11 42384
## - Composure 1 1.2472e+10 7.7401e+11 42408
## - ShotPower 1 1.5897e+10 7.7743e+11 42417
## - Positioning 1 2.8752e+10 7.9029e+11 42453
## - Age 1 2.9676e+10 7.9121e+11 42455
##
## Step: AIC=42373.18
## Wage ~ Age + Balance + ShotPower + Positioning + Composure
##
## Df Sum of Sq RSS AIC
## <none> 7.6162e+11 42373
## + Aggression 1 8.6672e+07 7.6154e+11 42375
## - Balance 1 3.9197e+09 7.6554e+11 42382
## - Composure 1 1.2939e+10 7.7456e+11 42407
## - ShotPower 1 1.7279e+10 7.7890e+11 42419
## - Positioning 1 2.8770e+10 7.9039e+11 42451
## - Age 1 3.0373e+10 7.9200e+11 42455
summary(wage_model_st_step)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning +
## Composure, data = football_st_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31793 -8228 -2326 4830 350282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -77250.10 4048.13 -19.083 < 2e-16 ***
## Age -1002.58 108.38 -9.251 < 2e-16 ***
## Balance 118.78 35.74 3.323 0.000904 ***
## ShotPower 506.25 72.55 6.978 3.98e-12 ***
## Positioning 741.93 82.40 9.004 < 2e-16 ***
## Composure 429.17 71.08 6.038 1.83e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18840 on 2146 degrees of freedom
## Multiple R-squared: 0.2997, Adjusted R-squared: 0.298
## F-statistic: 183.6 on 5 and 2146 DF, p-value: < 2.2e-16
gvlma(wage_model_st_step)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning +
## Composure, data = football_st_2)
##
## Coefficients:
## (Intercept) Age Balance ShotPower Positioning Composure
## -77250.1 -1002.6 118.8 506.2 741.9 429.2
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_step)
##
## Value p-value Decision
## Global Stat 1300530.2 0 Assumptions NOT satisfied!
## Skewness 25983.6 0 Assumptions NOT satisfied!
## Kurtosis 1273577.0 0 Assumptions NOT satisfied!
## Link Function 794.0 0 Assumptions NOT satisfied!
## Heteroscedasticity 175.5 0 Assumptions NOT satisfied!
Now, we will use the data mining approach.
Split the data into training and validation sets.
Set the seed using our favourite number :-)
set.seed(666)
Create the indices for the split This samples the row indices to split the data into training and validation.
train_index <- sample(1:nrow(football_st_2), 0.7 * nrow(football_st_2))
valid_index <- setdiff(1:nrow(football_st_2), train_index)
Using the indices, create the training and validation sets This is similar in principle to splitting a data frame by row.
train_df_st <- football_st_2[train_index, ]
valid_df_st <- football_st_2[valid_index, ]
It is a good habit to check after splitting.
nrow(train_df_st)
## [1] 1506
nrow(valid_df_st)
## [1] 646
Training the model on the training set.
wage_model_st_2 <- lm(Wage ~ Age + Balance + ShotPower +
Aggression + Positioning + Composure,
data = train_df_st)
summary(wage_model_st_2)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning + Composure, data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32861 -8569 -2336 5182 347609
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -81327.50 5253.63 -15.480 < 2e-16 ***
## Age -1032.61 146.38 -7.054 2.64e-12 ***
## Balance 131.37 46.63 2.817 0.00491 **
## ShotPower 514.89 98.00 5.254 1.70e-07 ***
## Aggression 13.64 41.73 0.327 0.74380
## Positioning 692.34 107.84 6.420 1.82e-10 ***
## Composure 533.27 93.18 5.723 1.26e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20410 on 1499 degrees of freedom
## Multiple R-squared: 0.2877, Adjusted R-squared: 0.2848
## F-statistic: 100.9 on 6 and 1499 DF, p-value: < 2.2e-16
Predict the outcome (i.e. wage) of the training and validation sets using the model from the training set. Compare the errors between the training and validation sets.
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
## The following object is masked from 'package:ggpubr':
##
## gghistogram
wage_model_st_2_pred_train <- predict(wage_model_st_2,
train_df_st)
accuracy(wage_model_st_2_pred_train, train_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set -3.02254e-10 20363.72 9804.857 -30.84404 131.6738
wage_model_st_2_pred_valid <- predict(wage_model_st_2,
valid_df_st)
accuracy(wage_model_st_2_pred_valid, valid_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set -910.8093 14653.34 9444.861 -32.5431 130.3103
max(train_df_st$Wage) - min(train_df_st$Wage)
## [1] 406504
sd(train_df_st$Wage)
## [1] 24135.81
max(valid_df_st$Wage) - min(valid_df_st$Wage)
## [1] 205030
sd(valid_df_st$Wage)
## [1] 18074.14
library(car)
vif(wage_model_st_2)
## Age Balance ShotPower Aggression Positioning Composure
## 1.690727 1.037080 2.900017 1.602035 3.573232 3.185606
library(lmtest)
bptest(wage_model_st_2)
##
## studentized Breusch-Pagan test
##
## data: wage_model_st_2
## BP = 72.484, df = 6, p-value = 1.264e-13
Predict new players
new <- read.csv("new.csv", header = TRUE)
wage_model_st_2_pred_new <- predict(wage_model_st_2,
newdata = new, interval = "confidence")
wage_model_st_2_pred_new
## fit lwr upr
## 1 21523.43 18689.82 24357.04
## 2 23759.40 20030.25 27488.55
## 3 21465.21 19657.65 23272.77
Subset to include categorical variable: preferred foot
football_st_3 <- football_st[, c("Preferred Foot", "Positioning", "Composure", "Wage")]
head(football_st_3)
## Preferred Foot Positioning Composure Wage
## 1 Right 78 70 1105
## 2 Left 76 72 2138
## 3 Right 72 83 3875
## 4 Right 73 76 3661
## 5 Right 75 74 2445
## 6 Right 62 51 2216
names(football_st_3)[1] <- "Preferred_Foot"
football_st_3$Positioning <- as.numeric(football_st_3$Positioning)
football_st_3$Composure <- as.numeric(football_st_3$Composure)
wage_model_st_cat <- lm(Wage ~ factor(Preferred_Foot) + Positioning + Composure, data = football_st_3)
confint(wage_model_st_cat, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -71769.7525 -59180.1997
## factor(Preferred_Foot)Right -3482.4352 1307.4754
## Positioning 668.9041 964.3812
## Composure 304.0428 572.1551
Residuals.
wage_model_st_cat_stdresiduals <- rstandard(wage_model_st_cat)
head(wage_model_st_cat_stdresiduals)
## 1 2 3 4 5 6
## -1.3769868 -1.3424571 -1.2770311 -1.1703986 -1.2718790 -0.2163923
football_st_3_cat <- cbind(football_st_3, wage_model_st_cat_stdresiduals)
head(football_st_3_cat)
## Preferred_Foot Positioning Composure Wage wage_model_st_cat_stdresiduals
## 1 Right 78 70 1105 -1.3769868
## 2 Left 76 72 2138 -1.3424571
## 3 Right 72 83 3875 -1.2770311
## 4 Right 73 76 3661 -1.1703986
## 5 Right 75 74 2445 -1.2718790
## 6 Right 62 51 2216 -0.2163923
ggplot(football_st_3_cat) + aes(x = Wage, y = wage_model_st_cat_stdresiduals) +
geom_point() + xlab("Wage") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Wage")
Positioning
ggplot(football_st_3_cat) + aes(x = Positioning, y = wage_model_st_cat_stdresiduals) +
geom_point() + xlab("Positioning") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Positioning")
Composure
ggplot(football_st_3_cat) + aes(x = Composure, y = wage_model_st_cat_stdresiduals) +
geom_point() + xlab("Composure") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Composure")
ggplot(football_st_3_cat) + aes(x = Preferred_Foot, y = wage_model_st_cat_stdresiduals) +
geom_point() + xlab("Preferred Foot") + ylab("Standarised Residuals") +
ggtitle("Standarised Residual Plot, Preferred Foot")
ggplot(football_st_3_cat) + aes(x = Wage) +
geom_histogram() +
ylab("Count") +
ggtitle("Distribution of Wage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the Shapiro-Wilks test.
H-0: normal distribution.
H-alt: distribution is different from a normal distribution.
shapiro.test(football_st_3_cat$Wage)
##
## Shapiro-Wilk normality test
##
## data: football_st_3_cat$Wage
## W = 0.39056, p-value < 2.2e-16
Multicollinearity
vif(wage_model_st_cat)
## factor(Preferred_Foot) Positioning Composure
## 1.002720 2.738872 2.743181
Homoscedasticity.
bptest(wage_model_st_cat)
##
## studentized Breusch-Pagan test
##
## data: wage_model_st_cat
## BP = 82.465, df = 3, p-value < 2.2e-16
gvlma(wage_model_st_cat)
##
## Call:
## lm(formula = Wage ~ factor(Preferred_Foot) + Positioning + Composure,
## data = football_st_3)
##
## Coefficients:
## (Intercept) factor(Preferred_Foot)Right
## -65475.0 -1087.5
## Positioning Composure
## 816.6 438.1
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_cat)
##
## Value p-value Decision
## Global Stat 1208344.4 0 Assumptions NOT satisfied!
## Skewness 25297.7 0 Assumptions NOT satisfied!
## Kurtosis 1182302.6 0 Assumptions NOT satisfied!
## Link Function 600.3 0 Assumptions NOT satisfied!
## Heteroscedasticity 143.8 0 Assumptions NOT satisfied!
set.seed(666)
train_index_3 <- sample(1:nrow(football_st_3), 0.7 *
nrow(football_st_3))
valid_index_3 <- setdiff(1:nrow(football_st_3), train_index)
train_df_st_3 <- football_st_3[train_index_3, ]
valid_df_st_3 <- football_st_3[valid_index_3, ]
wage_model_st_cat_2 <- lm(Wage ~ factor(Preferred_Foot) + Positioning +
Composure, data = train_df_st_3)
summary(wage_model_st_cat_2)
##
## Call:
## lm(formula = Wage ~ factor(Preferred_Foot) + Positioning + Composure,
## data = train_df_st_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33599 -8588 -2271 5035 352343
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -68270.03 4142.64 -16.480 < 2e-16 ***
## factor(Preferred_Foot)Right -2040.14 1572.46 -1.297 0.195
## Positioning 787.78 97.32 8.095 1.17e-15 ***
## Composure 534.07 89.17 5.990 2.63e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20920 on 1502 degrees of freedom
## Multiple R-squared: 0.2501, Adjusted R-squared: 0.2486
## F-statistic: 167 on 3 and 1502 DF, p-value: < 2.2e-16
wage_model_st_cat_2_pred_train <- predict(wage_model_st_cat_2,
train_df_st_3)
accuracy(wage_model_st_cat_2_pred_train, train_df_st_3$Wage)
## ME RMSE MAE MPE MAPE
## Test set -1.589124e-10 20893.75 9883.551 -37.84999 129.0229
wage_model_st_cat_2_pred_valid <- predict(wage_model_st_cat_2,
valid_df_st_3)
accuracy(wage_model_st_cat_2_pred_valid, valid_df_st_3$Wage)
## ME RMSE MAE MPE MAPE
## Test set -844.4194 15387.14 9700.628 -41.79669 130.5894
sd(train_df_st_3$Wage)
## [1] 24135.81
sd(valid_df_st_3$Wage)
## [1] 18074.14
vif(wage_model_st_cat_2)
## factor(Preferred_Foot) Positioning Composure
## 1.004517 2.769896 2.776543
bptest(wage_model_st_cat_2)
##
## studentized Breusch-Pagan test
##
## data: wage_model_st_cat_2
## BP = 64.695, df = 3, p-value = 5.829e-14
new2 <- read.csv("new2.csv")
new2
## Preferred.Foot Positioning Composure
## 1 Right 64 56
## 2 Right 65 47
new2$Preferred.Foot <- as.factor(new2$Preferred.Foot)
names(new2)
## [1] "Preferred.Foot" "Positioning" "Composure"
names(new2)[1] <- "Preferred_Foot"
names(new2)
## [1] "Preferred_Foot" "Positioning" "Composure"
wage_model_st_cat_2_pred_new <- predict(wage_model_st_cat_2,
newdata = new2, interval = "confidence")
wage_model_st_cat_2_pred_new
## fit lwr upr
## 1 10016.014 8776.276 11255.75
## 2 5997.149 3512.208 8482.09
Sometimes, a relationship may not be linear. In this case, we can specify a non-linear relationship in the model.
We start with the traditional statistics approach and evaluate.
The non-linear relationship is expressed in the model specification.
names(football_st_2)
## [1] "Age" "Balance" "ShotPower" "Aggression" "Positioning"
## [6] "Composure" "Wage"
wage_model_st_nl <- lm(Wage ~ Age + Balance + ShotPower +
Aggression + Positioning * Composure,
data = football_st_2)
summary(wage_model_st_nl)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning * Composure, data = football_st_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58380 -5245 80 4644 267683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 297675.783 13442.584 22.144 <2e-16 ***
## Age -789.963 94.502 -8.359 <2e-16 ***
## Balance 57.694 30.555 1.888 0.0591 .
## ShotPower 642.408 63.389 10.134 <2e-16 ***
## Aggression 19.805 27.418 0.722 0.4702
## Positioning -5016.022 211.523 -23.714 <2e-16 ***
## Composure -6150.054 235.919 -26.069 <2e-16 ***
## Positioning:Composure 96.301 3.339 28.844 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16000 on 2144 degrees of freedom
## Multiple R-squared: 0.4955, Adjusted R-squared: 0.4939
## F-statistic: 300.8 on 7 and 2144 DF, p-value: < 2.2e-16
vif(wage_model_st_nl)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## Age Balance ShotPower
## 1.683057 1.044616 2.804077
## Aggression Positioning Composure
## 1.593281 31.765761 48.069231
## Positioning:Composure
## 127.119996
durbinWatsonTest(wage_model_st_nl)
## lag Autocorrelation D-W Statistic p-value
## 1 0.2531554 1.491911 0
## Alternative hypothesis: rho != 0
Perform a stepwise regression with a non-linear relationship and evaluate
wage_model_st_nl_step <- step(wage_model_st_nl,
direction = "both")
## Start: AIC=41671.29
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning *
## Composure
##
## Df Sum of Sq RSS AIC
## - Aggression 1 1.3352e+08 5.4877e+11 41670
## <none> 5.4863e+11 41671
## - Balance 1 9.1234e+08 5.4955e+11 41673
## - Age 1 1.7881e+10 5.6652e+11 41738
## - ShotPower 1 2.6282e+10 5.7492e+11 41770
## - Positioning:Composure 1 2.1290e+11 7.6154e+11 42375
##
## Step: AIC=41669.81
## Wage ~ Age + Balance + ShotPower + Positioning + Composure +
## Positioning:Composure
##
## Df Sum of Sq RSS AIC
## <none> 5.4877e+11 41670
## - Balance 1 8.5698e+08 5.4963e+11 41671
## + Aggression 1 1.3352e+08 5.4863e+11 41671
## - Age 1 1.8041e+10 5.6681e+11 41737
## - ShotPower 1 2.8516e+10 5.7728e+11 41777
## - Positioning:Composure 1 2.1286e+11 7.6162e+11 42373
summary(wage_model_st_nl_step)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning +
## Composure + Positioning:Composure, data = football_st_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58507 -5205 67 4579 267488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 297410.796 13436.079 22.135 <2e-16 ***
## Age -775.517 92.352 -8.397 <2e-16 ***
## Balance 55.684 30.424 1.830 0.0674 .
## ShotPower 652.547 61.808 10.558 <2e-16 ***
## Positioning -5015.048 211.495 -23.712 <2e-16 ***
## Composure -6143.738 235.730 -26.063 <2e-16 ***
## Positioning:Composure 96.289 3.338 28.844 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15990 on 2145 degrees of freedom
## Multiple R-squared: 0.4954, Adjusted R-squared: 0.494
## F-statistic: 351 on 6 and 2145 DF, p-value: < 2.2e-16
vif(wage_model_st_nl_step)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## Age Balance ShotPower
## 1.607679 1.035950 2.666586
## Positioning Composure Positioning:Composure
## 31.764471 48.003192 127.116986
durbinWatsonTest(wage_model_st_nl_step)
## lag Autocorrelation D-W Statistic p-value
## 1 0.2522843 1.493672 0
## Alternative hypothesis: rho != 0
A data mining approach with the non-linear relationship.
wage_model_st_nl_2 <- lm(Wage ~ Age + Balance + ShotPower + Aggression +
Positioning * Composure,
data = train_df_st)
summary(wage_model_st_nl_2)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning * Composure, data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66085 -5447 302 4870 260260
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 342265.457 17203.875 19.895 < 2e-16 ***
## Age -758.277 122.802 -6.175 8.52e-10 ***
## Balance 79.001 39.022 2.025 0.0431 *
## ShotPower 699.312 82.219 8.505 < 2e-16 ***
## Aggression 13.470 34.875 0.386 0.6994
## Positioning -5818.796 271.105 -21.463 < 2e-16 ***
## Composure -6961.508 304.442 -22.866 < 2e-16 ***
## Positioning:Composure 109.127 4.285 25.465 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17060 on 1498 degrees of freedom
## Multiple R-squared: 0.5029, Adjusted R-squared: 0.5006
## F-statistic: 216.5 on 7 and 1498 DF, p-value: < 2.2e-16
Predict the training and validation sets using the non-linear model. Check the accyracu.
wage_model_st_nl_2_pred_train <- predict(wage_model_st_nl_2,
train_df_st)
accuracy(wage_model_st_nl_2_pred_train, train_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set 1.966265e-09 17011.8 8516.679 -27.10041 105.2229
wage_model_st_nl_2_pred_valid <- predict(wage_model_st_nl_2,
valid_df_st)
accuracy(wage_model_st_nl_2_pred_valid, valid_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set -855.2409 13504.82 8758.766 -32.40985 113.7297
Predict the wages of new players using the non-linear model.
wage_model_st_nl_2_pred_new <- predict(wage_model_st_nl_2,
newdata = new, interval = "confidence")
wage_model_st_nl_2_pred_new
## fit lwr upr
## 1 14285.88 11853.154 16718.61
## 2 12719.88 9489.583 15950.18
## 3 17129.30 15582.278 18676.32
gvlma(wage_model_st_nl_2)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Aggression +
## Positioning * Composure, data = train_df_st)
##
## Coefficients:
## (Intercept) Age Balance
## 342265.46 -758.28 79.00
## ShotPower Aggression Positioning
## 699.31 13.47 -5818.80
## Composure Positioning:Composure
## -6961.51 109.13
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_nl_2)
##
## Value p-value Decision
## Global Stat 435462.84 0.0000000 Assumptions NOT satisfied!
## Skewness 9663.54 0.0000000 Assumptions NOT satisfied!
## Kurtosis 425239.16 0.0000000 Assumptions NOT satisfied!
## Link Function 545.56 0.0000000 Assumptions NOT satisfied!
## Heteroscedasticity 14.58 0.0001342 Assumptions NOT satisfied!
A data mining approach using a stepwise regression and non-linear relationship.
wage_model_st_nl_2_step <- step(wage_model_st_nl_2,
direction = "both")
## Start: AIC=29357.89
## Wage ~ Age + Balance + ShotPower + Aggression + Positioning *
## Composure
##
## Df Sum of Sq RSS AIC
## - Aggression 1 4.3404e+07 4.3588e+11 29356
## <none> 4.3584e+11 29358
## - Balance 1 1.1925e+09 4.3703e+11 29360
## - Age 1 1.1093e+10 4.4693e+11 29394
## - ShotPower 1 2.1048e+10 4.5689e+11 29427
## - Positioning:Composure 1 1.8867e+11 6.2451e+11 29898
##
## Step: AIC=29356.04
## Wage ~ Age + Balance + ShotPower + Positioning + Composure +
## Positioning:Composure
##
## Df Sum of Sq RSS AIC
## <none> 4.3588e+11 29356
## + Aggression 1 4.3404e+07 4.3584e+11 29358
## - Balance 1 1.1606e+09 4.3704e+11 29358
## - Age 1 1.1307e+10 4.4719e+11 29393
## - ShotPower 1 2.2847e+10 4.5873e+11 29431
## - Positioning:Composure 1 1.8867e+11 6.2455e+11 29896
summary(wage_model_st_nl_2_step)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning +
## Composure + Positioning:Composure, data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66274 -5469 260 4921 260121
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 342109.131 17194.232 19.897 < 2e-16 ***
## Age -748.254 119.996 -6.236 5.84e-10 ***
## Balance 77.598 38.841 1.998 0.0459 *
## ShotPower 706.988 79.759 8.864 < 2e-16 ***
## Positioning -5818.926 271.028 -21.470 < 2e-16 ***
## Composure -6958.216 304.237 -22.871 < 2e-16 ***
## Positioning:Composure 109.128 4.284 25.472 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17050 on 1499 degrees of freedom
## Multiple R-squared: 0.5028, Adjusted R-squared: 0.5008
## F-statistic: 252.7 on 6 and 1499 DF, p-value: < 2.2e-16
Predict the training and validation sets using the stepwise, non-linear model. Check the accuracy.
wage_model_st_nl_2_step_pred_train <- predict(wage_model_st_nl_2_step,
train_df_st)
accuracy(wage_model_st_nl_2_step_pred_train, train_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set 1.982657e-09 17012.65 8519.261 -27.09032 105.1927
wage_model_st_nl_2_step_pred_valid <- predict(wage_model_st_nl_2_step,
valid_df_st)
accuracy(wage_model_st_nl_2_step_pred_valid, valid_df_st$Wage)
## ME RMSE MAE MPE MAPE
## Test set -852.6068 13509.38 8763.627 -32.34321 113.7652
Predict the wages of new players using the stepwise non-linear model.
wage_model_st_nl_2_step_pred_new <- predict(wage_model_st_nl_2_step,
newdata = new, interval = "confidence")
wage_model_st_nl_2_step_pred_new
## fit lwr upr
## 1 14529.97 12437.349 16622.59
## 2 12274.73 9968.014 14581.45
## 3 17001.40 15597.748 18405.04
gvlma(wage_model_st_nl_2_step)
##
## Call:
## lm(formula = Wage ~ Age + Balance + ShotPower + Positioning +
## Composure + Positioning:Composure, data = train_df_st)
##
## Coefficients:
## (Intercept) Age Balance
## 342109.1 -748.3 77.6
## ShotPower Positioning Composure
## 707.0 -5818.9 -6958.2
## Positioning:Composure
## 109.1
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_nl_2_step)
##
## Value p-value Decision
## Global Stat 433134.34 0.0000000 Assumptions NOT satisfied!
## Skewness 9632.18 0.0000000 Assumptions NOT satisfied!
## Kurtosis 422942.94 0.0000000 Assumptions NOT satisfied!
## Link Function 544.53 0.0000000 Assumptions NOT satisfied!
## Heteroscedasticity 14.69 0.0001267 Assumptions NOT satisfied!
Sometimes, the data need to be transformed. A common transformation is the log transformation.
A traditional statistics approach using a log transformation.
Here, the predictors are transformed using a log function.
wage_model_st_log <- lm(log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
log(Aggression) + log(Positioning) + log(Composure),
data = football_st_2)
summary(wage_model_st_log)
##
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Aggression) + log(Positioning) + log(Composure), data = football_st_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.58521 -0.38112 -0.03033 0.36168 2.32404
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.49560 0.49568 -19.157 < 2e-16 ***
## log(Age) -0.79291 0.08748 -9.064 < 2e-16 ***
## log(Balance) 0.23309 0.06514 3.578 0.000354 ***
## log(ShotPower) 1.66883 0.14615 11.418 < 2e-16 ***
## log(Aggression) 0.06081 0.04912 1.238 0.215839
## log(Positioning) 1.94852 0.15876 12.273 < 2e-16 ***
## log(Composure) 1.16000 0.12750 9.098 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5898 on 2145 degrees of freedom
## Multiple R-squared: 0.4989, Adjusted R-squared: 0.4975
## F-statistic: 355.9 on 6 and 2145 DF, p-value: < 2.2e-16
gvlma(wage_model_st_log)
##
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Aggression) + log(Positioning) + log(Composure), data = football_st_2)
##
## Coefficients:
## (Intercept) log(Age) log(Balance) log(ShotPower)
## -9.49560 -0.79291 0.23309 1.66883
## log(Aggression) log(Positioning) log(Composure)
## 0.06081 1.94852 1.16000
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_log)
##
## Value p-value Decision
## Global Stat 561.540 0.000e+00 Assumptions NOT satisfied!
## Skewness 2.261 1.326e-01 Assumptions acceptable.
## Kurtosis 66.615 3.331e-16 Assumptions NOT satisfied!
## Link Function 481.799 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity 10.865 9.802e-04 Assumptions NOT satisfied!
We can also use a data mining approach with the log transformation.
wage_model_st_log_2 <- lm(log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
log(Aggression) + log(Positioning) + log(Composure),
data = train_df_st)
summary(wage_model_st_log_2)
##
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Aggression) + log(Positioning) + log(Composure), data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.55406 -0.38093 -0.03289 0.35960 2.32610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.67383 0.59662 -16.214 < 2e-16 ***
## log(Age) -0.69453 0.10718 -6.480 1.24e-10 ***
## log(Balance) 0.28514 0.07870 3.623 0.000301 ***
## log(ShotPower) 1.49775 0.17843 8.394 < 2e-16 ***
## log(Aggression) 0.07195 0.05938 1.212 0.225846
## log(Positioning) 1.87362 0.19243 9.737 < 2e-16 ***
## log(Composure) 1.31511 0.15334 8.577 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5945 on 1499 degrees of freedom
## Multiple R-squared: 0.4985, Adjusted R-squared: 0.4965
## F-statistic: 248.3 on 6 and 1499 DF, p-value: < 2.2e-16
Predict the training and validation sets. Check the accuracy.
wage_model_st_log_2_pred_train <- predict(wage_model_st_log_2,
train_df_st)
train_df_st$logWage <- log(train_df_st$Wage)
accuracy(wage_model_st_log_2_pred_train, train_df_st$logWage)
## ME RMSE MAE MPE MAPE
## Test set 4.312485e-14 0.5931541 0.4587753 -0.4038503 5.089574
wage_model_st_log_2_pred_valid <- predict(wage_model_st_log_2,
valid_df_st)
valid_df_st$logWage <- log(valid_df_st$Wage)
accuracy(wage_model_st_log_2_pred_valid, valid_df_st$logWage)
## ME RMSE MAE MPE MAPE
## Test set 0.0002915404 0.5808893 0.4554191 -0.3779982 5.050841
gvlma(wage_model_st_log_2)
##
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Aggression) + log(Positioning) + log(Composure), data = train_df_st)
##
## Coefficients:
## (Intercept) log(Age) log(Balance) log(ShotPower)
## -9.67383 -0.69453 0.28514 1.49775
## log(Aggression) log(Positioning) log(Composure)
## 0.07195 1.87362 1.31511
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_log_2)
##
## Value p-value Decision
## Global Stat 394.158 0.000e+00 Assumptions NOT satisfied!
## Skewness 1.842 1.747e-01 Assumptions acceptable.
## Kurtosis 54.044 1.960e-13 Assumptions NOT satisfied!
## Link Function 336.747 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity 1.524 2.170e-01 Assumptions acceptable.
Predict new records
new3 <- read.csv("new3.csv", header = TRUE)
new3
## X Age Balance ShotPower Aggression Positioning Composure
## 1 1 25 66 69 55 72 71
## 2 2 26 58 76 75 66 66
## 3 3 19 80 67 33 43 52
wage_model_st_log_2_pred_new3 <- predict(wage_model_st_log_2,
newdata = new3, interval = "confidence")
wage_model_st_log_2_pred_new3
## fit lwr upr
## 1 9.533908 9.484214 9.583602
## 2 9.377799 9.315354 9.440244
## 3 8.323197 8.170023 8.476371
Results as a data frame (if desired).
wage_model_st_log_2_pred_new3_df <- as.data.frame(wage_model_st_log_2_pred_new3)
wage_model_st_log_2_pred_new3_df_value <- exp(1)^wage_model_st_log_2_pred_new3_df
wage_model_st_log_2_pred_new3_df_value
## fit lwr upr
## 1 13820.495 13150.482 14524.645
## 2 11822.968 11107.261 12584.791
## 3 4118.303 3533.423 4799.997
A stepwise regression using data mining and log transformations.
wage_model_st_log_2_step <- step(wage_model_st_log_2,
direction = "both")
## Start: AIC=-1559.17
## log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + log(Aggression) +
## log(Positioning) + log(Composure)
##
## Df Sum of Sq RSS AIC
## - log(Aggression) 1 0.519 530.38 -1559.7
## <none> 529.86 -1559.2
## - log(Balance) 1 4.640 534.50 -1548.0
## - log(Age) 1 14.843 544.70 -1519.6
## - log(ShotPower) 1 24.907 554.77 -1492.0
## - log(Composure) 1 26.002 555.86 -1489.0
## - log(Positioning) 1 33.510 563.37 -1468.8
##
## Step: AIC=-1559.7
## log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) + log(Positioning) +
## log(Composure)
##
## Df Sum of Sq RSS AIC
## <none> 530.38 -1559.7
## + log(Aggression) 1 0.519 529.86 -1559.2
## - log(Balance) 1 4.393 534.77 -1549.3
## - log(Age) 1 14.336 544.71 -1521.5
## - log(Composure) 1 27.350 557.73 -1486.0
## - log(ShotPower) 1 27.771 558.15 -1484.8
## - log(Positioning) 1 33.605 563.98 -1469.2
summary(wage_model_st_log_2_step)
##
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Positioning) + log(Composure), data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5474 -0.3754 -0.0318 0.3602 2.3170
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.74502 0.59382 -16.411 < 2e-16 ***
## log(Age) -0.66674 0.10471 -6.367 2.55e-10 ***
## log(Balance) 0.27624 0.07837 3.525 0.000436 ***
## log(ShotPower) 1.54434 0.17426 8.862 < 2e-16 ***
## log(Positioning) 1.87617 0.19245 9.749 < 2e-16 ***
## log(Composure) 1.33827 0.15216 8.795 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5946 on 1500 degrees of freedom
## Multiple R-squared: 0.498, Adjusted R-squared: 0.4963
## F-statistic: 297.6 on 5 and 1500 DF, p-value: < 2.2e-16
gvlma(wage_model_st_log_2_step)
##
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Positioning) + log(Composure), data = train_df_st)
##
## Coefficients:
## (Intercept) log(Age) log(Balance) log(ShotPower)
## -9.7450 -0.6667 0.2762 1.5443
## log(Positioning) log(Composure)
## 1.8762 1.3383
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_log_2_step)
##
## Value p-value Decision
## Global Stat 393.169 0.000e+00 Assumptions NOT satisfied!
## Skewness 2.178 1.400e-01 Assumptions acceptable.
## Kurtosis 52.586 4.117e-13 Assumptions NOT satisfied!
## Link Function 336.860 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity 1.545 2.139e-01 Assumptions acceptable.
Predict the training and validation sets. Check the accuracy.
wage_model_st_log_2_step_pred_train <- predict(wage_model_st_log_2_step, train_df_st)
accuracy(wage_model_st_log_2_step_pred_train, train_df_st$logWage)
## ME RMSE MAE MPE MAPE
## Test set 4.4382e-14 0.5934445 0.4588034 -0.4041652 5.089808
wage_model_st_log_2_step_pred_valid <- predict(wage_model_st_log_2_step, valid_df_st)
accuracy(wage_model_st_log_2_step_pred_valid, valid_df_st$logWage)
## ME RMSE MAE MPE MAPE
## Test set 0.0007094787 0.580847 0.4553575 -0.3734178 5.049277
gvlma(wage_model_st_log_2_step)
##
## Call:
## lm(formula = log(Wage) ~ log(Age) + log(Balance) + log(ShotPower) +
## log(Positioning) + log(Composure), data = train_df_st)
##
## Coefficients:
## (Intercept) log(Age) log(Balance) log(ShotPower)
## -9.7450 -0.6667 0.2762 1.5443
## log(Positioning) log(Composure)
## 1.8762 1.3383
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = wage_model_st_log_2_step)
##
## Value p-value Decision
## Global Stat 393.169 0.000e+00 Assumptions NOT satisfied!
## Skewness 2.178 1.400e-01 Assumptions acceptable.
## Kurtosis 52.586 4.117e-13 Assumptions NOT satisfied!
## Link Function 336.860 0.000e+00 Assumptions NOT satisfied!
## Heteroscedasticity 1.545 2.139e-01 Assumptions acceptable.
Predict new records using the stepwise log model
wage_model_st_log_2_step_pred_new3 <- predict(wage_model_st_log_2_step,
newdata = new3, interval = "confidence")
wage_model_st_log_2_step_pred_new3
## fit lwr upr
## 1 9.533442 9.483746 9.583138
## 2 9.359849 9.304570 9.415129
## 3 8.340250 8.189562 8.490939
wage_model_st_log_2_step_pred_new3_df <- as.data.frame(wage_model_st_log_2_step_pred_new3)
wage_model_st_log_2_step_pred_new3_df_value <- exp(1)^wage_model_st_log_2_step_pred_new3_df
wage_model_st_log_2_step_pred_new3_df_value
## fit lwr upr
## 1 13814.060 13144.332 14517.911
## 2 11612.636 10988.116 12272.652
## 3 4189.138 3603.142 4870.437
We can combine different settings.
wage_model_st_log_3 <- lm(log(Wage) ~ log(Age) + I(Positioning * Composure),
data = train_df_st)
summary(wage_model_st_log_3)
##
## Call:
## lm(formula = log(Wage) ~ log(Age) + I(Positioning * Composure),
## data = train_df_st)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.73661 -0.35786 -0.00011 0.35532 2.05304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.666e+00 2.793e-01 31.03 < 2e-16 ***
## log(Age) -6.139e-01 9.807e-02 -6.26 5.02e-10 ***
## I(Positioning * Composure) 5.760e-04 1.599e-05 36.02 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.577 on 1503 degrees of freedom
## Multiple R-squared: 0.5264, Adjusted R-squared: 0.5258
## F-statistic: 835.2 on 2 and 1503 DF, p-value: < 2.2e-16
Predict the training and validation sets. Check the accuracy.
wage_model_st_log_3_pred_train <- predict(wage_model_st_log_3,
train_df_st)
train_df_st$logWage <- log(train_df_st$Wage)
accuracy(wage_model_st_log_3_pred_train, train_df_st$logWage)
## ME RMSE MAE MPE MAPE
## Test set 3.847045e-14 0.5764289 0.4423697 -0.388299 4.908134
wage_model_st_log_3_pred_valid <- predict(wage_model_st_log_3,
valid_df_st)
valid_df_st$logWage <- log(valid_df_st$Wage)
accuracy(wage_model_st_log_3_pred_valid, valid_df_st$logWage)
## ME RMSE MAE MPE MAPE
## Test set 0.002845158 0.5841898 0.4527094 -0.3440159 5.01038
vif(wage_model_st_log_3)
## log(Age) I(Positioning * Composure)
## 1.568088 1.568088
Predict new records
new3 <- read.csv("new3.csv", header = TRUE)
new3
## X Age Balance ShotPower Aggression Positioning Composure
## 1 1 25 66 69 55 72 71
## 2 2 26 58 76 75 66 66
## 3 3 19 80 67 33 43 52
wage_model_st_log_3_pred_new3 <- predict(wage_model_st_log_3,
newdata = new3, interval = "confidence")
wage_model_st_log_3_pred_new3
## fit lwr upr
## 1 9.634869 9.590987 9.678752
## 2 9.175323 9.143823 9.206824
## 3 8.146727 8.092877 8.200578
Results as a data frame (if desired).
wage_model_st_log_3_pred_new3_df <- as.data.frame(wage_model_st_log_3_pred_new3)
wage_model_st_log_3_pred_new3_df_value <- exp(1)^wage_model_st_log_3_pred_new3_df
wage_model_st_log_3_pred_new3_df_value
## fit lwr upr
## 1 15288.703 14632.298 15974.554
## 2 9655.891 9356.468 9964.895
## 3 3452.064 3271.086 3643.054