Directions

Logistic regression to predict decision on a football player.

Data for demo

Back to the spellbook

1. Load Data

football <- read.csv("football_3.csv", header = TRUE)
head(football, 10)
##        ID         Name Age                                          Photo
## 1  207439   L. Paredes  24 https://cdn.sofifa.org/players/4/19/207439.png
## 2  156713 A. Granqvist  33 https://cdn.sofifa.org/players/4/19/156713.png
## 3  229909     A. Lunev  26 https://cdn.sofifa.org/players/4/19/229909.png
## 4  187347 I. Smolnikov  29 https://cdn.sofifa.org/players/4/19/187347.png
## 5  153260       Hilton  40 https://cdn.sofifa.org/players/4/19/153260.png
## 6  187607    A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 7  204341   Luí_s Neto  30 https://cdn.sofifa.org/players/4/19/204341.png
## 8  223058   D. Kuzyaev  25 https://cdn.sofifa.org/players/4/19/223058.png
## 9  183389       G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
## 10 156092    J. Villar  41 https://cdn.sofifa.org/players/4/19/156092.png
##    Nationality                                 Flag Overall Potential
## 1    Argentina  https://cdn.sofifa.org/flags/52.png      80        85
## 2       Sweden  https://cdn.sofifa.org/flags/46.png      80        80
## 3       Russia  https://cdn.sofifa.org/flags/40.png      79        81
## 4       Russia  https://cdn.sofifa.org/flags/40.png      79        79
## 5       Brazil  https://cdn.sofifa.org/flags/54.png      78        78
## 6       Russia  https://cdn.sofifa.org/flags/40.png      78        78
## 7     Portugal  https://cdn.sofifa.org/flags/38.png      77        77
## 8       Russia  https://cdn.sofifa.org/flags/40.png      77        80
## 9  Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77
## 10    Paraguay  https://cdn.sofifa.org/flags/58.png      77        77
##               Club                                   Club.Logo Value  Wage
## 1                          https://cdn.sofifa.org/flags/52.png  5684  1602
## 2                          https://cdn.sofifa.org/flags/46.png  6370  3591
## 3                          https://cdn.sofifa.org/flags/40.png  5675  3672
## 4                          https://cdn.sofifa.org/flags/40.png  6030  1448
## 5  Montpellier HSC https://cdn.sofifa.org/teams/2/light/70.png  6405 19799
## 6                          https://cdn.sofifa.org/flags/40.png  5764  1105
## 7                          https://cdn.sofifa.org/flags/38.png  6075  2836
## 8                          https://cdn.sofifa.org/flags/40.png  5565  2653
## 9                         https://cdn.sofifa.org/flags/108.png  5275  2138
## 10                         https://cdn.sofifa.org/flags/58.png  5698  2581
##    Special Preferred.Foot International.Reputation Weak.Foot Skill.Moves
## 1     2122          Right                        2         4           4
## 2     1797          Right                        2         4           2
## 3     1217          Right                        1         3           1
## 4     2038          Right                        2         3           3
## 5     1807          Right                        2         3           3
## 6     1810          Right                        2         3           3
## 7     1749          Right                        1         3           2
## 8     2041          Right                        1         3           3
## 9     1933           Left                        2         3           3
## 10    1168          Right                        2         3           1
##         Work.Rate Body.Type Real.Face Position Jersey.Number   Joined
## 1  Medium/ Medium    Normal        No       CM             5         
## 2    High/ Medium    Normal        No      LCB             4         
## 3  Medium/ Medium    Normal        No       GK            12         
## 4      High/ High      Lean        No       RB             2         
## 5  Medium/ Medium    Normal       Yes       CB             4 1-Aug-11
## 6    High/ Medium    Stocky        No       ST            22         
## 7  Medium/ Medium      Lean        No       CB             4         
## 8    Medium/ High      Lean        No       RM             7         
## 9       High/ Low    Normal        No       ST            21         
## 10 Medium/ Medium    Normal        No       GK             1         
##    Loaned.From Contract.Valid.Until Height Weight   LS   ST   RS   LW   LF   CF
## 1                                     5'11 165lbs 71+2 71+2 71+2 75+2 75+2 75+2
## 2                                      6'4 185lbs 62+2 62+2 62+2 56+2 58+2 58+2
## 3                                      6'2 176lbs                              
## 4                                     5'10 154lbs 70+2 70+2 70+2 73+2 72+2 72+2
## 5                              2019   5'11 172lbs 58+2 58+2 58+2 58+2 59+2 59+2
## 6                                      6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2
## 7                                      6'2 157lbs 52+2 52+2 52+2 51+2 51+2 51+2
## 8                                      6'0 163lbs 70+2 70+2 70+2 74+2 74+2 74+2
## 9                                     5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2
## 10                                    5'11 187lbs                              
##      RF   RW  LAM  CAM  RAM   LM  LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB
## 1  75+2 75+2 77+2 77+2 77+2 76+2 79+2 79+2 79+2 76+2 75+2 77+2 77+2 77+2 75+2
## 2  58+2 56+2 58+2 58+2 58+2 57+2 64+2 64+2 64+2 57+2 68+2 74+2 74+2 74+2 68+2
## 3                                                                            
## 4  72+2 73+2 73+2 73+2 73+2 75+2 74+2 74+2 74+2 75+2 78+2 75+2 75+2 75+2 78+2
## 5  59+2 58+2 62+2 62+2 62+2 60+2 67+2 67+2 67+2 60+2 67+2 73+2 73+2 73+2 67+2
## 6  74+2 71+2 71+2 71+2 71+2 71+2 66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2
## 7  51+2 51+2 54+2 54+2 54+2 54+2 61+2 61+2 61+2 54+2 67+2 72+2 72+2 72+2 67+2
## 8  74+2 74+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2
## 9  75+2 75+2 74+2 74+2 74+2 74+2 67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2
## 10                                                                           
##      LB  LCB   CB  RCB   RB Crossing Finishing HeadingAccuracy ShortPassing
## 1  74+2 72+2 72+2 72+2 74+2       76        55              60           84
## 2  70+2 79+2 79+2 79+2 70+2       49        51              81           73
## 3                                 16        14              17           25
## 4  78+2 73+2 73+2 73+2 78+2       73        61              69           79
## 5  68+2 76+2 76+2 76+2 68+2       60        45              79           73
## 6  48+2 48+2 48+2 48+2 48+2       61        79              86           71
## 7  69+2 75+2 75+2 75+2 69+2       42        33              80           72
## 8  74+2 70+2 70+2 70+2 74+2       67        64              51           82
## 9  50+2 46+2 46+2 46+2 50+2       68        77              71           73
## 10                                14        12              12           30
##    Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration
## 1       73        78    79         78          82          82           75
## 2       37        49    36         40          67          63           46
## 3       13        15    18         17          32          17           58
## 4       57        72    49         46          75          72           84
## 5       51        63    42         48          72          73           33
## 6       74        71    64         60          55          77           66
## 7       40        49    52         43          77          48           57
## 8       57        78    60         61          75          79           78
## 9       73        76    73         69          67          76           78
## 10       8        21    15         22          22          24           31
##    SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength
## 1           69      77        74      77        82      61      79       69
## 2           49      55        76      36        74      64      67       83
## 3           54      36        76      50        24      60      27       70
## 4           90      80        75      76        67      85      93       68
## 5           38      51        70      60        55      79      54       76
## 6           65      50        75      32        78      63      77       93
## 7           59      69        78      61        42      79      72       72
## 8           81      80        73      76        76      60      79       59
## 9           85      79        71      73        77      70      78       74
## 10          32      50        73      68        29      56      22       62
##    LongShots Aggression Interceptions Positioning Vision Penalties Composure
## 1         80         79            72          74     82        57        74
## 2         59         81            82          54     49        79        78
## 3         13         26            20          11     63        15        69
## 4         57         65            71          77     72        41        73
## 5         58         76            79          50     67        64        70
## 6         68         75            30          78     73        77        70
## 7         37         76            78          44     46        47        72
## 8         74         70            74          71     70        63        64
## 9         74         77            18          76     73        72        72
## 10        16         22            22          14     51        21        55
##    Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking
## 1       73             75            72        9         14         6
## 2       82             83            79        7          9        12
## 3       18             20            12       80         73        65
## 4       76             76            80        7         12        10
## 5       83             77            76       12          7        11
## 6       21             15            19       15         12        11
## 7       80             77            78       10         15        13
## 8       71             77            76       15         16        13
## 9       40             18            12       15          9        10
## 10      13             13            14       75         75        74
##    GKPositioning GKReflexes Decision Release.Clause
## 1              9         10        0               
## 2             10         15        1               
## 3             77         85        1               
## 4              8         15        1               
## 5             12         13        1               
## 6             11          8        0               
## 7             15          8        1               
## 8              7          8        0               
## 9             15         16        1               
## 10            78         77        1
names(football)
##  [1] "ID"                       "Name"                    
##  [3] "Age"                      "Photo"                   
##  [5] "Nationality"              "Flag"                    
##  [7] "Overall"                  "Potential"               
##  [9] "Club"                     "Club.Logo"               
## [11] "Value"                    "Wage"                    
## [13] "Special"                  "Preferred.Foot"          
## [15] "International.Reputation" "Weak.Foot"               
## [17] "Skill.Moves"              "Work.Rate"               
## [19] "Body.Type"                "Real.Face"               
## [21] "Position"                 "Jersey.Number"           
## [23] "Joined"                   "Loaned.From"             
## [25] "Contract.Valid.Until"     "Height"                  
## [27] "Weight"                   "LS"                      
## [29] "ST"                       "RS"                      
## [31] "LW"                       "LF"                      
## [33] "CF"                       "RF"                      
## [35] "RW"                       "LAM"                     
## [37] "CAM"                      "RAM"                     
## [39] "LM"                       "LCM"                     
## [41] "CM"                       "RCM"                     
## [43] "RM"                       "LWB"                     
## [45] "LDM"                      "CDM"                     
## [47] "RDM"                      "RWB"                     
## [49] "LB"                       "LCB"                     
## [51] "CB"                       "RCB"                     
## [53] "RB"                       "Crossing"                
## [55] "Finishing"                "HeadingAccuracy"         
## [57] "ShortPassing"             "Volleys"                 
## [59] "Dribbling"                "Curve"                   
## [61] "FKAccuracy"               "LongPassing"             
## [63] "BallControl"              "Acceleration"            
## [65] "SprintSpeed"              "Agility"                 
## [67] "Reactions"                "Balance"                 
## [69] "ShotPower"                "Jumping"                 
## [71] "Stamina"                  "Strength"                
## [73] "LongShots"                "Aggression"              
## [75] "Interceptions"            "Positioning"             
## [77] "Vision"                   "Penalties"               
## [79] "Composure"                "Marking"                 
## [81] "StandingTackle"           "SlidingTackle"           
## [83] "GKDiving"                 "GKHandling"              
## [85] "GKKicking"                "GKPositioning"           
## [87] "GKReflexes"               "Decision"                
## [89] "Release.Clause"
str(football)
## 'data.frame':    18159 obs. of  89 variables:
##  $ ID                      : int  207439 156713 229909 187347 153260 187607 204341 223058 183389 156092 ...
##  $ Name                    : Factor w/ 17148 levels "A. \201_ivkoviÛ\210",..: 9735 576 839 6706 6450 450 10142 3598 5890 8193 ...
##  $ Age                     : int  24 33 26 29 40 29 30 25 29 41 ...
##  $ Photo                   : Factor w/ 18159 levels "https://cdn.sofifa.org/players/4/19/100803.png",..: 5871 525 11255 2516 450 2539 5302 9314 2003 495 ...
##  $ Nationality             : Factor w/ 164 levels "Afghanistan",..: 7 146 129 129 21 129 124 129 80 120 ...
##  $ Flag                    : Factor w/ 164 levels "https://cdn.sofifa.org/flags/1.png",..: 123 116 111 111 125 111 108 111 11 129 ...
##  $ Overall                 : int  80 80 79 79 78 78 77 77 77 77 ...
##  $ Potential               : int  85 80 81 79 78 78 77 80 77 77 ...
##  $ Club                    : Factor w/ 652 levels ""," SSV Jahn Regensburg",..: 1 1 1 1 396 1 1 1 1 1 ...
##  $ Club.Logo               : Factor w/ 679 levels "https://cdn.sofifa.org/flags/103.png",..: 20 18 16 16 628 16 14 16 2 23 ...
##  $ Value                   : int  5684 6370 5675 6030 6405 5764 6075 5565 5275 5698 ...
##  $ Wage                    : int  1602 3591 3672 1448 19799 1105 2836 2653 2138 2581 ...
##  $ Special                 : int  2122 1797 1217 2038 1807 1810 1749 2041 1933 1168 ...
##  $ Preferred.Foot          : Factor w/ 2 levels "Left","Right": 2 2 2 2 2 2 2 2 1 2 ...
##  $ International.Reputation: int  2 2 1 2 2 2 1 1 2 2 ...
##  $ Weak.Foot               : int  4 4 3 3 3 3 3 3 3 3 ...
##  $ Skill.Moves             : int  4 2 1 3 3 3 2 3 3 1 ...
##  $ Work.Rate               : Factor w/ 9 levels "High/ High","High/ Low",..: 9 3 9 1 9 3 9 7 2 9 ...
##  $ Body.Type               : Factor w/ 10 levels "Akinfenwa","C. Ronaldo",..: 7 7 7 4 7 10 4 4 7 7 ...
##  $ Real.Face               : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
##  $ Position                : Factor w/ 28 levels "","CAM","CB",..: 6 10 7 19 3 28 3 24 28 7 ...
##  $ Jersey.Number           : int  5 4 12 2 4 22 4 7 21 1 ...
##  $ Joined                  : Factor w/ 1737 levels "","1-Apr-08",..: 1 1 1 1 17 1 1 1 1 1 ...
##  $ Loaned.From             : Factor w/ 342 levels "","1. FC Kí_ln",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Contract.Valid.Until    : Factor w/ 37 levels "","1-Dec-19",..: 1 1 1 1 13 1 1 1 1 1 ...
##  $ Height                  : Factor w/ 21 levels "5'1","5'10","5'11",..: 3 16 14 2 3 17 14 12 3 3 ...
##  $ Weight                  : Factor w/ 57 levels "110lbs","115lbs",..: 25 34 30 20 28 41 21 24 30 35 ...
##  $ LS                      : Factor w/ 94 levels "","31+2","32+2",..: 62 44 1 60 36 74 26 60 70 1 ...
##  $ ST                      : Factor w/ 94 levels "","31+2","32+2",..: 62 44 1 60 36 74 26 60 70 1 ...
##  $ RS                      : Factor w/ 94 levels "","31+2","32+2",..: 62 44 1 60 36 74 26 60 70 1 ...
##  $ LW                      : Factor w/ 106 levels "","25+2","27+2",..: 80 42 1 76 46 72 32 78 80 1 ...
##  $ LF                      : Factor w/ 103 levels "","27+2","29+2",..: 75 41 1 69 43 73 28 73 75 1 ...
##  $ CF                      : Factor w/ 103 levels "","27+2","29+2",..: 75 41 1 69 43 73 28 73 75 1 ...
##  $ RF                      : Factor w/ 103 levels "","27+2","29+2",..: 75 41 1 69 43 73 28 73 75 1 ...
##  $ RW                      : Factor w/ 106 levels "","25+2","27+2",..: 80 42 1 76 46 72 32 78 80 1 ...
##  $ LAM                     : Factor w/ 102 levels "","27+2","28+2",..: 80 42 1 72 50 68 34 76 74 1 ...
##  $ CAM                     : Factor w/ 102 levels "","27+2","28+2",..: 80 42 1 72 50 68 34 76 74 1 ...
##  $ RAM                     : Factor w/ 102 levels "","27+2","28+2",..: 80 42 1 72 50 68 34 76 74 1 ...
##  $ LM                      : Factor w/ 101 levels "","27+2","28+2",..: 77 39 1 75 45 67 33 75 73 1 ...
##  $ LCM                     : Factor w/ 93 levels "","30+2","31+2",..: 77 46 1 67 52 50 40 69 52 1 ...
##  $ CM                      : Factor w/ 93 levels "","30+2","31+2",..: 77 46 1 67 52 50 40 69 52 1 ...
##  $ RCM                     : Factor w/ 93 levels "","30+2","31+2",..: 77 46 1 67 52 50 40 69 52 1 ...
##  $ RM                      : Factor w/ 101 levels "","27+2","28+2",..: 77 39 1 75 45 67 33 75 73 1 ...
##  $ LWB                     : Factor w/ 96 levels "","30+2","31+2",..: 78 63 1 84 61 30 61 78 32 1 ...
##  $ LDM                     : Factor w/ 100 levels "","28+2","29+2",..: 83 77 1 79 75 31 73 79 31 1 ...
##  $ CDM                     : Factor w/ 100 levels "","28+2","29+2",..: 83 77 1 79 75 31 73 79 31 1 ...
##  $ RDM                     : Factor w/ 100 levels "","28+2","29+2",..: 83 77 1 79 75 31 73 79 31 1 ...
##  $ RWB                     : Factor w/ 96 levels "","30+2","31+2",..: 78 63 1 84 61 30 61 78 32 1 ...
##  $ LB                      : Factor w/ 99 levels "","29+2","30+2",..: 80 72 1 88 68 26 70 80 30 1 ...
##  $ LCB                     : Factor w/ 109 levels "","25+2","27+2",..: 82 96 1 84 90 32 88 78 28 1 ...
##  $ CB                      : Factor w/ 109 levels "","25+2","27+2",..: 82 96 1 84 90 32 88 78 28 1 ...
##  $ RCB                     : Factor w/ 109 levels "","25+2","27+2",..: 82 96 1 84 90 32 88 78 28 1 ...
##  $ RB                      : Factor w/ 99 levels "","29+2","30+2",..: 80 72 1 88 68 26 70 80 30 1 ...
##  $ Crossing                : int  76 49 16 73 60 61 42 67 68 14 ...
##  $ Finishing               : int  55 51 14 61 45 79 33 64 77 12 ...
##  $ HeadingAccuracy         : int  60 81 17 69 79 86 80 51 71 12 ...
##  $ ShortPassing            : int  84 73 25 79 73 71 72 82 73 30 ...
##  $ Volleys                 : int  73 37 13 57 51 74 40 57 73 8 ...
##  $ Dribbling               : int  78 49 15 72 63 71 49 78 76 21 ...
##  $ Curve                   : int  79 36 18 49 42 64 52 60 73 15 ...
##  $ FKAccuracy              : int  78 40 17 46 48 60 43 61 69 22 ...
##  $ LongPassing             : int  82 67 32 75 72 55 77 75 67 22 ...
##  $ BallControl             : int  82 63 17 72 73 77 48 79 76 24 ...
##  $ Acceleration            : int  75 46 58 84 33 66 57 78 78 31 ...
##  $ SprintSpeed             : int  69 49 54 90 38 65 59 81 85 32 ...
##  $ Agility                 : int  77 55 36 80 51 50 69 80 79 50 ...
##  $ Reactions               : int  74 76 76 75 70 75 78 73 71 73 ...
##  $ Balance                 : int  77 36 50 76 60 32 61 76 73 68 ...
##  $ ShotPower               : int  82 74 24 67 55 78 42 76 77 29 ...
##  $ Jumping                 : int  61 64 60 85 79 63 79 60 70 56 ...
##  $ Stamina                 : int  79 67 27 93 54 77 72 79 78 22 ...
##  $ Strength                : int  69 83 70 68 76 93 72 59 74 62 ...
##  $ LongShots               : int  80 59 13 57 58 68 37 74 74 16 ...
##  $ Aggression              : int  79 81 26 65 76 75 76 70 77 22 ...
##  $ Interceptions           : int  72 82 20 71 79 30 78 74 18 22 ...
##  $ Positioning             : int  74 54 11 77 50 78 44 71 76 14 ...
##  $ Vision                  : int  82 49 63 72 67 73 46 70 73 51 ...
##  $ Penalties               : int  57 79 15 41 64 77 47 63 72 21 ...
##  $ Composure               : int  74 78 69 73 70 70 72 64 72 55 ...
##  $ Marking                 : int  73 82 18 76 83 21 80 71 40 13 ...
##  $ StandingTackle          : int  75 83 20 76 77 15 77 77 18 13 ...
##  $ SlidingTackle           : int  72 79 12 80 76 19 78 76 12 14 ...
##  $ GKDiving                : int  9 7 80 7 12 15 10 15 15 75 ...
##  $ GKHandling              : int  14 9 73 12 7 12 15 16 9 75 ...
##  $ GKKicking               : int  6 12 65 10 11 11 13 13 10 74 ...
##  $ GKPositioning           : int  9 10 77 8 12 11 15 7 15 78 ...
##  $ GKReflexes              : int  10 15 85 15 13 8 8 8 16 77 ...
##  $ Decision                : int  0 1 1 1 1 0 1 0 1 1 ...
##  $ Release.Clause          : Factor w/ 1245 levels "","ä‰å1.1M","ä‰å1.2M",..: 1 1 1 1 1 1 1 1 1 1 ...
str(football$Wage)
##  int [1:18159] 1602 3591 3672 1448 19799 1105 2836 2653 2138 2581 ...
str(football$Value)
##  int [1:18159] 5684 6370 5675 6030 6405 5764 6075 5565 5275 5698 ...
table(football$Position)
## 
##       CAM   CB  CDM   CF   CM   GK  LAM   LB  LCB  LCM  LDM   LF   LM   LS   LW 
##   12  958 1778  948   74 1394 2025   21 1322  648  395  243   15 1095  207  381 
##  LWB  RAM   RB  RCB  RCM  RDM   RF   RM   RS   RW  RWB   ST 
##   78   21 1291  662  391  248   16 1124  203  370   87 2152
nrow(football)
## [1] 18159

2 PreProcessing

2.1 Filter for Strikers

Strikers are defined in the dataset as Position = “ST”.

football_st <- subset(football, Position =="ST")

head(football_st)
##        ID            Name Age                                          Photo
## 6  187607       A. Dzyuba  29 https://cdn.sofifa.org/players/4/19/187607.png
## 9  183389          G. Sio  29 https://cdn.sofifa.org/players/4/19/183389.png
## 19 245683       K. Fofana  26 https://cdn.sofifa.org/players/4/19/245683.png
## 46 190461 B. SiguríÁarson  27 https://cdn.sofifa.org/players/4/19/190461.png
## 66 225900    J. Sambenito  26 https://cdn.sofifa.org/players/4/19/225900.png
## 68 246405       B. Angulo  22 https://cdn.sofifa.org/players/4/19/246405.png
##    Nationality                                 Flag Overall Potential Club
## 6       Russia  https://cdn.sofifa.org/flags/40.png      78        78     
## 9  Ivory Coast https://cdn.sofifa.org/flags/108.png      77        77     
## 19 Ivory Coast https://cdn.sofifa.org/flags/108.png      75        75     
## 46     Iceland  https://cdn.sofifa.org/flags/24.png      73        74     
## 66    Paraguay  https://cdn.sofifa.org/flags/58.png      71        74     
## 68     Ecuador  https://cdn.sofifa.org/flags/57.png      71        77     
##                               Club.Logo Value Wage Special Preferred.Foot
## 6   https://cdn.sofifa.org/flags/40.png  5764 1105    1810          Right
## 9  https://cdn.sofifa.org/flags/108.png  5275 2138    1933           Left
## 19 https://cdn.sofifa.org/flags/108.png  5589 3875    1877          Right
## 46  https://cdn.sofifa.org/flags/24.png  5629 3661    1893          Right
## 66  https://cdn.sofifa.org/flags/58.png  6113 2445    1651          Right
## 68  https://cdn.sofifa.org/flags/57.png  5057 2216    1628          Right
##    International.Reputation Weak.Foot Skill.Moves      Work.Rate Body.Type
## 6                         2         3           3   High/ Medium    Stocky
## 9                         2         3           3      High/ Low    Normal
## 19                        1         3           3 Medium/ Medium    Normal
## 46                        1         4           3     High/ High    Normal
## 66                        1         3           2   High/ Medium      Lean
## 68                        1         4           3      High/ Low    Normal
##    Real.Face Position Jersey.Number Joined Loaned.From Contract.Valid.Until
## 6         No       ST            22                                        
## 9         No       ST            21                                        
## 19        No       ST            22                                        
## 46        No       ST             9                                        
## 66        No       ST             9                                        
## 68        No       ST            19                                        
##    Height Weight   LS   ST   RS   LW   LF   CF   RF   RW  LAM  CAM  RAM   LM
## 6     6'5 201lbs 77+2 77+2 77+2 71+2 74+2 74+2 74+2 71+2 71+2 71+2 71+2 71+2
## 9    5'11 176lbs 75+2 75+2 75+2 75+2 75+2 75+2 75+2 75+2 74+2 74+2 74+2 74+2
## 19    6'2 179lbs 73+2 73+2 73+2 71+2 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2
## 46    6'1 190lbs 72+2 72+2 72+2 71+2 71+2 71+2 71+2 71+2 70+2 70+2 70+2 71+2
## 66    6'0 190lbs 70+2 70+2 70+2 64+2 67+2 67+2 67+2 64+2 63+2 63+2 63+2 62+2
## 68    6'0 154lbs 70+2 70+2 70+2 67+2 68+2 68+2 68+2 67+2 63+2 63+2 63+2 65+2
##     LCM   CM  RCM   RM  LWB  LDM  CDM  RDM  RWB   LB  LCB   CB  RCB   RB
## 6  66+2 66+2 66+2 71+2 52+2 52+2 52+2 52+2 52+2 48+2 48+2 48+2 48+2 48+2
## 9  67+2 67+2 67+2 74+2 53+2 52+2 52+2 52+2 53+2 50+2 46+2 46+2 46+2 50+2
## 19 67+2 67+2 67+2 71+2 59+2 57+2 57+2 57+2 59+2 57+2 52+2 52+2 52+2 57+2
## 46 64+2 64+2 64+2 71+2 59+2 55+2 55+2 55+2 59+2 56+2 53+2 53+2 53+2 56+2
## 66 55+2 55+2 55+2 62+2 43+2 41+2 41+2 41+2 43+2 41+2 38+2 38+2 38+2 41+2
## 68 54+2 54+2 54+2 65+2 47+2 39+2 39+2 39+2 47+2 44+2 36+2 36+2 36+2 44+2
##    Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 6        61        79              86           71      74        71    64
## 9        68        77              71           73      73        76    73
## 19       66        75              72           74      74        72    63
## 46       66        71              68           68      65        73    63
## 66       40        74              72           57      72        60    64
## 68       50        78              69           56      46        76    58
##    FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility
## 6          60          55          77           66          65      50
## 9          69          67          76           78          85      79
## 19         59          58          75           59          77      63
## 46         48          44          73           78          79      83
## 66         42          42          63           79          72      61
## 68         58          33          71           82          79      78
##    Reactions Balance ShotPower Jumping Stamina Strength LongShots Aggression
## 6         75      32        78      63      77       93        68         75
## 9         71      73        77      70      78       74        74         77
## 19        72      60        78      69      83       77        73         67
## 46        74      76        68      78      90       85        66         73
## 66        69      64        73      69      67       72        67         49
## 68        73      64        72      69      77       69        54         28
##    Interceptions Positioning Vision Penalties Composure Marking StandingTackle
## 6             30          78     73        77        70      21             15
## 9             18          76     73        72        72      40             18
## 19            40          72     69        74        83      23             37
## 46            42          73     64        69        76      31             39
## 66            14          75     60        67        74      15             16
## 68            16          62     45        82        51      11             18
##    SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes
## 6             19       15         12        11            11          8
## 9             12       15          9        10            15         16
## 19            46        7         11         7            11         14
## 46            24        9         12        10            15         16
## 66            16       15         16        15             7          7
## 68            12       11          8        10             7          6
##    Decision Release.Clause
## 6         0               
## 9         1               
## 19        1               
## 46        0               
## 66        0               
## 68        1
nrow(football_st)
## [1] 2152

2.2 Training-Validation Split

Split the data into training and validation sets.

Set the seed using our favourite number :-)

set.seed(666)

Create the indices for the split This samples the row indices to split the data into training and validation.

train_index <- sample(1:nrow(football_st), 0.6 * nrow(football_st))
valid_index <- setdiff(1:nrow(football_st), train_index)

Using the indices, create the training and validation sets. This is similar in principle to splitting a data frame by row.

train_df <- football_st[train_index, ]
valid_df <- football_st[valid_index, ]

It is a good habit to check after splitting.

nrow(train_df)
## [1] 1291
nrow(valid_df)
## [1] 861

3 Logistic Regression Method 1

names(train_df)
##  [1] "ID"                       "Name"                    
##  [3] "Age"                      "Photo"                   
##  [5] "Nationality"              "Flag"                    
##  [7] "Overall"                  "Potential"               
##  [9] "Club"                     "Club.Logo"               
## [11] "Value"                    "Wage"                    
## [13] "Special"                  "Preferred.Foot"          
## [15] "International.Reputation" "Weak.Foot"               
## [17] "Skill.Moves"              "Work.Rate"               
## [19] "Body.Type"                "Real.Face"               
## [21] "Position"                 "Jersey.Number"           
## [23] "Joined"                   "Loaned.From"             
## [25] "Contract.Valid.Until"     "Height"                  
## [27] "Weight"                   "LS"                      
## [29] "ST"                       "RS"                      
## [31] "LW"                       "LF"                      
## [33] "CF"                       "RF"                      
## [35] "RW"                       "LAM"                     
## [37] "CAM"                      "RAM"                     
## [39] "LM"                       "LCM"                     
## [41] "CM"                       "RCM"                     
## [43] "RM"                       "LWB"                     
## [45] "LDM"                      "CDM"                     
## [47] "RDM"                      "RWB"                     
## [49] "LB"                       "LCB"                     
## [51] "CB"                       "RCB"                     
## [53] "RB"                       "Crossing"                
## [55] "Finishing"                "HeadingAccuracy"         
## [57] "ShortPassing"             "Volleys"                 
## [59] "Dribbling"                "Curve"                   
## [61] "FKAccuracy"               "LongPassing"             
## [63] "BallControl"              "Acceleration"            
## [65] "SprintSpeed"              "Agility"                 
## [67] "Reactions"                "Balance"                 
## [69] "ShotPower"                "Jumping"                 
## [71] "Stamina"                  "Strength"                
## [73] "LongShots"                "Aggression"              
## [75] "Interceptions"            "Positioning"             
## [77] "Vision"                   "Penalties"               
## [79] "Composure"                "Marking"                 
## [81] "StandingTackle"           "SlidingTackle"           
## [83] "GKDiving"                 "GKHandling"              
## [85] "GKKicking"                "GKPositioning"           
## [87] "GKReflexes"               "Decision"                
## [89] "Release.Clause"
train_df$Decision <- as.factor(train_df$Decision)
valid_df$Decision <- as.factor(valid_df$Decision)

logistic_reg_1 <- glm(Decision ~ Age + Crossing + Finishing + HeadingAccuracy +
                        ShortPassing + Volleys + Dribbling + Curve +
                        BallControl + Acceleration +
                        SprintSpeed + Agility + Reactions + Balance +
                        ShotPower + Jumping + Strength +
                        Aggression + Positioning + Composure,
                      data = train_df, family = "binomial")
summary(logistic_reg_1)
## 
## Call:
## glm(formula = Decision ~ Age + Crossing + Finishing + HeadingAccuracy + 
##     ShortPassing + Volleys + Dribbling + Curve + BallControl + 
##     Acceleration + SprintSpeed + Agility + Reactions + Balance + 
##     ShotPower + Jumping + Strength + Aggression + Positioning + 
##     Composure, family = "binomial", data = train_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5494  -1.1679   0.8109   1.1559   1.6141  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)      0.0700043  0.8775220   0.080   0.9364  
## Age             -0.0365861  0.0172659  -2.119   0.0341 *
## Crossing         0.0010418  0.0071265   0.146   0.8838  
## Finishing       -0.0153921  0.0174052  -0.884   0.3765  
## HeadingAccuracy -0.0083713  0.0109290  -0.766   0.4437  
## ShortPassing     0.0217390  0.0119988   1.812   0.0700 .
## Volleys         -0.0086527  0.0100163  -0.864   0.3877  
## Dribbling        0.0065157  0.0154621   0.421   0.6735  
## Curve           -0.0002802  0.0070235  -0.040   0.9682  
## BallControl      0.0032000  0.0178404   0.179   0.8576  
## Acceleration    -0.0125772  0.0122329  -1.028   0.3039  
## SprintSpeed     -0.0006122  0.0116696  -0.052   0.9582  
## Agility          0.0052299  0.0091886   0.569   0.5692  
## Reactions        0.0021523  0.0128892   0.167   0.8674  
## Balance          0.0018983  0.0079659   0.238   0.8116  
## ShotPower        0.0143233  0.0127724   1.121   0.2621  
## Jumping          0.0029168  0.0066383   0.439   0.6604  
## Strength        -0.0067525  0.0076664  -0.881   0.3784  
## Aggression       0.0015171  0.0048487   0.313   0.7544  
## Positioning      0.0111225  0.0144942   0.767   0.4429  
## Composure       -0.0031572  0.0108253  -0.292   0.7706  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1789.7  on 1290  degrees of freedom
## Residual deviance: 1767.8  on 1270  degrees of freedom
## AIC: 1809.8
## 
## Number of Fisher Scoring iterations: 4
logistic_reg_1_pred <- predict(logistic_reg_1, 
                             newdata = valid_df, type = "response")

head(logistic_reg_1_pred)
##         9        19        68       101       102       141 
## 0.5466647 0.5837867 0.5022499 0.4798854 0.5442764 0.4988865
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(as.factor(ifelse(logistic_reg_1_pred > 0.5, 1, 0)), 
                valid_df$Decision, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 220 210
##          1 230 201
##                                           
##                Accuracy : 0.489           
##                  95% CI : (0.4551, 0.5229)
##     No Information Rate : 0.5226          
##     P-Value [Acc > NIR] : 0.9779          
##                                           
##                   Kappa : -0.022          
##                                           
##  Mcnemar's Test P-Value : 0.3650          
##                                           
##             Sensitivity : 0.4891          
##             Specificity : 0.4889          
##          Pos Pred Value : 0.4664          
##          Neg Pred Value : 0.5116          
##              Prevalence : 0.4774          
##          Detection Rate : 0.2334          
##    Detection Prevalence : 0.5006          
##       Balanced Accuracy : 0.4890          
##                                           
##        'Positive' Class : 1               
## 

4 Logistic Regression Method 2

library(caret)


logistic_reg_2 <- train(Decision ~ Age + Crossing + Finishing + HeadingAccuracy +
                          ShortPassing + Volleys + Dribbling + Curve +
                          BallControl + Acceleration +
                          SprintSpeed + Agility + Reactions + Balance +
                          ShotPower + Jumping + Strength +
                          Aggression + Positioning + Composure,
                        data = train_df, method="glm", family="binomial")


summary(logistic_reg_2)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5494  -1.1679   0.8109   1.1559   1.6141  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)      0.0700043  0.8775220   0.080   0.9364  
## Age             -0.0365861  0.0172659  -2.119   0.0341 *
## Crossing         0.0010418  0.0071265   0.146   0.8838  
## Finishing       -0.0153921  0.0174052  -0.884   0.3765  
## HeadingAccuracy -0.0083713  0.0109290  -0.766   0.4437  
## ShortPassing     0.0217390  0.0119988   1.812   0.0700 .
## Volleys         -0.0086527  0.0100163  -0.864   0.3877  
## Dribbling        0.0065157  0.0154621   0.421   0.6735  
## Curve           -0.0002802  0.0070235  -0.040   0.9682  
## BallControl      0.0032000  0.0178404   0.179   0.8576  
## Acceleration    -0.0125772  0.0122329  -1.028   0.3039  
## SprintSpeed     -0.0006122  0.0116696  -0.052   0.9582  
## Agility          0.0052299  0.0091886   0.569   0.5692  
## Reactions        0.0021523  0.0128892   0.167   0.8674  
## Balance          0.0018983  0.0079659   0.238   0.8116  
## ShotPower        0.0143233  0.0127724   1.121   0.2621  
## Jumping          0.0029168  0.0066383   0.439   0.6604  
## Strength        -0.0067525  0.0076664  -0.881   0.3784  
## Aggression       0.0015171  0.0048487   0.313   0.7544  
## Positioning      0.0111225  0.0144942   0.767   0.4429  
## Composure       -0.0031572  0.0108253  -0.292   0.7706  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1789.7  on 1290  degrees of freedom
## Residual deviance: 1767.8  on 1270  degrees of freedom
## AIC: 1809.8
## 
## Number of Fisher Scoring iterations: 4

Predictions.

logistic_reg_2_pred <- predict(logistic_reg_2, 
                             newdata = valid_df, type = "raw")

head(logistic_reg_2_pred)
## [1] 1 1 1 0 1 0
## Levels: 0 1

Probabilities.

logistic_reg_2_prob <- predict(logistic_reg_2, 
                               newdata = valid_df, type = "prob")

head(logistic_reg_2_prob)
##             0         1
## 9   0.4533353 0.5466647
## 19  0.4162133 0.5837867
## 68  0.4977501 0.5022499
## 101 0.5201146 0.4798854
## 102 0.4557236 0.5442764
## 141 0.5011135 0.4988865

Confusion matrix.

confusionMatrix(as.factor(logistic_reg_2_pred), 
                valid_df$Decision)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 220 210
##          1 230 201
##                                           
##                Accuracy : 0.489           
##                  95% CI : (0.4551, 0.5229)
##     No Information Rate : 0.5226          
##     P-Value [Acc > NIR] : 0.9779          
##                                           
##                   Kappa : -0.022          
##                                           
##  Mcnemar's Test P-Value : 0.3650          
##                                           
##             Sensitivity : 0.4889          
##             Specificity : 0.4891          
##          Pos Pred Value : 0.5116          
##          Neg Pred Value : 0.4664          
##              Prevalence : 0.5226          
##          Detection Rate : 0.2555          
##    Detection Prevalence : 0.4994          
##       Balanced Accuracy : 0.4890          
##                                           
##        'Positive' Class : 0               
## 

5. Gains and Lift Charts

This approach only works using method 2.

library(modelplotr)
## Package modelplotr loaded! Happy model plotting!
scores_and_ntiles <- prepare_scores_and_ntiles(datasets = 
                                                 list("valid_df"),
                                               dataset_labels = 
                                                 list("Validation Data"),
                                               models = 
                                                 list("logistic_reg_2"),
                                               model_labels = 
                                                 list("Logistic regression"),
                                               target_column = "Decision",
                                               ntiles = 100)
## ... scoring caret model "logistic_reg_2" on dataset "valid_df".
## Data preparation step 1 succeeded! Dataframe created.
head(scores_and_ntiles)
##             model_label   dataset_label y_true    prob_0    prob_1 ntl_0 ntl_1
## 9   Logistic regression Validation Data      1 0.4533353 0.5466647    78    23
## 19  Logistic regression Validation Data      1 0.4162133 0.5837867    91    10
## 68  Logistic regression Validation Data      1 0.4977501 0.5022499    51    50
## 101 Logistic regression Validation Data      0 0.5201146 0.4798854    38    63
## 102 Logistic regression Validation Data      0 0.4557236 0.5442764    77    24
## 141 Logistic regression Validation Data      1 0.5011135 0.4988865    50    51
plot_input <- plotting_scope(prepared_input = scores_and_ntiles)
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
## 
## No comparison specified, default values are used. 
## 
## Single evaluation line will be plotted: Target value "1" plotted for dataset "Validation Data" and model "Logistic regression.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
head(plot_input)
##           scope         model_label   dataset_label target_class ntile neg pos
## 1 no_comparison Logistic regression Validation Data            1     0   0   0
## 2 no_comparison Logistic regression Validation Data            1     1   2   7
## 3 no_comparison Logistic regression Validation Data            1     2   2   7
## 4 no_comparison Logistic regression Validation Data            1     3   6   2
## 5 no_comparison Logistic regression Validation Data            1     4   5   4
## 6 no_comparison Logistic regression Validation Data            1     5   5   3
##   tot       pct negtot postot tottot    pcttot cumneg cumpos cumtot    cumpct
## 1   0        NA     NA     NA     NA        NA      0      0      0        NA
## 2   9 0.7777778    450    411    861 0.4773519      2      7      9 0.7777778
## 3   9 0.7777778    450    411    861 0.4773519      4     14     18 0.7777778
## 4   8 0.2500000    450    411    861 0.4773519     10     16     26 0.6153846
## 5   9 0.4444444    450    411    861 0.4773519     15     20     35 0.5714286
## 6   8 0.3750000    450    411    861 0.4773519     20     23     43 0.5348837
##         gain    cumgain gain_ref   gain_opt      lift  cumlift cumlift_ref
## 1 0.00000000 0.00000000     0.00 0.00000000        NA       NA           1
## 2 0.01703163 0.01703163     0.01 0.02189781 1.6293593 1.629359           1
## 3 0.01703163 0.03406326     0.02 0.04379562 1.6293593 1.629359           1
## 4 0.00486618 0.03892944     0.03 0.06326034 0.5237226 1.289163           1
## 5 0.00973236 0.04866180     0.04 0.08515815 0.9310624 1.197080           1
## 6 0.00729927 0.05596107     0.05 0.10462287 0.7855839 1.120523           1
##   legend
## 1      1
## 2      1
## 3      1
## 4      1
## 5      1
## 6      1

Cumulative gains for logistic regression.

plot_cumgains(data = plot_input)

Cumulative lift for logistic regression.

plot_cumlift(data = plot_input)

Response plot for logistic regression.

plot_response(data = plot_input)

Cumulative response plot for logistic regression.

plot_cumresponse(data = plot_input)

Multiplot for logistic regression.

plot_multiplot(data = plot_input)

6. Pseudo R^2

This approach only works using method 1.

Check the McFadden statistic.

library(pscl)
## Warning: package 'pscl' was built under R version 3.6.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
summary(logistic_reg_1)
## 
## Call:
## glm(formula = Decision ~ Age + Crossing + Finishing + HeadingAccuracy + 
##     ShortPassing + Volleys + Dribbling + Curve + BallControl + 
##     Acceleration + SprintSpeed + Agility + Reactions + Balance + 
##     ShotPower + Jumping + Strength + Aggression + Positioning + 
##     Composure, family = "binomial", data = train_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5494  -1.1679   0.8109   1.1559   1.6141  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)      0.0700043  0.8775220   0.080   0.9364  
## Age             -0.0365861  0.0172659  -2.119   0.0341 *
## Crossing         0.0010418  0.0071265   0.146   0.8838  
## Finishing       -0.0153921  0.0174052  -0.884   0.3765  
## HeadingAccuracy -0.0083713  0.0109290  -0.766   0.4437  
## ShortPassing     0.0217390  0.0119988   1.812   0.0700 .
## Volleys         -0.0086527  0.0100163  -0.864   0.3877  
## Dribbling        0.0065157  0.0154621   0.421   0.6735  
## Curve           -0.0002802  0.0070235  -0.040   0.9682  
## BallControl      0.0032000  0.0178404   0.179   0.8576  
## Acceleration    -0.0125772  0.0122329  -1.028   0.3039  
## SprintSpeed     -0.0006122  0.0116696  -0.052   0.9582  
## Agility          0.0052299  0.0091886   0.569   0.5692  
## Reactions        0.0021523  0.0128892   0.167   0.8674  
## Balance          0.0018983  0.0079659   0.238   0.8116  
## ShotPower        0.0143233  0.0127724   1.121   0.2621  
## Jumping          0.0029168  0.0066383   0.439   0.6604  
## Strength        -0.0067525  0.0076664  -0.881   0.3784  
## Aggression       0.0015171  0.0048487   0.313   0.7544  
## Positioning      0.0111225  0.0144942   0.767   0.4429  
## Composure       -0.0031572  0.0108253  -0.292   0.7706  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1789.7  on 1290  degrees of freedom
## Residual deviance: 1767.8  on 1270  degrees of freedom
## AIC: 1809.8
## 
## Number of Fisher Scoring iterations: 4
pR2(logistic_reg_1)
## fitting null model for pseudo-r2
##           llh       llhNull            G2      McFadden          r2ML 
## -883.90975662 -894.85262281   21.88573238    0.01222868    0.01680966 
##          r2CU 
##    0.02241288

7. Variable Importance

varImp(logistic_reg_1)
##                    Overall
## Age             2.11898044
## Crossing        0.14619034
## Finishing       0.88433641
## HeadingAccuracy 0.76597453
## ShortPassing    1.81175625
## Volleys         0.86385724
## Dribbling       0.42139682
## Curve           0.03989895
## BallControl     0.17937045
## Acceleration    1.02814440
## SprintSpeed     0.05246461
## Agility         0.56916872
## Reactions       0.16698124
## Balance         0.23829986
## ShotPower       1.12142255
## Jumping         0.43938627
## Strength        0.88078835
## Aggression      0.31289032
## Positioning     0.76737174
## Composure       0.29164995

8. ROC

The ROC is a trade off the rate of a correct vs incorrect prediction.

The AUC metric ranges from 0.5 to 1.0.

Values >= 0.8 are good.

library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
prob <- predict(logistic_reg_1, newdata = valid_df, type = "response")
pred <- prediction(prob, valid_df$Decision)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf)

auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.5073425

8.1 Optimal Cutoff

Generally, the point on the ROC curve that is closest to TPR = 1 and FPR = 0 gives the optimal cut off. This point applies equal weight to both sensitivity and specificity.

opt.cut = function(perf, pred){
  cut.ind = mapply(FUN=function(x, y, p){
    d = (x - 0)^2 + (y-1)^2
    ind = which(d == min(d))
    c(sensitivity = y[[ind]], specificity = 1-x[[ind]], 
      cutoff = p[[ind]])
  }, perf@x.values, perf@y.values, pred@cutoffs)
}
print(opt.cut(perf, pred))
##                  [,1]
## sensitivity 0.4671533
## specificity 0.5244444
## cutoff      0.5054612

Use the optimal cut off to compute the confusion matrix.

confusionMatrix(as.factor(ifelse(logistic_reg_2_prob[,2] > 0.5054612, "1", "0")), 
                valid_df$Decision)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 236 219
##          1 214 192
##                                          
##                Accuracy : 0.4971         
##                  95% CI : (0.4632, 0.531)
##     No Information Rate : 0.5226         
##     P-Value [Acc > NIR] : 0.9376         
##                                          
##                   Kappa : -0.0084        
##                                          
##  Mcnemar's Test P-Value : 0.8476         
##                                          
##             Sensitivity : 0.5244         
##             Specificity : 0.4672         
##          Pos Pred Value : 0.5187         
##          Neg Pred Value : 0.4729         
##              Prevalence : 0.5226         
##          Detection Rate : 0.2741         
##    Detection Prevalence : 0.5285         
##       Balanced Accuracy : 0.4958         
##                                          
##        'Positive' Class : 0              
## 

The optimal cutoff can also be computed based on maximum accuracy. The performance object has x.values to the cutoffs, and y.values corresponding to the corresponding accuracies. This takes the index for the maximum accuracy and corresponding cutoff. .

Note: This is just a hypothetical example.

ind = which.max(slot(perf, "y.values")[[1]] )
acc = slot(perf, "y.values")[[1]][ind]
cutoff = slot(perf, "x.values")[[1]][ind]
print(c(accuracy = acc, cutoff = cutoff))
##  accuracy    cutoff 
## 1.0000000 0.9977778

9. k-fold Cross Validation

Logistic regression using k-fold cross validation.

ctrl <- trainControl(method = "repeatedcv", number = 10, 
                     savePredictions = TRUE)
logistic_reg_3 <- train(Decision ~ Age + Crossing + Finishing + HeadingAccuracy +
                          ShortPassing + Volleys + Dribbling + Curve +
                          BallControl + Acceleration +
                          SprintSpeed + Agility + Reactions + Balance +
                          ShotPower + Jumping + Strength +
                          Aggression + Positioning + Composure, 
                        data = train_df, method="glm", family="binomial",
                        trControl = ctrl, tuneLength = 5)

logistic_reg_3_pred <- predict(logistic_reg_3, newdata = valid_df)
confusionMatrix(logistic_reg_3_pred, valid_df$Decision)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 220 210
##          1 230 201
##                                           
##                Accuracy : 0.489           
##                  95% CI : (0.4551, 0.5229)
##     No Information Rate : 0.5226          
##     P-Value [Acc > NIR] : 0.9779          
##                                           
##                   Kappa : -0.022          
##                                           
##  Mcnemar's Test P-Value : 0.3650          
##                                           
##             Sensitivity : 0.4889          
##             Specificity : 0.4891          
##          Pos Pred Value : 0.5116          
##          Neg Pred Value : 0.4664          
##              Prevalence : 0.5226          
##          Detection Rate : 0.2555          
##    Detection Prevalence : 0.4994          
##       Balanced Accuracy : 0.4890          
##                                           
##        'Positive' Class : 0               
## 

10. Bootstrap

Logistic regression using bootstrapping.

ctrl <- trainControl(method = "boot632", number = 1000, 
                     savePredictions = TRUE)
logistic_reg_4 <- train(Decision ~ Age + Crossing + Finishing + HeadingAccuracy +
                          ShortPassing + Volleys + Dribbling + Curve +
                          BallControl + Acceleration +
                          SprintSpeed + Agility + Reactions + Balance +
                          ShotPower + Jumping + Strength + 
                          Aggression + Positioning + Composure, 
                        data = train_df, method="glm", family="binomial",
                        trControl = ctrl, tuneLength = 5)

logistic_reg_4_pred <- predict(logistic_reg_4, newdata = valid_df)

confusionMatrix(logistic_reg_4_pred, valid_df$Decision, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 220 210
##          1 230 201
##                                           
##                Accuracy : 0.489           
##                  95% CI : (0.4551, 0.5229)
##     No Information Rate : 0.5226          
##     P-Value [Acc > NIR] : 0.9779          
##                                           
##                   Kappa : -0.022          
##                                           
##  Mcnemar's Test P-Value : 0.3650          
##                                           
##             Sensitivity : 0.4891          
##             Specificity : 0.4889          
##          Pos Pred Value : 0.4664          
##          Neg Pred Value : 0.5116          
##              Prevalence : 0.4774          
##          Detection Rate : 0.2334          
##    Detection Prevalence : 0.5006          
##       Balanced Accuracy : 0.4890          
##                                           
##        'Positive' Class : 1               
##