Directions

Control the force… 18 times

Data for demo

Back to the spellbook

1. Load data

1.1 Import

Import a csv file.

fuel <- read.csv("us-vehicle-fuel-economy-data-1984-2017.csv", 
                 header = TRUE, sep = ";")
head(fuel)
##    make                 model year barrels08 barrelsA08 charge120 charge240
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0         0         0
## 2  Ford          Explorer 4WD 1998  21.97400          0         0         0
## 3   BMW                  540i 2018  13.73375          0         0         0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0         0         0
## 5  MINI   Cooper Clubman All4 2018  12.67731          0         0         0
## 6   BMW                 740il 1999  18.31167          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     18       0       0        0      0     0      0  -1   -1               0
## 2     14       0       0        0      0     0      0  -1   -1               0
## 3     21      20       0        0      0     0      0 367   -1               0
## 4     13       0       0        0      0     0      0  -1   -1               0
## 5     23      23       0        0      0     0      0 341   -1               0
## 6     15       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       467.7368     19       0       0        0     0          0          0
## 2       592.4667     15       0       0        0     0          0          0
## 3       367.0000     24      24       0        0     0          0          0
## 4       634.7857     14       0       0        0     0          0          0
## 5       341.0000     26      26       0        0     0          0          0
## 6       493.7222     18       0       0        0     0          0          0
##   cylinders displ                      drive engId eng_dscr feScore fuelCost08
## 1         4   2.3                            60060 CA model      -1       2100
## 2         6   4.0 4-Wheel or All-Wheel Drive     0     SOHC      -1       2650
## 3         6   3.0           Rear-Wheel Drive   540     SIDI       5       2100
## 4         6   4.0 4-Wheel or All-Wheel Drive     0               -1       2850
## 5         3   1.5            All-Wheel Drive    40     SIDI       5       1900
## 6         8   4.4           Rear-Wheel Drive     0               -1       2200
##   fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08 highway08U
## 1           0  Regular Regular Gasoline       -1        -1        22          0
## 2           0  Regular Regular Gasoline       -1        -1        18          0
## 3           0  Premium Premium Gasoline        5        -1        30         30
## 4           0  Regular Regular Gasoline       -1        -1        16          0
## 5           0  Premium Premium Gasoline        5        -1        31         30
## 6           0  Regular Regular Gasoline       -1        -1        22          0
##   highwayA08 highwayA08U                      VClass highwayCD highwayE
## 1          0           0      Midsize Station Wagons         0        0
## 2          0           0 Special Purpose Vehicle 4WD         0        0
## 3          0           0                Midsize Cars         0        0
## 4          0           0 Special Purpose Vehicle 4WD         0        0
## 5          0           0                Midsize Cars         0        0
## 6          0           0                  Large Cars         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 28216   0  41       N       FALSE   0  89     0         0
## 2         0   0   0 14722   0   0       Y       FALSE   0   0     0         0
## 3         0   0   0 39230   0  14       N       FALSE   0  99     0         0
## 4         0   0   0 14749   0   0       Y       FALSE   0   0     0         0
## 5         0  18  92 39235   0   0       N       FALSE   0   0     0         0
## 6         0   0   0 15174   0  13       N       FALSE   0 107     0         0
##   rangeCityA rangeHwy rangeHwyA           trany   UCity UCityA UHighway
## 1          0        0         0 Automatic 4-spd 22.0000      0  31.0000
## 2          0        0         0 Automatic 5-spd 16.8405      0  24.5965
## 3          0        0         0  Automatic (S8) 26.3067      0  43.0230
## 4          0        0         0 Automatic 3-spd 16.2486      0  22.2888
## 5          0        0         0  Automatic (S8) 29.5784      0  44.1889
## 6          0        0         0 Automatic 5-spd 18.9000      0  30.0000
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -3000                          NA                           
## 2         0        -5750              CLKUP       NA                           
## 3         0        -3000                        TRUE                           
## 4         0        -6750                          NA                           
## 5         0        -2000                        TRUE                           
## 6         0        -3500                          NA                           
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                          0           2013-01-01 2013-01-01
## 2                                          0           2013-01-01 2013-01-01
## 3                    BMX                   0           2017-08-10 2018-02-26
## 4                                          0           2013-01-01 2013-01-01
## 5                    BMX                   0           2017-08-10 2018-04-04
## 6                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1                  0       0        0
## 2                  0       0        0
## 3         Y        0       0        0
## 4                  0       0        0
## 5         N        0       0        0
## 6                  0       0        0
class(fuel)
## [1] "data.frame"

1.2 Export

Export to a csv file.

write.csv(fuel, "fuel.csv")

2. Dimensions

nrow(fuel)
## [1] 41184
ncol(fuel)
## [1] 83

3. Column names

names(fuel)
##  [1] "make"            "model"           "year"            "barrels08"      
##  [5] "barrelsA08"      "charge120"       "charge240"       "city08"         
##  [9] "city08U"         "cityA08"         "cityA08U"        "cityCD"         
## [13] "cityE"           "cityUF"          "co2"             "co2A"           
## [17] "co2TailpipeAGpm" "co2TailpipeGpm"  "comb08"          "comb08U"        
## [21] "combA08"         "combA08U"        "combE"           "combinedCD"     
## [25] "combinedUF"      "cylinders"       "displ"           "drive"          
## [29] "engId"           "eng_dscr"        "feScore"         "fuelCost08"     
## [33] "fuelCostA08"     "fuelType"        "fuelType1"       "ghgScore"       
## [37] "ghgScoreA"       "highway08"       "highway08U"      "highwayA08"     
## [41] "highwayA08U"     "VClass"          "highwayCD"       "highwayE"       
## [45] "highwayUF"       "hlv"             "hpv"             "id"             
## [49] "lv2"             "lv4"             "mpgData"         "phevBlended"    
## [53] "pv2"             "pv4"             "range"           "rangeCity"      
## [57] "rangeCityA"      "rangeHwy"        "rangeHwyA"       "trany"          
## [61] "UCity"           "UCityA"          "UHighway"        "UHighwayA"      
## [65] "youSaveSpend"    "guzzler"         "trans_dscr"      "tCharger"       
## [69] "sCharger"        "atvType"         "fuelType2"       "rangeA"         
## [73] "evMotor"         "mfrCode"         "c240Dscr"        "charge240b"     
## [77] "c240bDscr"       "createdOn"       "modifiedOn"      "startStop"      
## [81] "phevCity"        "phevHwy"         "phevComb"

3.1 Rename column

names(fuel)[1] <- "make_of_car"
names(fuel)
##  [1] "make_of_car"     "model"           "year"            "barrels08"      
##  [5] "barrelsA08"      "charge120"       "charge240"       "city08"         
##  [9] "city08U"         "cityA08"         "cityA08U"        "cityCD"         
## [13] "cityE"           "cityUF"          "co2"             "co2A"           
## [17] "co2TailpipeAGpm" "co2TailpipeGpm"  "comb08"          "comb08U"        
## [21] "combA08"         "combA08U"        "combE"           "combinedCD"     
## [25] "combinedUF"      "cylinders"       "displ"           "drive"          
## [29] "engId"           "eng_dscr"        "feScore"         "fuelCost08"     
## [33] "fuelCostA08"     "fuelType"        "fuelType1"       "ghgScore"       
## [37] "ghgScoreA"       "highway08"       "highway08U"      "highwayA08"     
## [41] "highwayA08U"     "VClass"          "highwayCD"       "highwayE"       
## [45] "highwayUF"       "hlv"             "hpv"             "id"             
## [49] "lv2"             "lv4"             "mpgData"         "phevBlended"    
## [53] "pv2"             "pv4"             "range"           "rangeCity"      
## [57] "rangeCityA"      "rangeHwy"        "rangeHwyA"       "trany"          
## [61] "UCity"           "UCityA"          "UHighway"        "UHighwayA"      
## [65] "youSaveSpend"    "guzzler"         "trans_dscr"      "tCharger"       
## [69] "sCharger"        "atvType"         "fuelType2"       "rangeA"         
## [73] "evMotor"         "mfrCode"         "c240Dscr"        "charge240b"     
## [77] "c240bDscr"       "createdOn"       "modifiedOn"      "startStop"      
## [81] "phevCity"        "phevHwy"         "phevComb"
names(fuel)[1] <- "make"
names(fuel)
##  [1] "make"            "model"           "year"            "barrels08"      
##  [5] "barrelsA08"      "charge120"       "charge240"       "city08"         
##  [9] "city08U"         "cityA08"         "cityA08U"        "cityCD"         
## [13] "cityE"           "cityUF"          "co2"             "co2A"           
## [17] "co2TailpipeAGpm" "co2TailpipeGpm"  "comb08"          "comb08U"        
## [21] "combA08"         "combA08U"        "combE"           "combinedCD"     
## [25] "combinedUF"      "cylinders"       "displ"           "drive"          
## [29] "engId"           "eng_dscr"        "feScore"         "fuelCost08"     
## [33] "fuelCostA08"     "fuelType"        "fuelType1"       "ghgScore"       
## [37] "ghgScoreA"       "highway08"       "highway08U"      "highwayA08"     
## [41] "highwayA08U"     "VClass"          "highwayCD"       "highwayE"       
## [45] "highwayUF"       "hlv"             "hpv"             "id"             
## [49] "lv2"             "lv4"             "mpgData"         "phevBlended"    
## [53] "pv2"             "pv4"             "range"           "rangeCity"      
## [57] "rangeCityA"      "rangeHwy"        "rangeHwyA"       "trany"          
## [61] "UCity"           "UCityA"          "UHighway"        "UHighwayA"      
## [65] "youSaveSpend"    "guzzler"         "trans_dscr"      "tCharger"       
## [69] "sCharger"        "atvType"         "fuelType2"       "rangeA"         
## [73] "evMotor"         "mfrCode"         "c240Dscr"        "charge240b"     
## [77] "c240bDscr"       "createdOn"       "modifiedOn"      "startStop"      
## [81] "phevCity"        "phevHwy"         "phevComb"

4. Data type

subset_4 <- fuel
str(subset_4)
## 'data.frame':    41184 obs. of  83 variables:
##  $ make           : chr  "Volvo" "Ford" "BMW" "Jeep" ...
##  $ model          : chr  "240 DL/GL/Turbo Wagon" "Explorer 4WD" "540i" "Wrangler 4WD" ...
##  $ year           : int  1984 1998 2018 1998 2018 1999 1999 2017 2018 1999 ...
##  $ barrels08      : num  17.3 22 13.7 23.5 12.7 ...
##  $ barrelsA08     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ charge120      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ charge240      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ city08         : int  18 14 21 13 23 15 15 40 17 16 ...
##  $ city08U        : int  0 0 20 0 23 0 0 40 17 0 ...
##  $ cityA08        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cityA08U       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cityCD         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cityE          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cityUF         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ co2            : int  -1 -1 367 -1 341 -1 -1 223 451 -1 ...
##  $ co2A           : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ co2TailpipeAGpm: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ co2TailpipeGpm : num  468 592 367 635 341 ...
##  $ comb08         : int  19 15 24 14 26 18 18 40 20 17 ...
##  $ comb08U        : int  0 0 24 0 26 0 0 39 19 0 ...
##  $ combA08        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ combA08U       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ combE          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ combinedCD     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ combinedUF     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cylinders      : int  4 6 6 6 3 8 8 4 6 6 ...
##  $ displ          : num  2.3 4 3 4 1.5 4.4 4.6 2.5 2.9 4 ...
##  $ drive          : chr  "" "4-Wheel or All-Wheel Drive" "Rear-Wheel Drive" "4-Wheel or All-Wheel Drive" ...
##  $ engId          : int  60060 0 540 0 40 0 0 78 401 0 ...
##  $ eng_dscr       : chr  "CA model" "SOHC" "SIDI" "" ...
##  $ feScore        : int  -1 -1 5 -1 5 -1 -1 9 4 -1 ...
##  $ fuelCost08     : int  2100 2650 2100 2850 1900 2200 2200 1000 2500 2350 ...
##  $ fuelCostA08    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ fuelType       : chr  "Regular" "Regular" "Premium" "Regular" ...
##  $ fuelType1      : chr  "Regular Gasoline" "Regular Gasoline" "Premium Gasoline" "Regular Gasoline" ...
##  $ ghgScore       : int  -1 -1 5 -1 5 -1 -1 9 4 -1 ...
##  $ ghgScoreA      : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ highway08      : int  22 18 30 16 31 22 22 39 24 20 ...
##  $ highway08U     : int  0 0 30 0 30 0 0 38 24 0 ...
##  $ highwayA08     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ highwayA08U    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ VClass         : chr  "Midsize Station Wagons" "Special Purpose Vehicle 4WD" "Midsize Cars" "Special Purpose Vehicle 4WD" ...
##  $ highwayCD      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ highwayE       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ highwayUF      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hlv            : int  0 0 0 0 18 0 0 0 0 0 ...
##  $ hpv            : int  0 0 0 0 92 0 0 0 0 0 ...
##  $ id             : int  28216 14722 39230 14749 39235 15174 15189 38295 39288 15299 ...
##  $ lv2            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lv4            : int  41 0 14 0 0 13 21 14 12 0 ...
##  $ mpgData        : chr  "N" "Y" "N" "Y" ...
##  $ phevBlended    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pv2            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pv4            : int  89 0 99 0 0 107 111 104 100 0 ...
##  $ range          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rangeCity      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rangeCityA     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rangeHwy       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rangeHwyA      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trany          : chr  "Automatic 4-spd" "Automatic 5-spd" "Automatic (S8)" "Automatic 3-spd" ...
##  $ UCity          : num  22 16.8 26.3 16.2 29.6 ...
##  $ UCityA         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ UHighway       : num  31 24.6 43 22.3 44.2 ...
##  $ UHighwayA      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ youSaveSpend   : int  -3000 -5750 -3000 -6750 -2000 -3500 -3500 2500 -5000 -4250 ...
##  $ guzzler        : chr  "" "" "" "" ...
##  $ trans_dscr     : chr  "" "CLKUP" "" "" ...
##  $ tCharger       : logi  NA NA TRUE NA TRUE NA ...
##  $ sCharger       : chr  "" "" "" "" ...
##  $ atvType        : chr  "" "" "" "" ...
##  $ fuelType2      : chr  "" "" "" "" ...
##  $ rangeA         : chr  "" "" "" "" ...
##  $ evMotor        : chr  "" "" "" "" ...
##  $ mfrCode        : chr  "" "" "BMX" "" ...
##  $ c240Dscr       : chr  "" "" "" "" ...
##  $ charge240b     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ c240bDscr      : chr  "" "" "" "" ...
##  $ createdOn      : chr  "2013-01-01" "2013-01-01" "2017-08-10" "2013-01-01" ...
##  $ modifiedOn     : chr  "2013-01-01" "2013-01-01" "2018-02-26" "2013-01-01" ...
##  $ startStop      : chr  "" "" "Y" "" ...
##  $ phevCity       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ phevHwy        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ phevComb       : int  0 0 0 0 0 0 0 0 0 0 ...

4.1 Change to factor

subset_4$rangeA <- as.factor(subset_4$rangeA)
class(subset_4$rangeA)
## [1] "factor"

4.2 Change to character

subset_4$id <- as.character(subset_4$id)
class(subset_4$id)
## [1] "character"

5. Subset

5.1 Numerical variables

5.1.1 One Condition

subset_5a <- subset(fuel, fuelCost08 >= 2000)
head(subset_5a)
##    make                 model year barrels08 barrelsA08 charge120 charge240
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0         0         0
## 2  Ford          Explorer 4WD 1998  21.97400          0         0         0
## 3   BMW                  540i 2018  13.73375          0         0         0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0         0         0
## 6   BMW                 740il 1999  18.31167          0         0         0
## 7  Ford        Crown Victoria 1999  18.31167          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     18       0       0        0      0     0      0  -1   -1               0
## 2     14       0       0        0      0     0      0  -1   -1               0
## 3     21      20       0        0      0     0      0 367   -1               0
## 4     13       0       0        0      0     0      0  -1   -1               0
## 6     15       0       0        0      0     0      0  -1   -1               0
## 7     15       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       467.7368     19       0       0        0     0          0          0
## 2       592.4667     15       0       0        0     0          0          0
## 3       367.0000     24      24       0        0     0          0          0
## 4       634.7857     14       0       0        0     0          0          0
## 6       493.7222     18       0       0        0     0          0          0
## 7       493.7222     18       0       0        0     0          0          0
##   cylinders displ                      drive engId eng_dscr feScore fuelCost08
## 1         4   2.3                            60060 CA model      -1       2100
## 2         6   4.0 4-Wheel or All-Wheel Drive     0     SOHC      -1       2650
## 3         6   3.0           Rear-Wheel Drive   540     SIDI       5       2100
## 4         6   4.0 4-Wheel or All-Wheel Drive     0               -1       2850
## 6         8   4.4           Rear-Wheel Drive     0               -1       2200
## 7         8   4.6           Rear-Wheel Drive     0               -1       2200
##   fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08 highway08U
## 1           0  Regular Regular Gasoline       -1        -1        22          0
## 2           0  Regular Regular Gasoline       -1        -1        18          0
## 3           0  Premium Premium Gasoline        5        -1        30         30
## 4           0  Regular Regular Gasoline       -1        -1        16          0
## 6           0  Regular Regular Gasoline       -1        -1        22          0
## 7           0  Regular Regular Gasoline       -1        -1        22          0
##   highwayA08 highwayA08U                      VClass highwayCD highwayE
## 1          0           0      Midsize Station Wagons         0        0
## 2          0           0 Special Purpose Vehicle 4WD         0        0
## 3          0           0                Midsize Cars         0        0
## 4          0           0 Special Purpose Vehicle 4WD         0        0
## 6          0           0                  Large Cars         0        0
## 7          0           0                  Large Cars         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 28216   0  41       N       FALSE   0  89     0         0
## 2         0   0   0 14722   0   0       Y       FALSE   0   0     0         0
## 3         0   0   0 39230   0  14       N       FALSE   0  99     0         0
## 4         0   0   0 14749   0   0       Y       FALSE   0   0     0         0
## 6         0   0   0 15174   0  13       N       FALSE   0 107     0         0
## 7         0   0   0 15189   0  21       Y       FALSE   0 111     0         0
##   rangeCityA rangeHwy rangeHwyA           trany   UCity UCityA UHighway
## 1          0        0         0 Automatic 4-spd 22.0000      0  31.0000
## 2          0        0         0 Automatic 5-spd 16.8405      0  24.5965
## 3          0        0         0  Automatic (S8) 26.3067      0  43.0230
## 4          0        0         0 Automatic 3-spd 16.2486      0  22.2888
## 6          0        0         0 Automatic 5-spd 18.9000      0  30.0000
## 7          0        0         0 Automatic 4-spd 18.7236      0  30.3385
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -3000                          NA                           
## 2         0        -5750              CLKUP       NA                           
## 3         0        -3000                        TRUE                           
## 4         0        -6750                          NA                           
## 6         0        -3500                          NA                           
## 7         0        -3500              CLKUP       NA                           
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                          0           2013-01-01 2013-01-01
## 2                                          0           2013-01-01 2013-01-01
## 3                    BMX                   0           2017-08-10 2018-02-26
## 4                                          0           2013-01-01 2013-01-01
## 6                                          0           2013-01-01 2013-01-01
## 7                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1                  0       0        0
## 2                  0       0        0
## 3         Y        0       0        0
## 4                  0       0        0
## 6                  0       0        0
## 7                  0       0        0

5.1.2 Multiple Conditions

subset_5b <- subset(fuel, fuelCost08 >= 2000 & year < 1998)
head(subset_5b)
##                     make                 model year barrels08 barrelsA08
## 1                  Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0
## 12                   GMC  Caballero Pickup 2WD 1985  20.60063          0
## 27                 Dodge  D250 Cab Chassis 2WD 1986  29.96455          0
## 28 Import Trade Services          ITS 190E 2.3 1991  20.60063          0
## 34                  Ford                Taurus 1993  16.48050          0
## 35             Chevrolet               Caprice 1993  21.97400          0
##    charge120 charge240 city08 city08U cityA08 cityA08U cityCD cityE cityUF co2
## 1          0         0     18       0       0        0      0     0      0  -1
## 12         0         0     15       0       0        0      0     0      0  -1
## 27         0         0     10       0       0        0      0     0      0  -1
## 28         0         0     15       0       0        0      0     0      0  -1
## 34         0         0     17       0       0        0      0     0      0  -1
## 35         0         0     13       0       0        0      0     0      0  -1
##    co2A co2TailpipeAGpm co2TailpipeGpm comb08 comb08U combA08 combA08U combE
## 1    -1               0       467.7368     19       0       0        0     0
## 12   -1               0       555.4375     16       0       0        0     0
## 27   -1               0       807.9091     11       0       0        0     0
## 28   -1               0       555.4375     16       0       0        0     0
## 34   -1               0       444.3500     20       0       0        0     0
## 35   -1               0       592.4667     15       0       0        0     0
##    combinedCD combinedUF cylinders displ             drive engId
## 1           0          0         4   2.3                   60060
## 12          0          0         8   5.0  Rear-Wheel Drive  4893
## 27          0          0         8   5.9  Rear-Wheel Drive  2864
## 28          0          0         4   2.3  Rear-Wheel Drive     0
## 34          0          0         6   3.0 Front-Wheel Drive     0
## 35          0          0         8   5.7  Rear-Wheel Drive     0
##                             eng_dscr feScore fuelCost08 fuelCostA08 fuelType
## 1                           CA model      -1       2100           0  Regular
## 12         (GM-CHEV)  (FFS) CA model      -1       2500           0  Regular
## 27                                        -1       3650           0  Regular
## 28                   (GUZZLER) (FFS)      -1       3100           0  Premium
## 34                             (FFS)      -1       2000           0  Regular
## 35 (350 V8) (GUZZLER) (POLICE) (FFS)      -1       2650           0  Regular
##           fuelType1 ghgScore ghgScoreA highway08 highway08U highwayA08
## 1  Regular Gasoline       -1        -1        22          0          0
## 12 Regular Gasoline       -1        -1        20          0          0
## 27 Regular Gasoline       -1        -1        12          0          0
## 28 Premium Gasoline       -1        -1        17          0          0
## 34 Regular Gasoline       -1        -1        26          0          0
## 35 Regular Gasoline       -1        -1        19          0          0
##    highwayA08U                      VClass highwayCD highwayE highwayUF hlv hpv
## 1            0      Midsize Station Wagons         0        0         0   0   0
## 12           0      Standard Pickup Trucks         0        0         0   0   0
## 27           0 Special Purpose Vehicle 2WD         0        0         0   0   0
## 28           0             Subcompact Cars         0        0         0   0   0
## 34           0                Midsize Cars         0        0         0   0   0
## 35           0                  Large Cars         0        0         0   0   0
##       id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity rangeCityA
## 1  28216   0  41       N       FALSE   0  89     0         0          0
## 12  1536   0   0       N       FALSE   0   0     0         0          0
## 27  2883   0   0       N       FALSE   0   0     0         0          0
## 28 28894   0   0       N       FALSE   0   0     0         0          0
## 34 29006   0   0       Y       FALSE   0   0     0         0          0
## 35 29023   0   0       N       FALSE   0   0     0         0          0
##    rangeHwy rangeHwyA           trany   UCity UCityA UHighway UHighwayA
## 1         0         0 Automatic 4-spd 22.0000      0  31.0000         0
## 12        0         0 Automatic 4-spd 18.0000      0  27.0000         0
## 27        0         0 Automatic 3-spd 12.0000      0  16.0000         0
## 28        0         0 Automatic 4-spd 18.7234      0  23.3155         0
## 34        0         0 Automatic 4-spd 21.5000      0  36.1000         0
## 35        0         0 Automatic 4-spd 15.6000      0  25.8000         0
##    youSaveSpend guzzler  trans_dscr tCharger sCharger atvType fuelType2 rangeA
## 1         -3000                           NA                                  
## 12        -5000                           NA                                  
## 27       -10750                           NA                                  
## 28        -8000       T 2MODE DC/FW       NA                                  
## 34        -2500                           NA                                  
## 35        -5750       T       CLKUP       NA                                  
##    evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                    0           2013-01-01 2013-01-01
## 12                                   0           2013-01-01 2013-01-01
## 27                                   0           2013-01-01 2013-01-01
## 28                                   0           2013-01-01 2013-01-01
## 34                                   0           2013-01-01 2013-01-01
## 35                                   0           2013-01-01 2013-01-01
##    startStop phevCity phevHwy phevComb
## 1                   0       0        0
## 12                  0       0        0
## 27                  0       0        0
## 28                  0       0        0
## 34                  0       0        0
## 35                  0       0        0

5.2 Categorical variables

5.2.1 One condition

subset_5c <- subset(fuel, drive == "Front-Wheel Drive")
head(subset_5c)
##          make         model year barrels08 barrelsA08 charge120 charge240
## 8      Toyota Avalon Hybrid 2017   8.24025          0         0         0
## 11        Kia Soul Electric 2018   0.18600          0         0         5
## 14    Lincoln       MKX FWD 2017  16.48050          0         0         0
## 16  Chevrolet      Cavalier 1998  13.18440          0         0         0
## 18 Mitsubishi       Eclipse 2010  17.34789          0         0         0
## 20       Ford    Taurus FWD 2017  15.69571          0         0         0
##    city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 8      40      40       0        0      0     0      0 223   -1               0
## 11    124     123       0        0      0    27      0   0   -1               0
## 14     18      17       0        0      0     0      0 444   -1               0
## 16     21       0       0        0      0     0      0  -1   -1               0
## 18     16       0       0        0      0     0      0  -1   -1               0
## 20     18      18       0        0      0     0      0 423   -1               0
##    co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 8        223.0000     40      39       0        0     0          0          0
## 11         0.0000    108     107       0        0    31          0          0
## 14       444.0000     20      20       0        0     0          0          0
## 16       355.4800     25       0       0        0     0          0          0
## 18       467.7368     19       0       0        0     0          0          0
## 20       423.0000     21      21       0        0     0          0          0
##    cylinders displ             drive engId eng_dscr feScore fuelCost08
## 8          4   2.5 Front-Wheel Drive    78                9       1000
## 11        NA    NA Front-Wheel Drive    27               10        600
## 14         6   2.7 Front-Wheel Drive   159     SIDI       4       2000
## 16         4   2.2 Front-Wheel Drive     0               -1       1600
## 18         6   3.8 Front-Wheel Drive   313       PR      -1       2650
## 20         6   3.5 Front-Wheel Drive   288                4       1900
##    fuelCostA08    fuelType        fuelType1 ghgScore ghgScoreA highway08
## 8            0     Regular Regular Gasoline        9        -1        39
## 11           0 Electricity      Electricity       10        -1        93
## 14           0     Regular Regular Gasoline        4        -1        25
## 16           0     Regular Regular Gasoline       -1        -1        31
## 18           0     Premium Premium Gasoline       -1        -1        25
## 20           0     Regular Regular Gasoline        4        -1        27
##    highway08U highwayA08 highwayA08U                          VClass highwayCD
## 8          38          0           0                    Midsize Cars         0
## 11         93          0           0            Small Station Wagons         0
## 14         25          0           0 Small Sport Utility Vehicle 2WD         0
## 16          0          0           0                 Subcompact Cars         0
## 18          0          0           0                 Subcompact Cars         0
## 20         27          0           0                      Large Cars         0
##    highwayE highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range
## 8         0         0   0   0 38295   0  14       Y       FALSE   0 104     0
## 11       36         0   0   0 39322   0  19       N       FALSE   0  97   111
## 14        0         0   0   0 38397   0   0       N       FALSE   0   0     0
## 16        0         0   0   0 14061  12  14       Y       FALSE  85  92     0
## 18        0         0   0   0 28559   0  16       N       FALSE   0  82     0
## 20        0         0   0   0 38482   0  20       N       FALSE   0 102     0
##    rangeCity rangeCityA rangeHwy rangeHwyA             trany    UCity UCityA
## 8          0          0        0         0 Automatic (AV-S6)  56.0848      0
## 11       125          0       94         0    Automatic (A1) 176.6000      0
## 14         0          0        0         0    Automatic (S6)  22.2000      0
## 16         0          0        0         0      Manual 5-spd  26.6000      0
## 18         0          0        0         0      Manual 6-spd  20.0000      0
## 20         0          0        0         0    Automatic (S6)  24.2000      0
##    UHighway UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType
## 8   54.1578         0         2500                          NA           Hybrid
## 11 133.5000         0         4500                          NA               EV
## 14  38.7000         0        -2500                        TRUE                 
## 16  43.6000         0         -500                SIL       NA                 
## 18  34.6000         0        -5750                          NA                 
## 20  39.0000         0        -2000                          NA                 
##    fuelType2 rangeA       evMotor mfrCode c240Dscr charge240b c240bDscr
## 8                      245V Ni-MH     TYX                   0          
## 11                  81 kW AC PMSM     KMX                   0          
## 14                                    FMX                   0          
## 16                                                          0          
## 18                                                          0          
## 20                                    FMX                   0          
##     createdOn modifiedOn startStop phevCity phevHwy phevComb
## 8  2016-10-12 2017-04-05         Y        0       0        0
## 11 2017-08-16 2018-01-24         N        0       0        0
## 14 2016-11-07 2016-11-22         N        0       0        0
## 16 2013-01-01 2013-01-01                  0       0        0
## 18 2013-01-01 2013-01-01                  0       0        0
## 20 2016-12-02 2017-01-25         N        0       0        0

5.2.2 Multiple conditions

subset_5d <- subset(fuel, drive == "Front-Wheel Drive" | fuelType1 == "Electricity")
head(subset_5d)
##          make         model year barrels08 barrelsA08 charge120 charge240
## 8      Toyota Avalon Hybrid 2017   8.24025          0         0         0
## 11        Kia Soul Electric 2018   0.18600          0         0         5
## 14    Lincoln       MKX FWD 2017  16.48050          0         0         0
## 16  Chevrolet      Cavalier 1998  13.18440          0         0         0
## 18 Mitsubishi       Eclipse 2010  17.34789          0         0         0
## 20       Ford    Taurus FWD 2017  15.69571          0         0         0
##    city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 8      40      40       0        0      0     0      0 223   -1               0
## 11    124     123       0        0      0    27      0   0   -1               0
## 14     18      17       0        0      0     0      0 444   -1               0
## 16     21       0       0        0      0     0      0  -1   -1               0
## 18     16       0       0        0      0     0      0  -1   -1               0
## 20     18      18       0        0      0     0      0 423   -1               0
##    co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 8        223.0000     40      39       0        0     0          0          0
## 11         0.0000    108     107       0        0    31          0          0
## 14       444.0000     20      20       0        0     0          0          0
## 16       355.4800     25       0       0        0     0          0          0
## 18       467.7368     19       0       0        0     0          0          0
## 20       423.0000     21      21       0        0     0          0          0
##    cylinders displ             drive engId eng_dscr feScore fuelCost08
## 8          4   2.5 Front-Wheel Drive    78                9       1000
## 11        NA    NA Front-Wheel Drive    27               10        600
## 14         6   2.7 Front-Wheel Drive   159     SIDI       4       2000
## 16         4   2.2 Front-Wheel Drive     0               -1       1600
## 18         6   3.8 Front-Wheel Drive   313       PR      -1       2650
## 20         6   3.5 Front-Wheel Drive   288                4       1900
##    fuelCostA08    fuelType        fuelType1 ghgScore ghgScoreA highway08
## 8            0     Regular Regular Gasoline        9        -1        39
## 11           0 Electricity      Electricity       10        -1        93
## 14           0     Regular Regular Gasoline        4        -1        25
## 16           0     Regular Regular Gasoline       -1        -1        31
## 18           0     Premium Premium Gasoline       -1        -1        25
## 20           0     Regular Regular Gasoline        4        -1        27
##    highway08U highwayA08 highwayA08U                          VClass highwayCD
## 8          38          0           0                    Midsize Cars         0
## 11         93          0           0            Small Station Wagons         0
## 14         25          0           0 Small Sport Utility Vehicle 2WD         0
## 16          0          0           0                 Subcompact Cars         0
## 18          0          0           0                 Subcompact Cars         0
## 20         27          0           0                      Large Cars         0
##    highwayE highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range
## 8         0         0   0   0 38295   0  14       Y       FALSE   0 104     0
## 11       36         0   0   0 39322   0  19       N       FALSE   0  97   111
## 14        0         0   0   0 38397   0   0       N       FALSE   0   0     0
## 16        0         0   0   0 14061  12  14       Y       FALSE  85  92     0
## 18        0         0   0   0 28559   0  16       N       FALSE   0  82     0
## 20        0         0   0   0 38482   0  20       N       FALSE   0 102     0
##    rangeCity rangeCityA rangeHwy rangeHwyA             trany    UCity UCityA
## 8          0          0        0         0 Automatic (AV-S6)  56.0848      0
## 11       125          0       94         0    Automatic (A1) 176.6000      0
## 14         0          0        0         0    Automatic (S6)  22.2000      0
## 16         0          0        0         0      Manual 5-spd  26.6000      0
## 18         0          0        0         0      Manual 6-spd  20.0000      0
## 20         0          0        0         0    Automatic (S6)  24.2000      0
##    UHighway UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType
## 8   54.1578         0         2500                          NA           Hybrid
## 11 133.5000         0         4500                          NA               EV
## 14  38.7000         0        -2500                        TRUE                 
## 16  43.6000         0         -500                SIL       NA                 
## 18  34.6000         0        -5750                          NA                 
## 20  39.0000         0        -2000                          NA                 
##    fuelType2 rangeA       evMotor mfrCode c240Dscr charge240b c240bDscr
## 8                      245V Ni-MH     TYX                   0          
## 11                  81 kW AC PMSM     KMX                   0          
## 14                                    FMX                   0          
## 16                                                          0          
## 18                                                          0          
## 20                                    FMX                   0          
##     createdOn modifiedOn startStop phevCity phevHwy phevComb
## 8  2016-10-12 2017-04-05         Y        0       0        0
## 11 2017-08-16 2018-01-24         N        0       0        0
## 14 2016-11-07 2016-11-22         N        0       0        0
## 16 2013-01-01 2013-01-01                  0       0        0
## 18 2013-01-01 2013-01-01                  0       0        0
## 20 2016-12-02 2017-01-25         N        0       0        0

5.3 Select columns

subset_5e <- fuel[, c(1:5)]
head(subset_5e)
##    make                 model year barrels08 barrelsA08
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0
## 2  Ford          Explorer 4WD 1998  21.97400          0
## 3   BMW                  540i 2018  13.73375          0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0
## 5  MINI   Cooper Clubman All4 2018  12.67731          0
## 6   BMW                 740il 1999  18.31167          0
names(fuel)
##  [1] "make"            "model"           "year"            "barrels08"      
##  [5] "barrelsA08"      "charge120"       "charge240"       "city08"         
##  [9] "city08U"         "cityA08"         "cityA08U"        "cityCD"         
## [13] "cityE"           "cityUF"          "co2"             "co2A"           
## [17] "co2TailpipeAGpm" "co2TailpipeGpm"  "comb08"          "comb08U"        
## [21] "combA08"         "combA08U"        "combE"           "combinedCD"     
## [25] "combinedUF"      "cylinders"       "displ"           "drive"          
## [29] "engId"           "eng_dscr"        "feScore"         "fuelCost08"     
## [33] "fuelCostA08"     "fuelType"        "fuelType1"       "ghgScore"       
## [37] "ghgScoreA"       "highway08"       "highway08U"      "highwayA08"     
## [41] "highwayA08U"     "VClass"          "highwayCD"       "highwayE"       
## [45] "highwayUF"       "hlv"             "hpv"             "id"             
## [49] "lv2"             "lv4"             "mpgData"         "phevBlended"    
## [53] "pv2"             "pv4"             "range"           "rangeCity"      
## [57] "rangeCityA"      "rangeHwy"        "rangeHwyA"       "trany"          
## [61] "UCity"           "UCityA"          "UHighway"        "UHighwayA"      
## [65] "youSaveSpend"    "guzzler"         "trans_dscr"      "tCharger"       
## [69] "sCharger"        "atvType"         "fuelType2"       "rangeA"         
## [73] "evMotor"         "mfrCode"         "c240Dscr"        "charge240b"     
## [77] "c240bDscr"       "createdOn"       "modifiedOn"      "startStop"      
## [81] "phevCity"        "phevHwy"         "phevComb"
subset_5f <- fuel[, c("make", "mpgData", "UCity")]
head(subset_5f)
##    make mpgData   UCity
## 1 Volvo       N 22.0000
## 2  Ford       Y 16.8405
## 3   BMW       N 26.3067
## 4  Jeep       Y 16.2486
## 5  MINI       N 29.5784
## 6   BMW       N 18.9000

5.4 Winsorize

subset_5g <- fuel[, c(1:5)]
summary(subset_5g)
##      make              model                year        barrels08    
##  Length:41184       Length:41184       Min.   :1984   Min.   : 0.06  
##  Class :character   Class :character   1st Qu.:1991   1st Qu.:14.33  
##  Mode  :character   Mode  :character   Median :2002   Median :16.48  
##                                        Mean   :2002   Mean   :17.28  
##                                        3rd Qu.:2012   3rd Qu.:19.39  
##                                        Max.   :2020   Max.   :47.09  
##    barrelsA08     
##  Min.   : 0.0000  
##  1st Qu.: 0.0000  
##  Median : 0.0000  
##  Mean   : 0.1982  
##  3rd Qu.: 0.0000  
##  Max.   :18.0000
library(robustHD)
## Warning: package 'robustHD' was built under R version 4.4.2
## Loading required package: ggplot2
## Loading required package: perry
## Warning: package 'perry' was built under R version 4.4.2
## Loading required package: parallel
## Loading required package: robustbase
subset_5g$barrels08 <- robustHD::winsorize(subset_5g$barrels08, 
                                 probs = c(0.05, 0.95))


head(subset_5g)
##    make                 model year barrels08 barrelsA08
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0
## 2  Ford          Explorer 4WD 1998  21.97400          0
## 3   BMW                  540i 2018  13.73375          0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0
## 5  MINI   Cooper Clubman All4 2018  12.67731          0
## 6   BMW                 740il 1999  18.31167          0

5.5 Remove extreme values

subset_5h <- fuel[, c(1:5)]
lower_bound <- quantile(subset_5h$barrels08, 0.05, na.rm = TRUE)
upper_bound <- quantile(subset_5h$barrels08, 0.95, na.rm = TRUE)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
subset_5h_filtered <- subset_5h %>%
  filter(barrels08 >= lower_bound & 
           barrels08 <= upper_bound)
nrow(subset_5h_filtered)
## [1] 38003

6. Missing data

6.1 Count

sum(is.na(fuel))
## [1] 34697
sum(is.na(fuel$displ))
## [1] 207
sapply(fuel, function(x) sum(is.na(x)))
##            make           model            year       barrels08      barrelsA08 
##               0               0               0               0               0 
##       charge120       charge240          city08         city08U         cityA08 
##               0               0               0               0               0 
##        cityA08U          cityCD           cityE          cityUF             co2 
##               0               0               0               0               0 
##            co2A co2TailpipeAGpm  co2TailpipeGpm          comb08         comb08U 
##               0               0               0               0               0 
##         combA08        combA08U           combE      combinedCD      combinedUF 
##               0               0               0               0               0 
##       cylinders           displ           drive           engId        eng_dscr 
##             209             207               0               0               0 
##         feScore      fuelCost08     fuelCostA08        fuelType       fuelType1 
##               0               0               0               0               0 
##        ghgScore       ghgScoreA       highway08      highway08U      highwayA08 
##               0               0               0               0               0 
##     highwayA08U          VClass       highwayCD        highwayE       highwayUF 
##               0               0               0               0               0 
##             hlv             hpv              id             lv2             lv4 
##               0               0               0               0               0 
##         mpgData     phevBlended             pv2             pv4           range 
##               0               0               0               0               0 
##       rangeCity      rangeCityA        rangeHwy       rangeHwyA           trany 
##               0               0               0               0               0 
##           UCity          UCityA        UHighway       UHighwayA    youSaveSpend 
##               0               0               0               0               0 
##         guzzler      trans_dscr        tCharger        sCharger         atvType 
##               0               0           34281               0               0 
##       fuelType2          rangeA         evMotor         mfrCode        c240Dscr 
##               0               0               0               0               0 
##      charge240b       c240bDscr       createdOn      modifiedOn       startStop 
##               0               0               0               0               0 
##        phevCity         phevHwy        phevComb 
##               0               0               0

6.2 Complete cases

library(tidyr)
subset_6a <- drop_na(fuel)
nrow(subset_6a)
## [1] 6901

6.3 Remove records based on one variable’s NA

Check NAs.

sum(is.na(fuel$displ))
## [1] 207

Remove NAs.

subset_6b <- fuel[!is.na(fuel$displ), ]
head(subset_6b)
##    make                 model year barrels08 barrelsA08 charge120 charge240
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0         0         0
## 2  Ford          Explorer 4WD 1998  21.97400          0         0         0
## 3   BMW                  540i 2018  13.73375          0         0         0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0         0         0
## 5  MINI   Cooper Clubman All4 2018  12.67731          0         0         0
## 6   BMW                 740il 1999  18.31167          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     18       0       0        0      0     0      0  -1   -1               0
## 2     14       0       0        0      0     0      0  -1   -1               0
## 3     21      20       0        0      0     0      0 367   -1               0
## 4     13       0       0        0      0     0      0  -1   -1               0
## 5     23      23       0        0      0     0      0 341   -1               0
## 6     15       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       467.7368     19       0       0        0     0          0          0
## 2       592.4667     15       0       0        0     0          0          0
## 3       367.0000     24      24       0        0     0          0          0
## 4       634.7857     14       0       0        0     0          0          0
## 5       341.0000     26      26       0        0     0          0          0
## 6       493.7222     18       0       0        0     0          0          0
##   cylinders displ                      drive engId eng_dscr feScore fuelCost08
## 1         4   2.3                            60060 CA model      -1       2100
## 2         6   4.0 4-Wheel or All-Wheel Drive     0     SOHC      -1       2650
## 3         6   3.0           Rear-Wheel Drive   540     SIDI       5       2100
## 4         6   4.0 4-Wheel or All-Wheel Drive     0               -1       2850
## 5         3   1.5            All-Wheel Drive    40     SIDI       5       1900
## 6         8   4.4           Rear-Wheel Drive     0               -1       2200
##   fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08 highway08U
## 1           0  Regular Regular Gasoline       -1        -1        22          0
## 2           0  Regular Regular Gasoline       -1        -1        18          0
## 3           0  Premium Premium Gasoline        5        -1        30         30
## 4           0  Regular Regular Gasoline       -1        -1        16          0
## 5           0  Premium Premium Gasoline        5        -1        31         30
## 6           0  Regular Regular Gasoline       -1        -1        22          0
##   highwayA08 highwayA08U                      VClass highwayCD highwayE
## 1          0           0      Midsize Station Wagons         0        0
## 2          0           0 Special Purpose Vehicle 4WD         0        0
## 3          0           0                Midsize Cars         0        0
## 4          0           0 Special Purpose Vehicle 4WD         0        0
## 5          0           0                Midsize Cars         0        0
## 6          0           0                  Large Cars         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 28216   0  41       N       FALSE   0  89     0         0
## 2         0   0   0 14722   0   0       Y       FALSE   0   0     0         0
## 3         0   0   0 39230   0  14       N       FALSE   0  99     0         0
## 4         0   0   0 14749   0   0       Y       FALSE   0   0     0         0
## 5         0  18  92 39235   0   0       N       FALSE   0   0     0         0
## 6         0   0   0 15174   0  13       N       FALSE   0 107     0         0
##   rangeCityA rangeHwy rangeHwyA           trany   UCity UCityA UHighway
## 1          0        0         0 Automatic 4-spd 22.0000      0  31.0000
## 2          0        0         0 Automatic 5-spd 16.8405      0  24.5965
## 3          0        0         0  Automatic (S8) 26.3067      0  43.0230
## 4          0        0         0 Automatic 3-spd 16.2486      0  22.2888
## 5          0        0         0  Automatic (S8) 29.5784      0  44.1889
## 6          0        0         0 Automatic 5-spd 18.9000      0  30.0000
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -3000                          NA                           
## 2         0        -5750              CLKUP       NA                           
## 3         0        -3000                        TRUE                           
## 4         0        -6750                          NA                           
## 5         0        -2000                        TRUE                           
## 6         0        -3500                          NA                           
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                          0           2013-01-01 2013-01-01
## 2                                          0           2013-01-01 2013-01-01
## 3                    BMX                   0           2017-08-10 2018-02-26
## 4                                          0           2013-01-01 2013-01-01
## 5                    BMX                   0           2017-08-10 2018-04-04
## 6                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1                  0       0        0
## 2                  0       0        0
## 3         Y        0       0        0
## 4                  0       0        0
## 5         N        0       0        0
## 6                  0       0        0
sum(is.na(subset_6b$displ))
## [1] 0

Another way.

subset_6c <- fuel[complete.cases(fuel$displ),]
head(subset_6c)
##    make                 model year barrels08 barrelsA08 charge120 charge240
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0         0         0
## 2  Ford          Explorer 4WD 1998  21.97400          0         0         0
## 3   BMW                  540i 2018  13.73375          0         0         0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0         0         0
## 5  MINI   Cooper Clubman All4 2018  12.67731          0         0         0
## 6   BMW                 740il 1999  18.31167          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     18       0       0        0      0     0      0  -1   -1               0
## 2     14       0       0        0      0     0      0  -1   -1               0
## 3     21      20       0        0      0     0      0 367   -1               0
## 4     13       0       0        0      0     0      0  -1   -1               0
## 5     23      23       0        0      0     0      0 341   -1               0
## 6     15       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       467.7368     19       0       0        0     0          0          0
## 2       592.4667     15       0       0        0     0          0          0
## 3       367.0000     24      24       0        0     0          0          0
## 4       634.7857     14       0       0        0     0          0          0
## 5       341.0000     26      26       0        0     0          0          0
## 6       493.7222     18       0       0        0     0          0          0
##   cylinders displ                      drive engId eng_dscr feScore fuelCost08
## 1         4   2.3                            60060 CA model      -1       2100
## 2         6   4.0 4-Wheel or All-Wheel Drive     0     SOHC      -1       2650
## 3         6   3.0           Rear-Wheel Drive   540     SIDI       5       2100
## 4         6   4.0 4-Wheel or All-Wheel Drive     0               -1       2850
## 5         3   1.5            All-Wheel Drive    40     SIDI       5       1900
## 6         8   4.4           Rear-Wheel Drive     0               -1       2200
##   fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08 highway08U
## 1           0  Regular Regular Gasoline       -1        -1        22          0
## 2           0  Regular Regular Gasoline       -1        -1        18          0
## 3           0  Premium Premium Gasoline        5        -1        30         30
## 4           0  Regular Regular Gasoline       -1        -1        16          0
## 5           0  Premium Premium Gasoline        5        -1        31         30
## 6           0  Regular Regular Gasoline       -1        -1        22          0
##   highwayA08 highwayA08U                      VClass highwayCD highwayE
## 1          0           0      Midsize Station Wagons         0        0
## 2          0           0 Special Purpose Vehicle 4WD         0        0
## 3          0           0                Midsize Cars         0        0
## 4          0           0 Special Purpose Vehicle 4WD         0        0
## 5          0           0                Midsize Cars         0        0
## 6          0           0                  Large Cars         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 28216   0  41       N       FALSE   0  89     0         0
## 2         0   0   0 14722   0   0       Y       FALSE   0   0     0         0
## 3         0   0   0 39230   0  14       N       FALSE   0  99     0         0
## 4         0   0   0 14749   0   0       Y       FALSE   0   0     0         0
## 5         0  18  92 39235   0   0       N       FALSE   0   0     0         0
## 6         0   0   0 15174   0  13       N       FALSE   0 107     0         0
##   rangeCityA rangeHwy rangeHwyA           trany   UCity UCityA UHighway
## 1          0        0         0 Automatic 4-spd 22.0000      0  31.0000
## 2          0        0         0 Automatic 5-spd 16.8405      0  24.5965
## 3          0        0         0  Automatic (S8) 26.3067      0  43.0230
## 4          0        0         0 Automatic 3-spd 16.2486      0  22.2888
## 5          0        0         0  Automatic (S8) 29.5784      0  44.1889
## 6          0        0         0 Automatic 5-spd 18.9000      0  30.0000
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -3000                          NA                           
## 2         0        -5750              CLKUP       NA                           
## 3         0        -3000                        TRUE                           
## 4         0        -6750                          NA                           
## 5         0        -2000                        TRUE                           
## 6         0        -3500                          NA                           
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                          0           2013-01-01 2013-01-01
## 2                                          0           2013-01-01 2013-01-01
## 3                    BMX                   0           2017-08-10 2018-02-26
## 4                                          0           2013-01-01 2013-01-01
## 5                    BMX                   0           2017-08-10 2018-04-04
## 6                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1                  0       0        0
## 2                  0       0        0
## 3         Y        0       0        0
## 4                  0       0        0
## 5         N        0       0        0
## 6                  0       0        0
sum(is.na(subset_6c$displ))
## [1] 0

7. Descriptives

summary(fuel)
##      make              model                year        barrels08    
##  Length:41184       Length:41184       Min.   :1984   Min.   : 0.06  
##  Class :character   Class :character   1st Qu.:1991   1st Qu.:14.33  
##  Mode  :character   Mode  :character   Median :2002   Median :16.48  
##                                        Mean   :2002   Mean   :17.28  
##                                        3rd Qu.:2012   3rd Qu.:19.39  
##                                        Max.   :2020   Max.   :47.09  
##                                                                      
##    barrelsA08        charge120   charge240           city08      
##  Min.   : 0.0000   Min.   :0   Min.   : 0.0000   Min.   :  6.00  
##  1st Qu.: 0.0000   1st Qu.:0   1st Qu.: 0.0000   1st Qu.: 15.00  
##  Median : 0.0000   Median :0   Median : 0.0000   Median : 17.00  
##  Mean   : 0.1982   Mean   :0   Mean   : 0.0447   Mean   : 18.38  
##  3rd Qu.: 0.0000   3rd Qu.:0   3rd Qu.: 0.0000   3rd Qu.: 21.00  
##  Max.   :18.0000   Max.   :0   Max.   :13.0000   Max.   :150.00  
##                                                                  
##     city08U           cityA08            cityA08U            cityCD        
##  Min.   :  0.000   Min.   :  0.0000   Min.   :  0.0000   Min.   :0.000000  
##  1st Qu.:  0.000   1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.:0.000000  
##  Median :  0.000   Median :  0.0000   Median :  0.0000   Median :0.000000  
##  Mean   :  5.865   Mean   :  0.6679   Mean   :  0.5095   Mean   :0.000243  
##  3rd Qu.: 14.000   3rd Qu.:  0.0000   3rd Qu.:  0.0000   3rd Qu.:0.000000  
##  Max.   :150.000   Max.   :145.0000   Max.   :145.0000   Max.   :5.000000  
##                                                                            
##      cityE              cityUF       co2              co2A        
##  Min.   :  0.0000   Min.   :0   Min.   : -1.00   Min.   : -1.000  
##  1st Qu.:  0.0000   1st Qu.:0   1st Qu.: -1.00   1st Qu.: -1.000  
##  Median :  0.0000   Median :0   Median : -1.00   Median : -1.000  
##  Mean   :  0.3248   Mean   :0   Mean   : 88.21   Mean   :  5.891  
##  3rd Qu.:  0.0000   3rd Qu.:0   3rd Qu.: -1.00   3rd Qu.: -1.000  
##  Max.   :122.0000   Max.   :0   Max.   :847.00   Max.   :713.000  
##                                                                   
##  co2TailpipeAGpm  co2TailpipeGpm       comb08          comb08U       
##  Min.   :  0.00   Min.   :   0.0   Min.   :  7.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.: 386.4   1st Qu.: 17.00   1st Qu.:  0.000  
##  Median :  0.00   Median : 444.4   Median : 20.00   Median :  0.000  
##  Mean   : 17.59   Mean   : 466.2   Mean   : 20.63   Mean   :  6.563  
##  3rd Qu.:  0.00   3rd Qu.: 522.8   3rd Qu.: 23.00   3rd Qu.: 16.000  
##  Max.   :713.00   Max.   :1269.6   Max.   :136.00   Max.   :136.000  
##                                                                      
##     combA08            combA08U           combE            combinedCD      
##  Min.   :  0.0000   Min.   :  0.000   Min.   :  0.0000   Min.   :0.000000  
##  1st Qu.:  0.0000   1st Qu.:  0.000   1st Qu.:  0.0000   1st Qu.:0.000000  
##  Median :  0.0000   Median :  0.000   Median :  0.0000   Median :0.000000  
##  Mean   :  0.7271   Mean   :  0.546   Mean   :  0.3319   Mean   :0.000194  
##  3rd Qu.:  0.0000   3rd Qu.:  0.000   3rd Qu.:  0.0000   3rd Qu.:0.000000  
##  Max.   :133.0000   Max.   :133.000   Max.   :121.0000   Max.   :4.000000  
##                                                                            
##    combinedUF   cylinders          displ          drive          
##  Min.   :0    Min.   : 2.000   Min.   :0.000   Length:41184      
##  1st Qu.:0    1st Qu.: 4.000   1st Qu.:2.200   Class :character  
##  Median :0    Median : 6.000   Median :3.000   Mode  :character  
##  Mean   :0    Mean   : 5.716   Mean   :3.293                     
##  3rd Qu.:0    3rd Qu.: 6.000   3rd Qu.:4.300                     
##  Max.   :0    Max.   :16.000   Max.   :8.400                     
##               NA's   :209      NA's   :207                       
##      engId         eng_dscr            feScore          fuelCost08  
##  Min.   :    0   Length:41184       Min.   :-1.0000   Min.   : 500  
##  1st Qu.:    0   Class :character   1st Qu.:-1.0000   1st Qu.:1800  
##  Median :  181   Mode  :character   Median :-1.0000   Median :2200  
##  Mean   : 8159                      Mean   : 0.3643   Mean   :2257  
##  3rd Qu.: 4190                      3rd Qu.:-1.0000   3rd Qu.:2650  
##  Max.   :69102                      Max.   :10.0000   Max.   :7150  
##                                                                     
##   fuelCostA08        fuelType          fuelType1            ghgScore      
##  Min.   :   0.00   Length:41184       Length:41184       Min.   :-1.0000  
##  1st Qu.:   0.00   Class :character   Class :character   1st Qu.:-1.0000  
##  Median :   0.00   Mode  :character   Mode  :character   Median :-1.0000  
##  Mean   :  98.75                                         Mean   : 0.3623  
##  3rd Qu.:   0.00                                         3rd Qu.:-1.0000  
##  Max.   :3950.00                                         Max.   :10.0000  
##                                                                           
##    ghgScoreA         highway08        highway08U       highwayA08      
##  Min.   :-1.0000   Min.   :  9.00   Min.   :  0.00   Min.   :  0.0000  
##  1st Qu.:-1.0000   1st Qu.: 20.00   1st Qu.:  0.00   1st Qu.:  0.0000  
##  Median :-1.0000   Median : 24.00   Median :  0.00   Median :  0.0000  
##  Mean   :-0.9207   Mean   : 24.51   Mean   :  7.77   Mean   :  0.8306  
##  3rd Qu.:-1.0000   3rd Qu.: 28.00   3rd Qu.: 20.00   3rd Qu.:  0.0000  
##  Max.   : 8.0000   Max.   :124.00   Max.   :124.00   Max.   :121.0000  
##                                                                        
##   highwayA08U          VClass            highwayCD           highwayE       
##  Min.   :  0.0000   Length:41184       Min.   :0.000000   Min.   :  0.0000  
##  1st Qu.:  0.0000   Class :character   1st Qu.:0.000000   1st Qu.:  0.0000  
##  Median :  0.0000   Mode  :character   Median :0.000000   Median :  0.0000  
##  Mean   :  0.6163                      Mean   :0.000194   Mean   :  0.3412  
##  3rd Qu.:  0.0000                      3rd Qu.:0.000000   3rd Qu.:  0.0000  
##  Max.   :121.0000                      Max.   :4.000000   Max.   :120.0000  
##                                                                             
##    highwayUF      hlv              hpv               id             lv2        
##  Min.   :0   Min.   : 0.000   Min.   :  0.00   Min.   :    1   Min.   : 0.000  
##  1st Qu.:0   1st Qu.: 0.000   1st Qu.:  0.00   1st Qu.:10297   1st Qu.: 0.000  
##  Median :0   Median : 0.000   Median :  0.00   Median :20594   Median : 0.000  
##  Mean   :0   Mean   : 2.005   Mean   : 10.26   Mean   :20710   Mean   : 1.805  
##  3rd Qu.:0   3rd Qu.: 0.000   3rd Qu.:  0.00   3rd Qu.:31159   3rd Qu.: 0.000  
##  Max.   :0   Max.   :49.000   Max.   :195.00   Max.   :41529   Max.   :41.000  
##                                                                                
##       lv4           mpgData          phevBlended          pv2        
##  Min.   : 0.000   Length:41184       Mode :logical   Min.   :  0.00  
##  1st Qu.: 0.000   Class :character   FALSE:41089     1st Qu.:  0.00  
##  Median : 0.000   Mode  :character   TRUE :95        Median :  0.00  
##  Mean   : 6.124                                      Mean   : 13.52  
##  3rd Qu.:13.000                                      3rd Qu.:  0.00  
##  Max.   :55.000                                      Max.   :194.00  
##                                                                      
##       pv4             range            rangeCity          rangeCityA       
##  Min.   :  0.00   Min.   :  0.0000   Min.   :  0.0000   Min.   :  0.00000  
##  1st Qu.:  0.00   1st Qu.:  0.0000   1st Qu.:  0.0000   1st Qu.:  0.00000  
##  Median :  0.00   Median :  0.0000   Median :  0.0000   Median :  0.00000  
##  Mean   : 33.83   Mean   :  0.8141   Mean   :  0.7832   Mean   :  0.08564  
##  3rd Qu.: 91.00   3rd Qu.:  0.0000   3rd Qu.:  0.0000   3rd Qu.:  0.00000  
##  Max.   :192.00   Max.   :370.0000   Max.   :381.0000   Max.   :135.00000  
##                                                                            
##     rangeHwy          rangeHwyA            trany               UCity       
##  Min.   :  0.0000   Min.   :  0.00000   Length:41184       Min.   :  0.00  
##  1st Qu.:  0.0000   1st Qu.:  0.00000   Class :character   1st Qu.: 18.25  
##  Median :  0.0000   Median :  0.00000   Mode  :character   Median : 21.40  
##  Mean   :  0.7515   Mean   :  0.07872                      Mean   : 23.22  
##  3rd Qu.:  0.0000   3rd Qu.:  0.00000                      3rd Qu.: 25.96  
##  Max.   :355.0000   Max.   :114.00000                      Max.   :224.80  
##                                                                            
##      UCityA            UHighway        UHighwayA        youSaveSpend   
##  Min.   :  0.0000   Min.   :  0.00   Min.   :  0.000   Min.   :-28250  
##  1st Qu.:  0.0000   1st Qu.: 27.90   1st Qu.:  0.000   1st Qu.: -5750  
##  Median :  0.0000   Median : 33.25   Median :  0.000   Median : -3500  
##  Mean   :  0.8472   Mean   : 34.36   Mean   :  1.129   Mean   : -3780  
##  3rd Qu.:  0.0000   3rd Qu.: 39.00   3rd Qu.:  0.000   3rd Qu.: -1500  
##  Max.   :207.0000   Max.   :182.70   Max.   :173.000   Max.   :  5000  
##                                                                        
##    guzzler           trans_dscr        tCharger         sCharger        
##  Length:41184       Length:41184       Mode:logical   Length:41184      
##  Class :character   Class :character   TRUE:6903      Class :character  
##  Mode  :character   Mode  :character   NA's:34281     Mode  :character  
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##    atvType           fuelType2            rangeA            evMotor         
##  Length:41184       Length:41184       Length:41184       Length:41184      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    mfrCode            c240Dscr           charge240b        c240bDscr        
##  Length:41184       Length:41184       Min.   :0.000000   Length:41184      
##  Class :character   Class :character   1st Qu.:0.000000   Class :character  
##  Mode  :character   Mode  :character   Median :0.000000   Mode  :character  
##                                        Mean   :0.009008                     
##                                        3rd Qu.:0.000000                     
##                                        Max.   :8.000000                     
##                                                                             
##   createdOn          modifiedOn         startStop            phevCity      
##  Length:41184       Length:41184       Length:41184       Min.   : 0.0000  
##  Class :character   Class :character   Class :character   1st Qu.: 0.0000  
##  Mode  :character   Mode  :character   Mode  :character   Median : 0.0000  
##                                                           Mean   : 0.1546  
##                                                           3rd Qu.: 0.0000  
##                                                           Max.   :97.0000  
##                                                                            
##     phevHwy           phevComb      
##  Min.   : 0.0000   Min.   : 0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Median : 0.0000   Median : 0.0000  
##  Mean   : 0.1544   Mean   : 0.1538  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :81.0000   Max.   :88.0000  
## 
summary(fuel$fuelCost08)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     500    1800    2200    2257    2650    7150
table(fuel$fuelCost08)
## 
##  500  550  600  650  700  750  800  850  900  950 1000 1050 1100 1150 1200 1250 
##    8   31   53   29   28   44   23   55   18   46  123   58   71   65  310  220 
## 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 2100 
##  277  428  517  629  660 1010 1255 1469  157 1626 1895  300 2659    3 2561 2694 
## 2150 2200 2250 2300 2350 2400 2450 2500 2550 2650 2700 2800 2850 2900 2950 3050 
##  577 2467  833   35 1919 1309   26 2992   90 3324  173 1458 1568  102  593   32 
## 3100 3300 3350 3550 3650 3700 3850 4000 4150 4450 4550 5000 5550 6250 7150 
## 1630    6 1012  424  315    4  355   81  243   41  153   85    6    4    5
table(fuel$cylinders)
## 
##     2     3     4     5     6     8    10    12    16 
##    59   279 15967   771 14290  8803   170   626    10
mean(fuel$barrels08, na.rm = TRUE)
## [1] 17.27932
median(fuel$city08, na.rm = TRUE)
## [1] 17
library(DescTools)

Mode(fuel$make)
## [1] "Chevrolet"
## attr(,"freq")
## [1] 4012
max(fuel$barrels08, na.rm = TRUE)
## [1] 47.08714
min(fuel$barrels08, na.rm = TRUE)
## [1] 0.06
length(fuel$model)
## [1] 41184
sum(fuel$city08)
## [1] 756878
var(fuel$barrels08, na.rm = TRUE)
## [1] 21.39272
sd(fuel$barrels08, na.rm = TRUE)
## [1] 4.625226

8. Sampling

8.1 Random sampling

Sample 10%.

library(dplyr)
set.seed(666)

subset_8a <- sample_frac(fuel, 0.1)
nrow(subset_8a)
## [1] 4118

Sample 100 rows.

set.seed(666)
subset_8b <- sample_n(fuel, 100)
nrow(subset_8b)
## [1] 100

8.2 Deleting rows

library(dplyr)

Delete even rows.

subset_8b <- filter(subset_8a, row_number() %% 2 != 0)
nrow(subset_8b)
## [1] 2059
head(subset_8b)
##        make             model year barrels08 barrelsA08 charge120 charge240
## 1       BMW     Z4 sDrive35is 2013  17.34789          0         0         0
## 2    Suzuki  Vitara 2Door 4WD 2001  14.33087          0         0         0
## 3    Nissan             300ZX 1989  18.31167          0         0         0
## 4      Ford Ranger Pickup 2WD 2005  18.31167          0         0         0
## 5 Chevrolet  K1500 Pickup 4WD 1989  27.46750          0         0         0
## 6      Ford   F150 Pickup 2WD 2005  21.97400          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     17      16       0        0      0     0      0 458   -1               0
## 2     22       0       0        0      0     0      0  -1   -1               0
## 3     15       0       0        0      0     0      0  -1   -1               0
## 4     16       0       0        0      0     0      0  -1   -1               0
## 5     11       0       0        0      0     0      0  -1   -1               0
## 6     13       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       458.0000     19      19       0        0     0          0          0
## 2       386.3913     23       0       0        0     0          0          0
## 3       493.7222     18       0       0        0     0          0          0
## 4       493.7222     18       0       0        0     0          0          0
## 5       740.5833     12       0       0        0     0          0          0
## 6       592.4667     15       0       0        0     0          0          0
##   cylinders displ                      drive engId eng_dscr feScore fuelCost08
## 1         6   3.0           Rear-Wheel Drive   438     SIDI       4       2650
## 2         4   1.6 4-Wheel or All-Wheel Drive     0               -1       1750
## 3         6   3.0           Rear-Wheel Drive 38051    (FFS)      -1       2200
## 4         6   4.0           Rear-Wheel Drive     0               -1       2200
## 5         8   5.0 4-Wheel or All-Wheel Drive  4934    (FFS)      -1       3350
## 6         8   5.4           Rear-Wheel Drive     0               -1       2650
##   fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08 highway08U
## 1           0  Premium Premium Gasoline        4        -1        23         23
## 2           0  Regular Regular Gasoline       -1        -1        25          0
## 3           0  Regular Regular Gasoline       -1        -1        23          0
## 4           0  Regular Regular Gasoline       -1        -1        21          0
## 5           0  Regular Regular Gasoline       -1        -1        13          0
## 6           0  Regular Regular Gasoline       -1        -1        17          0
##   highwayA08 highwayA08U                      VClass highwayCD highwayE
## 1          0           0                 Two Seaters         0        0
## 2          0           0 Sport Utility Vehicle - 4WD         0        0
## 3          0           0                 Two Seaters         0        0
## 4          0           0  Standard Pickup Trucks 2WD         0        0
## 5          0           0      Standard Pickup Trucks         0        0
## 6          0           0  Standard Pickup Trucks 2WD         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 32859   0   0       N       FALSE   0   0     0         0
## 2         0   0   0 17285   0   0       Y       FALSE   0   0     0         0
## 3         0  23  49  5254   0   0       Y       FALSE   0   0     0         0
## 4         0   0   0 21116   0   0       N       FALSE   0   0     0         0
## 5         0   0   0  6057   0   0       N       FALSE   0   0     0         0
## 6         0   0   0 21110   0   0       Y       FALSE   0   0     0         0
##   rangeCityA rangeHwy rangeHwyA             trany   UCity UCityA UHighway
## 1          0        0         0 Automatic (AM-S7) 21.1097      0  32.8224
## 2          0        0         0      Manual 5-spd 27.8499      0  34.7974
## 3          0        0         0      Manual 5-spd 19.0000      0  32.0000
## 4          0        0         0      Manual 5-spd 19.7000      0  29.3000
## 5          0        0         0   Automatic 3-spd 14.0000      0  18.0000
## 6          0        0         0   Automatic 4-spd 15.8603      0  23.9027
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -5750                        TRUE                           
## 2         0        -1250                          NA                           
## 3         0        -3500                          NA                           
## 4         0        -3500                          NA                           
## 5         0        -9250                          NA                           
## 6         0        -5750              CLKUP       NA                           
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                    BMX                   0           2013-01-01 2017-04-05
## 2                                          0           2013-01-01 2013-01-01
## 3                                          0           2013-01-01 2013-01-01
## 4                                          0           2013-01-01 2013-01-01
## 5                                          0           2013-01-01 2013-01-01
## 6                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1         N        0       0        0
## 2                  0       0        0
## 3                  0       0        0
## 4                  0       0        0
## 5                  0       0        0
## 6                  0       0        0

Delete odd rows.

subset_8c <- filter(subset_8a, row_number() %% 2 != 1)
nrow(subset_8c)
## [1] 2059
head(subset_8c)
##        make             model year barrels08 barrelsA08 charge120 charge240
## 1      Ford Ranger Pickup 2WD 1984  17.34789          0         0         0
## 2 Chevrolet         Celebrity 1984  15.92437          0         0         0
## 3      Saab               900 1991  16.48050          0         0         0
## 4      Audi         Cabriolet 1997  17.34789          0         0         0
## 5    Nissan         Truck 4WD 1989  21.97400          0         0         0
## 6 Chevrolet    K10 Pickup 4WD 1985  27.46750          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     17       0       0        0      0     0      0  -1   -1               0
## 2     21       0       0        0      0     0      0  -1   -1               0
## 3     18       0       0        0      0     0      0  -1   -1               0
## 4     17       0       0        0      0     0      0  -1   -1               0
## 5     14       0       0        0      0     0      0  -1   -1               0
## 6     11       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       467.7368     19       0       0        0     0          0          0
## 2       424.1667     24       0       0        0     0          0          0
## 3       444.3500     20       0       0        0     0          0          0
## 4       467.7368     19       0       0        0     0          0          0
## 5       592.4667     15       0       0        0     0          0          0
## 6       740.5833     12       0       0        0     0          0          0
##   cylinders displ                      drive engId          eng_dscr feScore
## 1         4   2.3              2-Wheel Drive  3896                        -1
## 2         6   4.3                             4322 (DIESEL) CA model      -1
## 3         4   2.1          Front-Wheel Drive 47010             (FFS)      -1
## 4         6   2.8          Front-Wheel Drive 64011             (FFS)      -1
## 5         6   3.0 4-Wheel or All-Wheel Drive 38092             (FFS)      -1
## 6         8   5.0 4-Wheel or All-Wheel Drive  4962         (GM-CHEV)      -1
##   fuelCost08 fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08
## 1       2100           0  Regular Regular Gasoline       -1        -1        22
## 2       1900           0   Diesel           Diesel       -1        -1        29
## 3       2000           0  Regular Regular Gasoline       -1        -1        24
## 4       2650           0  Premium Premium Gasoline       -1        -1        22
## 5       2650           0  Regular Regular Gasoline       -1        -1        17
## 6       3350           0  Regular Regular Gasoline       -1        -1        15
##   highway08U highwayA08 highwayA08U                  VClass highwayCD highwayE
## 1          0          0           0 Small Pickup Trucks 2WD         0        0
## 2          0          0           0            Midsize Cars         0        0
## 3          0          0           0            Compact Cars         0        0
## 4          0          0           0        Minicompact Cars         0        0
## 5          0          0           0  Standard Pickup Trucks         0        0
## 6          0          0           0  Standard Pickup Trucks         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 27238   0   0       N       FALSE   0   0     0         0
## 2         0   0   0 28019  16  16       N       FALSE  98  98     0         0
## 3         0  22  88  7872   0  14       N       FALSE   0  89     0         0
## 4         0   0   0 13281   7   0       N       FALSE  71   0     0         0
## 5         0   0   0  6082   0   0       N       FALSE   0   0     0         0
## 6         0   0   0   853   0   0       N       FALSE   0   0     0         0
##   rangeCityA rangeHwy rangeHwyA           trany   UCity UCityA UHighway
## 1          0        0         0    Manual 5-spd 21.0000      0  30.0000
## 2          0        0         0 Automatic 3-spd 26.0000      0  41.0000
## 3          0        0         0    Manual 5-spd 22.0000      0  33.3333
## 4          0        0         0 Automatic 4-spd 21.0000      0  31.0000
## 5          0        0         0    Manual 5-spd 17.0000      0  24.0000
## 6          0        0         0    Manual 4-spd 13.3333      0  21.0000
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -3000                          NA                           
## 2         0        -2000                          NA           Diesel          
## 3         0        -2500                SIL       NA                           
## 4         0        -5750              CLKUP       NA                           
## 5         0        -5750                          NA                           
## 6         0        -9250                          NA                           
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                          0           2013-01-01 2013-01-01
## 2                                          0           2013-01-01 2013-01-01
## 3                                          0           2013-01-01 2013-01-01
## 4                                          0           2013-01-01 2013-01-01
## 5                                          0           2013-01-01 2013-01-01
## 6                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1                  0       0        0
## 2                  0       0        0
## 3                  0       0        0
## 4                  0       0        0
## 5                  0       0        0
## 6                  0       0        0

Delete every 3rd row starting from 1.

subset_8d <- filter(subset_8a, row_number() %% 3 != 1)
nrow(subset_8d)
## [1] 2745
head(subset_8d)
##        make             model year barrels08 barrelsA08 charge120 charge240
## 1      Ford Ranger Pickup 2WD 1984  17.34789          0         0         0
## 2    Suzuki  Vitara 2Door 4WD 2001  14.33087          0         0         0
## 3    Nissan             300ZX 1989  18.31167          0         0         0
## 4      Saab               900 1991  16.48050          0         0         0
## 5      Audi         Cabriolet 1997  17.34789          0         0         0
## 6 Chevrolet  K1500 Pickup 4WD 1989  27.46750          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     17       0       0        0      0     0      0  -1   -1               0
## 2     22       0       0        0      0     0      0  -1   -1               0
## 3     15       0       0        0      0     0      0  -1   -1               0
## 4     18       0       0        0      0     0      0  -1   -1               0
## 5     17       0       0        0      0     0      0  -1   -1               0
## 6     11       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       467.7368     19       0       0        0     0          0          0
## 2       386.3913     23       0       0        0     0          0          0
## 3       493.7222     18       0       0        0     0          0          0
## 4       444.3500     20       0       0        0     0          0          0
## 5       467.7368     19       0       0        0     0          0          0
## 6       740.5833     12       0       0        0     0          0          0
##   cylinders displ                      drive engId eng_dscr feScore fuelCost08
## 1         4   2.3              2-Wheel Drive  3896               -1       2100
## 2         4   1.6 4-Wheel or All-Wheel Drive     0               -1       1750
## 3         6   3.0           Rear-Wheel Drive 38051    (FFS)      -1       2200
## 4         4   2.1          Front-Wheel Drive 47010    (FFS)      -1       2000
## 5         6   2.8          Front-Wheel Drive 64011    (FFS)      -1       2650
## 6         8   5.0 4-Wheel or All-Wheel Drive  4934    (FFS)      -1       3350
##   fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08 highway08U
## 1           0  Regular Regular Gasoline       -1        -1        22          0
## 2           0  Regular Regular Gasoline       -1        -1        25          0
## 3           0  Regular Regular Gasoline       -1        -1        23          0
## 4           0  Regular Regular Gasoline       -1        -1        24          0
## 5           0  Premium Premium Gasoline       -1        -1        22          0
## 6           0  Regular Regular Gasoline       -1        -1        13          0
##   highwayA08 highwayA08U                      VClass highwayCD highwayE
## 1          0           0     Small Pickup Trucks 2WD         0        0
## 2          0           0 Sport Utility Vehicle - 4WD         0        0
## 3          0           0                 Two Seaters         0        0
## 4          0           0                Compact Cars         0        0
## 5          0           0            Minicompact Cars         0        0
## 6          0           0      Standard Pickup Trucks         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 27238   0   0       N       FALSE   0   0     0         0
## 2         0   0   0 17285   0   0       Y       FALSE   0   0     0         0
## 3         0  23  49  5254   0   0       Y       FALSE   0   0     0         0
## 4         0  22  88  7872   0  14       N       FALSE   0  89     0         0
## 5         0   0   0 13281   7   0       N       FALSE  71   0     0         0
## 6         0   0   0  6057   0   0       N       FALSE   0   0     0         0
##   rangeCityA rangeHwy rangeHwyA           trany   UCity UCityA UHighway
## 1          0        0         0    Manual 5-spd 21.0000      0  30.0000
## 2          0        0         0    Manual 5-spd 27.8499      0  34.7974
## 3          0        0         0    Manual 5-spd 19.0000      0  32.0000
## 4          0        0         0    Manual 5-spd 22.0000      0  33.3333
## 5          0        0         0 Automatic 4-spd 21.0000      0  31.0000
## 6          0        0         0 Automatic 3-spd 14.0000      0  18.0000
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -3000                          NA                           
## 2         0        -1250                          NA                           
## 3         0        -3500                          NA                           
## 4         0        -2500                SIL       NA                           
## 5         0        -5750              CLKUP       NA                           
## 6         0        -9250                          NA                           
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                          0           2013-01-01 2013-01-01
## 2                                          0           2013-01-01 2013-01-01
## 3                                          0           2013-01-01 2013-01-01
## 4                                          0           2013-01-01 2013-01-01
## 5                                          0           2013-01-01 2013-01-01
## 6                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1                  0       0        0
## 2                  0       0        0
## 3                  0       0        0
## 4                  0       0        0
## 5                  0       0        0
## 6                  0       0        0

9. Unique values

9.1 Individual variables

table(fuel$make)
## 
##                              Acura                         Alfa Romeo 
##                                347                                 62 
##                         AM General        American Motors Corporation 
##                                  6                                 27 
##                   ASC Incorporated                       Aston Martin 
##                                  1                                156 
##                               Audi                    Aurora Cars Ltd 
##                               1002                                  1 
##                  Autokraft Limited           Avanti Motor Corporation 
##                                  4                                  2 
##                     Azure Dynamics                            Bentley 
##                                  2                                127 
##                            Bertone      Bill Dovell Motor Car Company 
##                                  7                                  4 
##             Bitter Gmbh and Co. Kg                                BMW 
##                                  5                               1966 
##                         BMW Alpina                            Bugatti 
##                                  3                                 10 
##                              Buick                                BYD 
##                                671                                  7 
##                           Cadillac                    CCC Engineering 
##                                583                                  2 
##                          Chevrolet                           Chrysler 
##                               4012                                726 
##                    CODA Automotive           Consulier Industries Inc 
##                                  2                                  3 
##                      CX Automotive         Dabryan Coach Builders Inc 
##                                 17                                  9 
##                              Dacia                             Daewoo 
##                                  3                                 67 
##                           Daihatsu                              Dodge 
##                                 17                               2583 
##                 E. P. Dutton, Inc.                              Eagle 
##                                  1                                161 
##   Environmental Rsch and Devp Corp                  Evans Automobiles 
##                                  1                                  3 
##                    Excalibur Autos                      Federal Coach 
##                                  1                                 14 
##                            Ferrari                               Fiat 
##                                229                                 71 
##                             Fisker                               Ford 
##                                  1                               3373 
##                     General Motors                            Genesis 
##                                  1                                 43 
##                                Geo                                GMC 
##                                147                               2497 
##                           Goldacre          Grumman Allied Industries 
##                                  1                                  1 
##                      Grumman Olson                              Honda 
##                                  4                               1001 
##                             Hummer                            Hyundai 
##                                 19                                783 
##      Import Foreign Auto Sales Inc              Import Trade Services 
##                                  1                                 13 
##                           Infiniti                   Isis Imports Ltd 
##                                411                                  1 
##                              Isuzu                        J.K. Motors 
##                                434                                 36 
##                             Jaguar                JBA Motorcars, Inc. 
##                                469                                  1 
##                               Jeep                              Karma 
##                                929                                  1 
##      Kenyon Corporation Of America                                Kia 
##                                  4                                613 
##                         Koenigsegg             Laforza Automobile Inc 
##                                  1                                  2 
##             Lambda Control Systems                        Lamborghini 
##                                  1                                128 
##                         Land Rover                              Lexus 
##                                202                                501 
##                            Lincoln                London Coach Co Inc 
##                                346                                  1 
##                        London Taxi                              Lotus 
##                                  1                                 59 
##                           Mahindra                           Maserati 
##                                  1                                136 
##                            Maybach                              Mazda 
##                                 31                               1005 
##                      Mcevoy Motors                 McLaren Automotive 
##                                  6                                 14 
##                      Mercedes-Benz                            Mercury 
##                               1540                                609 
##                             Merkur                               MINI 
##                                 14                                405 
##                         Mitsubishi              Mobility Ventures LLC 
##                               1067                                  4 
##                             Morgan                             Nissan 
##                                  3                               1471 
##                         Oldsmobile                             Pagani 
##                                462                                  3 
##                              Panos             Panoz Auto-Development 
##                                  1                                  1 
##        Panther Car Company Limited                      PAS Inc - GMC 
##                                  4                                  2 
##                           PAS, Inc                            Peugeot 
##                                  2                                 98 
##                        Pininfarina                           Plymouth 
##                                  6                                526 
##                            Pontiac                            Porsche 
##                                893                               1053 
##               Quantum Technologies                              Qvale 
##                                  2                                  1 
##                                Ram                     Red Shift Ltd. 
##                                 81                                  2 
##                            Renault                        Rolls-Royce 
##                                 56                                186 
##                  Roush Performance                     RUF Automobile 
##                                 63                                  1 
##                Ruf Automobile Gmbh S and S Coach Company  E.p. Dutton 
##                                  3                                  1 
##                               Saab                             Saleen 
##                                432                                  5 
##                 Saleen Performance                             Saturn 
##                                  5                                278 
##                              Scion                             Shelby 
##                                 84                                  1 
##                              smart                             Spyker 
##                                 38                                 13 
##                                SRT                           Sterling 
##                                  2                                 12 
##                             Subaru   Superior Coaches Div E.p. Dutton 
##                                885                                  1 
##                             Suzuki                        Tecstar, LP 
##                                515                                  6 
##                              Tesla                Texas Coach Company 
##                                 77                                  4 
##                             Toyota                TVR Engineering Ltd 
##                               2071                                  4 
##                             Vector                Vixen Motor Company 
##                                  4                                  1 
##        Volga Associated Automobile                         Volkswagen 
##                                  1                               1180 
##                              Volvo                                VPG 
##                                826                                  5 
##              Wallace Environmental                               Yugo 
##                                 32                                  8
table(fuel$year)
## 
## 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 
## 1964 1701 1210 1247 1130 1153 1078 1132 1121 1093  982  967  773  762  812  852 
## 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 
##  840  911  975 1044 1122 1166 1104 1126 1187 1184 1109 1130 1152 1184 1225 1283 
## 2016 2017 2018 2019 2020 
## 1262 1293 1344 1320  246
subset_9a <- unique(fuel["make"])
head(subset_9a)
##     make
## 1  Volvo
## 2   Ford
## 3    BMW
## 4   Jeep
## 5   MINI
## 8 Toyota
nrow(subset_9a)
## [1] 136
subset_9b <- unique(fuel[c("make", "year")])
head(subset_9b)
##    make year
## 1 Volvo 1984
## 2  Ford 1998
## 3   BMW 2018
## 4  Jeep 1998
## 5  MINI 2018
## 6   BMW 1999
nrow(subset_9b)
## [1] 1751

9.2 Remove duplicate rows

library(dplyr)
head(distinct(fuel))
##    make                 model year barrels08 barrelsA08 charge120 charge240
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0         0         0
## 2  Ford          Explorer 4WD 1998  21.97400          0         0         0
## 3   BMW                  540i 2018  13.73375          0         0         0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0         0         0
## 5  MINI   Cooper Clubman All4 2018  12.67731          0         0         0
## 6   BMW                 740il 1999  18.31167          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     18       0       0        0      0     0      0  -1   -1               0
## 2     14       0       0        0      0     0      0  -1   -1               0
## 3     21      20       0        0      0     0      0 367   -1               0
## 4     13       0       0        0      0     0      0  -1   -1               0
## 5     23      23       0        0      0     0      0 341   -1               0
## 6     15       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       467.7368     19       0       0        0     0          0          0
## 2       592.4667     15       0       0        0     0          0          0
## 3       367.0000     24      24       0        0     0          0          0
## 4       634.7857     14       0       0        0     0          0          0
## 5       341.0000     26      26       0        0     0          0          0
## 6       493.7222     18       0       0        0     0          0          0
##   cylinders displ                      drive engId eng_dscr feScore fuelCost08
## 1         4   2.3                            60060 CA model      -1       2100
## 2         6   4.0 4-Wheel or All-Wheel Drive     0     SOHC      -1       2650
## 3         6   3.0           Rear-Wheel Drive   540     SIDI       5       2100
## 4         6   4.0 4-Wheel or All-Wheel Drive     0               -1       2850
## 5         3   1.5            All-Wheel Drive    40     SIDI       5       1900
## 6         8   4.4           Rear-Wheel Drive     0               -1       2200
##   fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08 highway08U
## 1           0  Regular Regular Gasoline       -1        -1        22          0
## 2           0  Regular Regular Gasoline       -1        -1        18          0
## 3           0  Premium Premium Gasoline        5        -1        30         30
## 4           0  Regular Regular Gasoline       -1        -1        16          0
## 5           0  Premium Premium Gasoline        5        -1        31         30
## 6           0  Regular Regular Gasoline       -1        -1        22          0
##   highwayA08 highwayA08U                      VClass highwayCD highwayE
## 1          0           0      Midsize Station Wagons         0        0
## 2          0           0 Special Purpose Vehicle 4WD         0        0
## 3          0           0                Midsize Cars         0        0
## 4          0           0 Special Purpose Vehicle 4WD         0        0
## 5          0           0                Midsize Cars         0        0
## 6          0           0                  Large Cars         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 28216   0  41       N       FALSE   0  89     0         0
## 2         0   0   0 14722   0   0       Y       FALSE   0   0     0         0
## 3         0   0   0 39230   0  14       N       FALSE   0  99     0         0
## 4         0   0   0 14749   0   0       Y       FALSE   0   0     0         0
## 5         0  18  92 39235   0   0       N       FALSE   0   0     0         0
## 6         0   0   0 15174   0  13       N       FALSE   0 107     0         0
##   rangeCityA rangeHwy rangeHwyA           trany   UCity UCityA UHighway
## 1          0        0         0 Automatic 4-spd 22.0000      0  31.0000
## 2          0        0         0 Automatic 5-spd 16.8405      0  24.5965
## 3          0        0         0  Automatic (S8) 26.3067      0  43.0230
## 4          0        0         0 Automatic 3-spd 16.2486      0  22.2888
## 5          0        0         0  Automatic (S8) 29.5784      0  44.1889
## 6          0        0         0 Automatic 5-spd 18.9000      0  30.0000
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -3000                          NA                           
## 2         0        -5750              CLKUP       NA                           
## 3         0        -3000                        TRUE                           
## 4         0        -6750                          NA                           
## 5         0        -2000                        TRUE                           
## 6         0        -3500                          NA                           
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                          0           2013-01-01 2013-01-01
## 2                                          0           2013-01-01 2013-01-01
## 3                    BMX                   0           2017-08-10 2018-02-26
## 4                                          0           2013-01-01 2013-01-01
## 5                    BMX                   0           2017-08-10 2018-04-04
## 6                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1                  0       0        0
## 2                  0       0        0
## 3         Y        0       0        0
## 4                  0       0        0
## 5         N        0       0        0
## 6                  0       0        0

10. Recode data

10.1 One value

subset_10a <- fuel

subset_10a$highway08 <- ifelse(subset_10a$highway08 >= 25, "yay", "blah")
unique(subset_10a["highway08"])
##   highway08
## 1      blah
## 3       yay
head(subset_10a$highway08, 20)
##  [1] "blah" "blah" "yay"  "blah" "yay"  "blah" "blah" "yay"  "blah" "blah"
## [11] "yay"  "blah" "yay"  "yay"  "yay"  "yay"  "yay"  "yay"  "yay"  "yay"

10.2 Multiple values

names(fuel)
##  [1] "make"            "model"           "year"            "barrels08"      
##  [5] "barrelsA08"      "charge120"       "charge240"       "city08"         
##  [9] "city08U"         "cityA08"         "cityA08U"        "cityCD"         
## [13] "cityE"           "cityUF"          "co2"             "co2A"           
## [17] "co2TailpipeAGpm" "co2TailpipeGpm"  "comb08"          "comb08U"        
## [21] "combA08"         "combA08U"        "combE"           "combinedCD"     
## [25] "combinedUF"      "cylinders"       "displ"           "drive"          
## [29] "engId"           "eng_dscr"        "feScore"         "fuelCost08"     
## [33] "fuelCostA08"     "fuelType"        "fuelType1"       "ghgScore"       
## [37] "ghgScoreA"       "highway08"       "highway08U"      "highwayA08"     
## [41] "highwayA08U"     "VClass"          "highwayCD"       "highwayE"       
## [45] "highwayUF"       "hlv"             "hpv"             "id"             
## [49] "lv2"             "lv4"             "mpgData"         "phevBlended"    
## [53] "pv2"             "pv4"             "range"           "rangeCity"      
## [57] "rangeCityA"      "rangeHwy"        "rangeHwyA"       "trany"          
## [61] "UCity"           "UCityA"          "UHighway"        "UHighwayA"      
## [65] "youSaveSpend"    "guzzler"         "trans_dscr"      "tCharger"       
## [69] "sCharger"        "atvType"         "fuelType2"       "rangeA"         
## [73] "evMotor"         "mfrCode"         "c240Dscr"        "charge240b"     
## [77] "c240bDscr"       "createdOn"       "modifiedOn"      "startStop"      
## [81] "phevCity"        "phevHwy"         "phevComb"
unique(fuel["fuelType1"])
##             fuelType1
## 1    Regular Gasoline
## 3    Premium Gasoline
## 11        Electricity
## 42             Diesel
## 378 Midgrade Gasoline
## 777       Natural Gas

10.2.1 Recode into the same variable

subset_10b <- fuel
subset_10b$fuelType1 <- recode(subset_10b$fuelType1, 
                              "Regular Gasoline" = "RG",
                              "Premium Gasoline" = "PG",
                              "Electricity" = "E",
                              "Diesel" = "D",
                              "Midgrade Gasoline" = "MG",
                              "Natural Gas" = "NG")
head(subset_10b)
##    make                 model year barrels08 barrelsA08 charge120 charge240
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0         0         0
## 2  Ford          Explorer 4WD 1998  21.97400          0         0         0
## 3   BMW                  540i 2018  13.73375          0         0         0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0         0         0
## 5  MINI   Cooper Clubman All4 2018  12.67731          0         0         0
## 6   BMW                 740il 1999  18.31167          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     18       0       0        0      0     0      0  -1   -1               0
## 2     14       0       0        0      0     0      0  -1   -1               0
## 3     21      20       0        0      0     0      0 367   -1               0
## 4     13       0       0        0      0     0      0  -1   -1               0
## 5     23      23       0        0      0     0      0 341   -1               0
## 6     15       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       467.7368     19       0       0        0     0          0          0
## 2       592.4667     15       0       0        0     0          0          0
## 3       367.0000     24      24       0        0     0          0          0
## 4       634.7857     14       0       0        0     0          0          0
## 5       341.0000     26      26       0        0     0          0          0
## 6       493.7222     18       0       0        0     0          0          0
##   cylinders displ                      drive engId eng_dscr feScore fuelCost08
## 1         4   2.3                            60060 CA model      -1       2100
## 2         6   4.0 4-Wheel or All-Wheel Drive     0     SOHC      -1       2650
## 3         6   3.0           Rear-Wheel Drive   540     SIDI       5       2100
## 4         6   4.0 4-Wheel or All-Wheel Drive     0               -1       2850
## 5         3   1.5            All-Wheel Drive    40     SIDI       5       1900
## 6         8   4.4           Rear-Wheel Drive     0               -1       2200
##   fuelCostA08 fuelType fuelType1 ghgScore ghgScoreA highway08 highway08U
## 1           0  Regular        RG       -1        -1        22          0
## 2           0  Regular        RG       -1        -1        18          0
## 3           0  Premium        PG        5        -1        30         30
## 4           0  Regular        RG       -1        -1        16          0
## 5           0  Premium        PG        5        -1        31         30
## 6           0  Regular        RG       -1        -1        22          0
##   highwayA08 highwayA08U                      VClass highwayCD highwayE
## 1          0           0      Midsize Station Wagons         0        0
## 2          0           0 Special Purpose Vehicle 4WD         0        0
## 3          0           0                Midsize Cars         0        0
## 4          0           0 Special Purpose Vehicle 4WD         0        0
## 5          0           0                Midsize Cars         0        0
## 6          0           0                  Large Cars         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 28216   0  41       N       FALSE   0  89     0         0
## 2         0   0   0 14722   0   0       Y       FALSE   0   0     0         0
## 3         0   0   0 39230   0  14       N       FALSE   0  99     0         0
## 4         0   0   0 14749   0   0       Y       FALSE   0   0     0         0
## 5         0  18  92 39235   0   0       N       FALSE   0   0     0         0
## 6         0   0   0 15174   0  13       N       FALSE   0 107     0         0
##   rangeCityA rangeHwy rangeHwyA           trany   UCity UCityA UHighway
## 1          0        0         0 Automatic 4-spd 22.0000      0  31.0000
## 2          0        0         0 Automatic 5-spd 16.8405      0  24.5965
## 3          0        0         0  Automatic (S8) 26.3067      0  43.0230
## 4          0        0         0 Automatic 3-spd 16.2486      0  22.2888
## 5          0        0         0  Automatic (S8) 29.5784      0  44.1889
## 6          0        0         0 Automatic 5-spd 18.9000      0  30.0000
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -3000                          NA                           
## 2         0        -5750              CLKUP       NA                           
## 3         0        -3000                        TRUE                           
## 4         0        -6750                          NA                           
## 5         0        -2000                        TRUE                           
## 6         0        -3500                          NA                           
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                          0           2013-01-01 2013-01-01
## 2                                          0           2013-01-01 2013-01-01
## 3                    BMX                   0           2017-08-10 2018-02-26
## 4                                          0           2013-01-01 2013-01-01
## 5                    BMX                   0           2017-08-10 2018-04-04
## 6                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1                  0       0        0
## 2                  0       0        0
## 3         Y        0       0        0
## 4                  0       0        0
## 5         N        0       0        0
## 6                  0       0        0

10.2.2 Recode into a different variable

library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
subset_10c <- fuel

subset_10c$fuelType2<- revalue(subset_10c$fuelType1, 
                       c("Regular Gasoline" = "RG",
                         "Premium Gasoline" = "PG",
                         "Electricity" = "E",
                         "Diesel" = "D",
                         "Midgrade Gasoline" = "MG",
                         "Natural Gas" = "NG"))
head(subset_10c)
##    make                 model year barrels08 barrelsA08 charge120 charge240
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0         0         0
## 2  Ford          Explorer 4WD 1998  21.97400          0         0         0
## 3   BMW                  540i 2018  13.73375          0         0         0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0         0         0
## 5  MINI   Cooper Clubman All4 2018  12.67731          0         0         0
## 6   BMW                 740il 1999  18.31167          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     18       0       0        0      0     0      0  -1   -1               0
## 2     14       0       0        0      0     0      0  -1   -1               0
## 3     21      20       0        0      0     0      0 367   -1               0
## 4     13       0       0        0      0     0      0  -1   -1               0
## 5     23      23       0        0      0     0      0 341   -1               0
## 6     15       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       467.7368     19       0       0        0     0          0          0
## 2       592.4667     15       0       0        0     0          0          0
## 3       367.0000     24      24       0        0     0          0          0
## 4       634.7857     14       0       0        0     0          0          0
## 5       341.0000     26      26       0        0     0          0          0
## 6       493.7222     18       0       0        0     0          0          0
##   cylinders displ                      drive engId eng_dscr feScore fuelCost08
## 1         4   2.3                            60060 CA model      -1       2100
## 2         6   4.0 4-Wheel or All-Wheel Drive     0     SOHC      -1       2650
## 3         6   3.0           Rear-Wheel Drive   540     SIDI       5       2100
## 4         6   4.0 4-Wheel or All-Wheel Drive     0               -1       2850
## 5         3   1.5            All-Wheel Drive    40     SIDI       5       1900
## 6         8   4.4           Rear-Wheel Drive     0               -1       2200
##   fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08 highway08U
## 1           0  Regular Regular Gasoline       -1        -1        22          0
## 2           0  Regular Regular Gasoline       -1        -1        18          0
## 3           0  Premium Premium Gasoline        5        -1        30         30
## 4           0  Regular Regular Gasoline       -1        -1        16          0
## 5           0  Premium Premium Gasoline        5        -1        31         30
## 6           0  Regular Regular Gasoline       -1        -1        22          0
##   highwayA08 highwayA08U                      VClass highwayCD highwayE
## 1          0           0      Midsize Station Wagons         0        0
## 2          0           0 Special Purpose Vehicle 4WD         0        0
## 3          0           0                Midsize Cars         0        0
## 4          0           0 Special Purpose Vehicle 4WD         0        0
## 5          0           0                Midsize Cars         0        0
## 6          0           0                  Large Cars         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 28216   0  41       N       FALSE   0  89     0         0
## 2         0   0   0 14722   0   0       Y       FALSE   0   0     0         0
## 3         0   0   0 39230   0  14       N       FALSE   0  99     0         0
## 4         0   0   0 14749   0   0       Y       FALSE   0   0     0         0
## 5         0  18  92 39235   0   0       N       FALSE   0   0     0         0
## 6         0   0   0 15174   0  13       N       FALSE   0 107     0         0
##   rangeCityA rangeHwy rangeHwyA           trany   UCity UCityA UHighway
## 1          0        0         0 Automatic 4-spd 22.0000      0  31.0000
## 2          0        0         0 Automatic 5-spd 16.8405      0  24.5965
## 3          0        0         0  Automatic (S8) 26.3067      0  43.0230
## 4          0        0         0 Automatic 3-spd 16.2486      0  22.2888
## 5          0        0         0  Automatic (S8) 29.5784      0  44.1889
## 6          0        0         0 Automatic 5-spd 18.9000      0  30.0000
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -3000                          NA                         RG
## 2         0        -5750              CLKUP       NA                         RG
## 3         0        -3000                        TRUE                         PG
## 4         0        -6750                          NA                         RG
## 5         0        -2000                        TRUE                         PG
## 6         0        -3500                          NA                         RG
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                          0           2013-01-01 2013-01-01
## 2                                          0           2013-01-01 2013-01-01
## 3                    BMX                   0           2017-08-10 2018-02-26
## 4                                          0           2013-01-01 2013-01-01
## 5                    BMX                   0           2017-08-10 2018-04-04
## 6                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1                  0       0        0
## 2                  0       0        0
## 3         Y        0       0        0
## 4                  0       0        0
## 5         N        0       0        0
## 6                  0       0        0
table(subset_10c$fuelType2)
## 
##     D     E    MG    NG    PG    RG 
##  1180   206   106    60 11778 27854

Convert to factor.

subset_10c$fuelType2 <- as.factor(subset_10c$fuelType2)

subset_10c$fuelType2 <- factor(subset_10c$fuelType2,
                              levels = c("NG", "PG", "RG",
                                         "D", "E", "MG"))
class(subset_10c$fuelType2)
## [1] "factor"
table(subset_10c$fuelType2)
## 
##    NG    PG    RG     D     E    MG 
##    60 11778 27854  1180   206   106

10.2.3 Too many levels

subset_10d <- subset_10c

subset_10d$fuelType2 <- recode(subset_10d$fuelType2, 
                               "PG" = "PG", 
                               "RG" = "RG", 
                               .default = "Other")
table(subset_10d$fuelType2)
## 
## Other    PG    RG 
##  1552 11778 27854

10.3 Impute

Using median for numeric columns.

subset_10e <- fuel

subset_10e[] <- lapply(subset_10e, function(x) if (is.numeric(x)) replace(x, is.na(x), median(x, na.rm = TRUE)) else x)

Using mode for categorical columns (although potentially problematic).

library(DescTools)

subset_10e[] <- lapply(subset_10e, function(x) if (is.character(x) || is.factor(x)) 
  replace(x, is.na(x), Mode(na.omit(x))) else x)

Still a logical column.

sapply(subset_10e, function(x) sum(is.na(x)))
##            make           model            year       barrels08      barrelsA08 
##               0               0               0               0               0 
##       charge120       charge240          city08         city08U         cityA08 
##               0               0               0               0               0 
##        cityA08U          cityCD           cityE          cityUF             co2 
##               0               0               0               0               0 
##            co2A co2TailpipeAGpm  co2TailpipeGpm          comb08         comb08U 
##               0               0               0               0               0 
##         combA08        combA08U           combE      combinedCD      combinedUF 
##               0               0               0               0               0 
##       cylinders           displ           drive           engId        eng_dscr 
##               0               0               0               0               0 
##         feScore      fuelCost08     fuelCostA08        fuelType       fuelType1 
##               0               0               0               0               0 
##        ghgScore       ghgScoreA       highway08      highway08U      highwayA08 
##               0               0               0               0               0 
##     highwayA08U          VClass       highwayCD        highwayE       highwayUF 
##               0               0               0               0               0 
##             hlv             hpv              id             lv2             lv4 
##               0               0               0               0               0 
##         mpgData     phevBlended             pv2             pv4           range 
##               0               0               0               0               0 
##       rangeCity      rangeCityA        rangeHwy       rangeHwyA           trany 
##               0               0               0               0               0 
##           UCity          UCityA        UHighway       UHighwayA    youSaveSpend 
##               0               0               0               0               0 
##         guzzler      trans_dscr        tCharger        sCharger         atvType 
##               0               0           34281               0               0 
##       fuelType2          rangeA         evMotor         mfrCode        c240Dscr 
##               0               0               0               0               0 
##      charge240b       c240bDscr       createdOn      modifiedOn       startStop 
##               0               0               0               0               0 
##        phevCity         phevHwy        phevComb 
##               0               0               0

Blah blah blah.

subset_10e[] <- lapply(subset_10e, function(x) {
  if (is.logical(x)) {
    replace(x, is.na(x), "BLAH")  
  } else x
})

And there are no more NAs.

sum(is.na(subset_10e$tCharger))
## [1] 0

11. Training-validation split

set.seed(666)

train_index <- sample(1:nrow(fuel), 0.6 * nrow(fuel))
valid_index <- setdiff(1:nrow(fuel), train_index)

train <- fuel[train_index, ]
nrow(train)
## [1] 24710
head(train)
##            make             model year barrels08 barrelsA08 charge120 charge240
## 12926       BMW     Z4 sDrive35is 2013  17.34789          0         0         0
## 13195      Ford Ranger Pickup 2WD 1984  17.34789          0         0         0
## 32645    Suzuki  Vitara 2Door 4WD 2001  14.33087          0         0         0
## 17036 Chevrolet         Celebrity 1984  15.92437          0         0         0
## 1074     Nissan             300ZX 1989  18.31167          0         0         0
## 9317       Saab               900 1991  16.48050          0         0         0
##       city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A
## 12926     17      16       0        0      0     0      0 458   -1
## 13195     17       0       0        0      0     0      0  -1   -1
## 32645     22       0       0        0      0     0      0  -1   -1
## 17036     21       0       0        0      0     0      0  -1   -1
## 1074      15       0       0        0      0     0      0  -1   -1
## 9317      18       0       0        0      0     0      0  -1   -1
##       co2TailpipeAGpm co2TailpipeGpm comb08 comb08U combA08 combA08U combE
## 12926               0       458.0000     19      19       0        0     0
## 13195               0       467.7368     19       0       0        0     0
## 32645               0       386.3913     23       0       0        0     0
## 17036               0       424.1667     24       0       0        0     0
## 1074                0       493.7222     18       0       0        0     0
## 9317                0       444.3500     20       0       0        0     0
##       combinedCD combinedUF cylinders displ                      drive engId
## 12926          0          0         6   3.0           Rear-Wheel Drive   438
## 13195          0          0         4   2.3              2-Wheel Drive  3896
## 32645          0          0         4   1.6 4-Wheel or All-Wheel Drive     0
## 17036          0          0         6   4.3                             4322
## 1074           0          0         6   3.0           Rear-Wheel Drive 38051
## 9317           0          0         4   2.1          Front-Wheel Drive 47010
##                eng_dscr feScore fuelCost08 fuelCostA08 fuelType
## 12926              SIDI       4       2650           0  Premium
## 13195                        -1       2100           0  Regular
## 32645                        -1       1750           0  Regular
## 17036 (DIESEL) CA model      -1       1900           0   Diesel
## 1074              (FFS)      -1       2200           0  Regular
## 9317              (FFS)      -1       2000           0  Regular
##              fuelType1 ghgScore ghgScoreA highway08 highway08U highwayA08
## 12926 Premium Gasoline        4        -1        23         23          0
## 13195 Regular Gasoline       -1        -1        22          0          0
## 32645 Regular Gasoline       -1        -1        25          0          0
## 17036           Diesel       -1        -1        29          0          0
## 1074  Regular Gasoline       -1        -1        23          0          0
## 9317  Regular Gasoline       -1        -1        24          0          0
##       highwayA08U                      VClass highwayCD highwayE highwayUF hlv
## 12926           0                 Two Seaters         0        0         0   0
## 13195           0     Small Pickup Trucks 2WD         0        0         0   0
## 32645           0 Sport Utility Vehicle - 4WD         0        0         0   0
## 17036           0                Midsize Cars         0        0         0   0
## 1074            0                 Two Seaters         0        0         0  23
## 9317            0                Compact Cars         0        0         0  22
##       hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity rangeCityA
## 12926   0 32859   0   0       N       FALSE   0   0     0         0          0
## 13195   0 27238   0   0       N       FALSE   0   0     0         0          0
## 32645   0 17285   0   0       Y       FALSE   0   0     0         0          0
## 17036   0 28019  16  16       N       FALSE  98  98     0         0          0
## 1074   49  5254   0   0       Y       FALSE   0   0     0         0          0
## 9317   88  7872   0  14       N       FALSE   0  89     0         0          0
##       rangeHwy rangeHwyA             trany   UCity UCityA UHighway UHighwayA
## 12926        0         0 Automatic (AM-S7) 21.1097      0  32.8224         0
## 13195        0         0      Manual 5-spd 21.0000      0  30.0000         0
## 32645        0         0      Manual 5-spd 27.8499      0  34.7974         0
## 17036        0         0   Automatic 3-spd 26.0000      0  41.0000         0
## 1074         0         0      Manual 5-spd 19.0000      0  32.0000         0
## 9317         0         0      Manual 5-spd 22.0000      0  33.3333         0
##       youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 12926        -5750                        TRUE                           
## 13195        -3000                          NA                           
## 32645        -1250                          NA                           
## 17036        -2000                          NA           Diesel          
## 1074         -3500                          NA                           
## 9317         -2500                SIL       NA                           
##       rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn
## 12926                    BMX                   0           2013-01-01
## 13195                                          0           2013-01-01
## 32645                                          0           2013-01-01
## 17036                                          0           2013-01-01
## 1074                                           0           2013-01-01
## 9317                                           0           2013-01-01
##       modifiedOn startStop phevCity phevHwy phevComb
## 12926 2017-04-05         N        0       0        0
## 13195 2013-01-01                  0       0        0
## 32645 2013-01-01                  0       0        0
## 17036 2013-01-01                  0       0        0
## 1074  2013-01-01                  0       0        0
## 9317  2013-01-01                  0       0        0
valid <- fuel[valid_index, ]
nrow(valid)
## [1] 16474
head(valid)
##          make         model year barrels08 barrelsA08 charge120 charge240
## 3         BMW          540i 2018  13.73375          0         0         0
## 8      Toyota Avalon Hybrid 2017   8.24025          0         0         0
## 9  Alfa Romeo        Giulia 2018  16.48050          0         0         0
## 14    Lincoln       MKX FWD 2017  16.48050          0         0         0
## 17 Alfa Romeo            4C 2017  11.77179          0         0         0
## 22 Mitsubishi       3000 GT 1998  17.34789          0         0         0
##    city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 3      21      20       0        0      0     0      0 367   -1               0
## 8      40      40       0        0      0     0      0 223   -1               0
## 9      17      17       0        0      0     0      0 451   -1               0
## 14     18      17       0        0      0     0      0 444   -1               0
## 17     24      24       0        0      0     0      0 317   -1               0
## 22     17       0       0        0      0     0      0  -1   -1               0
##    co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 3        367.0000     24      24       0        0     0          0          0
## 8        223.0000     40      39       0        0     0          0          0
## 9        451.0000     20      19       0        0     0          0          0
## 14       444.0000     20      20       0        0     0          0          0
## 17       317.0000     28      27       0        0     0          0          0
## 22       467.7368     19       0       0        0     0          0          0
##    cylinders displ             drive engId eng_dscr feScore fuelCost08
## 3          6   3.0  Rear-Wheel Drive   540     SIDI       5       2100
## 8          4   2.5 Front-Wheel Drive    78                9       1000
## 9          6   2.9  Rear-Wheel Drive   401     SIDI       4       2500
## 14         6   2.7 Front-Wheel Drive   159     SIDI       4       2000
## 17         4   1.8  Rear-Wheel Drive    36     SIDI       6       1800
## 22         6   3.0 Front-Wheel Drive     0     SOHC      -1       2100
##    fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08
## 3            0  Premium Premium Gasoline        5        -1        30
## 8            0  Regular Regular Gasoline        9        -1        39
## 9            0  Premium Premium Gasoline        4        -1        24
## 14           0  Regular Regular Gasoline        4        -1        25
## 17           0  Premium Premium Gasoline        6        -1        34
## 22           0  Regular Regular Gasoline       -1        -1        22
##    highway08U highwayA08 highwayA08U                          VClass highwayCD
## 3          30          0           0                    Midsize Cars         0
## 8          38          0           0                    Midsize Cars         0
## 9          24          0           0                    Midsize Cars         0
## 14         25          0           0 Small Sport Utility Vehicle 2WD         0
## 17         34          0           0                     Two Seaters         0
## 22          0          0           0                 Subcompact Cars         0
##    highwayE highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range
## 3         0         0   0   0 39230   0  14       N       FALSE   0  99     0
## 8         0         0   0   0 38295   0  14       Y       FALSE   0 104     0
## 9         0         0   0   0 39288   0  12       N       FALSE   0 100     0
## 14        0         0   0   0 38397   0   0       N       FALSE   0   0     0
## 17        0         0   0   0 38407   0   0       N       FALSE   0   0     0
## 22        0         0  11  82 14105   0   0       N       FALSE   0   0     0
##    rangeCity rangeCityA rangeHwy rangeHwyA             trany   UCity UCityA
## 3          0          0        0         0    Automatic (S8) 26.3067      0
## 8          0          0        0         0 Automatic (AV-S6) 56.0848      0
## 9          0          0        0         0   Automatic 8-spd 21.2313      0
## 14         0          0        0         0    Automatic (S6) 22.2000      0
## 17         0          0        0         0   Automatic (AM6) 28.7000      0
## 22         0          0        0         0      Manual 5-spd 20.9000      0
##    UHighway UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType
## 3   43.0230         0        -3000                        TRUE                 
## 8   54.1578         0         2500                          NA           Hybrid
## 9   34.2507         0        -5000                        TRUE                 
## 14  38.7000         0        -2500                        TRUE                 
## 17  45.7000         0        -1500                        TRUE                 
## 22  31.1000         0        -3000                          NA                 
##    fuelType2 rangeA    evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn
## 3                                  BMX                   0           2017-08-10
## 8                   245V Ni-MH     TYX                   0           2016-10-12
## 9                                  CRX                   0           2017-08-16
## 14                                 FMX                   0           2016-11-07
## 17                                 CRX                   0           2016-11-14
## 22                                                       0           2013-01-01
##    modifiedOn startStop phevCity phevHwy phevComb
## 3  2018-02-26         Y        0       0        0
## 8  2017-04-05         Y        0       0        0
## 9  2017-11-03         Y        0       0        0
## 14 2016-11-22         N        0       0        0
## 17 2017-04-05         N        0       0        0
## 22 2013-01-01                  0       0        0

12. Normalisation

str(fuel)
## 'data.frame':    41184 obs. of  83 variables:
##  $ make           : chr  "Volvo" "Ford" "BMW" "Jeep" ...
##  $ model          : chr  "240 DL/GL/Turbo Wagon" "Explorer 4WD" "540i" "Wrangler 4WD" ...
##  $ year           : int  1984 1998 2018 1998 2018 1999 1999 2017 2018 1999 ...
##  $ barrels08      : num  17.3 22 13.7 23.5 12.7 ...
##  $ barrelsA08     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ charge120      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ charge240      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ city08         : int  18 14 21 13 23 15 15 40 17 16 ...
##  $ city08U        : int  0 0 20 0 23 0 0 40 17 0 ...
##  $ cityA08        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cityA08U       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cityCD         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cityE          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cityUF         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ co2            : int  -1 -1 367 -1 341 -1 -1 223 451 -1 ...
##  $ co2A           : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ co2TailpipeAGpm: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ co2TailpipeGpm : num  468 592 367 635 341 ...
##  $ comb08         : int  19 15 24 14 26 18 18 40 20 17 ...
##  $ comb08U        : int  0 0 24 0 26 0 0 39 19 0 ...
##  $ combA08        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ combA08U       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ combE          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ combinedCD     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ combinedUF     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cylinders      : int  4 6 6 6 3 8 8 4 6 6 ...
##  $ displ          : num  2.3 4 3 4 1.5 4.4 4.6 2.5 2.9 4 ...
##  $ drive          : chr  "" "4-Wheel or All-Wheel Drive" "Rear-Wheel Drive" "4-Wheel or All-Wheel Drive" ...
##  $ engId          : int  60060 0 540 0 40 0 0 78 401 0 ...
##  $ eng_dscr       : chr  "CA model" "SOHC" "SIDI" "" ...
##  $ feScore        : int  -1 -1 5 -1 5 -1 -1 9 4 -1 ...
##  $ fuelCost08     : int  2100 2650 2100 2850 1900 2200 2200 1000 2500 2350 ...
##  $ fuelCostA08    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ fuelType       : chr  "Regular" "Regular" "Premium" "Regular" ...
##  $ fuelType1      : chr  "Regular Gasoline" "Regular Gasoline" "Premium Gasoline" "Regular Gasoline" ...
##  $ ghgScore       : int  -1 -1 5 -1 5 -1 -1 9 4 -1 ...
##  $ ghgScoreA      : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ highway08      : int  22 18 30 16 31 22 22 39 24 20 ...
##  $ highway08U     : int  0 0 30 0 30 0 0 38 24 0 ...
##  $ highwayA08     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ highwayA08U    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ VClass         : chr  "Midsize Station Wagons" "Special Purpose Vehicle 4WD" "Midsize Cars" "Special Purpose Vehicle 4WD" ...
##  $ highwayCD      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ highwayE       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ highwayUF      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hlv            : int  0 0 0 0 18 0 0 0 0 0 ...
##  $ hpv            : int  0 0 0 0 92 0 0 0 0 0 ...
##  $ id             : int  28216 14722 39230 14749 39235 15174 15189 38295 39288 15299 ...
##  $ lv2            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lv4            : int  41 0 14 0 0 13 21 14 12 0 ...
##  $ mpgData        : chr  "N" "Y" "N" "Y" ...
##  $ phevBlended    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pv2            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pv4            : int  89 0 99 0 0 107 111 104 100 0 ...
##  $ range          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rangeCity      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rangeCityA     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rangeHwy       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rangeHwyA      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trany          : chr  "Automatic 4-spd" "Automatic 5-spd" "Automatic (S8)" "Automatic 3-spd" ...
##  $ UCity          : num  22 16.8 26.3 16.2 29.6 ...
##  $ UCityA         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ UHighway       : num  31 24.6 43 22.3 44.2 ...
##  $ UHighwayA      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ youSaveSpend   : int  -3000 -5750 -3000 -6750 -2000 -3500 -3500 2500 -5000 -4250 ...
##  $ guzzler        : chr  "" "" "" "" ...
##  $ trans_dscr     : chr  "" "CLKUP" "" "" ...
##  $ tCharger       : logi  NA NA TRUE NA TRUE NA ...
##  $ sCharger       : chr  "" "" "" "" ...
##  $ atvType        : chr  "" "" "" "" ...
##  $ fuelType2      : chr  "" "" "" "" ...
##  $ rangeA         : chr  "" "" "" "" ...
##  $ evMotor        : chr  "" "" "" "" ...
##  $ mfrCode        : chr  "" "" "BMX" "" ...
##  $ c240Dscr       : chr  "" "" "" "" ...
##  $ charge240b     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ c240bDscr      : chr  "" "" "" "" ...
##  $ createdOn      : chr  "2013-01-01" "2013-01-01" "2017-08-10" "2013-01-01" ...
##  $ modifiedOn     : chr  "2013-01-01" "2013-01-01" "2018-02-26" "2013-01-01" ...
##  $ startStop      : chr  "" "" "Y" "" ...
##  $ phevCity       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ phevHwy        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ phevComb       : int  0 0 0 0 0 0 0 0 0 0 ...
subset_12 <- fuel[, c("barrels08", "co2TailpipeGpm",
                      "fuelCost08", "highway08")]

head(subset_12)
##   barrels08 co2TailpipeGpm fuelCost08 highway08
## 1  17.34789       467.7368       2100        22
## 2  21.97400       592.4667       2650        18
## 3  13.73375       367.0000       2100        30
## 4  23.54357       634.7857       2850        16
## 5  12.67731       341.0000       1900        31
## 6  18.31167       493.7222       2200        22
subset_12 <- sapply(subset_12, scale)
head(subset_12)
##        barrels08 co2TailpipeGpm  fuelCost08  highway08
## [1,]  0.01482686     0.01210683 -0.24915683 -0.3243303
## [2,]  1.01501694     1.02060586  0.62313758 -0.8404118
## [3,] -0.76657165    -0.80239771 -0.24915683  0.7078326
## [4,]  1.35436715     1.36277517  0.94033555 -1.0984525
## [5,] -0.99498044    -1.01261988 -0.56635480  0.8368529
## [6,]  0.22319979     0.22221079 -0.09055785 -0.3243303

13. Data format

Create unique records.

subset_13a <- unique(fuel[c("make", "model", "year", "barrels08")])

head(subset_13a)
##    make                 model year barrels08
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789
## 2  Ford          Explorer 4WD 1998  21.97400
## 3   BMW                  540i 2018  13.73375
## 4  Jeep          Wrangler 4WD 1998  23.54357
## 5  MINI   Cooper Clubman All4 2018  12.67731
## 6   BMW                 740il 1999  18.31167
nrow(subset_13a)
## [1] 33217

A small data set for illustration.

subset_13a <- subset_13a[1:10,]
subset_13a
##          make                 model year barrels08
## 1       Volvo 240 DL/GL/Turbo Wagon 1984  17.34789
## 2        Ford          Explorer 4WD 1998  21.97400
## 3         BMW                  540i 2018  13.73375
## 4        Jeep          Wrangler 4WD 1998  23.54357
## 5        MINI   Cooper Clubman All4 2018  12.67731
## 6         BMW                 740il 1999  18.31167
## 7        Ford        Crown Victoria 1999  18.31167
## 8      Toyota         Avalon Hybrid 2017   8.24025
## 9  Alfa Romeo                Giulia 2018  16.48050
## 10       Ford     Ranger Pickup 2WD 1999  19.38882

13.1 Long to wide

Using count.

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
subset_13b <- dcast(subset_13a, make + model ~ year,
                    value.var = "barrels08",
                    fun.aggregate = length)
subset_13b
##          make                 model 1984 1998 1999 2017 2018
## 1  Alfa Romeo                Giulia    0    0    0    0    1
## 2         BMW                  540i    0    0    0    0    1
## 3         BMW                 740il    0    0    1    0    0
## 4        Ford        Crown Victoria    0    0    1    0    0
## 5        Ford          Explorer 4WD    0    1    0    0    0
## 6        Ford     Ranger Pickup 2WD    0    0    1    0    0
## 7        Jeep          Wrangler 4WD    0    1    0    0    0
## 8        MINI   Cooper Clubman All4    0    0    0    0    1
## 9      Toyota         Avalon Hybrid    0    0    0    1    0
## 10      Volvo 240 DL/GL/Turbo Wagon    1    0    0    0    0
nrow(subset_13b)
## [1] 10

Pivot using mean.

subset_13c <- dcast(subset_13a, make + model ~ year,
                    value.var = "barrels08",
                    fun.aggregate = mean,
                    na.rm = TRUE)
subset_13c
##          make                 model     1984     1998     1999    2017     2018
## 1  Alfa Romeo                Giulia      NaN      NaN      NaN     NaN 16.48050
## 2         BMW                  540i      NaN      NaN      NaN     NaN 13.73375
## 3         BMW                 740il      NaN      NaN 18.31167     NaN      NaN
## 4        Ford        Crown Victoria      NaN      NaN 18.31167     NaN      NaN
## 5        Ford          Explorer 4WD      NaN 21.97400      NaN     NaN      NaN
## 6        Ford     Ranger Pickup 2WD      NaN      NaN 19.38882     NaN      NaN
## 7        Jeep          Wrangler 4WD      NaN 23.54357      NaN     NaN      NaN
## 8        MINI   Cooper Clubman All4      NaN      NaN      NaN     NaN 12.67731
## 9      Toyota         Avalon Hybrid      NaN      NaN      NaN 8.24025      NaN
## 10      Volvo 240 DL/GL/Turbo Wagon 17.34789      NaN      NaN     NaN      NaN
nrow(subset_13c)
## [1] 10

13.2 Wide to long

subset_13d <- melt(subset_13b, 
                   id.vars = c("make", "model"),
                   variable.name = "year",
                   value.name = "barrels08")
subset_13d
##          make                 model year barrels08
## 1  Alfa Romeo                Giulia 1984         0
## 2         BMW                  540i 1984         0
## 3         BMW                 740il 1984         0
## 4        Ford        Crown Victoria 1984         0
## 5        Ford          Explorer 4WD 1984         0
## 6        Ford     Ranger Pickup 2WD 1984         0
## 7        Jeep          Wrangler 4WD 1984         0
## 8        MINI   Cooper Clubman All4 1984         0
## 9      Toyota         Avalon Hybrid 1984         0
## 10      Volvo 240 DL/GL/Turbo Wagon 1984         1
## 11 Alfa Romeo                Giulia 1998         0
## 12        BMW                  540i 1998         0
## 13        BMW                 740il 1998         0
## 14       Ford        Crown Victoria 1998         0
## 15       Ford          Explorer 4WD 1998         1
## 16       Ford     Ranger Pickup 2WD 1998         0
## 17       Jeep          Wrangler 4WD 1998         1
## 18       MINI   Cooper Clubman All4 1998         0
## 19     Toyota         Avalon Hybrid 1998         0
## 20      Volvo 240 DL/GL/Turbo Wagon 1998         0
## 21 Alfa Romeo                Giulia 1999         0
## 22        BMW                  540i 1999         0
## 23        BMW                 740il 1999         1
## 24       Ford        Crown Victoria 1999         1
## 25       Ford          Explorer 4WD 1999         0
## 26       Ford     Ranger Pickup 2WD 1999         1
## 27       Jeep          Wrangler 4WD 1999         0
## 28       MINI   Cooper Clubman All4 1999         0
## 29     Toyota         Avalon Hybrid 1999         0
## 30      Volvo 240 DL/GL/Turbo Wagon 1999         0
## 31 Alfa Romeo                Giulia 2017         0
## 32        BMW                  540i 2017         0
## 33        BMW                 740il 2017         0
## 34       Ford        Crown Victoria 2017         0
## 35       Ford          Explorer 4WD 2017         0
## 36       Ford     Ranger Pickup 2WD 2017         0
## 37       Jeep          Wrangler 4WD 2017         0
## 38       MINI   Cooper Clubman All4 2017         0
## 39     Toyota         Avalon Hybrid 2017         1
## 40      Volvo 240 DL/GL/Turbo Wagon 2017         0
## 41 Alfa Romeo                Giulia 2018         1
## 42        BMW                  540i 2018         1
## 43        BMW                 740il 2018         0
## 44       Ford        Crown Victoria 2018         0
## 45       Ford          Explorer 4WD 2018         0
## 46       Ford     Ranger Pickup 2WD 2018         0
## 47       Jeep          Wrangler 4WD 2018         0
## 48       MINI   Cooper Clubman All4 2018         1
## 49     Toyota         Avalon Hybrid 2018         0
## 50      Volvo 240 DL/GL/Turbo Wagon 2018         0
nrow(subset_13d)
## [1] 50
table(subset_13d$barrels08)
## 
##  0  1 
## 40 10

Get the original back.

subset_13e <- subset(subset_13d, barrels08 == "1")
subset_13e
##          make                 model year barrels08
## 10      Volvo 240 DL/GL/Turbo Wagon 1984         1
## 15       Ford          Explorer 4WD 1998         1
## 17       Jeep          Wrangler 4WD 1998         1
## 23        BMW                 740il 1999         1
## 24       Ford        Crown Victoria 1999         1
## 26       Ford     Ranger Pickup 2WD 1999         1
## 39     Toyota         Avalon Hybrid 2017         1
## 41 Alfa Romeo                Giulia 2018         1
## 42        BMW                  540i 2018         1
## 48       MINI   Cooper Clubman All4 2018         1

Compare columns.

library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
compare_df_cols(subset_13a, subset_13e)
##   column_name subset_13a subset_13e
## 1   barrels08    numeric    integer
## 2        make  character  character
## 3       model  character  character
## 4        year    integer     factor

14. Pivot tables

14.1 One categorical variable

subset_14a <- aggregate(barrelsA08 ~ make, 
                        data = fuel, 
                        FUN = mean)
head(subset_14a)
##                          make barrelsA08
## 1                       Acura          0
## 2                  Alfa Romeo          0
## 3                  AM General          0
## 4 American Motors Corporation          0
## 5            ASC Incorporated          0
## 6                Aston Martin          0

14.2 Two categorical variables

subset_14b <- aggregate(barrelsA08 ~ make + model, 
                        data = fuel, 
                        FUN = mean)
head(subset_14b)
##      make             model barrelsA08
## 1  Toyota   1-Ton Truck 2WD          0
## 2    Audi               100          0
## 3    Audi       100 quattro          0
## 4    Audi 100 quattro Wagon          0
## 5    Audi         100 Wagon          0
## 6 Pontiac              1000          0

14.3 Two numerical variables

subset_14c <- aggregate(cbind(barrelsA08,co2TailpipeGpm) ~ make + model, 
                        data = fuel, 
                        FUN = mean)
head(subset_14c)
##      make             model barrelsA08 co2TailpipeGpm
## 1  Toyota   1-Ton Truck 2WD          0       453.0908
## 2    Audi               100          0       480.7295
## 3    Audi       100 quattro          0       496.9492
## 4    Audi 100 quattro Wagon          0       493.7222
## 5    Audi         100 Wagon          0       485.0604
## 6 Pontiac              1000          0       334.4084

14.4 Multiple functions

subset_14d <- aggregate(cbind(barrelsA08,co2TailpipeGpm) ~ make + model, 
                        data = fuel, 
                        FUN = each(mean, median))
head(subset_14d)
##      make             model barrelsA08.mean barrelsA08.median
## 1  Toyota   1-Ton Truck 2WD               0                 0
## 2    Audi               100               0                 0
## 3    Audi       100 quattro               0                 0
## 4    Audi 100 quattro Wagon               0                 0
## 5    Audi         100 Wagon               0                 0
## 6 Pontiac              1000               0                 0
##   co2TailpipeGpm.mean co2TailpipeGpm.median
## 1            453.0908              467.7368
## 2            480.7295              480.7295
## 3            496.9492              493.7222
## 4            493.7222              493.7222
## 5            485.0604              493.7222
## 6            334.4084              339.3333

14.5 Count

By 1 variable.

library(dplyr)
subset_14e <- fuel %>% group_by(make) %>%
  dplyr::summarize(count_by_make = n())

subset_14e
## # A tibble: 136 × 2
##    make                        count_by_make
##    <chr>                               <int>
##  1 AM General                              6
##  2 ASC Incorporated                        1
##  3 Acura                                 347
##  4 Alfa Romeo                             62
##  5 American Motors Corporation            27
##  6 Aston Martin                          156
##  7 Audi                                 1002
##  8 Aurora Cars Ltd                         1
##  9 Autokraft Limited                       4
## 10 Avanti Motor Corporation                2
## # ℹ 126 more rows

By 2 variables.

subset_14f <- fuel %>% group_by(make, year) %>%
  dplyr::summarize(count_by_make_year = n())
## `summarise()` has grouped output by 'make'. You can override using the
## `.groups` argument.
subset_14f
## # A tibble: 1,751 × 3
## # Groups:   make [136]
##    make              year count_by_make_year
##    <chr>            <int>              <int>
##  1 AM General        1984                  4
##  2 AM General        1985                  2
##  3 ASC Incorporated  1987                  1
##  4 Acura             1986                  4
##  5 Acura             1987                  6
##  6 Acura             1988                  4
##  7 Acura             1989                  4
##  8 Acura             1990                  4
##  9 Acura             1991                  6
## 10 Acura             1992                  9
## # ℹ 1,741 more rows

14.6 Summarise

subset_14g <- fuel %>% group_by(make) %>%
  dplyr::summarize(count_by_make = n(), mean_barrel = mean(barrels08))

subset_14g
## # A tibble: 136 × 3
##    make                        count_by_make mean_barrel
##    <chr>                               <int>       <dbl>
##  1 AM General                              6        22.6
##  2 ASC Incorporated                        1        20.6
##  3 Acura                                 347        15.5
##  4 Alfa Romeo                             62        16.2
##  5 American Motors Corporation            27        18.5
##  6 Aston Martin                          156        25.2
##  7 Audi                                 1002        16.5
##  8 Aurora Cars Ltd                         1        22.0
##  9 Autokraft Limited                       4        20.0
## 10 Avanti Motor Corporation                2        20.0
## # ℹ 126 more rows
subset_14g <- fuel %>% group_by(make, year) %>%
 dplyr:: summarize(count_by_make_year = n(), 
            mean_barrel = mean(barrels08, na.rm = TRUE),
            median_barrel = median(barrels08, na.rm = TRUE),
            sd_barrel = sd(barrels08, na.rm = TRUE))
## `summarise()` has grouped output by 'make'. You can override using the
## `.groups` argument.
subset_14g
## # A tibble: 1,751 × 6
## # Groups:   make [136]
##    make              year count_by_make_year mean_barrel median_barrel sd_barrel
##    <chr>            <int>              <int>       <dbl>         <dbl>     <dbl>
##  1 AM General        1984                  4        22.4          22.4      3.44
##  2 AM General        1985                  2        23.0          23.0      3.36
##  3 ASC Incorporated  1987                  1        20.6          20.6     NA   
##  4 Acura             1986                  4        15.2          15.1      2.04
##  5 Acura             1987                  6        16.1          16.9      2.11
##  6 Acura             1988                  4        15.6          15.5      2.56
##  7 Acura             1989                  4        15.6          15.5      2.56
##  8 Acura             1990                  4        16.2          16.2      1.89
##  9 Acura             1991                  6        16.8          17.8      1.97
## 10 Acura             1992                  9        16.1          16.5      1.52
## # ℹ 1,741 more rows

15. Split and merge

15.1 By column

Split 1.

subset_15a <- fuel[, c(1:3)]
head(subset_15a)
##    make                 model year
## 1 Volvo 240 DL/GL/Turbo Wagon 1984
## 2  Ford          Explorer 4WD 1998
## 3   BMW                  540i 2018
## 4  Jeep          Wrangler 4WD 1998
## 5  MINI   Cooper Clubman All4 2018
## 6   BMW                 740il 1999

Split2.

subset_15b <- fuel[, c(4:6)]
head(subset_15b)
##   barrels08 barrelsA08 charge120
## 1  17.34789          0         0
## 2  21.97400          0         0
## 3  13.73375          0         0
## 4  23.54357          0         0
## 5  12.67731          0         0
## 6  18.31167          0         0

Merge.

subset_15c <- cbind(subset_15a, subset_15b)
head(subset_15c)
##    make                 model year barrels08 barrelsA08 charge120
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0         0
## 2  Ford          Explorer 4WD 1998  21.97400          0         0
## 3   BMW                  540i 2018  13.73375          0         0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0         0
## 5  MINI   Cooper Clubman All4 2018  12.67731          0         0
## 6   BMW                 740il 1999  18.31167          0         0

15.2 By row

Split 1.

subset_15d <- fuel[c(1:30),]
head(subset_15d)
##    make                 model year barrels08 barrelsA08 charge120 charge240
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0         0         0
## 2  Ford          Explorer 4WD 1998  21.97400          0         0         0
## 3   BMW                  540i 2018  13.73375          0         0         0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0         0         0
## 5  MINI   Cooper Clubman All4 2018  12.67731          0         0         0
## 6   BMW                 740il 1999  18.31167          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     18       0       0        0      0     0      0  -1   -1               0
## 2     14       0       0        0      0     0      0  -1   -1               0
## 3     21      20       0        0      0     0      0 367   -1               0
## 4     13       0       0        0      0     0      0  -1   -1               0
## 5     23      23       0        0      0     0      0 341   -1               0
## 6     15       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       467.7368     19       0       0        0     0          0          0
## 2       592.4667     15       0       0        0     0          0          0
## 3       367.0000     24      24       0        0     0          0          0
## 4       634.7857     14       0       0        0     0          0          0
## 5       341.0000     26      26       0        0     0          0          0
## 6       493.7222     18       0       0        0     0          0          0
##   cylinders displ                      drive engId eng_dscr feScore fuelCost08
## 1         4   2.3                            60060 CA model      -1       2100
## 2         6   4.0 4-Wheel or All-Wheel Drive     0     SOHC      -1       2650
## 3         6   3.0           Rear-Wheel Drive   540     SIDI       5       2100
## 4         6   4.0 4-Wheel or All-Wheel Drive     0               -1       2850
## 5         3   1.5            All-Wheel Drive    40     SIDI       5       1900
## 6         8   4.4           Rear-Wheel Drive     0               -1       2200
##   fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08 highway08U
## 1           0  Regular Regular Gasoline       -1        -1        22          0
## 2           0  Regular Regular Gasoline       -1        -1        18          0
## 3           0  Premium Premium Gasoline        5        -1        30         30
## 4           0  Regular Regular Gasoline       -1        -1        16          0
## 5           0  Premium Premium Gasoline        5        -1        31         30
## 6           0  Regular Regular Gasoline       -1        -1        22          0
##   highwayA08 highwayA08U                      VClass highwayCD highwayE
## 1          0           0      Midsize Station Wagons         0        0
## 2          0           0 Special Purpose Vehicle 4WD         0        0
## 3          0           0                Midsize Cars         0        0
## 4          0           0 Special Purpose Vehicle 4WD         0        0
## 5          0           0                Midsize Cars         0        0
## 6          0           0                  Large Cars         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 28216   0  41       N       FALSE   0  89     0         0
## 2         0   0   0 14722   0   0       Y       FALSE   0   0     0         0
## 3         0   0   0 39230   0  14       N       FALSE   0  99     0         0
## 4         0   0   0 14749   0   0       Y       FALSE   0   0     0         0
## 5         0  18  92 39235   0   0       N       FALSE   0   0     0         0
## 6         0   0   0 15174   0  13       N       FALSE   0 107     0         0
##   rangeCityA rangeHwy rangeHwyA           trany   UCity UCityA UHighway
## 1          0        0         0 Automatic 4-spd 22.0000      0  31.0000
## 2          0        0         0 Automatic 5-spd 16.8405      0  24.5965
## 3          0        0         0  Automatic (S8) 26.3067      0  43.0230
## 4          0        0         0 Automatic 3-spd 16.2486      0  22.2888
## 5          0        0         0  Automatic (S8) 29.5784      0  44.1889
## 6          0        0         0 Automatic 5-spd 18.9000      0  30.0000
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -3000                          NA                           
## 2         0        -5750              CLKUP       NA                           
## 3         0        -3000                        TRUE                           
## 4         0        -6750                          NA                           
## 5         0        -2000                        TRUE                           
## 6         0        -3500                          NA                           
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                          0           2013-01-01 2013-01-01
## 2                                          0           2013-01-01 2013-01-01
## 3                    BMX                   0           2017-08-10 2018-02-26
## 4                                          0           2013-01-01 2013-01-01
## 5                    BMX                   0           2017-08-10 2018-04-04
## 6                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1                  0       0        0
## 2                  0       0        0
## 3         Y        0       0        0
## 4                  0       0        0
## 5         N        0       0        0
## 6                  0       0        0

Split 2.

subset_15e <- fuel[c(31:60),]
head(subset_15e)
##         make                model year barrels08 barrelsA08 charge120 charge240
## 31   Pontiac              Firefly 1993  9.988182          0         0         0
## 32    Subaru         Forester AWD 2018 13.733750          0         0         0
## 33    Jaguar F-Type S Convertible 2018 18.311667          0         0         0
## 34      Ford               Taurus 1993 16.480500          0         0         0
## 35 Chevrolet              Caprice 1993 21.974000          0         0         0
## 36     Volvo                  S70 1998 17.347895          0         0         0
##    city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 31     30       0       0        0      0     0      0  -1   -1               0
## 32     22      21       0        0      0     0      0 368   -1               0
## 33     15      15       0        0      0     0      0 485   -1               0
## 34     17       0       0        0      0     0      0  -1   -1               0
## 35     13       0       0        0      0     0      0  -1   -1               0
## 36     16       0       0        0      0     0      0  -1   -1               0
##    co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 31       269.3030     33       0       0        0     0          0          0
## 32       368.0000     24      24       0        0     0          0          0
## 33       485.0000     18      18       0        0     0          0          0
## 34       444.3500     20       0       0        0     0          0          0
## 35       592.4667     15       0       0        0     0          0          0
## 36       467.7368     19       0       0        0     0          0          0
##    cylinders displ             drive engId                          eng_dscr
## 31         3   1.0 Front-Wheel Drive     0                             (FFS)
## 32         4   2.5   All-Wheel Drive    16                                  
## 33         6   3.0  Rear-Wheel Drive   173                              SIDI
## 34         6   3.0 Front-Wheel Drive     0                             (FFS)
## 35         8   5.7  Rear-Wheel Drive     0 (350 V8) (GUZZLER) (POLICE) (FFS)
## 36         5   2.3 Front-Wheel Drive     0                                  
##    feScore fuelCost08 fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA
## 31      -1       1200           0  Regular Regular Gasoline       -1        -1
## 32       5       1650           0  Regular Regular Gasoline        5        -1
## 33       3       2800           0  Premium Premium Gasoline        3        -1
## 34      -1       2000           0  Regular Regular Gasoline       -1        -1
## 35      -1       2650           0  Regular Regular Gasoline       -1        -1
## 36      -1       2650           0  Premium Premium Gasoline       -1        -1
##    highway08 highway08U highwayA08 highwayA08U                          VClass
## 31        36          0          0           0                 Subcompact Cars
## 32        28         28          0           0 Small Sport Utility Vehicle 4WD
## 33        24         23          0           0                     Two Seaters
## 34        26          0          0           0                    Midsize Cars
## 35        19          0          0           0                      Large Cars
## 36        24          0          0           0                    Midsize Cars
##    highwayCD highwayE highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2
## 31         0        0         0   0   0 28981   0   0       N       FALSE   0
## 32         0        0         0   0   0 38645   0   0       Y       FALSE   0
## 33         0        0         0   0   0 38658   0   0       N       FALSE   0
## 34         0        0         0   0   0 29006   0   0       Y       FALSE   0
## 35         0        0         0   0   0 29023   0   0       N       FALSE   0
## 36         0        0         0   0   0 14370   0  14       Y       FALSE   0
##    pv4 range rangeCity rangeCityA rangeHwy rangeHwyA           trany   UCity
## 31   0     0         0          0        0         0 Automatic 3-spd 39.7000
## 32   0     0         0          0        0         0    Manual 6-spd 27.5000
## 33   0     0         0          0        0         0    Manual 6-spd 19.1000
## 34   0     0         0          0        0         0 Automatic 4-spd 21.5000
## 35   0     0         0          0        0         0 Automatic 4-spd 15.6000
## 36  98     0         0          0        0         0 Automatic 4-spd 20.1882
##    UCityA UHighway UHighwayA youSaveSpend guzzler  trans_dscr tCharger sCharger
## 31      0  50.9000         0         1500                           NA         
## 32      0  40.2000         0         -750                           NA         
## 33      0  33.0000         0        -6500                           NA        S
## 34      0  36.1000         0        -2500                           NA         
## 35      0  25.8000         0        -5750       T       CLKUP       NA         
## 36      0  33.3333         0        -5750         2MODE CLKUP     TRUE         
##    atvType fuelType2 rangeA evMotor mfrCode c240Dscr charge240b c240bDscr
## 31                                                            0          
## 32                                      FJX                   0          
## 33                                      JLX                   0          
## 34                                                            0          
## 35                                                            0          
## 36                                                            0          
##     createdOn modifiedOn startStop phevCity phevHwy phevComb
## 31 2013-01-01 2013-01-01                  0       0        0
## 32 2017-04-19 2018-01-24         N        0       0        0
## 33 2017-04-25 2017-08-10         Y        0       0        0
## 34 2013-01-01 2013-01-01                  0       0        0
## 35 2013-01-01 2013-01-01                  0       0        0
## 36 2013-01-01 2013-01-01                  0       0        0

Merge.

subset_15f <- rbind(subset_15d, subset_15e)
head(subset_15f)
##    make                 model year barrels08 barrelsA08 charge120 charge240
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  17.34789          0         0         0
## 2  Ford          Explorer 4WD 1998  21.97400          0         0         0
## 3   BMW                  540i 2018  13.73375          0         0         0
## 4  Jeep          Wrangler 4WD 1998  23.54357          0         0         0
## 5  MINI   Cooper Clubman All4 2018  12.67731          0         0         0
## 6   BMW                 740il 1999  18.31167          0         0         0
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF co2 co2A co2TailpipeAGpm
## 1     18       0       0        0      0     0      0  -1   -1               0
## 2     14       0       0        0      0     0      0  -1   -1               0
## 3     21      20       0        0      0     0      0 367   -1               0
## 4     13       0       0        0      0     0      0  -1   -1               0
## 5     23      23       0        0      0     0      0 341   -1               0
## 6     15       0       0        0      0     0      0  -1   -1               0
##   co2TailpipeGpm comb08 comb08U combA08 combA08U combE combinedCD combinedUF
## 1       467.7368     19       0       0        0     0          0          0
## 2       592.4667     15       0       0        0     0          0          0
## 3       367.0000     24      24       0        0     0          0          0
## 4       634.7857     14       0       0        0     0          0          0
## 5       341.0000     26      26       0        0     0          0          0
## 6       493.7222     18       0       0        0     0          0          0
##   cylinders displ                      drive engId eng_dscr feScore fuelCost08
## 1         4   2.3                            60060 CA model      -1       2100
## 2         6   4.0 4-Wheel or All-Wheel Drive     0     SOHC      -1       2650
## 3         6   3.0           Rear-Wheel Drive   540     SIDI       5       2100
## 4         6   4.0 4-Wheel or All-Wheel Drive     0               -1       2850
## 5         3   1.5            All-Wheel Drive    40     SIDI       5       1900
## 6         8   4.4           Rear-Wheel Drive     0               -1       2200
##   fuelCostA08 fuelType        fuelType1 ghgScore ghgScoreA highway08 highway08U
## 1           0  Regular Regular Gasoline       -1        -1        22          0
## 2           0  Regular Regular Gasoline       -1        -1        18          0
## 3           0  Premium Premium Gasoline        5        -1        30         30
## 4           0  Regular Regular Gasoline       -1        -1        16          0
## 5           0  Premium Premium Gasoline        5        -1        31         30
## 6           0  Regular Regular Gasoline       -1        -1        22          0
##   highwayA08 highwayA08U                      VClass highwayCD highwayE
## 1          0           0      Midsize Station Wagons         0        0
## 2          0           0 Special Purpose Vehicle 4WD         0        0
## 3          0           0                Midsize Cars         0        0
## 4          0           0 Special Purpose Vehicle 4WD         0        0
## 5          0           0                Midsize Cars         0        0
## 6          0           0                  Large Cars         0        0
##   highwayUF hlv hpv    id lv2 lv4 mpgData phevBlended pv2 pv4 range rangeCity
## 1         0   0   0 28216   0  41       N       FALSE   0  89     0         0
## 2         0   0   0 14722   0   0       Y       FALSE   0   0     0         0
## 3         0   0   0 39230   0  14       N       FALSE   0  99     0         0
## 4         0   0   0 14749   0   0       Y       FALSE   0   0     0         0
## 5         0  18  92 39235   0   0       N       FALSE   0   0     0         0
## 6         0   0   0 15174   0  13       N       FALSE   0 107     0         0
##   rangeCityA rangeHwy rangeHwyA           trany   UCity UCityA UHighway
## 1          0        0         0 Automatic 4-spd 22.0000      0  31.0000
## 2          0        0         0 Automatic 5-spd 16.8405      0  24.5965
## 3          0        0         0  Automatic (S8) 26.3067      0  43.0230
## 4          0        0         0 Automatic 3-spd 16.2486      0  22.2888
## 5          0        0         0  Automatic (S8) 29.5784      0  44.1889
## 6          0        0         0 Automatic 5-spd 18.9000      0  30.0000
##   UHighwayA youSaveSpend guzzler trans_dscr tCharger sCharger atvType fuelType2
## 1         0        -3000                          NA                           
## 2         0        -5750              CLKUP       NA                           
## 3         0        -3000                        TRUE                           
## 4         0        -6750                          NA                           
## 5         0        -2000                        TRUE                           
## 6         0        -3500                          NA                           
##   rangeA evMotor mfrCode c240Dscr charge240b c240bDscr  createdOn modifiedOn
## 1                                          0           2013-01-01 2013-01-01
## 2                                          0           2013-01-01 2013-01-01
## 3                    BMX                   0           2017-08-10 2018-02-26
## 4                                          0           2013-01-01 2013-01-01
## 5                    BMX                   0           2017-08-10 2018-04-04
## 6                                          0           2013-01-01 2013-01-01
##   startStop phevCity phevHwy phevComb
## 1                  0       0        0
## 2                  0       0        0
## 3         Y        0       0        0
## 4                  0       0        0
## 5         N        0       0        0
## 6                  0       0        0

16. Compute new variables

subset_16 <- fuel

subset_16$barrels08_pct <- subset_16$barrels08 / sum(subset_16$barrels08)

summary(subset_16$barrels08_pct)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 8.431e-08 2.014e-05 2.316e-05 2.428e-05 2.725e-05 6.617e-05

17. Merge datasets

Subset of df.

subset_17a <- fuel[, c(1:3, 34)]
head(subset_17a)
##    make                 model year fuelType
## 1 Volvo 240 DL/GL/Turbo Wagon 1984  Regular
## 2  Ford          Explorer 4WD 1998  Regular
## 3   BMW                  540i 2018  Premium
## 4  Jeep          Wrangler 4WD 1998  Regular
## 5  MINI   Cooper Clubman All4 2018  Premium
## 6   BMW                 740il 1999  Regular

Create a new df.

fuelType <- unique(subset_17a$fuelType)
fuelType
##  [1] "Regular"                     "Premium"                    
##  [3] "Electricity"                 "Premium and Electricity"    
##  [5] "Diesel"                      "Gasoline or E85"            
##  [7] "Midgrade"                    "CNG"                        
##  [9] "Premium or E85"              "Regular Gas and Electricity"
## [11] "Premium Gas or Electricity"  "Gasoline or natural gas"    
## [13] "Regular Gas or Electricity"  "Gasoline or propane"
blah_blah_blah <- seq(110, 240, by = 10)

new_df <- data.frame(fuelType, blah_blah_blah)
new_df
##                       fuelType blah_blah_blah
## 1                      Regular            110
## 2                      Premium            120
## 3                  Electricity            130
## 4      Premium and Electricity            140
## 5                       Diesel            150
## 6              Gasoline or E85            160
## 7                     Midgrade            170
## 8                          CNG            180
## 9               Premium or E85            190
## 10 Regular Gas and Electricity            200
## 11  Premium Gas or Electricity            210
## 12     Gasoline or natural gas            220
## 13  Regular Gas or Electricity            230
## 14         Gasoline or propane            240

Merge.

subset_17b <- merge(subset_17a, new_df, by = "fuelType")
head(subset_17b)
##   fuelType                  make                         model year
## 1      CNG                Toyota                     Camry CNG 1999
## 2      CNG             Chevrolet Express Cargo (dedicated CNG) 2004
## 3      CNG                 Honda                     Civic CNG 2009
## 4      CNG                Toyota                     Camry CNG 2001
## 5      CNG                 Dodge     Caravan/Grand Caravan 2WD 1995
## 6      CNG Mobility Ventures LLC                      MV-1 CNG 2014
##   blah_blah_blah
## 1            180
## 2            180
## 3            180
## 4            180
## 5            180
## 6            180
table(subset_17b$blah_blah_blah)
## 
##   110   120   130   140   150   160   170   180   190   200   210   220   230 
## 26462 11562   206    53  1180  1318   106    60   127    42    36    20     4 
##   240 
##     8

18. Transpose

subset_18 <- fuel[1:5,1:3]
subset_18
##    make                 model year
## 1 Volvo 240 DL/GL/Turbo Wagon 1984
## 2  Ford          Explorer 4WD 1998
## 3   BMW                  540i 2018
## 4  Jeep          Wrangler 4WD 1998
## 5  MINI   Cooper Clubman All4 2018
t(subset_18)
##       1                       2              3      4             
## make  "Volvo"                 "Ford"         "BMW"  "Jeep"        
## model "240 DL/GL/Turbo Wagon" "Explorer 4WD" "540i" "Wrangler 4WD"
## year  "1984"                  "1998"         "2018" "1998"        
##       5                    
## make  "MINI"               
## model "Cooper Clubman All4"
## year  "2018"