#Install ggplot2 and plyr packages if you haven't.
#install.packages("ggplot2", dependencies = TRUE)
#install.packages("plyr", dependencies = TRUE)
# 1. Load libraries and data ----------------------------------------------------------------------
#Load library and data. The data come from the ggplot2 package,
library(ggplot2)
library(plyr)
data(diamonds)
#Check data.
head(diamonds)
## # A tibble: 6 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
names(diamonds)
## [1] "carat" "cut" "color" "clarity" "depth" "table" "price"
## [8] "x" "y" "z"
# 2. Pivot tables ----------------------------------------------------------------------
#Aggregate average price by cut.
aggregate(price ~ cut, data = diamonds, FUN = mean)
## cut price
## 1 Fair 4358.758
## 2 Good 3928.864
## 3 Very Good 3981.760
## 4 Premium 4584.258
## 5 Ideal 3457.542
#Aggregate average price by cut then colour.
aggregate(price ~ cut + color, data = diamonds, FUN = mean)
## cut color price
## 1 Fair D 4291.061
## 2 Good D 3405.382
## 3 Very Good D 3470.467
## 4 Premium D 3631.293
## 5 Ideal D 2629.095
## 6 Fair E 3682.312
## 7 Good E 3423.644
## 8 Very Good E 3214.652
## 9 Premium E 3538.914
## 10 Ideal E 2597.550
## 11 Fair F 3827.003
## 12 Good F 3495.750
## 13 Very Good F 3778.820
## 14 Premium F 4324.890
## 15 Ideal F 3374.939
## 16 Fair G 4239.255
## 17 Good G 4123.482
## 18 Very Good G 3872.754
## 19 Premium G 4500.742
## 20 Ideal G 3720.706
## 21 Fair H 5135.683
## 22 Good H 4276.255
## 23 Very Good H 4535.390
## 24 Premium H 5216.707
## 25 Ideal H 3889.335
## 26 Fair I 4685.446
## 27 Good I 5078.533
## 28 Very Good I 5255.880
## 29 Premium I 5946.181
## 30 Ideal I 4451.970
## 31 Fair J 4975.655
## 32 Good J 4574.173
## 33 Very Good J 5103.513
## 34 Premium J 6294.592
## 35 Ideal J 4918.186
#Aggregate average price and average carat by cut.
aggregate(cbind(price,carat) ~ cut, data = diamonds, FUN = mean)
## cut price carat
## 1 Fair 4358.758 1.0461366
## 2 Good 3928.864 0.8491847
## 3 Very Good 3981.760 0.8063814
## 4 Premium 4584.258 0.8919549
## 5 Ideal 3457.542 0.7028370
#Aggregate average price and average carat by cut then colour.
aggregate(cbind(price,carat) ~ cut + color, data = diamonds, FUN = mean)
## cut color price carat
## 1 Fair D 4291.061 0.9201227
## 2 Good D 3405.382 0.7445166
## 3 Very Good D 3470.467 0.6964243
## 4 Premium D 3631.293 0.7215471
## 5 Ideal D 2629.095 0.5657657
## 6 Fair E 3682.312 0.8566071
## 7 Good E 3423.644 0.7451340
## 8 Very Good E 3214.652 0.6763167
## 9 Premium E 3538.914 0.7177450
## 10 Ideal E 2597.550 0.5784012
## 11 Fair F 3827.003 0.9047115
## 12 Good F 3495.750 0.7759296
## 13 Very Good F 3778.820 0.7409612
## 14 Premium F 4324.890 0.8270356
## 15 Ideal F 3374.939 0.6558285
## 16 Fair G 4239.255 1.0238217
## 17 Good G 4123.482 0.8508955
## 18 Very Good G 3872.754 0.7667986
## 19 Premium G 4500.742 0.8414877
## 20 Ideal G 3720.706 0.7007146
## 21 Fair H 5135.683 1.2191749
## 22 Good H 4276.255 0.9147293
## 23 Very Good H 4535.390 0.9159485
## 24 Premium H 5216.707 1.0164492
## 25 Ideal H 3889.335 0.7995249
## 26 Fair I 4685.446 1.1980571
## 27 Good I 5078.533 1.0572222
## 28 Very Good I 5255.880 1.0469518
## 29 Premium I 5946.181 1.1449370
## 30 Ideal I 4451.970 0.9130291
## 31 Fair J 4975.655 1.3411765
## 32 Good J 4574.173 1.0995440
## 33 Very Good J 5103.513 1.1332153
## 34 Premium J 6294.592 1.2930941
## 35 Ideal J 4918.186 1.0635937
#Aggregate average and median price by cut.
aggregate(price ~ cut, data = diamonds, each(mean, median))
## cut price.mean price.median
## 1 Fair 4358.758 3282.000
## 2 Good 3928.864 3050.500
## 3 Very Good 3981.760 2648.000
## 4 Premium 4584.258 3185.000
## 5 Ideal 3457.542 1810.000
# 3. Some computations ----------------------------------------------------------------------
#Aggregate percentage of total price by cut. The computation works but it adds little value in this case.
diamonds$price_pct = diamonds$price / sum(diamonds$price)
diamonds
## # A tibble: 53,940 x 11
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
## # ... with 53,930 more rows, and 1 more variables: price_pct <dbl>
#Makes more sense to compute how far away is the price from the average.
diamonds$price_pct_avg = diamonds$price / mean(diamonds$price)
head(diamonds)
## # A tibble: 6 x 12
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## # ... with 2 more variables: price_pct <dbl>, price_pct_avg <dbl>
#Percentage difference in price from the average price by cut.
Diamonds_Pivot_Avg_Price <- aggregate(price ~ cut, data = diamonds, FUN = mean)
head(Diamonds_Pivot_Avg_Price)
## cut price
## 1 Fair 4358.758
## 2 Good 3928.864
## 3 Very Good 3981.760
## 4 Premium 4584.258
## 5 Ideal 3457.542
colnames(Diamonds_Pivot_Avg_Price)[2] <- "AvgCutPrice"
Merged_Diamonds <- merge(diamonds, Diamonds_Pivot_Avg_Price, by = "cut")
names(Merged_Diamonds)
## [1] "cut" "carat" "color" "clarity"
## [5] "depth" "table" "price" "x"
## [9] "y" "z" "price_pct" "price_pct_avg"
## [13] "AvgCutPrice"
Merged_Diamonds$price_pct_avg_cut = (Merged_Diamonds$price / Merged_Diamonds$AvgCutPrice)*100
names(Merged_Diamonds)
## [1] "cut" "carat" "color"
## [4] "clarity" "depth" "table"
## [7] "price" "x" "y"
## [10] "z" "price_pct" "price_pct_avg"
## [13] "AvgCutPrice" "price_pct_avg_cut"
head(Merged_Diamonds)
## cut carat color clarity depth table price x y z price_pct
## 1 Fair 0.70 D SI1 52.2 65 1895 6.04 5.99 3.14 8.932982e-06
## 2 Fair 0.30 F VS2 64.5 55 662 4.31 4.19 2.74 3.120651e-06
## 3 Fair 1.51 F I1 67.5 56 3734 7.17 7.05 4.80 1.760198e-05
## 4 Fair 1.03 D SI2 66.4 56 3743 6.31 6.19 4.15 1.764441e-05
## 5 Fair 0.98 G I1 69.7 58 1712 6.08 5.98 4.21 8.070324e-06
## 6 Fair 2.03 F I1 65.6 56 6753 7.89 7.86 5.16 3.183347e-05
## price_pct_avg AvgCutPrice price_pct_avg_cut
## 1 0.4818450 4358.758 43.47569
## 2 0.1683279 4358.758 15.18781
## 3 0.9494508 4358.758 85.66661
## 4 0.9517393 4358.758 85.87309
## 5 0.4353133 4358.758 39.27725
## 6 1.7170974 4358.758 154.92946
# 4. Export data ----------------------------------------------------------------------
#Export to csv.
Pivot_Table_1 <- aggregate(price ~ cut, data = diamonds,
each(mean, median))
write.csv(Pivot_Table_1, file = "Pivot_Table_1.csv")
write.csv(Merged_Diamonds, file = "Merged_Diamonds.csv")