#Install ggplot2 and plyr packages if you haven't.
#install.packages("ggplot2", dependencies = TRUE)
#install.packages("plyr", dependencies = TRUE)


# 1. Load libraries and data ----------------------------------------------------------------------

#Load library and data. The data come from the ggplot2 package,
library(ggplot2)
library(plyr)
data(diamonds)

#Check data.
head(diamonds)
## # A tibble: 6 x 10
##   carat       cut color clarity depth table price     x     y     z
##   <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
names(diamonds)
##  [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
##  [8] "x"       "y"       "z"
# 2. Pivot tables ----------------------------------------------------------------------

#Aggregate average price by cut.
aggregate(price ~ cut, data = diamonds, FUN = mean)
##         cut    price
## 1      Fair 4358.758
## 2      Good 3928.864
## 3 Very Good 3981.760
## 4   Premium 4584.258
## 5     Ideal 3457.542
#Aggregate average price by cut then colour.
aggregate(price ~ cut + color, data = diamonds, FUN = mean)
##          cut color    price
## 1       Fair     D 4291.061
## 2       Good     D 3405.382
## 3  Very Good     D 3470.467
## 4    Premium     D 3631.293
## 5      Ideal     D 2629.095
## 6       Fair     E 3682.312
## 7       Good     E 3423.644
## 8  Very Good     E 3214.652
## 9    Premium     E 3538.914
## 10     Ideal     E 2597.550
## 11      Fair     F 3827.003
## 12      Good     F 3495.750
## 13 Very Good     F 3778.820
## 14   Premium     F 4324.890
## 15     Ideal     F 3374.939
## 16      Fair     G 4239.255
## 17      Good     G 4123.482
## 18 Very Good     G 3872.754
## 19   Premium     G 4500.742
## 20     Ideal     G 3720.706
## 21      Fair     H 5135.683
## 22      Good     H 4276.255
## 23 Very Good     H 4535.390
## 24   Premium     H 5216.707
## 25     Ideal     H 3889.335
## 26      Fair     I 4685.446
## 27      Good     I 5078.533
## 28 Very Good     I 5255.880
## 29   Premium     I 5946.181
## 30     Ideal     I 4451.970
## 31      Fair     J 4975.655
## 32      Good     J 4574.173
## 33 Very Good     J 5103.513
## 34   Premium     J 6294.592
## 35     Ideal     J 4918.186
#Aggregate average price and average carat by cut.
aggregate(cbind(price,carat) ~ cut, data = diamonds, FUN = mean)
##         cut    price     carat
## 1      Fair 4358.758 1.0461366
## 2      Good 3928.864 0.8491847
## 3 Very Good 3981.760 0.8063814
## 4   Premium 4584.258 0.8919549
## 5     Ideal 3457.542 0.7028370
#Aggregate average price and average carat by cut then colour.
aggregate(cbind(price,carat) ~ cut + color, data = diamonds, FUN = mean)
##          cut color    price     carat
## 1       Fair     D 4291.061 0.9201227
## 2       Good     D 3405.382 0.7445166
## 3  Very Good     D 3470.467 0.6964243
## 4    Premium     D 3631.293 0.7215471
## 5      Ideal     D 2629.095 0.5657657
## 6       Fair     E 3682.312 0.8566071
## 7       Good     E 3423.644 0.7451340
## 8  Very Good     E 3214.652 0.6763167
## 9    Premium     E 3538.914 0.7177450
## 10     Ideal     E 2597.550 0.5784012
## 11      Fair     F 3827.003 0.9047115
## 12      Good     F 3495.750 0.7759296
## 13 Very Good     F 3778.820 0.7409612
## 14   Premium     F 4324.890 0.8270356
## 15     Ideal     F 3374.939 0.6558285
## 16      Fair     G 4239.255 1.0238217
## 17      Good     G 4123.482 0.8508955
## 18 Very Good     G 3872.754 0.7667986
## 19   Premium     G 4500.742 0.8414877
## 20     Ideal     G 3720.706 0.7007146
## 21      Fair     H 5135.683 1.2191749
## 22      Good     H 4276.255 0.9147293
## 23 Very Good     H 4535.390 0.9159485
## 24   Premium     H 5216.707 1.0164492
## 25     Ideal     H 3889.335 0.7995249
## 26      Fair     I 4685.446 1.1980571
## 27      Good     I 5078.533 1.0572222
## 28 Very Good     I 5255.880 1.0469518
## 29   Premium     I 5946.181 1.1449370
## 30     Ideal     I 4451.970 0.9130291
## 31      Fair     J 4975.655 1.3411765
## 32      Good     J 4574.173 1.0995440
## 33 Very Good     J 5103.513 1.1332153
## 34   Premium     J 6294.592 1.2930941
## 35     Ideal     J 4918.186 1.0635937
#Aggregate average and median price by cut.
aggregate(price ~ cut, data = diamonds, each(mean, median))
##         cut price.mean price.median
## 1      Fair   4358.758     3282.000
## 2      Good   3928.864     3050.500
## 3 Very Good   3981.760     2648.000
## 4   Premium   4584.258     3185.000
## 5     Ideal   3457.542     1810.000
# 3. Some computations ----------------------------------------------------------------------


#Aggregate percentage of total price by cut. The computation works but it adds little value in this case.
diamonds$price_pct = diamonds$price / sum(diamonds$price)
diamonds
## # A tibble: 53,940 x 11
##    carat       cut color clarity depth table price     x     y     z
##    <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
##  2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
##  3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
##  4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
##  5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
##  6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
##  7  0.24 Very Good     I    VVS1  62.3    57   336  3.95  3.98  2.47
##  8  0.26 Very Good     H     SI1  61.9    55   337  4.07  4.11  2.53
##  9  0.22      Fair     E     VS2  65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good     H     VS1  59.4    61   338  4.00  4.05  2.39
## # ... with 53,930 more rows, and 1 more variables: price_pct <dbl>
#Makes more sense to compute how far away is the price from the average.
diamonds$price_pct_avg = diamonds$price / mean(diamonds$price)
head(diamonds)
## # A tibble: 6 x 12
##   carat       cut color clarity depth table price     x     y     z
##   <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
## # ... with 2 more variables: price_pct <dbl>, price_pct_avg <dbl>
#Percentage difference in price from the average price by cut.

Diamonds_Pivot_Avg_Price <- aggregate(price ~ cut, data = diamonds, FUN = mean)
head(Diamonds_Pivot_Avg_Price)
##         cut    price
## 1      Fair 4358.758
## 2      Good 3928.864
## 3 Very Good 3981.760
## 4   Premium 4584.258
## 5     Ideal 3457.542
colnames(Diamonds_Pivot_Avg_Price)[2] <- "AvgCutPrice"

Merged_Diamonds <- merge(diamonds, Diamonds_Pivot_Avg_Price, by = "cut")
names(Merged_Diamonds)
##  [1] "cut"           "carat"         "color"         "clarity"      
##  [5] "depth"         "table"         "price"         "x"            
##  [9] "y"             "z"             "price_pct"     "price_pct_avg"
## [13] "AvgCutPrice"
Merged_Diamonds$price_pct_avg_cut = (Merged_Diamonds$price / Merged_Diamonds$AvgCutPrice)*100
names(Merged_Diamonds)
##  [1] "cut"               "carat"             "color"            
##  [4] "clarity"           "depth"             "table"            
##  [7] "price"             "x"                 "y"                
## [10] "z"                 "price_pct"         "price_pct_avg"    
## [13] "AvgCutPrice"       "price_pct_avg_cut"
head(Merged_Diamonds)
##    cut carat color clarity depth table price    x    y    z    price_pct
## 1 Fair  0.70     D     SI1  52.2    65  1895 6.04 5.99 3.14 8.932982e-06
## 2 Fair  0.30     F     VS2  64.5    55   662 4.31 4.19 2.74 3.120651e-06
## 3 Fair  1.51     F      I1  67.5    56  3734 7.17 7.05 4.80 1.760198e-05
## 4 Fair  1.03     D     SI2  66.4    56  3743 6.31 6.19 4.15 1.764441e-05
## 5 Fair  0.98     G      I1  69.7    58  1712 6.08 5.98 4.21 8.070324e-06
## 6 Fair  2.03     F      I1  65.6    56  6753 7.89 7.86 5.16 3.183347e-05
##   price_pct_avg AvgCutPrice price_pct_avg_cut
## 1     0.4818450    4358.758          43.47569
## 2     0.1683279    4358.758          15.18781
## 3     0.9494508    4358.758          85.66661
## 4     0.9517393    4358.758          85.87309
## 5     0.4353133    4358.758          39.27725
## 6     1.7170974    4358.758         154.92946
# 4. Export data ----------------------------------------------------------------------


#Export to csv.

Pivot_Table_1 <- aggregate(price ~ cut, data = diamonds, 
                           each(mean, median))
write.csv(Pivot_Table_1, file = "Pivot_Table_1.csv")

write.csv(Merged_Diamonds, file = "Merged_Diamonds.csv")