# 1. Load data ---------------------------------------------------------
# Data source: Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

wine <- read.csv("Wine.csv", header = TRUE)
head(wine)
##   Type Alcohol Malic_Acid  Ash Ash_Alcalinity Magnesium Total_Phenols
## 1    A   14.23       1.71 2.43           15.6       127          2.80
## 2    A   13.20       1.78 2.14           11.2       100          2.65
## 3    A   13.16       2.36 2.67           18.6       101          2.80
## 4    A   14.37       1.95 2.50           16.8       113          3.85
## 5    A   13.24       2.59 2.87           21.0       118          2.80
## 6    A   14.20       1.76 2.45           15.2       112          3.27
##   Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity  Hue
## 1       3.06                 0.28            2.29            5.64 1.04
## 2       2.76                 0.26            1.28            4.38 1.05
## 3       3.24                 0.30            2.81            5.68 1.03
## 4       3.49                 0.24            2.18            7.80 0.86
## 5       2.69                 0.39            1.82            4.32 1.04
## 6       3.39                 0.34            1.97            6.75 1.05
##   OD280_OD315 Proline
## 1        3.92    1065
## 2        3.40    1050
## 3        3.17    1185
## 4        3.45    1480
## 5        2.93     735
## 6        2.85    1450
names(wine)
##  [1] "Type"                 "Alcohol"              "Malic_Acid"          
##  [4] "Ash"                  "Ash_Alcalinity"       "Magnesium"           
##  [7] "Total_Phenols"        "Flavanoids"           "Nonflavanoid_Phenols"
## [10] "Proanthocyanins"      "Color_Intensity"      "Hue"                 
## [13] "OD280_OD315"          "Proline"
# 2. PCA raw data -------------------------------------------------

# The result is a list. 
# centre and scale are the mean and standard deviation of the variables used for normalisation prior to  PCA
# rotation is the principal component loading. 
# x is a matrix with the principal component score vectors.

wine_pca <- prcomp(wine[,-c(1)])
summary(wine_pca)
## Importance of components:
##                             PC1      PC2     PC3     PC4     PC5     PC6
## Standard deviation     314.9632 13.13527 3.07215 2.23409 1.10853 0.91710
## Proportion of Variance   0.9981  0.00174 0.00009 0.00005 0.00001 0.00001
## Cumulative Proportion    0.9981  0.99983 0.99992 0.99997 0.99998 0.99999
##                           PC7    PC8    PC9   PC10   PC11   PC12    PC13
## Standard deviation     0.5282 0.3891 0.3348 0.2678 0.1938 0.1452 0.09057
## Proportion of Variance 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.00000
## Cumulative Proportion  1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.00000
wine_pca$rot[,1:4]
##                                PC1           PC2          PC3          PC4
## Alcohol              -0.0016592647 -1.203406e-03 -0.016873809  0.141446778
## Malic_Acid            0.0006810156 -2.154982e-03 -0.122003373  0.160389543
## Ash                  -0.0001949057 -4.593693e-03 -0.051987430 -0.009772810
## Ash_Alcalinity        0.0046713006 -2.645039e-02 -0.938593003 -0.330965260
## Magnesium            -0.0178680075 -9.993442e-01  0.029780248 -0.005393756
## Total_Phenols        -0.0009898297 -8.779622e-04  0.040484644 -0.074584656
## Flavanoids           -0.0015672883  5.185073e-05  0.085443339 -0.169086724
## Nonflavanoid_Phenols  0.0001230867  1.354479e-03 -0.013510780  0.010805561
## Proanthocyanins      -0.0006006078 -5.004400e-03  0.024659382 -0.050120952
## Color_Intensity      -0.0023271432 -1.510035e-02 -0.291398464  0.878893693
## Hue                  -0.0001713800  7.626731e-04  0.025977662 -0.060034945
## OD280_OD315          -0.0007049316  3.495364e-03  0.070323969 -0.178200254
## Proline              -0.9998229365  1.777381e-02 -0.004528682 -0.003112916
# Investigate Proline variable

summary(wine$Proline)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   278.0   500.5   673.5   746.9   985.0  1680.0
typeof(wine$Proline)
## [1] "integer"
library(pastecs)
## Warning: package 'pastecs' was built under R version 3.4.4
options(scipen = 100)
options(digits = 2)
stat.desc(wine$Proline)
##      nbr.val     nbr.null       nbr.na          min          max 
##       178.00         0.00         0.00       278.00      1680.00 
##        range          sum       median         mean      SE.mean 
##      1402.00    132947.00       673.50       746.89        23.60 
## CI.mean.0.95          var      std.dev     coef.var 
##        46.58     99166.72       314.91         0.42
# 3. PCA on normalised data ---------------------------------------

# Identify numerical variables

names(wine)
##  [1] "Type"                 "Alcohol"              "Malic_Acid"          
##  [4] "Ash"                  "Ash_Alcalinity"       "Magnesium"           
##  [7] "Total_Phenols"        "Flavanoids"           "Nonflavanoid_Phenols"
## [10] "Proanthocyanins"      "Color_Intensity"      "Hue"                 
## [13] "OD280_OD315"          "Proline"
str(wine)
## 'data.frame':    178 obs. of  14 variables:
##  $ Type                : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic_Acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Ash_Alcalinity      : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Magnesium           : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Total_Phenols       : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid_Phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanthocyanins     : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color_Intensity     : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD280_OD315         : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
# PCA using numerical variables only, normalised


wine_pca_nom <- prcomp(na.omit(wine[,-c(1,6,14)]), scale. = T)
summary(wine_pca_nom)
## Importance of components:
##                          PC1   PC2   PC3   PC4    PC5    PC6    PC7    PC8
## Standard deviation     2.086 1.412 1.189 0.914 0.7954 0.7736 0.5983 0.5420
## Proportion of Variance 0.396 0.181 0.129 0.076 0.0575 0.0544 0.0325 0.0267
## Cumulative Proportion  0.396 0.577 0.705 0.781 0.8387 0.8931 0.9257 0.9524
##                           PC9   PC10    PC11
## Standard deviation     0.4971 0.4136 0.32466
## Proportion of Variance 0.0225 0.0155 0.00958
## Cumulative Proportion  0.9749 0.9904 1.00000
wine_pca_nom$rot
##                         PC1    PC2    PC3    PC4    PC5    PC6    PC7
## Alcohol              -0.080  0.568 -0.235  0.266 -0.324  0.057 -0.376
## Malic_Acid            0.276  0.224  0.073 -0.590 -0.493  0.364 -0.135
## Ash                   0.045  0.352  0.622  0.268 -0.180 -0.177  0.055
## Ash_Alcalinity        0.236 -0.034  0.622 -0.142  0.166 -0.281 -0.245
## Total_Phenols        -0.401  0.223  0.125 -0.040  0.033  0.120  0.432
## Flavanoids           -0.438  0.148  0.135 -0.038 -0.013  0.037  0.201
## Nonflavanoid_Phenols  0.310 -0.043  0.218  0.360  0.146  0.772  0.228
## Proanthocyanins      -0.322  0.173  0.111 -0.367  0.576  0.312 -0.423
## Color_Intensity       0.162  0.570 -0.176  0.145  0.368 -0.116  0.066
## Hue                  -0.335 -0.267  0.117  0.423 -0.160  0.173 -0.533
## OD280_OD315          -0.412 -0.042  0.167 -0.148 -0.276  0.047  0.176
##                          PC8   PC9   PC10   PC11
## Alcohol               0.4005 -0.21  0.304  0.027
## Malic_Acid           -0.1606  0.29 -0.131  0.023
## Ash                  -0.5238 -0.25  0.043 -0.107
## Ash_Alcalinity        0.5669  0.20  0.041  0.075
## Total_Phenols         0.1396  0.47  0.331 -0.471
## Flavanoids           -0.0232  0.15  0.029  0.837
## Nonflavanoid_Phenols  0.1868 -0.11 -0.021  0.090
## Proanthocyanins      -0.1667 -0.26  0.085 -0.099
## Color_Intensity       0.0037  0.28 -0.604 -0.015
## Hue                  -0.1253  0.44 -0.258 -0.083
## OD280_OD315           0.3460 -0.42 -0.581 -0.185
# 4. Visualisations -------------------------------------------------------



# Plot PC1 vs PC2 with labels

library(plyr)
count(wine, "Type")
##   Type freq
## 1    A   59
## 2    B   71
## 3    C   48
plot(x = wine_pca_nom$x[,1], 
     y = wine_pca_nom$x[,2],
     xlab = "PC1",
     ylab = "PC2")

text(x = wine_pca_nom$x[,1],
     y = wine_pca_nom$x[,2],
     labels = wine$Type,
     col = ifelse(wine$Type == "A", "red", 
                  ifelse(wine$Type == "B", "blue", "green")))

# Plot PC1 vs PC2 with legend

plot(x = wine_pca_nom$x[,1], y = wine_pca_nom$x[,2],
     xlab = "PC1", ylab= "PC2",
     main = "Wine PC1 vs PC2",
     col=ifelse(wine$Type == "A", "red", 
                ifelse(wine$Type == "B", "blue", "green")))

legend(x="bottomleft", legend = levels(wine[, 1]),
            col = c("red", "blue", "green"), pch=1)
     
head(wine_pca_nom$x)
##    PC1   PC2   PC3   PC4     PC5   PC6    PC7   PC8   PC9  PC10   PC11
## 1 -2.8  1.40 -0.44  0.12 -0.3031  0.30 -0.158  0.27 -1.03 -0.40 -0.153
## 2 -2.0 -0.32 -1.94  0.31 -1.0003  0.13  0.934 -0.47 -0.26 -0.39 -0.010
## 3 -2.1  1.30  0.99 -0.61  0.6782  0.38 -0.386 -0.79 -0.42 -0.11  0.072
## 4 -2.8  2.81 -0.27 -0.17  0.0929 -0.13  0.859  0.44  0.14  0.23 -0.409
## 5 -0.8  0.89  1.83  0.29 -0.4798  0.19  0.079 -0.44 -0.13  0.31 -0.102
## 6 -2.2  1.84 -0.61  0.77  0.0081  0.51  0.365 -0.10  0.23  0.35  0.120
head(wine)
##   Type Alcohol Malic_Acid Ash Ash_Alcalinity Magnesium Total_Phenols
## 1    A      14        1.7 2.4             16       127           2.8
## 2    A      13        1.8 2.1             11       100           2.6
## 3    A      13        2.4 2.7             19       101           2.8
## 4    A      14        1.9 2.5             17       113           3.8
## 5    A      13        2.6 2.9             21       118           2.8
## 6    A      14        1.8 2.4             15       112           3.3
##   Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity  Hue
## 1        3.1                 0.28             2.3             5.6 1.04
## 2        2.8                 0.26             1.3             4.4 1.05
## 3        3.2                 0.30             2.8             5.7 1.03
## 4        3.5                 0.24             2.2             7.8 0.86
## 5        2.7                 0.39             1.8             4.3 1.04
## 6        3.4                 0.34             2.0             6.8 1.05
##   OD280_OD315 Proline
## 1         3.9    1065
## 2         3.4    1050
## 3         3.2    1185
## 4         3.4    1480
## 5         2.9     735
## 6         2.8    1450
str(wine_pca_nom$x)
##  num [1:178, 1:11] -2.752 -2.047 -2.104 -2.822 -0.801 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:178] "1" "2" "3" "4" ...
##   ..$ : chr [1:11] "PC1" "PC2" "PC3" "PC4" ...
# 5. Using ggplot ---------------------------------------------------------



# Convert matrix x to a data frame
wine_weights <- as.data.frame(wine_pca_nom$x)
head(wine_weights)
##    PC1   PC2   PC3   PC4     PC5   PC6    PC7   PC8   PC9  PC10   PC11
## 1 -2.8  1.40 -0.44  0.12 -0.3031  0.30 -0.158  0.27 -1.03 -0.40 -0.153
## 2 -2.0 -0.32 -1.94  0.31 -1.0003  0.13  0.934 -0.47 -0.26 -0.39 -0.010
## 3 -2.1  1.30  0.99 -0.61  0.6782  0.38 -0.386 -0.79 -0.42 -0.11  0.072
## 4 -2.8  2.81 -0.27 -0.17  0.0929 -0.13  0.859  0.44  0.14  0.23 -0.409
## 5 -0.8  0.89  1.83  0.29 -0.4798  0.19  0.079 -0.44 -0.13  0.31 -0.102
## 6 -2.2  1.84 -0.61  0.77  0.0081  0.51  0.365 -0.10  0.23  0.35  0.120
# Combine matrix x to the original data set

wine_weights_PC <- cbind(wine, wine_weights)
head(wine_weights_PC)
##   Type Alcohol Malic_Acid Ash Ash_Alcalinity Magnesium Total_Phenols
## 1    A      14        1.7 2.4             16       127           2.8
## 2    A      13        1.8 2.1             11       100           2.6
## 3    A      13        2.4 2.7             19       101           2.8
## 4    A      14        1.9 2.5             17       113           3.8
## 5    A      13        2.6 2.9             21       118           2.8
## 6    A      14        1.8 2.4             15       112           3.3
##   Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity  Hue
## 1        3.1                 0.28             2.3             5.6 1.04
## 2        2.8                 0.26             1.3             4.4 1.05
## 3        3.2                 0.30             2.8             5.7 1.03
## 4        3.5                 0.24             2.2             7.8 0.86
## 5        2.7                 0.39             1.8             4.3 1.04
## 6        3.4                 0.34             2.0             6.8 1.05
##   OD280_OD315 Proline  PC1   PC2   PC3   PC4     PC5   PC6    PC7   PC8
## 1         3.9    1065 -2.8  1.40 -0.44  0.12 -0.3031  0.30 -0.158  0.27
## 2         3.4    1050 -2.0 -0.32 -1.94  0.31 -1.0003  0.13  0.934 -0.47
## 3         3.2    1185 -2.1  1.30  0.99 -0.61  0.6782  0.38 -0.386 -0.79
## 4         3.4    1480 -2.8  2.81 -0.27 -0.17  0.0929 -0.13  0.859  0.44
## 5         2.9     735 -0.8  0.89  1.83  0.29 -0.4798  0.19  0.079 -0.44
## 6         2.8    1450 -2.2  1.84 -0.61  0.77  0.0081  0.51  0.365 -0.10
##     PC9  PC10   PC11
## 1 -1.03 -0.40 -0.153
## 2 -0.26 -0.39 -0.010
## 3 -0.42 -0.11  0.072
## 4  0.14  0.23 -0.409
## 5 -0.13  0.31 -0.102
## 6  0.23  0.35  0.120
summary(wine_weights_PC$Type)
##  A  B  C 
## 59 71 48
library(ggplot2)

ggplot() + geom_point(data = wine_weights_PC, aes(x = PC1,
                                                  y = PC2, color = Type))

# 6. Other plots/analyses -------------------------------------------------


biplot(wine_pca_nom, scale = 0)

# compute standard deviation of each principal component
std_dev <- wine_pca_nom$sdev

# compute variance
pca_var <- std_dev^2

# check variance
pca_var
##  [1] 4.35 1.99 1.41 0.84 0.63 0.60 0.36 0.29 0.25 0.17 0.11
# scree plot

plot(pca_var, xlab = "Principal Component",
     ylab = "Proportion of variance explained",
     type = "b")

# cummulative plot

plot(cumsum(pca_var), xlab = "Principal Component",
     ylab = "Cumulative proportion of variance explained",
     type = "b")