# 1. Load data ---------------------------------------------------------
# Data source: Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
wine <- read.csv("Wine.csv", header = TRUE)
head(wine)
## Type Alcohol Malic_Acid Ash Ash_Alcalinity Magnesium Total_Phenols
## 1 A 14.23 1.71 2.43 15.6 127 2.80
## 2 A 13.20 1.78 2.14 11.2 100 2.65
## 3 A 13.16 2.36 2.67 18.6 101 2.80
## 4 A 14.37 1.95 2.50 16.8 113 3.85
## 5 A 13.24 2.59 2.87 21.0 118 2.80
## 6 A 14.20 1.76 2.45 15.2 112 3.27
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 3.06 0.28 2.29 5.64 1.04
## 2 2.76 0.26 1.28 4.38 1.05
## 3 3.24 0.30 2.81 5.68 1.03
## 4 3.49 0.24 2.18 7.80 0.86
## 5 2.69 0.39 1.82 4.32 1.04
## 6 3.39 0.34 1.97 6.75 1.05
## OD280_OD315 Proline
## 1 3.92 1065
## 2 3.40 1050
## 3 3.17 1185
## 4 3.45 1480
## 5 2.93 735
## 6 2.85 1450
names(wine)
## [1] "Type" "Alcohol" "Malic_Acid"
## [4] "Ash" "Ash_Alcalinity" "Magnesium"
## [7] "Total_Phenols" "Flavanoids" "Nonflavanoid_Phenols"
## [10] "Proanthocyanins" "Color_Intensity" "Hue"
## [13] "OD280_OD315" "Proline"
# 2. PCA raw data -------------------------------------------------
# The result is a list.
# centre and scale are the mean and standard deviation of the variables used for normalisation prior to PCA
# rotation is the principal component loading.
# x is a matrix with the principal component score vectors.
wine_pca <- prcomp(wine[,-c(1)])
summary(wine_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 314.9632 13.13527 3.07215 2.23409 1.10853 0.91710
## Proportion of Variance 0.9981 0.00174 0.00009 0.00005 0.00001 0.00001
## Cumulative Proportion 0.9981 0.99983 0.99992 0.99997 0.99998 0.99999
## PC7 PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.5282 0.3891 0.3348 0.2678 0.1938 0.1452 0.09057
## Proportion of Variance 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.00000
## Cumulative Proportion 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.00000
wine_pca$rot[,1:4]
## PC1 PC2 PC3 PC4
## Alcohol -0.0016592647 -1.203406e-03 -0.016873809 0.141446778
## Malic_Acid 0.0006810156 -2.154982e-03 -0.122003373 0.160389543
## Ash -0.0001949057 -4.593693e-03 -0.051987430 -0.009772810
## Ash_Alcalinity 0.0046713006 -2.645039e-02 -0.938593003 -0.330965260
## Magnesium -0.0178680075 -9.993442e-01 0.029780248 -0.005393756
## Total_Phenols -0.0009898297 -8.779622e-04 0.040484644 -0.074584656
## Flavanoids -0.0015672883 5.185073e-05 0.085443339 -0.169086724
## Nonflavanoid_Phenols 0.0001230867 1.354479e-03 -0.013510780 0.010805561
## Proanthocyanins -0.0006006078 -5.004400e-03 0.024659382 -0.050120952
## Color_Intensity -0.0023271432 -1.510035e-02 -0.291398464 0.878893693
## Hue -0.0001713800 7.626731e-04 0.025977662 -0.060034945
## OD280_OD315 -0.0007049316 3.495364e-03 0.070323969 -0.178200254
## Proline -0.9998229365 1.777381e-02 -0.004528682 -0.003112916
# Investigate Proline variable
summary(wine$Proline)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 278.0 500.5 673.5 746.9 985.0 1680.0
typeof(wine$Proline)
## [1] "integer"
library(pastecs)
## Warning: package 'pastecs' was built under R version 3.4.4
options(scipen = 100)
options(digits = 2)
stat.desc(wine$Proline)
## nbr.val nbr.null nbr.na min max
## 178.00 0.00 0.00 278.00 1680.00
## range sum median mean SE.mean
## 1402.00 132947.00 673.50 746.89 23.60
## CI.mean.0.95 var std.dev coef.var
## 46.58 99166.72 314.91 0.42
# 3. PCA on normalised data ---------------------------------------
# Identify numerical variables
names(wine)
## [1] "Type" "Alcohol" "Malic_Acid"
## [4] "Ash" "Ash_Alcalinity" "Magnesium"
## [7] "Total_Phenols" "Flavanoids" "Nonflavanoid_Phenols"
## [10] "Proanthocyanins" "Color_Intensity" "Hue"
## [13] "OD280_OD315" "Proline"
str(wine)
## 'data.frame': 178 obs. of 14 variables:
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic_Acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Ash_Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total_Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid_Phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color_Intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD280_OD315 : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
# PCA using numerical variables only, normalised
wine_pca_nom <- prcomp(na.omit(wine[,-c(1,6,14)]), scale. = T)
summary(wine_pca_nom)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Standard deviation 2.086 1.412 1.189 0.914 0.7954 0.7736 0.5983 0.5420
## Proportion of Variance 0.396 0.181 0.129 0.076 0.0575 0.0544 0.0325 0.0267
## Cumulative Proportion 0.396 0.577 0.705 0.781 0.8387 0.8931 0.9257 0.9524
## PC9 PC10 PC11
## Standard deviation 0.4971 0.4136 0.32466
## Proportion of Variance 0.0225 0.0155 0.00958
## Cumulative Proportion 0.9749 0.9904 1.00000
wine_pca_nom$rot
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Alcohol -0.080 0.568 -0.235 0.266 -0.324 0.057 -0.376
## Malic_Acid 0.276 0.224 0.073 -0.590 -0.493 0.364 -0.135
## Ash 0.045 0.352 0.622 0.268 -0.180 -0.177 0.055
## Ash_Alcalinity 0.236 -0.034 0.622 -0.142 0.166 -0.281 -0.245
## Total_Phenols -0.401 0.223 0.125 -0.040 0.033 0.120 0.432
## Flavanoids -0.438 0.148 0.135 -0.038 -0.013 0.037 0.201
## Nonflavanoid_Phenols 0.310 -0.043 0.218 0.360 0.146 0.772 0.228
## Proanthocyanins -0.322 0.173 0.111 -0.367 0.576 0.312 -0.423
## Color_Intensity 0.162 0.570 -0.176 0.145 0.368 -0.116 0.066
## Hue -0.335 -0.267 0.117 0.423 -0.160 0.173 -0.533
## OD280_OD315 -0.412 -0.042 0.167 -0.148 -0.276 0.047 0.176
## PC8 PC9 PC10 PC11
## Alcohol 0.4005 -0.21 0.304 0.027
## Malic_Acid -0.1606 0.29 -0.131 0.023
## Ash -0.5238 -0.25 0.043 -0.107
## Ash_Alcalinity 0.5669 0.20 0.041 0.075
## Total_Phenols 0.1396 0.47 0.331 -0.471
## Flavanoids -0.0232 0.15 0.029 0.837
## Nonflavanoid_Phenols 0.1868 -0.11 -0.021 0.090
## Proanthocyanins -0.1667 -0.26 0.085 -0.099
## Color_Intensity 0.0037 0.28 -0.604 -0.015
## Hue -0.1253 0.44 -0.258 -0.083
## OD280_OD315 0.3460 -0.42 -0.581 -0.185
# 4. Visualisations -------------------------------------------------------
# Plot PC1 vs PC2 with labels
library(plyr)
count(wine, "Type")
## Type freq
## 1 A 59
## 2 B 71
## 3 C 48
plot(x = wine_pca_nom$x[,1],
y = wine_pca_nom$x[,2],
xlab = "PC1",
ylab = "PC2")
text(x = wine_pca_nom$x[,1],
y = wine_pca_nom$x[,2],
labels = wine$Type,
col = ifelse(wine$Type == "A", "red",
ifelse(wine$Type == "B", "blue", "green")))
# Plot PC1 vs PC2 with legend
plot(x = wine_pca_nom$x[,1], y = wine_pca_nom$x[,2],
xlab = "PC1", ylab= "PC2",
main = "Wine PC1 vs PC2",
col=ifelse(wine$Type == "A", "red",
ifelse(wine$Type == "B", "blue", "green")))
legend(x="bottomleft", legend = levels(wine[, 1]),
col = c("red", "blue", "green"), pch=1)
head(wine_pca_nom$x)
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11
## 1 -2.8 1.40 -0.44 0.12 -0.3031 0.30 -0.158 0.27 -1.03 -0.40 -0.153
## 2 -2.0 -0.32 -1.94 0.31 -1.0003 0.13 0.934 -0.47 -0.26 -0.39 -0.010
## 3 -2.1 1.30 0.99 -0.61 0.6782 0.38 -0.386 -0.79 -0.42 -0.11 0.072
## 4 -2.8 2.81 -0.27 -0.17 0.0929 -0.13 0.859 0.44 0.14 0.23 -0.409
## 5 -0.8 0.89 1.83 0.29 -0.4798 0.19 0.079 -0.44 -0.13 0.31 -0.102
## 6 -2.2 1.84 -0.61 0.77 0.0081 0.51 0.365 -0.10 0.23 0.35 0.120
head(wine)
## Type Alcohol Malic_Acid Ash Ash_Alcalinity Magnesium Total_Phenols
## 1 A 14 1.7 2.4 16 127 2.8
## 2 A 13 1.8 2.1 11 100 2.6
## 3 A 13 2.4 2.7 19 101 2.8
## 4 A 14 1.9 2.5 17 113 3.8
## 5 A 13 2.6 2.9 21 118 2.8
## 6 A 14 1.8 2.4 15 112 3.3
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 3.1 0.28 2.3 5.6 1.04
## 2 2.8 0.26 1.3 4.4 1.05
## 3 3.2 0.30 2.8 5.7 1.03
## 4 3.5 0.24 2.2 7.8 0.86
## 5 2.7 0.39 1.8 4.3 1.04
## 6 3.4 0.34 2.0 6.8 1.05
## OD280_OD315 Proline
## 1 3.9 1065
## 2 3.4 1050
## 3 3.2 1185
## 4 3.4 1480
## 5 2.9 735
## 6 2.8 1450
str(wine_pca_nom$x)
## num [1:178, 1:11] -2.752 -2.047 -2.104 -2.822 -0.801 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:178] "1" "2" "3" "4" ...
## ..$ : chr [1:11] "PC1" "PC2" "PC3" "PC4" ...
# 5. Using ggplot ---------------------------------------------------------
# Convert matrix x to a data frame
wine_weights <- as.data.frame(wine_pca_nom$x)
head(wine_weights)
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11
## 1 -2.8 1.40 -0.44 0.12 -0.3031 0.30 -0.158 0.27 -1.03 -0.40 -0.153
## 2 -2.0 -0.32 -1.94 0.31 -1.0003 0.13 0.934 -0.47 -0.26 -0.39 -0.010
## 3 -2.1 1.30 0.99 -0.61 0.6782 0.38 -0.386 -0.79 -0.42 -0.11 0.072
## 4 -2.8 2.81 -0.27 -0.17 0.0929 -0.13 0.859 0.44 0.14 0.23 -0.409
## 5 -0.8 0.89 1.83 0.29 -0.4798 0.19 0.079 -0.44 -0.13 0.31 -0.102
## 6 -2.2 1.84 -0.61 0.77 0.0081 0.51 0.365 -0.10 0.23 0.35 0.120
# Combine matrix x to the original data set
wine_weights_PC <- cbind(wine, wine_weights)
head(wine_weights_PC)
## Type Alcohol Malic_Acid Ash Ash_Alcalinity Magnesium Total_Phenols
## 1 A 14 1.7 2.4 16 127 2.8
## 2 A 13 1.8 2.1 11 100 2.6
## 3 A 13 2.4 2.7 19 101 2.8
## 4 A 14 1.9 2.5 17 113 3.8
## 5 A 13 2.6 2.9 21 118 2.8
## 6 A 14 1.8 2.4 15 112 3.3
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 3.1 0.28 2.3 5.6 1.04
## 2 2.8 0.26 1.3 4.4 1.05
## 3 3.2 0.30 2.8 5.7 1.03
## 4 3.5 0.24 2.2 7.8 0.86
## 5 2.7 0.39 1.8 4.3 1.04
## 6 3.4 0.34 2.0 6.8 1.05
## OD280_OD315 Proline PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 1 3.9 1065 -2.8 1.40 -0.44 0.12 -0.3031 0.30 -0.158 0.27
## 2 3.4 1050 -2.0 -0.32 -1.94 0.31 -1.0003 0.13 0.934 -0.47
## 3 3.2 1185 -2.1 1.30 0.99 -0.61 0.6782 0.38 -0.386 -0.79
## 4 3.4 1480 -2.8 2.81 -0.27 -0.17 0.0929 -0.13 0.859 0.44
## 5 2.9 735 -0.8 0.89 1.83 0.29 -0.4798 0.19 0.079 -0.44
## 6 2.8 1450 -2.2 1.84 -0.61 0.77 0.0081 0.51 0.365 -0.10
## PC9 PC10 PC11
## 1 -1.03 -0.40 -0.153
## 2 -0.26 -0.39 -0.010
## 3 -0.42 -0.11 0.072
## 4 0.14 0.23 -0.409
## 5 -0.13 0.31 -0.102
## 6 0.23 0.35 0.120
summary(wine_weights_PC$Type)
## A B C
## 59 71 48
library(ggplot2)
ggplot() + geom_point(data = wine_weights_PC, aes(x = PC1,
y = PC2, color = Type))
# 6. Other plots/analyses -------------------------------------------------
biplot(wine_pca_nom, scale = 0)
# compute standard deviation of each principal component
std_dev <- wine_pca_nom$sdev
# compute variance
pca_var <- std_dev^2
# check variance
pca_var
## [1] 4.35 1.99 1.41 0.84 0.63 0.60 0.36 0.29 0.25 0.17 0.11
# scree plot
plot(pca_var, xlab = "Principal Component",
ylab = "Proportion of variance explained",
type = "b")
# cummulative plot
plot(cumsum(pca_var), xlab = "Principal Component",
ylab = "Cumulative proportion of variance explained",
type = "b")