forked from dlab-berkeley/Machine-Learning-in-R
-
Notifications
You must be signed in to change notification settings - Fork 0
/
08-PCA.Rmd
231 lines (176 loc) · 6.06 KB
/
08-PCA.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# Principal Component Analysis
# Load packages
```{r}
library(dplyr)
library(PCAmixdata)
```
# Load data
Reimport the heart disease dataset.
```{r}
load("data/preprocessed.RData")
```
# Overview
## Unsupervised approaches
Since we are not trying to predict the value of any target variable like in supervised approaches, the value of unsupervised machine learning can be to see how data separate based solely on the nature of their features. This is a major value, as we can include all of the data at once, and just see how it sorts! Unsupervised approaches are also useful for optimizing other machine learning algorithms.
Principal component analysis (PCA) is a powerful linear transformation technique used to explore patterns in data and highly correlated variables. It is useful for distilling variation across many variables onto a reduced feature space, such as a two-dimensional scatterplot.
## Reclass variables
dplyr is essential for changing the classes of multiple features at once :^)
## Scale numeric variables
```{r}
vars_to_scale = c("age", "trestbps", "chol", "thalach", "oldpeak")
h = data_original %>% mutate_at(scale, .vars = vars(vars_to_scale))
h = h %>% mutate_at(as.numeric, .vars = vars(vars_to_scale))
head(h)
```
## Factorize categorical variables
```{r}
# Quick rename target outcomes
h$target = ifelse(h$target == 1, "yes", "no")
vars_to_fac = c("sex", "cp", "fbs", "restecg", "exang",
"slope", "ca", "thal", "target")
h = h %>% mutate_at(as.factor, .vars = vars(vars_to_fac))
# Awesome!
sapply(h, class)
# Create subset of numeric-only data (along with h.target)
# Combine the scaled numeric data and the original target feature
ml_num = data.frame(subset(h, select = vars_to_scale), h$target)
head(ml_num)
```
# Fit model
```{r}
split = splitmix(h)
X1 = split$X.quanti
X2 = split$X.quali
res.pcamix = PCAmix(X.quanti = X1,
X.quali = X2,
rename.level = TRUE,
graph = TRUE)
# Stuff to unpack
names(res.pcamix)
res.pcamix$eig
```
## Screeplot
```{r}
barplot(res.pcamix$eig[,2],
ylim = c(0, 20), las = 2)
```
## ggplot coordinates
```{r}
# ?plot.PCAmix
# Convert the coordinates to a dataframe, and add the original target column
pca1 = data.frame(res.pcamix$ind$coord, h$target)
ggplot(pca1, aes(x = dim.1, y = dim.2, color = h.target)) +
geom_point() +
theme_bw() +
guides(color = guide_legend(title = "Has heart \n disease?")) +
ggtitle("PCA of heart disease") +
xlab(paste("Dimension 1", paste0("(",
round(res.pcamix$eig[1, 2], 2),
"%", ")"))) +
ylab(paste("Dimension 2", paste0("(",
round(res.pcamix$eig[2, 2], 2),
"%", ")")))
```
## View factor loadings
```{r}
pca2 = data.frame(res.pcamix$sqload)
pca2
# Dimension 1
ggplot(pca2, aes(x = reorder(rownames(pca2), -dim.1), y = dim.1)) +
geom_bar(stat = "identity") +
theme_bw() + ggtitle("Dimension 1 loadings") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Dimension 2
ggplot(pca2, aes(x = reorder(rownames(pca2), -dim.2), y = dim.2)) +
geom_bar(stat = "identity") +
theme_bw() + ggtitle("Dimension 2 loadings") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
```
# PCA for Machine Learning
Create a 70/30 training/test split
```{r}
# Set seed for reproducibility
set.seed(1)
# Create a stratified random split
training_rows = caret::createDataPartition(ml_num$h.target,
p = 0.70, list = FALSE)
# Partition training dataset
train_x_class = ml_num[training_rows, ]
# Partition test dataset
test_x_class = ml_num[-training_rows, ]
dim(train_x_class)
dim(test_x_class)
table(train_x_class$h.target)
table(test_x_class$h.target)
```
# Fit PCA model to training set for numeric values
```{r}
?prcomp
pca_ml = prcomp(subset(train_x_class, select = -h.target),
retx = TRUE,
center = FALSE, scale = FALSE)
pca_ml
# view percentage of variance explained
summary(pca_ml)
# or
expl.var = round(pca_ml$sdev ^ 2 / sum(pca_ml$sdev ^ 2) * 100, 4)
expl.var
```
## Generate predicted values of PCs for test dataset
```{r}
predicted_values = predict(pca_ml,
newdata = subset(test_x_class, select = -h.target))
head(predicted_values)
```
## Define plotting parameters
```{r}
# Assign one color to each condition
target_colors = 1:2
# Assign one shape for the training data and another shape for the test data
target_shapes = c(1,16)
# Squares = training data
# Circles = test data
# Which PCs to plot?
target_PCs = 1:2
```
## Store the scores inside of dataframes
```{r}
# Assign the data into dataframes like before
gg_train = data.frame(pca_ml$x[, target_PCs])
head(gg_train)
gg_test = data.frame(predicted_values[, target_PCs])
head(gg_test)
```
# Visualize
We can plot the training and test data on the same plot!
```{r}
ggplot(
# training data
gg_train, aes(x = gg_train[,1], y = gg_train[,2],
color = train_x_class$h.target)) +
geom_point(shape = 0, alpha = .5, stroke = 1, size = 3) +
stat_ellipse(show.legend = FALSE, lwd = 0.5) +
labs(color = "Has heart disease?",
caption = "Squares = training data \n
Circles = test data \n
Ellipses are 95% confidence ellipses for training data") +
xlab("Dimension 1") +
ylab("Dimension 2") +
xlim(c(-4, 4)) +
ylim(c(-4, 4)) +
theme_bw() +
# test data
geom_point(gg_test, mapping = aes(x = gg_test[,1], y = gg_test[,2],
color = test_x_class$h.target,
size = 3, alpha = 0.75)) +
guides(size = FALSE, alpha = FALSE) +
theme(legend.position = "top") +
ggtitle("Heart disease training/test data") +
theme(plot.title = element_text(hjust = 0.5, size = 10),
legend.title = element_text(size = 10),
legend.text = element_text(size = 10))
```
## Save `ml_num` for use in 09-hclust.Rmd
```{r}
save(ml_num, file = "data/unsupervised.RData")
```