08-PCA.Rmd

# Principal Component Analysis

# Load packages

```{r}
library(dplyr)
library(PCAmixdata)
```

# Load data

Reimport the heart disease dataset. 

```{r}
load("data/preprocessed.RData")
```

# Overview

## Unsupervised approaches

Since we are not trying to predict the value of any target variable like in supervised approaches, the value of unsupervised machine learning can be to see how data separate based solely on the nature of their features. This is a major value, as we can include all of the data at once, and just see how it sorts! Unsupervised approaches are also useful for optimizing other machine learning algorithms.  

Principal component analysis (PCA) is a powerful linear transformation technique used to explore patterns in data and highly correlated variables. It is useful for distilling variation across many variables onto a reduced feature space, such as a two-dimensional scatterplot. 

## Reclass variables

dplyr is essential for changing the classes of multiple features at once :^) 

## Scale numeric variables

```{r}
vars_to_scale = c("age", "trestbps", "chol", "thalach", "oldpeak")
h = data_original %>% mutate_at(scale, .vars = vars(vars_to_scale))
h = h %>% mutate_at(as.numeric, .vars = vars(vars_to_scale))
head(h)
```

## Factorize categorical variables 

```{r}
# Quick rename target outcomes
h$target = ifelse(h$target == 1, "yes", "no")

vars_to_fac = c("sex", "cp", "fbs", "restecg", "exang", 
                "slope", "ca", "thal", "target")

h = h %>% mutate_at(as.factor, .vars = vars(vars_to_fac))

# Awesome!
sapply(h, class)

# Create subset of numeric-only data (along with h.target)
# Combine the scaled numeric data and the original target feature
ml_num = data.frame(subset(h, select = vars_to_scale), h$target)
head(ml_num)
```

# Fit model

```{r}
split = splitmix(h)
X1 = split$X.quanti 
X2 = split$X.quali 
res.pcamix = PCAmix(X.quanti = X1, 
                    X.quali = X2,
                    rename.level = TRUE,
                    graph = TRUE)

# Stuff to unpack
names(res.pcamix)
res.pcamix$eig
```

## Screeplot

```{r}
barplot(res.pcamix$eig[,2], 
        ylim = c(0, 20), las = 2)
```

## ggplot coordinates

```{r}
# ?plot.PCAmix

# Convert the coordinates to a dataframe, and add the original target column
pca1 = data.frame(res.pcamix$ind$coord, h$target)

ggplot(pca1, aes(x = dim.1, y = dim.2, color = h.target)) + 
  geom_point() + 
  theme_bw() +
  guides(color = guide_legend(title = "Has heart \n disease?")) + 
  ggtitle("PCA of heart disease") +
  xlab(paste("Dimension 1", paste0("(", 
                                   round(res.pcamix$eig[1, 2], 2), 
                                   "%", ")"))) + 
  ylab(paste("Dimension 2", paste0("(", 
                                   round(res.pcamix$eig[2, 2], 2), 
                                   "%", ")")))
```

## View factor loadings

```{r}
pca2 = data.frame(res.pcamix$sqload)
pca2

# Dimension 1
ggplot(pca2, aes(x = reorder(rownames(pca2), -dim.1), y = dim.1)) + 
  geom_bar(stat = "identity") + 
  theme_bw() + ggtitle("Dimension 1 loadings") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Dimension 2
ggplot(pca2, aes(x = reorder(rownames(pca2), -dim.2), y = dim.2)) + 
  geom_bar(stat = "identity") + 
  theme_bw() + ggtitle("Dimension 2 loadings") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

```

# PCA for Machine Learning

Create a 70/30 training/test split
```{r}
# Set seed for reproducibility
set.seed(1)

# Create a stratified random split
training_rows = caret::createDataPartition(ml_num$h.target, 
                                           p = 0.70, list = FALSE) 

# Partition training dataset
train_x_class = ml_num[training_rows, ] 

# Partition test dataset
test_x_class = ml_num[-training_rows, ] 

dim(train_x_class)
dim(test_x_class)

table(train_x_class$h.target)
table(test_x_class$h.target)
```

# Fit PCA model to training set for numeric values
```{r}
?prcomp
pca_ml = prcomp(subset(train_x_class, select = -h.target), 
                       retx = TRUE, 
                       center = FALSE, scale = FALSE)
pca_ml

# view percentage of variance explained
summary(pca_ml)

# or
expl.var = round(pca_ml$sdev ^ 2 / sum(pca_ml$sdev ^ 2) * 100, 4) 
expl.var
```

## Generate predicted values of PCs for test dataset
```{r}
predicted_values = predict(pca_ml, 
                           newdata = subset(test_x_class, select = -h.target))
head(predicted_values)
```

## Define plotting parameters
```{r}
# Assign one color to each condition
target_colors = 1:2

# Assign one shape for the training data and another shape for the test data
target_shapes = c(1,16)
# Squares = training data
# Circles = test data

# Which PCs to plot?
target_PCs = 1:2
```

## Store the scores inside of dataframes
```{r}
# Assign the data into dataframes like before
gg_train = data.frame(pca_ml$x[, target_PCs])
head(gg_train)

gg_test = data.frame(predicted_values[, target_PCs])
head(gg_test)
```

# Visualize
We can plot the training and test data on the same plot! 

```{r}
ggplot(
  # training data
  gg_train, aes(x = gg_train[,1], y = gg_train[,2], 
                color = train_x_class$h.target)) + 
  geom_point(shape = 0, alpha = .5, stroke = 1, size = 3) + 
  stat_ellipse(show.legend = FALSE, lwd = 0.5) + 
  labs(color = "Has heart disease?", 
       caption = "Squares = training data \n 
       Circles = test data \n 
       Ellipses are 95% confidence ellipses for training data") + 
  xlab("Dimension 1") + 
  ylab("Dimension 2") + 
  xlim(c(-4, 4)) + 
  ylim(c(-4, 4)) +
  theme_bw() + 
  
  # test data
  
  geom_point(gg_test, mapping = aes(x = gg_test[,1], y = gg_test[,2], 
                                    color = test_x_class$h.target, 
                                    size = 3, alpha = 0.75)) + 
  guides(size = FALSE, alpha = FALSE) + 
  theme(legend.position = "top") + 
  ggtitle("Heart disease training/test data") + 
  theme(plot.title = element_text(hjust = 0.5, size = 10),
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 10))  
```

## Save `ml_num` for use in 09-hclust.Rmd
```{r}
save(ml_num, file = "data/unsupervised.RData")
```