more on Mona PCA

friendly · Dec 10, 2023 · a9cf61f · a9cf61f
1 parent cd0d061
commit a9cf61f
Show file tree

Hide file tree

Showing 30 changed files with 224 additions and 66 deletions.
diff --git a/04-pca-biplot.qmd b/04-pca-biplot.qmd
@@ -875,11 +875,13 @@ pixels in B/W and 1200K pixels in color.
 The uses here include
 
 * image compression:  a process applied to a graphics file to minimize its size in bytes for storage or transmission, without degrading image quality below an acceptable threshold
-* image reconstruction: 
+* image enhancement: improving the quality of an image, with applications in Computer Vision tasks, remote sensing, and satellite imagery. 
 * facial recognition: classifying or matching a facial image against a large corpus of stored images.
 
-As an example, consider the black and white version of the Mona Lisa shown in @fig-MonaLisa,
-adapted from this [blog post](https://kieranhealy.org/blog/archives/2019/10/27/reconstructing-images-using-pca/)
+When PCA is used on facial images, you can think of the process as generating **eigenfaces**,
+a representation of the pixels in the image in terms of an eigenvalue decomposition.
+As an example, consider the black and white version of the Mona Lisa shown in @fig-MonaLisa.
+The idea and code for this is adapted from this [blog post](https://kieranhealy.org/blog/archives/2019/10/27/reconstructing-images-using-pca/)
 by Kieran Healy. 
 
 
@@ -896,6 +898,57 @@ The complete script for this example is contained in [PCA-MonaLisa.R](R/PCA-Mona
 
 **TODO**: Show the necessary parts, including the screeplot.
 
+An image can be imported using `imager::load.image()` which creates a `"cimg"` object, 
+a 4-dimensional array with dimensions
+named `x,y,z,c`. `x` and `y` are the usual spatial dimensions, `z` is a depth dimension (which would correspond to time in a movie), and `c` is a color dimension containing R, G, B values.
+```{r mona-load}
+library(imager)
+img <- imager::load.image("https://github.com/friendly/Vis-MLM-book/blob/master/images/MonaLisa-BW.jpg?raw=true")
+dim(img)
+```
+
+An `as.data.frame()` method converts this to a data frame with `x` and `y` coordinates.
+Each x-y pair is a location in the 640 by 954 pixel grid, and the `value` is a grayscale value ranging from zero to one.
+```{r mona-reshape-long}
+img_df_long <- as.data.frame(img)
+head(img_df_long)
+```
+
+However, to do a PCA we will need a matrix of data in wide format containing the grayscale pixel values.
+We can do this using `tidyr::pivot_wider()`, giving a result with 640 rows and 954 columns.
+```{r mona-reshape-wide}
+img_df <- pivot_wider(img_df_long, 
+                     names_from = y, 
+                     values_from = value) |>
+  select(-x)
+dim(img_df)
+```
+
+Mona's PCA is produced with `prcomp()`.
+```{r mona-pca}
+img_pca <- img_df |>
+  prcomp(scale = TRUE, center = TRUE)
+```
+
+Examine the eigenvalues:
+
+```{r mona-eig}
+img_pca |>
+  broom::tidy(matrix = "eigenvalues") |> head(10)
+```
+
+
+Make a scree plot:
+
+```{r}
+#| label: mona-screeplot
+#| fig-height: 4
+#| out-width: "100%"
+ggscreeplot(img_pca) +
+  scale_x_log10()
+```
+
+
 Finally, the recovered images, using 2, 3 , 4, 5, 10, 15, 20, 50, and 100 principal components are plotted
 using ggplot, giving @fig-mona-pca.
 ```{r}

diff --git a/R/PCA-MonaLisa.R b/R/PCA-MonaLisa.R
@@ -7,10 +7,12 @@
 library(imager)
 library(here)
 library(dplyr)
+library(tidyr)
 library(purrr)
 library(broom)
 library(ggplot2)
 library(gganimate)
+library(ggbiplot)
 
 img <- imager::load.image(here("images", "MonaLisa.jpg"))
 img <- imager::load.image("https://github.com/friendly/Vis-MLM-book/blob/master/images/MonaLisa.jpg?raw=true")
@@ -47,17 +49,27 @@ img_df <- tidyr::pivot_wider(img_df_long,
                              values_from = value)
 dim(img_df)
 
-img_pca <- img_df %>%
-  dplyr::select(-x) %>%
+img_pca <- img_df |>
+  dplyr::select(-x) |>
   prcomp(scale = TRUE, center = TRUE)
+
 
 pca_tidy <- tidy(img_pca, matrix = "pcs")
 
-pca_tidy %>%
+# variance proportions
+img_pca |>
+  broom::tidy(matrix = "eigenvalues") |> head(12)
+
+pca_tidy |>
     ggplot(aes(x = PC, y = percent)) +
     geom_line(linewidth = 2) +
     labs(x = "Principal Component", y = "Variance Explained") 
 
+ggscreeplot(img_pca) +
+  scale_x_log10()
+
+
+
 reverse_pca <- function(n_comp = 20, pca_object = img_pca){
   ## The pca_object is an object created by base R's prcomp() function.
 
@@ -86,12 +98,12 @@ reverse_pca <- function(n_comp = 20, pca_object = img_pca){
   colnames(recon_df) <- c("x", 1:(ncol(recon_df)-1))
 
   ## Return the data to long form 
-  recon_df_long <- recon_df %>%
+  recon_df_long <- recon_df |>
     tidyr::pivot_longer(cols = -x, 
                         names_to = "y", 
-                        values_to = "value") %>%
-    mutate(y = as.numeric(y)) %>%
-    arrange(y) %>%
+                        values_to = "value") |>
+    mutate(y = as.numeric(y)) |>
+    arrange(y) |>
     as.data.frame()
 
   recon_df_long
@@ -107,7 +119,7 @@ names(n_pcs) <- paste("First", n_pcs, "Components", sep = "_")
 ## map reverse_pca() 
 recovered_imgs <- map_dfr(n_pcs, 
                           reverse_pca, 
-                          .id = "pcs") %>%
+                          .id = "pcs") |>
   mutate(pcs = stringr::str_replace_all(pcs, "_", " "), 
          pcs = factor(pcs, levels = unique(pcs), ordered = TRUE))
 

diff --git a/bib/pkgs.bib b/bib/pkgs.bib
@@ -38,7 +38,7 @@ @Manual{R-car
   title = {car: Companion to Applied Regression},
   author = {John Fox and Sanford Weisberg and Brad Price},
   year = {2023},
-  note = {R package version 3.1-3},
+  note = {R package version 3.1-4},
   url = {https://r-forge.r-project.org/projects/car/},
 }
 
@@ -82,6 +82,14 @@ @Manual{R-corrplot
   url = {https://github.com/taiyun/corrplot},
 }
 
+@Manual{R-datasauRus,
+  title = {datasauRus: Datasets from the Datasaurus Dozen},
+  author = {Rhian Davies and Steph Locke and Lucy {D'Agostino McGowan}},
+  year = {2022},
+  note = {R package version 0.1.6},
+  url = {https://github.com/jumpingrivers/datasauRus},
+}
+
 @Manual{R-datawizard,
   title = {datawizard: Easy Data Wrangling and Statistical Transformations},
   author = {Indrajeet Patil and Etienne Bacher and Dominique Makowski and Daniel Lüdecke and Mattan S. Ben-Shachar and Brenton M. Wiernik},
@@ -147,13 +155,6 @@ @Manual{R-forcats
   url = {https://forcats.tidyverse.org/},
 }
 
-@Manual{R-genridge,
-  title = {genridge: Generalized Ridge Trace Plots for Ridge Regression},
-  author = {Michael Friendly},
-  year = {2023},
-  note = {R package version 0.7.0},
-  url = {https://friendly.github.io/genridge/},
-}
 
 @Manual{R-geomtextpath,
   title = {geomtextpath: Curved Text in ggplot2},
@@ -208,7 +209,7 @@ @Manual{R-heplots
   title = {heplots: Visualizing Hypothesis Tests in Multivariate Linear Models},
   author = {Michael Friendly and John Fox and Georges Monette},
   year = {2023},
-  note = {R package version 1.6.0},
+  note = {R package version 1.6.1},
   url = {http://friendly.github.io/heplots/},
 }
 

diff --git a/child/neuro.qmd b/child/neuro.qmd
@@ -116,7 +116,7 @@ about the relations among variables.
 A corrgram [@Friendly:02:corrgram] provides a useful reconnaisance plot of the bivariate correlations in the dataset. It suppresses details, and allows focus on the overall pattern.
 The `corrgram::corrgram()` function has the ability to enhance perception by
 permuting the variables in the order of their variable vectors in a biplot, so more highly correlated variables are adjacent in the plot, and example of _effect ordering_ for
-data displays [@FriendlyKwan:02:effect].
+data displays [@FriendlyKwan:03:effect].
 
 The plot below includes all variables except for `Dx` group.
 There are a number of `panel.*` functions for choosing how the correlation for each pair is