build Ch11

friendly · Dec 20, 2024 · 130036a · 130036a
1 parent ad3544d
commit 130036a
Show file tree

Hide file tree

Showing 44 changed files with 8,037 additions and 437 deletions.
diff --git a/11-mlm-viz.qmd b/11-mlm-viz.qmd
@@ -117,7 +117,7 @@ I refer to this as _effect size scaling_, because it is similar to an effect siz
 univariate models, e.g., $ES = (\bar{y}_1 - \bar{y}_2) / s_e$ in a two-group, univariate design.
 
 This is illustrated in ...
-
+<!--
 ```{r}
 op <- par(mar = c(4, 4, 1, 1) + .5,
           mfrow = c(1, 2))
@@ -133,12 +133,15 @@ covEllipses(cbind(Sepal.Length, Sepal.Width) ~ Species, data=iris,
       label.pos = c(3, 1, 3, 0),
       xlim = c(4,8), ylim = c(2,4))
 
+iris.mod <- lm(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~
+                 Species, data=iris)
 heplot(iris.mod, size = "effect",
        cex = 1.5, cex.lab = 1.5,
        fill=TRUE, fill.alpha=c(0.3,0.1),
        xlim = c(4,8), ylim = c(2,4))
 par(op)
 ```
+-->
 
 
 The geometry of ellipsoids and multivariate tests allow us to go further with another re-scaling of the $\mat{H}$ ellipsoid
@@ -147,7 +150,7 @@ This is done simply by dividing $\mat{H} / df_e$ further
 by the $\alpha$-critical value of the corresponding test statistic to show the strength of evidence against
 the null hypothesis.
 Among the various multivariate test statistics,
-Roy's maximum root test, based on the largest eigenvalue $\lambda_1$ of $\mat{H} \mat{E}^{-1},
+Roy's maximum root test, based on the largest eigenvalue $\lambda_1$ of $\mat{H} \mat{E}^{-1}$,
 gives $\mat{H} / (\lambda_\alpha df_e)$
 which has the visual property that the
 scaled $\mat{H}$ ellipsoid will protrude _somewhere_ outside the standard $\mat{E}$ ellipsoid if and only if

diff --git a/R/digits-tSNE.R b/R/digits-tSNE.R
@@ -0,0 +1,162 @@
+# R tSNE with 3D plots
+# https://www.appsilon.com/post/r-tsne
+
+library(dplyr)
+library(Rtsne)
+library(ggplot2)
+library(plotly)
+
+# https://github.com/pjreddie/mnist-csv-png?tab=readme-ov-file
+digits <- read.csv("mnist_train.csv", header = FALSE)
+colnames(digits) <- c("digit", paste0("pixel", 1:784))
+
+# print first row as a matrix
+first_digit <- matrix(digits[1, ] |> 
+	select(-digit) |> unlist(), nrow = 28, ncol = 28, byrow = TRUE)
+first_digit
+
+# visualize the digit
+rotate <- function(x) t(apply(x, 2, rev))
+image(rotate(first_digit), col = (gray(255:0 / 255)))
+
+# Or, if you want to get a glimpse into a larger portion of the dataset, run the following snippet:
+par(mfrow = c(5, 5))
+
+for (i in sample(1:nrow(digits), 25)) {
+  digit_matrix <- matrix(digits[i, ] |> 
+  	select(-digit) |> 
+  	unlist(), nrow = 28, ncol = 28, byrow = TRUE)
+  image(rotate(digit_matrix), col = gray(255:0 / 255), axes = FALSE, xlab = "", ylab = "")
+}
+par(mfrow = c(1, 1))
+
+# That’s too much to include in a single chart, so we’ll reduce the per-class sample size to 100:
+data_sample <- digits |>
+  group_by(digit) |>
+  sample_n(100) |>
+  ungroup()
+
+data_sample |>
+  group_by(digit) |>
+  count()
+
+#  let’s also make a feature/target split:
+X <- data_sample |> select(-digit)
+y <- data_sample |> select(digit)
+
+# Run tSNE
+
+tsne_results <- Rtsne(X, dims = 2, perplexity = 25, verbose = TRUE, max_iter = 1500)
+
+tsne_df <- data.frame(
+  X = tsne_results$Y[, 1],
+  Y = tsne_results$Y[, 2],
+  digit = y
+)
+colors <- c("#E6194B", "#3CB44B", "#FFE119", "#4363D8", "#F58231", "#911EB4", "#46F0F0", "#F032E6", "#BCF60C", "#FABEBE")
+
+ggplot(tsne_df, aes(x = X, y = Y, color = factor(digit))) +
+  geom_point(size = 1.5) +
+  scale_color_manual(values = colors) +
+  labs(
+    title = "t-SNE 2-Dimensional Digit Visualization",
+    x = "t-SNE Dimension 1",
+    y = "t-SNE Dimension 2"
+  ) +
+  theme_minimal() +
+  theme(
+    plot.title = element_text(size = 20)
+  )
+
+# Plotting t-SNE Results in 3 Dimensions
+
+tsne_results <- Rtsne(features, dims = 3, perplexity = 30, verbose = TRUE, max_iter = 1500)
+
+tsne_df <- data.frame(
+  X = tsne_results$Y[, 1],
+  Y = tsne_results$Y[, 2],
+  Z = tsne_results$Y[, 3],
+  digit = factor(labels)
+)
+head(tsne_df)
+
+colors <- c("#E6194B", "#3CB44B", "#FFE119", "#4363D8", "#F58231", "#911EB4", "#46F0F0", "#F032E6", "#BCF60C", "#FABEBE")
+hover_text <- paste(
+  "Digit:", tsne_df$digit, "",
+  "Dimension 1:", round(tsne_df$X, 3),
+  "Dimension 2:", round(tsne_df$Y, 3),
+  "Dimension 3:", round(tsne_df$Z, 3)
+)
+
+plot_ly(
+  data = tsne_df,
+  x = ~X,
+  y = ~Y,
+  z = ~Z,
+  type = "scatter3d",
+  mode = "markers",
+  marker = list(size = 6),
+  text = hover_text,
+  hoverinfo = "text",
+  color = ~digit,
+  colors = colors
+) |>
+  layout(
+    title = "t-SNE 3-Dimensional Digit Visualization",
+    scene = list(
+      xaxis = list(title = "t-SNE Dimension 1"),
+      yaxis = list(title = "t-SNE Dimension 2"),
+      zaxis = list(title = "t-SNE Dimension 3")
+    )
+  )
+
+# Perplexity Tuning
+
+# Unlike PCA, the results of t-SNE will often vary (sometimes significantly) because of the tweakable parameters and the nature of gradient descent.
+
+# This section demonstrates how to tweak the most important parameter - perplexity - and shows you just how different the results are. 
+# The values for this parameter typically range from 5 to 50, so we’ll go over this entire range with the step size of 5.
+
+library(gganimate)
+
+perplexity_values <- c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50)
+
+tsne_results_list <- lapply(perplexity_values, function(perp) {
+  tsne <- Rtsne(X, dims = 2, perplexity = perp, verbose = TRUE, max_iter = 1500)
+  data.frame(
+    X = tsne$Y[, 1],
+    Y = tsne$Y[, 2],
+    digit = y,
+    perplexity = perp
+  )
+})
+
+tsne_df <- do.call(rbind, tsne_results_list)
+
+plot <- ggplot(tsne_df, aes(x = X, y = Y, color = factor(digit))) +
+  geom_point(size = 1.5) +
+  scale_color_manual(values = colors) +
+  labs(
+    title = "t-SNE 2-Dimensional Digit Visualization",
+    subtitle = "Perplexity: {closest_state}", # This will display the perplexity value
+    x = "t-SNE Dimension 1",
+    y = "t-SNE Dimension 2"
+  ) +
+  theme_minimal() +
+  theme(
+    plot.title = element_text(size = 20),
+    plot.subtitle = element_text(size = 16)
+  ) +
+  transition_states(perplexity, transition_length = 2, state_length = 1) +
+  ease_aes("linear")
+
+animate(
+  plot,
+  width = 800,
+  height = 600,
+  res = 100,
+  nframes = 300,
+  fps = 30,
+  renderer = gifski_renderer(file = "tsne-2d-animated.gif")
+)
+
diff --git a/bib/pkgs.bib b/bib/pkgs.bib
@@ -34,8 +34,7 @@ @Manual{R-broom
 }
 
 @Manual{R-candisc,
-  title = {candisc: Visualizing Generalized Canonical Discriminant and Canonical
-Correlation Analysis},
+  title = {candisc: Visualizing Generalized Canonical Discriminant and Canonical Correlation Analysis},
   author = {Michael Friendly and John Fox},
   year = {2024},
   note = {R package version 0.9.0},
@@ -135,7 +134,7 @@ @Manual{R-effectsize
   title = {effectsize: Indices of Effect Size},
   author = {Mattan S. Ben-Shachar and Dominique Makowski and Daniel Lüdecke and Indrajeet Patil and Brenton M. Wiernik and Rémi Thériault and Philip Waggoner},
   year = {2024},
-  note = {R package version 0.8.9},
+  note = {R package version 1.0.0},
   url = {https://easystats.github.io/effectsize/},
 }
 
@@ -207,7 +206,7 @@ @Manual{R-ggbiplot
   title = {ggbiplot: A Grammar of Graphics Implementation of Biplots},
   author = {Vincent Q. Vu and Michael Friendly},
   year = {2024},
-  note = {R package version 0.6.3},
+  note = {R package version 0.6.2},
   url = {https://github.com/friendly/ggbiplot},
 }
 
@@ -333,19 +332,11 @@ @Manual{R-lattice
   url = {https://lattice.r-forge.r-project.org/},
 }
 
-@Manual{R-liminal,
-  title = {liminal: Multivariate Data Visualization with Tours and Embeddings},
-  author = {Stuart Lee},
-  year = {2021},
-  note = {R package version 0.1.2},
-  url = {https://github.com/sa-lee/liminal/},
-}
-
 @Manual{R-lubridate,
   title = {lubridate: Make Dealing with Dates a Little Easier},
   author = {Vitalie Spinu and Garrett Grolemund and Hadley Wickham},
-  year = {2023},
-  note = {R package version 1.9.3},
+  year = {2024},
+  note = {R package version 1.9.4},
   url = {https://lubridate.tidyverse.org},
 }
 

diff --git a/bib/pkgs.txt b/bib/pkgs.txt
@@ -116,3 +116,30 @@ knitr
 matlib
 patchwork
 tidyr
+broom
+car
+carData
+dplyr
+ggplot2
+heplots
+knitr
+tidyr
+broom
+candisc
+car
+carData
+dplyr
+ggplot2
+heplots
+knitr
+tidyr
+broom
+candisc
+car
+carData
+corrgram
+dplyr
+ggplot2
+heplots
+knitr
+tidyr
diff --git a/docs/01-intro.html b/docs/01-intro.html
@@ -378,7 +378,7 @@ <h1 class="title"><span id="sec-introduction" class="quarto-section-identifier">
 </section><section id="visualization-is-harder" class="level2" data-number="1.4"><h2 data-number="1.4" class="anchored" data-anchor-id="visualization-is-harder">
 <span class="header-section-number">1.4</span> Visualization is harder</h2>
 <p>However, with two or more response variables, visualizations for multivariate models are not as simple as they are for their univariate counterparts for understanding the effects of predictors, model parameters, or model diagnostics. Consequently, the results of such studies are often explored and discussed solely in terms of coefficients and significance, and visualizations of the relationships are only provided for one response variable at a time, if at all. This tradition can mask important nuances, and lead researchers to draw erroneous conclusions.</p>
-<p>The aim of this book is to describe and illustrate some central methods that we have developed over the last ten years that aid in the understanding and communication of the results of multivariate linear models <span class="citation" data-cites="Friendly-07-manova FriendlyMeyer:2016:DDAR">(<a href="#ref-Friendly-07-manova" role="doc-biblioref">Friendly, 2007</a>;<!-- @Friendly-etal:ellipses:2013;  --> <a href="#ref-FriendlyMeyer:2016:DDAR" role="doc-biblioref">Friendly &amp; Meyer, 2016</a>)</span>. These methods rely on <em>data ellipsoids</em> as simple, minimally sufficient visualizations of variance that can be shown in 2D and 3D plots. As will be demonstrated, the <em>Hypothesis-Error (HE) plot</em> framework applies this idea to the results of multivariate tests of linear hypotheses. </p>
+<p>The aim of this book is to describe and illustrate some central methods that we have developed over the last ten years that aid in the understanding and communication of the results of multivariate linear models <span class="citation" data-cites="Friendly-07-manova FriendlyMeyer:2016:DDAR">(<a href="95-references.html#ref-Friendly-07-manova" role="doc-biblioref">Friendly, 2007</a>;<!-- @Friendly-etal:ellipses:2013;  --> <a href="95-references.html#ref-FriendlyMeyer:2016:DDAR" role="doc-biblioref">Friendly &amp; Meyer, 2016</a>)</span>. These methods rely on <em>data ellipsoids</em> as simple, minimally sufficient visualizations of variance that can be shown in 2D and 3D plots. As will be demonstrated, the <em>Hypothesis-Error (HE) plot</em> framework applies this idea to the results of multivariate tests of linear hypotheses. </p>
 <p>Further, in the case where there are more than just a few outcome variables, the important nectar of their relationships to predictors can often be distilled in a multivariate juicer— a <strong>projection</strong> of the multivariate relationships to the predictors in the low-D space that captures most of the flavor. This idea can be applied using <em>canonical correlation plots</em> and with <em>canonical discriminant HE plots</em>. </p>
 <div class="quarto-figure quarto-figure-center">
 <figure class="figure"><p><img src="images/Cover-GBE.png" class="img-fluid figure-img"></p>
@@ -401,7 +401,7 @@ <h1 class="title"><span id="sec-introduction" class="quarto-section-identifier">
 <!-- ## References {.unnumbered} -->
 
 
-<div id="refs" class="references csl-bib-body hanging-indent" data-entry-spacing="0" data-line-spacing="2" role="list">
+<div id="refs" class="references csl-bib-body hanging-indent" data-entry-spacing="0" data-line-spacing="2" role="list" style="display: none">
 <div id="ref-Friendly-07-manova" class="csl-entry" role="listitem">
 Friendly, M. (2007). <span>HE</span> plots for multivariate general linear models. <em>Journal of Computational and Graphical Statistics</em>, <em>16</em>(2), 421–444. <a href="https://doi.org/10.1198/106186007X208407">https://doi.org/10.1198/106186007X208407</a>
 </div>
-Original file line number
+Diff line change
@@ Expand Up / @@ -116,3 +116,30 @@ knitr @@
     matlib
     patchwork
     tidyr
+    broom
+    car
+    carData
+    dplyr
+    ggplot2
+    heplots
+    knitr
+    tidyr
+    broom
+    candisc
+    car
+    carData
+    dplyr
+    ggplot2
+    heplots
+    knitr
+    tidyr
+    broom
+    candisc
+    car
+    carData
+    corrgram
+    dplyr
+    ggplot2
+    heplots
+    knitr
+    tidyr