diff --git a/11-mlm-viz.qmd b/11-mlm-viz.qmd index 44eaa459..486e84a4 100644 --- a/11-mlm-viz.qmd +++ b/11-mlm-viz.qmd @@ -117,7 +117,7 @@ I refer to this as _effect size scaling_, because it is similar to an effect siz univariate models, e.g., $ES = (\bar{y}_1 - \bar{y}_2) / s_e$ in a two-group, univariate design. This is illustrated in ... - + The geometry of ellipsoids and multivariate tests allow us to go further with another re-scaling of the $\mat{H}$ ellipsoid @@ -147,7 +150,7 @@ This is done simply by dividing $\mat{H} / df_e$ further by the $\alpha$-critical value of the corresponding test statistic to show the strength of evidence against the null hypothesis. Among the various multivariate test statistics, -Roy's maximum root test, based on the largest eigenvalue $\lambda_1$ of $\mat{H} \mat{E}^{-1}, +Roy's maximum root test, based on the largest eigenvalue $\lambda_1$ of $\mat{H} \mat{E}^{-1}$, gives $\mat{H} / (\lambda_\alpha df_e)$ which has the visual property that the scaled $\mat{H}$ ellipsoid will protrude _somewhere_ outside the standard $\mat{E}$ ellipsoid if and only if diff --git a/R/digits-tSNE.R b/R/digits-tSNE.R new file mode 100644 index 00000000..e8b8e8ee --- /dev/null +++ b/R/digits-tSNE.R @@ -0,0 +1,162 @@ +# R tSNE with 3D plots +# https://www.appsilon.com/post/r-tsne + +library(dplyr) +library(Rtsne) +library(ggplot2) +library(plotly) + +# https://github.com/pjreddie/mnist-csv-png?tab=readme-ov-file +digits <- read.csv("mnist_train.csv", header = FALSE) +colnames(digits) <- c("digit", paste0("pixel", 1:784)) + +# print first row as a matrix +first_digit <- matrix(digits[1, ] |> + select(-digit) |> unlist(), nrow = 28, ncol = 28, byrow = TRUE) +first_digit + +# visualize the digit +rotate <- function(x) t(apply(x, 2, rev)) +image(rotate(first_digit), col = (gray(255:0 / 255))) + +# Or, if you want to get a glimpse into a larger portion of the dataset, run the following snippet: +par(mfrow = c(5, 5)) + +for (i in sample(1:nrow(digits), 25)) { + digit_matrix <- matrix(digits[i, ] |> + select(-digit) |> + unlist(), nrow = 28, ncol = 28, byrow = TRUE) + image(rotate(digit_matrix), col = gray(255:0 / 255), axes = FALSE, xlab = "", ylab = "") +} +par(mfrow = c(1, 1)) + +# That’s too much to include in a single chart, so we’ll reduce the per-class sample size to 100: +data_sample <- digits |> + group_by(digit) |> + sample_n(100) |> + ungroup() + +data_sample |> + group_by(digit) |> + count() + +# let’s also make a feature/target split: +X <- data_sample |> select(-digit) +y <- data_sample |> select(digit) + +# Run tSNE + +tsne_results <- Rtsne(X, dims = 2, perplexity = 25, verbose = TRUE, max_iter = 1500) + +tsne_df <- data.frame( + X = tsne_results$Y[, 1], + Y = tsne_results$Y[, 2], + digit = y +) +colors <- c("#E6194B", "#3CB44B", "#FFE119", "#4363D8", "#F58231", "#911EB4", "#46F0F0", "#F032E6", "#BCF60C", "#FABEBE") + +ggplot(tsne_df, aes(x = X, y = Y, color = factor(digit))) + + geom_point(size = 1.5) + + scale_color_manual(values = colors) + + labs( + title = "t-SNE 2-Dimensional Digit Visualization", + x = "t-SNE Dimension 1", + y = "t-SNE Dimension 2" + ) + + theme_minimal() + + theme( + plot.title = element_text(size = 20) + ) + +# Plotting t-SNE Results in 3 Dimensions + +tsne_results <- Rtsne(features, dims = 3, perplexity = 30, verbose = TRUE, max_iter = 1500) + +tsne_df <- data.frame( + X = tsne_results$Y[, 1], + Y = tsne_results$Y[, 2], + Z = tsne_results$Y[, 3], + digit = factor(labels) +) +head(tsne_df) + +colors <- c("#E6194B", "#3CB44B", "#FFE119", "#4363D8", "#F58231", "#911EB4", "#46F0F0", "#F032E6", "#BCF60C", "#FABEBE") +hover_text <- paste( + "Digit:", tsne_df$digit, "", + "Dimension 1:", round(tsne_df$X, 3), + "Dimension 2:", round(tsne_df$Y, 3), + "Dimension 3:", round(tsne_df$Z, 3) +) + +plot_ly( + data = tsne_df, + x = ~X, + y = ~Y, + z = ~Z, + type = "scatter3d", + mode = "markers", + marker = list(size = 6), + text = hover_text, + hoverinfo = "text", + color = ~digit, + colors = colors +) |> + layout( + title = "t-SNE 3-Dimensional Digit Visualization", + scene = list( + xaxis = list(title = "t-SNE Dimension 1"), + yaxis = list(title = "t-SNE Dimension 2"), + zaxis = list(title = "t-SNE Dimension 3") + ) + ) + +# Perplexity Tuning + +# Unlike PCA, the results of t-SNE will often vary (sometimes significantly) because of the tweakable parameters and the nature of gradient descent. + +# This section demonstrates how to tweak the most important parameter - perplexity - and shows you just how different the results are. +# The values for this parameter typically range from 5 to 50, so we’ll go over this entire range with the step size of 5. + +library(gganimate) + +perplexity_values <- c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50) + +tsne_results_list <- lapply(perplexity_values, function(perp) { + tsne <- Rtsne(X, dims = 2, perplexity = perp, verbose = TRUE, max_iter = 1500) + data.frame( + X = tsne$Y[, 1], + Y = tsne$Y[, 2], + digit = y, + perplexity = perp + ) +}) + +tsne_df <- do.call(rbind, tsne_results_list) + +plot <- ggplot(tsne_df, aes(x = X, y = Y, color = factor(digit))) + + geom_point(size = 1.5) + + scale_color_manual(values = colors) + + labs( + title = "t-SNE 2-Dimensional Digit Visualization", + subtitle = "Perplexity: {closest_state}", # This will display the perplexity value + x = "t-SNE Dimension 1", + y = "t-SNE Dimension 2" + ) + + theme_minimal() + + theme( + plot.title = element_text(size = 20), + plot.subtitle = element_text(size = 16) + ) + + transition_states(perplexity, transition_length = 2, state_length = 1) + + ease_aes("linear") + +animate( + plot, + width = 800, + height = 600, + res = 100, + nframes = 300, + fps = 30, + renderer = gifski_renderer(file = "tsne-2d-animated.gif") +) + diff --git a/bib/pkgs.bib b/bib/pkgs.bib index 4ad18e43..b1a988e1 100644 --- a/bib/pkgs.bib +++ b/bib/pkgs.bib @@ -34,8 +34,7 @@ @Manual{R-broom } @Manual{R-candisc, - title = {candisc: Visualizing Generalized Canonical Discriminant and Canonical -Correlation Analysis}, + title = {candisc: Visualizing Generalized Canonical Discriminant and Canonical Correlation Analysis}, author = {Michael Friendly and John Fox}, year = {2024}, note = {R package version 0.9.0}, @@ -135,7 +134,7 @@ @Manual{R-effectsize title = {effectsize: Indices of Effect Size}, author = {Mattan S. Ben-Shachar and Dominique Makowski and Daniel LĂĽdecke and Indrajeet Patil and Brenton M. Wiernik and RĂ©mi ThĂ©riault and Philip Waggoner}, year = {2024}, - note = {R package version 0.8.9}, + note = {R package version 1.0.0}, url = {https://easystats.github.io/effectsize/}, } @@ -207,7 +206,7 @@ @Manual{R-ggbiplot title = {ggbiplot: A Grammar of Graphics Implementation of Biplots}, author = {Vincent Q. Vu and Michael Friendly}, year = {2024}, - note = {R package version 0.6.3}, + note = {R package version 0.6.2}, url = {https://github.com/friendly/ggbiplot}, } @@ -333,19 +332,11 @@ @Manual{R-lattice url = {https://lattice.r-forge.r-project.org/}, } -@Manual{R-liminal, - title = {liminal: Multivariate Data Visualization with Tours and Embeddings}, - author = {Stuart Lee}, - year = {2021}, - note = {R package version 0.1.2}, - url = {https://github.com/sa-lee/liminal/}, -} - @Manual{R-lubridate, title = {lubridate: Make Dealing with Dates a Little Easier}, author = {Vitalie Spinu and Garrett Grolemund and Hadley Wickham}, - year = {2023}, - note = {R package version 1.9.3}, + year = {2024}, + note = {R package version 1.9.4}, url = {https://lubridate.tidyverse.org}, } diff --git a/bib/pkgs.txt b/bib/pkgs.txt index 935da98f..cc14beb4 100644 --- a/bib/pkgs.txt +++ b/bib/pkgs.txt @@ -116,3 +116,30 @@ knitr matlib patchwork tidyr +broom +car +carData +dplyr +ggplot2 +heplots +knitr +tidyr +broom +candisc +car +carData +dplyr +ggplot2 +heplots +knitr +tidyr +broom +candisc +car +carData +corrgram +dplyr +ggplot2 +heplots +knitr +tidyr diff --git a/docs/01-intro.html b/docs/01-intro.html index 52f81896..91279784 100644 --- a/docs/01-intro.html +++ b/docs/01-intro.html @@ -378,7 +378,7 @@
However, with two or more response variables, visualizations for multivariate models are not as simple as they are for their univariate counterparts for understanding the effects of predictors, model parameters, or model diagnostics. Consequently, the results of such studies are often explored and discussed solely in terms of coefficients and significance, and visualizations of the relationships are only provided for one response variable at a time, if at all. This tradition can mask important nuances, and lead researchers to draw erroneous conclusions.
-The aim of this book is to describe and illustrate some central methods that we have developed over the last ten years that aid in the understanding and communication of the results of multivariate linear models (Friendly, 2007; Friendly & Meyer, 2016). These methods rely on data ellipsoids as simple, minimally sufficient visualizations of variance that can be shown in 2D and 3D plots. As will be demonstrated, the Hypothesis-Error (HE) plot framework applies this idea to the results of multivariate tests of linear hypotheses.
+The aim of this book is to describe and illustrate some central methods that we have developed over the last ten years that aid in the understanding and communication of the results of multivariate linear models (Friendly, 2007; Friendly & Meyer, 2016). These methods rely on data ellipsoids as simple, minimally sufficient visualizations of variance that can be shown in 2D and 3D plots. As will be demonstrated, the Hypothesis-Error (HE) plot framework applies this idea to the results of multivariate tests of linear hypotheses.
Further, in the case where there are more than just a few outcome variables, the important nectar of their relationships to predictors can often be distilled in a multivariate juicer— a projection of the multivariate relationships to the predictors in the low-D space that captures most of the flavor. This idea can be applied using canonical correlation plots and with canonical discriminant HE plots.