diff --git a/11-mlm-viz.qmd b/11-mlm-viz.qmd index 44eaa459..486e84a4 100644 --- a/11-mlm-viz.qmd +++ b/11-mlm-viz.qmd @@ -117,7 +117,7 @@ I refer to this as _effect size scaling_, because it is similar to an effect siz univariate models, e.g., $ES = (\bar{y}_1 - \bar{y}_2) / s_e$ in a two-group, univariate design. This is illustrated in ... - + The geometry of ellipsoids and multivariate tests allow us to go further with another re-scaling of the $\mat{H}$ ellipsoid @@ -147,7 +150,7 @@ This is done simply by dividing $\mat{H} / df_e$ further by the $\alpha$-critical value of the corresponding test statistic to show the strength of evidence against the null hypothesis. Among the various multivariate test statistics, -Roy's maximum root test, based on the largest eigenvalue $\lambda_1$ of $\mat{H} \mat{E}^{-1}, +Roy's maximum root test, based on the largest eigenvalue $\lambda_1$ of $\mat{H} \mat{E}^{-1}$, gives $\mat{H} / (\lambda_\alpha df_e)$ which has the visual property that the scaled $\mat{H}$ ellipsoid will protrude _somewhere_ outside the standard $\mat{E}$ ellipsoid if and only if diff --git a/R/digits-tSNE.R b/R/digits-tSNE.R new file mode 100644 index 00000000..e8b8e8ee --- /dev/null +++ b/R/digits-tSNE.R @@ -0,0 +1,162 @@ +# R tSNE with 3D plots +# https://www.appsilon.com/post/r-tsne + +library(dplyr) +library(Rtsne) +library(ggplot2) +library(plotly) + +# https://github.com/pjreddie/mnist-csv-png?tab=readme-ov-file +digits <- read.csv("mnist_train.csv", header = FALSE) +colnames(digits) <- c("digit", paste0("pixel", 1:784)) + +# print first row as a matrix +first_digit <- matrix(digits[1, ] |> + select(-digit) |> unlist(), nrow = 28, ncol = 28, byrow = TRUE) +first_digit + +# visualize the digit +rotate <- function(x) t(apply(x, 2, rev)) +image(rotate(first_digit), col = (gray(255:0 / 255))) + +# Or, if you want to get a glimpse into a larger portion of the dataset, run the following snippet: +par(mfrow = c(5, 5)) + +for (i in sample(1:nrow(digits), 25)) { + digit_matrix <- matrix(digits[i, ] |> + select(-digit) |> + unlist(), nrow = 28, ncol = 28, byrow = TRUE) + image(rotate(digit_matrix), col = gray(255:0 / 255), axes = FALSE, xlab = "", ylab = "") +} +par(mfrow = c(1, 1)) + +# That’s too much to include in a single chart, so we’ll reduce the per-class sample size to 100: +data_sample <- digits |> + group_by(digit) |> + sample_n(100) |> + ungroup() + +data_sample |> + group_by(digit) |> + count() + +# let’s also make a feature/target split: +X <- data_sample |> select(-digit) +y <- data_sample |> select(digit) + +# Run tSNE + +tsne_results <- Rtsne(X, dims = 2, perplexity = 25, verbose = TRUE, max_iter = 1500) + +tsne_df <- data.frame( + X = tsne_results$Y[, 1], + Y = tsne_results$Y[, 2], + digit = y +) +colors <- c("#E6194B", "#3CB44B", "#FFE119", "#4363D8", "#F58231", "#911EB4", "#46F0F0", "#F032E6", "#BCF60C", "#FABEBE") + +ggplot(tsne_df, aes(x = X, y = Y, color = factor(digit))) + + geom_point(size = 1.5) + + scale_color_manual(values = colors) + + labs( + title = "t-SNE 2-Dimensional Digit Visualization", + x = "t-SNE Dimension 1", + y = "t-SNE Dimension 2" + ) + + theme_minimal() + + theme( + plot.title = element_text(size = 20) + ) + +# Plotting t-SNE Results in 3 Dimensions + +tsne_results <- Rtsne(features, dims = 3, perplexity = 30, verbose = TRUE, max_iter = 1500) + +tsne_df <- data.frame( + X = tsne_results$Y[, 1], + Y = tsne_results$Y[, 2], + Z = tsne_results$Y[, 3], + digit = factor(labels) +) +head(tsne_df) + +colors <- c("#E6194B", "#3CB44B", "#FFE119", "#4363D8", "#F58231", "#911EB4", "#46F0F0", "#F032E6", "#BCF60C", "#FABEBE") +hover_text <- paste( + "Digit:", tsne_df$digit, "", + "Dimension 1:", round(tsne_df$X, 3), + "Dimension 2:", round(tsne_df$Y, 3), + "Dimension 3:", round(tsne_df$Z, 3) +) + +plot_ly( + data = tsne_df, + x = ~X, + y = ~Y, + z = ~Z, + type = "scatter3d", + mode = "markers", + marker = list(size = 6), + text = hover_text, + hoverinfo = "text", + color = ~digit, + colors = colors +) |> + layout( + title = "t-SNE 3-Dimensional Digit Visualization", + scene = list( + xaxis = list(title = "t-SNE Dimension 1"), + yaxis = list(title = "t-SNE Dimension 2"), + zaxis = list(title = "t-SNE Dimension 3") + ) + ) + +# Perplexity Tuning + +# Unlike PCA, the results of t-SNE will often vary (sometimes significantly) because of the tweakable parameters and the nature of gradient descent. + +# This section demonstrates how to tweak the most important parameter - perplexity - and shows you just how different the results are. +# The values for this parameter typically range from 5 to 50, so we’ll go over this entire range with the step size of 5. + +library(gganimate) + +perplexity_values <- c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50) + +tsne_results_list <- lapply(perplexity_values, function(perp) { + tsne <- Rtsne(X, dims = 2, perplexity = perp, verbose = TRUE, max_iter = 1500) + data.frame( + X = tsne$Y[, 1], + Y = tsne$Y[, 2], + digit = y, + perplexity = perp + ) +}) + +tsne_df <- do.call(rbind, tsne_results_list) + +plot <- ggplot(tsne_df, aes(x = X, y = Y, color = factor(digit))) + + geom_point(size = 1.5) + + scale_color_manual(values = colors) + + labs( + title = "t-SNE 2-Dimensional Digit Visualization", + subtitle = "Perplexity: {closest_state}", # This will display the perplexity value + x = "t-SNE Dimension 1", + y = "t-SNE Dimension 2" + ) + + theme_minimal() + + theme( + plot.title = element_text(size = 20), + plot.subtitle = element_text(size = 16) + ) + + transition_states(perplexity, transition_length = 2, state_length = 1) + + ease_aes("linear") + +animate( + plot, + width = 800, + height = 600, + res = 100, + nframes = 300, + fps = 30, + renderer = gifski_renderer(file = "tsne-2d-animated.gif") +) + diff --git a/bib/pkgs.bib b/bib/pkgs.bib index 4ad18e43..b1a988e1 100644 --- a/bib/pkgs.bib +++ b/bib/pkgs.bib @@ -34,8 +34,7 @@ @Manual{R-broom } @Manual{R-candisc, - title = {candisc: Visualizing Generalized Canonical Discriminant and Canonical -Correlation Analysis}, + title = {candisc: Visualizing Generalized Canonical Discriminant and Canonical Correlation Analysis}, author = {Michael Friendly and John Fox}, year = {2024}, note = {R package version 0.9.0}, @@ -135,7 +134,7 @@ @Manual{R-effectsize title = {effectsize: Indices of Effect Size}, author = {Mattan S. Ben-Shachar and Dominique Makowski and Daniel LĂĽdecke and Indrajeet Patil and Brenton M. Wiernik and RĂ©mi ThĂ©riault and Philip Waggoner}, year = {2024}, - note = {R package version 0.8.9}, + note = {R package version 1.0.0}, url = {https://easystats.github.io/effectsize/}, } @@ -207,7 +206,7 @@ @Manual{R-ggbiplot title = {ggbiplot: A Grammar of Graphics Implementation of Biplots}, author = {Vincent Q. Vu and Michael Friendly}, year = {2024}, - note = {R package version 0.6.3}, + note = {R package version 0.6.2}, url = {https://github.com/friendly/ggbiplot}, } @@ -333,19 +332,11 @@ @Manual{R-lattice url = {https://lattice.r-forge.r-project.org/}, } -@Manual{R-liminal, - title = {liminal: Multivariate Data Visualization with Tours and Embeddings}, - author = {Stuart Lee}, - year = {2021}, - note = {R package version 0.1.2}, - url = {https://github.com/sa-lee/liminal/}, -} - @Manual{R-lubridate, title = {lubridate: Make Dealing with Dates a Little Easier}, author = {Vitalie Spinu and Garrett Grolemund and Hadley Wickham}, - year = {2023}, - note = {R package version 1.9.3}, + year = {2024}, + note = {R package version 1.9.4}, url = {https://lubridate.tidyverse.org}, } diff --git a/bib/pkgs.txt b/bib/pkgs.txt index 935da98f..cc14beb4 100644 --- a/bib/pkgs.txt +++ b/bib/pkgs.txt @@ -116,3 +116,30 @@ knitr matlib patchwork tidyr +broom +car +carData +dplyr +ggplot2 +heplots +knitr +tidyr +broom +candisc +car +carData +dplyr +ggplot2 +heplots +knitr +tidyr +broom +candisc +car +carData +corrgram +dplyr +ggplot2 +heplots +knitr +tidyr diff --git a/docs/01-intro.html b/docs/01-intro.html index 52f81896..91279784 100644 --- a/docs/01-intro.html +++ b/docs/01-intro.html @@ -378,7 +378,7 @@

1.4 Visualization is harder

However, with two or more response variables, visualizations for multivariate models are not as simple as they are for their univariate counterparts for understanding the effects of predictors, model parameters, or model diagnostics. Consequently, the results of such studies are often explored and discussed solely in terms of coefficients and significance, and visualizations of the relationships are only provided for one response variable at a time, if at all. This tradition can mask important nuances, and lead researchers to draw erroneous conclusions.

-

The aim of this book is to describe and illustrate some central methods that we have developed over the last ten years that aid in the understanding and communication of the results of multivariate linear models (Friendly, 2007; Friendly & Meyer, 2016). These methods rely on data ellipsoids as simple, minimally sufficient visualizations of variance that can be shown in 2D and 3D plots. As will be demonstrated, the Hypothesis-Error (HE) plot framework applies this idea to the results of multivariate tests of linear hypotheses.

+

The aim of this book is to describe and illustrate some central methods that we have developed over the last ten years that aid in the understanding and communication of the results of multivariate linear models (Friendly, 2007; Friendly & Meyer, 2016). These methods rely on data ellipsoids as simple, minimally sufficient visualizations of variance that can be shown in 2D and 3D plots. As will be demonstrated, the Hypothesis-Error (HE) plot framework applies this idea to the results of multivariate tests of linear hypotheses.

Further, in the case where there are more than just a few outcome variables, the important nectar of their relationships to predictors can often be distilled in a multivariate juicer— a projection of the multivariate relationships to the predictors in the low-D space that captures most of the flavor. This idea can be applied using canonical correlation plots and with canonical discriminant HE plots.

@@ -401,7 +401,7 @@

-
+

As we can see, all four datasets have nearly identical univariate and bivariate statistical measures. You can only see how they differ in graphs, which show their true natures to be vastly different.

-

Figure fig-ch02-anscombe1 is an enhanced version of Anscombe’s plot of these data, adding helpful annotations to show visually the underlying statistical summaries.

+

Figure 2.1 is an enhanced version of Anscombe’s plot of these data, adding helpful annotations to show visually the underlying statistical summaries.

@@ -440,7 +440,7 @@

ggplot(), faceted by dataset. As we will see later (sec-data-ellipse), the data ellipse (produced by stat_ellipse()) reflects the correlation between the variables.

+

This figure is produced as follows, using a single call to ggplot(), faceted by dataset. As we will see later (Section 3.2), the data ellipse (produced by stat_ellipse()) reflects the correlation between the variables.

desc <- tibble(
   dataset = 1:4,
@@ -479,7 +479,7 @@ 

The method Anscombe used to compose his quartet is unknown, but it turns out that that there is a method to construct a wider collection of datasets with identical statistical properties. After all, in a bivariate dataset with \(n\) observations, the correlation has \((n-2)\) degrees of freedom, so it is possible to choose \(n-2\) of the \((x, y)\) pairs to yield any given value. As it happens, it is also possible to create any number of datasets with the same means, standard deviations and correlations with nearly any shape you like — even a dinosaur!

-

The Datasaurus Dozen was first publicized by Alberto Cairo in a blog post and are available in the datasauRus package (Davies et al., 2022). As shown in Figure fig-datasaurus, the sets include a star, cross, circle, bullseye, horizontal and vertical lines, and, of course the “dino”. The method (Matejka & Fitzmaurice, 2017) uses simulated annealing, an iterative process that perturbs the points in a scatterplot, moving them towards a given shape while keeping the statistical summaries close to the fixed target value.

+

The Datasaurus Dozen was first publicized by Alberto Cairo in a blog post and are available in the datasauRus package (Davies et al., 2022). As shown in Figure 2.2, the sets include a star, cross, circle, bullseye, horizontal and vertical lines, and, of course the “dino”. The method (Matejka & Fitzmaurice, 2017) uses simulated annealing, an iterative process that perturbs the points in a scatterplot, moving them towards a given shape while keeping the statistical summaries close to the fixed target value.

The datasauRus package just contains the datasets, but a general method, called statistical metamers, for producing such datasets has been described by Elio Campitelli and implemented in the metamer package.

@@ -505,13 +505,13 @@

-

The essential idea of a statistical “quartet” is to illustrate four quite different datasets or circumstances that seem superficially the same, but yet are paradoxically very different when you look behind the scenes. For example, in the context of causal analysis Gelman et al. (2023), illustrated sets of four graphs, within each of which all four represent the same average (latent) causal effect but with much different patterns of individual effects; McGowan et al. (2023) provide another illustration with four seemingly identical data sets each generated by a different causal mechanism. As an example of machine learning models, Biecek et al. (2023), introduced the “Rashamon Quartet”, a synthetic dataset for which four models from different classes (linear model, regression tree, random forest, neural network) have practically identical predictive performance. In all cases, the paradox is solved when their visualization reveals the distinct ways of understanding structure in the data. The quartets package contains these and other variations on this theme.

+

The essential idea of a statistical “quartet” is to illustrate four quite different datasets or circumstances that seem superficially the same, but yet are paradoxically very different when you look behind the scenes. For example, in the context of causal analysis Gelman et al. (2023), illustrated sets of four graphs, within each of which all four represent the same average (latent) causal effect but with much different patterns of individual effects; McGowan et al. (2023) provide another illustration with four seemingly identical data sets each generated by a different causal mechanism. As an example of machine learning models, Biecek et al. (2023), introduced the “Rashamon Quartet”, a synthetic dataset for which four models from different classes (linear model, regression tree, random forest, neural network) have practically identical predictive performance. In all cases, the paradox is solved when their visualization reveals the distinct ways of understanding structure in the data. The quartets package contains these and other variations on this theme.

2.1.2 One lousy point can ruin your day

-

In the mid 1980s, a consulting client had a strange problem.1 She was conducting a study of the relation between body image and weight preoccupation in exercising and non-exercising people (Davis, 1990). As part of the design, the researcher wanted to know if self-reported weight could be taken as a reliable indicator of true weight measured on a scale. It was expected that the correlations between reported and measured weight should be close to 1.0, and the slope of the regression lines for men and women should also be close to 1.0. The dataset is car::Davis.

+

In the mid 1980s, a consulting client had a strange problem.1 She was conducting a study of the relation between body image and weight preoccupation in exercising and non-exercising people (Davis, 1990). As part of the design, the researcher wanted to know if self-reported weight could be taken as a reliable indicator of true weight measured on a scale. It was expected that the correlations between reported and measured weight should be close to 1.0, and the slope of the regression lines for men and women should also be close to 1.0. The dataset is car::Davis.

She was therefore very surprise to see the following numerical results: For men, the correlation was nearly perfect, but not so for women.

data(Davis, package="carData")
@@ -592,7 +592,7 @@ 

Figure fig-ch02-davis-reg2, this discrepant observation again stands out like a sore thumb, but it makes very little difference in the fitted line for females. The reason is that this point is well within the range of the \(x\) variable (repwt). To impact the slope of the regression line, an observation must be unusual in_both_ \(x\) and \(y\). We take up the topic of how to detect influential observations and what to do about them in sec-linear-models-plots.

+

In Figure 2.4, this discrepant observation again stands out like a sore thumb, but it makes very little difference in the fitted line for females. The reason is that this point is well within the range of the \(x\) variable (repwt). To impact the slope of the regression line, an observation must be unusual in_both_ \(x\) and \(y\). We take up the topic of how to detect influential observations and what to do about them in Chapter 6.

The value of such plots is not only that they can reveal possible problems with an analysis, but also help identify their reasons and suggest corrective action. What went wrong here? Examination of the original data showed that this person switched the values, recording her reported weight in the box for measured weight and vice versa.

2.1.3 Shaken, not stirred: The 1970 Draft Lottery

@@ -609,7 +609,7 @@

(Fienberg, 1971).

+

In an attempt to make the selection process also transparent, the proceeding was covered on radio, TV and film and the dates posted in order on a large display board. The first capsule—drawn by Congressman Alexander Pirnie (R-NY) of the House Armed Services Committee—contained the date September 14, so all men born on September 14 in any year between 1944 and 1950 were assigned lottery number 1, and would be drafted first. April 24 was drawn next, then December 30, February 14, and so on until June 8, selected last. At the time of the drawing, US officials stated that those with birthdays drawn in the first third would almost certainly be drafted, while those in the last third would probably avoid the draft (Fienberg, 1971).

I watched this unfold with considerable interest because I was eligible for the Draft that year. I was dismayed when my birthday, May 7, came up ranked 35. Ugh!

The data, from the official Selective Service listing are contained in the dataset vcdExtra::Draft1970, ordered by Month and birthdate (Day), with Rank as the order in which the birthdates were drawn.

@@ -624,7 +624,7 @@

#> $ Rank <int> 305, 159, 251, 215, 101, 224, 306, 199, 194, 325, 32… #> $ Month <ord> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Ja… -

A basic scatterplot, slightly prettified, is shown in Figure fig-draft-gg1. The points are colored by month, and month labels are shown at the bottom.

+

A basic scatterplot, slightly prettified, is shown in Figure 2.5. The points are colored by month, and month labels are shown at the bottom.

Show the code
# make markers for months at their mid points
 months <- data.frame(
@@ -661,7 +661,7 @@ 

If you stare at the graph in @fig-draft-gg1 long enough, you can make out a sparsity of points in the

upper right corner and also in the lower left corner compared to the opposite corners.

Visual smoothers

-

Fitting a linear regression line or a smoothed (loess) curve can bring out the signal lurking in the background of a field of nearly random points. Figure fig-draft-gg2 shows a definite trend to lower ranks for birthdays toward the end of the year. Those born earlier in the year were more likely to be given lower ranks, calling them up sooner for the draft.

+

Fitting a linear regression line or a smoothed (loess) curve can bring out the signal lurking in the background of a field of nearly random points. Figure 2.6 shows a definite trend to lower ranks for birthdays toward the end of the year. Those born earlier in the year were more likely to be given lower ranks, calling them up sooner for the draft.

Show the code
ggplot(Draft1970, aes(x = Day, y = Rank)) +
   geom_point(size = 2.5, shape = 21, 
@@ -706,7 +706,7 @@ 

Statistical summaries

Another way to enhance the signal-to-noise ratio of a graph is to plot summaries of the messy data points. For example, you might make boxplots of the ranks by month, or calculate and plot the mean or median rank by month and plot those together with some indication of variability within month.

-

Figure fig-draft-means plots the average Rank for each month with error bars showing the mean \(\pm 1\) standard errors against the average Day. The message of rank decreasing nearly linearly with month is now more dramatic. The correlation between the means is \(r = -0.867\).

+

Figure 2.7 plots the average Rank for each month with error bars showing the mean \(\pm 1\) standard errors against the average Day. The message of rank decreasing nearly linearly with month is now more dramatic. The correlation between the means is \(r = -0.867\).

Code
means <- Draft1970 |>
   group_by(Month) |>
@@ -733,12 +733,12 @@ 

Figure fig-draft-means than in Figure fig-draft-gg2 for two reasons:

+

The visual impression of a linearly decreasing trend in lottery rank is much stronger in Figure 2.7 than in Figure 2.6 for two reasons:

  • Replacing the data points with their means strengthens the signal in relation to noise.
  • The narrower vertical range (100–250) in the plot of means makes the slope of the line appear steeper. (However, the correlation of the means, \(r = -0.231\) is nearly the same as the correlation of the data points.)

What happened here?

-

Previous lotteries carried out by drawing capsules from a container had occasionally suffered the embarrassment that an empty capsule was selected because of vigorous mixing (Fienberg, 1971). So for the 1970 lottery, the birthdate capsules were put in cardboard boxes, one for each month and these were carefully emptied into the glass container in order of month: Jan., Feb., … Dec., gently shaken in atop the pile already there. All might have been well had the persons drawing the capsules put their hand in truly randomly, but generally they picked from toward the top of the container. Consequently, those born later in the year had a greater chance of being picked earlier.

+

Previous lotteries carried out by drawing capsules from a container had occasionally suffered the embarrassment that an empty capsule was selected because of vigorous mixing (Fienberg, 1971). So for the 1970 lottery, the birthdate capsules were put in cardboard boxes, one for each month and these were carefully emptied into the glass container in order of month: Jan., Feb., … Dec., gently shaken in atop the pile already there. All might have been well had the persons drawing the capsules put their hand in truly randomly, but generally they picked from toward the top of the container. Consequently, those born later in the year had a greater chance of being picked earlier.

There was considerable criticism of this procedure once the flaw had been revealed by analyses such as described here. In the following year, the Selective Service called upon the National Bureau of Standards to devise a better procedure. In 1971 they used two drums, one with the dates of the year and another with the rank numbers 1-366. As a date capsule was drawn randomly from the first drum, another from the numbers drum was picked simultaneously, giving a doubly-randomized sequence.

Of course, if they had R, the entire process could have been done using sample():

@@ -811,7 +811,7 @@

+ -

In the simplest case, a data point \(\mathbf{x} = (x_1, x_2)\) in two dimensions can be represented geometrically as a vector from the origin as shown in Figure fig-projection. This point can be projected on any one-dimensional axis \(\mathbf{p}\) by dropping a line perpendicular to \(\mathbf{p}\), which is the idea of a shadow. Mathematically, this is calculated as the product \(\mathbf{x}^\mathsf{T} \mathbf{p} = x_1 p_1 + x_2 p_2\) and suitably normalized to give the correct length. …

+

In the simplest case, a data point \(\mathbf{x} = (x_1, x_2)\) in two dimensions can be represented geometrically as a vector from the origin as shown in Figure 3.35. This point can be projected on any one-dimensional axis \(\mathbf{p}\) by dropping a line perpendicular to \(\mathbf{p}\), which is the idea of a shadow. Mathematically, this is calculated as the product \(\mathbf{x}^\mathsf{T} \mathbf{p} = x_1 p_1 + x_2 p_2\) and suitably normalized to give the correct length. …

@@ -1868,7 +1868,7 @@

Y2 <- X %*% P2

-

In this example, the matrix \(\mathbf{X}\) consists of 8 points at the vertices of a cube of size 10, as shown in Figure fig-proj-combined (a). The projections \(\mathbf{Y}_1 = \mathbf{P}_1 \mathbf{X}\) and \(\mathbf{Y}_2 = \mathbf{P}_2 \mathbf{X}\) are shown in panels (b) and (c). To make it easier to relate the points in different views, shapes and colors are assigned so that each point has a unique combination of these attributes.6

+

In this example, the matrix \(\mathbf{X}\) consists of 8 points at the vertices of a cube of size 10, as shown in Figure 3.36 (a). The projections \(\mathbf{Y}_1 = \mathbf{P}_1 \mathbf{X}\) and \(\mathbf{Y}_2 = \mathbf{P}_2 \mathbf{X}\) are shown in panels (b) and (c). To make it easier to relate the points in different views, shapes and colors are assigned so that each point has a unique combination of these attributes.6

pch <- rep(15:18, times = 2)
 colors <- c("red", "blue", "darkgreen", "brown")
@@ -1896,7 +1896,7 @@ 

\(\mathbf{Y}\), we need some signposts to tell us how the new dimensions relate to those of \(\mathbf{X}\). The answer is provided simply by plotting the rows of \(\mathbf{P}\) as vectors, as shown in Figure fig-proj-vectors. In these plots, each row of \(\mathbf{P}_1\) and \(\mathbf{P}_2\) appears as a vector from the origin. It’s direction shows the contribution each of \(\mathbf{x}_1, \mathbf{x}_2, \mathbf{x}_3\) make to the new coordinates \(\mathbf{y}_1\) and \(\mathbf{y}_2\).

+

But, if we are traveling in the projection space of \(\mathbf{Y}\), we need some signposts to tell us how the new dimensions relate to those of \(\mathbf{X}\). The answer is provided simply by plotting the rows of \(\mathbf{P}\) as vectors, as shown in Figure 3.37. In these plots, each row of \(\mathbf{P}_1\) and \(\mathbf{P}_2\) appears as a vector from the origin. It’s direction shows the contribution each of \(\mathbf{x}_1, \mathbf{x}_2, \mathbf{x}_3\) make to the new coordinates \(\mathbf{y}_1\) and \(\mathbf{y}_2\).

In \(\mathbf{P}_1\), the projected variable \(\mathbf{y}_1\) is related only to \(\mathbf{x}_1\), while \(\mathbf{y}_2\) is related only to \(\mathbf{x}_2\) \(\mathbf{x}_3\) makes no contribution, and appears at the origin. However in the projection given by \(\mathbf{P}_2\), \(\mathbf{x}_1\) and \(\mathbf{x}_2\) make the same contribution to \(\mathbf{y}_1\), while \(\mathbf{x}_3\) has no contribution to that horizontal axis. The vertical axis, \(\mathbf{y}_2\) here is completely aligned with \(\mathbf{x}_3\); \(\mathbf{x}_1\) and \(\mathbf{x}_2\) have vertical components that are half of that for \(\mathbf{x}_3\) in absolute value.

Code
library(matlib)
 op <- par(mar=c(4, 5, 1, 1)+.1)
@@ -1935,7 +1935,7 @@ 

3.7.1.1 Vector lengths

-

In Figure fig-proj-vectors, the lengths of the \(\mathbf{x}\) vectors reflect the relative degree to which each variable is represented in the space of the projection, and this is important for interpretation. For the \(\mathbf{P}_1\) projection, \(\mathbf{x}_3\) is of length 0, while \(\mathbf{x}_1\) and \(\mathbf{x}_2\) fill the unit circle. In the projection given by \(\mathbf{P}_2\), all three \(\mathbf{x}\) are approximately the same length.

+

In Figure 3.37, the lengths of the \(\mathbf{x}\) vectors reflect the relative degree to which each variable is represented in the space of the projection, and this is important for interpretation. For the \(\mathbf{P}_1\) projection, \(\mathbf{x}_3\) is of length 0, while \(\mathbf{x}_1\) and \(\mathbf{x}_2\) fill the unit circle. In the projection given by \(\mathbf{P}_2\), all three \(\mathbf{x}\) are approximately the same length.

In algebra, the length of a vector \(\mathbf{x}\) is \(||\mathbf{x}|| = (\mathbf{x}^\mathsf{T} \mathbf{x})^{1/2} = \sqrt{\Sigma x_i^2}\), the Euclidean distance of the tip of the vector from the origin. In R, we calculate the lengths of row vectors in a projection matrix by transposing and using matlib::len().

P1 |> t() |> matlib::len()
@@ -1945,7 +1945,7 @@ 

3.7.1.2 Joint-views

-

To interpret such projections, we want to see both the projected data and the signposts that tell us where we are in relation to the original variables. To do this, we can overlay the variable vectors represented by the rows of the projection matrix \(\mathbf{P}\) onto plots like Figure fig-proj-combined (b) and Figure fig-proj-combined (c) to see how the axes in a projection relate to those in the data. To place these together on the same plot, we can either center the columns of \(\mathbf{Y}\) at their means or shift the the columns of \(\mathbf{P}\) to colMeans(Y). It is only the directions of the vectors that matters, so we are free to scale their lengths by any convenient factor.

+

To interpret such projections, we want to see both the projected data and the signposts that tell us where we are in relation to the original variables. To do this, we can overlay the variable vectors represented by the rows of the projection matrix \(\mathbf{P}\) onto plots like Figure 3.36 (b) and Figure 3.36 (c) to see how the axes in a projection relate to those in the data. To place these together on the same plot, we can either center the columns of \(\mathbf{Y}\) at their means or shift the the columns of \(\mathbf{P}\) to colMeans(Y). It is only the directions of the vectors that matters, so we are free to scale their lengths by any convenient factor.

Code
Y2s <- scale(Y2, scale=FALSE)       # center Y2
 plot(Y2s, cex = 3, 
@@ -1959,7 +1959,7 @@ 

vectors(-vecs, labels = NULL, lty = 1, angle = 1, col = "gray")

-

The plot in Figure fig-proj-P2-vec illustrates this, centering \(\mathbf{Y}\), and multiplying the vectors in \(\mathbf{P}\) by 7. To check your understanding, try to see if you can relate what is shown in this plot to the 3D plot in Figure fig-proj-combined (a).

+

The plot in Figure 3.38 illustrates this, centering \(\mathbf{Y}\), and multiplying the vectors in \(\mathbf{P}\) by 7. To check your understanding, try to see if you can relate what is shown in this plot to the 3D plot in Figure 3.36 (a).

@@ -1972,10 +1972,10 @@

sec-biplot) we will use in relation to principal components analysis.

+

The idea of viewing low-dimensional projections of data together with vectors representing the contributions of the original variables to the dimensions shown in a display is also the basis of biplot techniques (Section 4.3) we will use in relation to principal components analysis.

3.7.2 Touring methods

-

The trick of statistical touring methods is to generate a smooth sequence of interpolated projections \(\mathbf{P}_{(t)}\) indexed by time \(t\), \(\mathbf{P}_{(1)}, \mathbf{P}_{(2)}, \mathbf{P}_{(3)}, \dots, \mathbf{P}_{(T)}\). This gives a path of views \(\mathbf{Y}_{(t)} = \mathbf{X} \mathbf{P}_{(t)}\), that can be animated in successive frames, as shown schematically in Figure fig-peng-tourr-diagram.

+

The trick of statistical touring methods is to generate a smooth sequence of interpolated projections \(\mathbf{P}_{(t)}\) indexed by time \(t\), \(\mathbf{P}_{(1)}, \mathbf{P}_{(2)}, \mathbf{P}_{(3)}, \dots, \mathbf{P}_{(T)}\). This gives a path of views \(\mathbf{Y}_{(t)} = \mathbf{X} \mathbf{P}_{(t)}\), that can be animated in successive frames, as shown schematically in Figure 3.39.

@@ -1988,15 +1988,15 @@

(1985) original idea of the grand tour was that of a random path, picking orthogonal projections \(\mathbf{P}_{(i)}\) at random. Given enough time, the grand tour gives a space-filling path and would eventually show every possible projection of the data. But it does so smoothly, by interpolating from one projection to the next. In the travel analogy, the path by road from London to Paris might go smoothly through Kent to Dover, thence via Amiens and Beauvais before reaching Paris. By air, the tour would follow a smoother geodesic path, and this is what the grand tour does. The sense in watching an animation of a statistical grand tour is that of continuous motion. The grand tour algorithm is described in detail by Buja et al. (2005) and Cook et al. (2008).

+

Asimov’s (1985) original idea of the grand tour was that of a random path, picking orthogonal projections \(\mathbf{P}_{(i)}\) at random. Given enough time, the grand tour gives a space-filling path and would eventually show every possible projection of the data. But it does so smoothly, by interpolating from one projection to the next. In the travel analogy, the path by road from London to Paris might go smoothly through Kent to Dover, thence via Amiens and Beauvais before reaching Paris. By air, the tour would follow a smoother geodesic path, and this is what the grand tour does. The sense in watching an animation of a statistical grand tour is that of continuous motion. The grand tour algorithm is described in detail by Buja et al. (2005) and Cook et al. (2008).

3.7.2.1 Guided tours

-

The next big idea was that rather than traveling randomly in projection space one could take a guided tour, following a path that leads to “interesting projections”, such as those that reveal clusters, gaps in data space or outliers. This idea, called projection pursuit (Cook et al., 1995), works by defining a measure of interestingness of a data projection. In a guided tour, the next projection is chosen to increase that index, so over time the projection moves toward one that is maximizes that index.

-

In the time since Asimov (1985), there have been many implementations of touring visualization methods. XGobi (Swayne et al., 1998) for X-Windows displays on Linux systems provided a test-bed for dynamic, interactive graphic methods; it’s successor, GGobi (Cook & Swayne, 2007; Swayne et al., 2003) extended the range of touring methods to include a wider variety of projection pursuit indices.

+

The next big idea was that rather than traveling randomly in projection space one could take a guided tour, following a path that leads to “interesting projections”, such as those that reveal clusters, gaps in data space or outliers. This idea, called projection pursuit (Cook et al., 1995), works by defining a measure of interestingness of a data projection. In a guided tour, the next projection is chosen to increase that index, so over time the projection moves toward one that is maximizes that index.

+

In the time since Asimov (1985), there have been many implementations of touring visualization methods. XGobi (Swayne et al., 1998) for X-Windows displays on Linux systems provided a test-bed for dynamic, interactive graphic methods; it’s successor, GGobi (Cook & Swayne, 2007; Swayne et al., 2003) extended the range of touring methods to include a wider variety of projection pursuit indices.

3.7.2.2 tourr package

-

The current state of art is best captured in the tourr package for R (Wickham et al., 2011; Wickham & Cook, 2024). It defines a tour to consist of three components:

+

The current state of art is best captured in the tourr package for R (Wickham et al., 2011; Wickham & Cook, 2024). It defines a tour to consist of three components:

  • data: An \((n \times p)\) numerical data matrix to be viewed.
  • @@ -2044,12 +2044,12 @@

    holes()): This is sensitive to projections with separated clusters of points, with few points near the origin
  • Central mass (cmass()): Sensitive to projections with lots of points in the center, but perhaps with some outliers
  • Linear discriminant analysis (lda_pp()): For data with a grouping factor, optimizes a measure of separation of the group means as in MANOVA or linear discriminant analysis.
  • -
  • PDA analysis (pda_pp()): A penalized version of lda_pp() for cases of large \(p\) relative to sample size \(n\) (E.-K. Lee & Cook, 2009).
  • +
  • PDA analysis (pda_pp()): A penalized version of lda_pp() for cases of large \(p\) relative to sample size \(n\) (E.-K. Lee & Cook, 2009).

In addition, there is now a guided_anomaly_tour() that looks for the best projection of observations that are outside the data ellipsoid, finding a view showing observations with large Mahalanobis distances from the centroid.

3.7.2.3 Penguin tours

-

Penguins are a traveling species. They make yearly travels inland to breeding sites in early spring, repeating the patterns of their ancestors. Near the beginning of summer, adult penguins and their chicks return to the sea and spend the rest of the summer feeding there (Black et al., 2018). If they were also data scientists, they might wonder about the relations among among their cousins of different species and take a tour of their measurements…

+

Penguins are a traveling species. They make yearly travels inland to breeding sites in early spring, repeating the patterns of their ancestors. Near the beginning of summer, adult penguins and their chicks return to the sea and spend the rest of the summer feeding there (Black et al., 2018). If they were also data scientists, they might wonder about the relations among among their cousins of different species and take a tour of their measurements…

For example, using the Penguins dataset, the following calls produce grand tours in 2, 3, and 4 dimensions. The 2D tour is displayed as a scatterplot, the 3D tour using simulated depth as shown by variation in point size and transparency, and the 4D tour is shown using a parallel coordinate plot.

@@ -2132,7 +2132,7 @@

Figure fig-peng-tour-grand-frames shows three frames from this movie. The first (a) is the initial frame that shows the projection in the plane of bill depth and bill length. The variable vectors indicate that bill length differentiates Adelie penguins from the others. In frame (b), the three species are widely separated, with bill depth distinguishing Gentoo from the others. In frame (c) the three species are largely mixed, but two points stand out as outliers, with exceptionally long bills compared to the rest.

+

Figure 3.42 shows three frames from this movie. The first (a) is the initial frame that shows the projection in the plane of bill depth and bill length. The variable vectors indicate that bill length differentiates Adelie penguins from the others. In frame (b), the three species are widely separated, with bill depth distinguishing Gentoo from the others. In frame (c) the three species are largely mixed, but two points stand out as outliers, with exceptionally long bills compared to the rest.

@@ -2233,7 +2233,7 @@

lda_pp() criterion optimizes the separation of the means for species relative to within-group variation. (b) The anomalies_index() optimizes the average Mahalanobis distance of points from the centroid

-

These examples are intended to highlight what is possible with dynamic graphics for exploring high-dimensional data visually. Cook & Laa (2024) extend the discussion of these methods from Cook & Swayne (2007) (which used Ggobi) to the tourr package. They illustrate dimension reduction, various cluster analysis methods, trees and random forests and some machine-learning techniques.

+

These examples are intended to highlight what is possible with dynamic graphics for exploring high-dimensional data visually. Cook & Laa (2024) extend the discussion of these methods from Cook & Swayne (2007) (which used Ggobi) to the tourr package. They illustrate dimension reduction, various cluster analysis methods, trees and random forests and some machine-learning techniques.

Ideally, we should be able interact with a tour,

3.8 Network diagrams

A major theme throughout this chapter has been to understand how to extend data visualization from simple bivariate scatterplots to increasingly more complex situations with larger datasets. With a moderate number of variables, techniques such as smoothing, summarizing with data ellipses and fitted curves, and visual thinning can be used to tame “big \(N\)” datasets with thousands of cases.

However “big \(p\)” datasets, with more than a moderate number (\(p\)) of variables still remain a challenge. It is hard to see how the more advanced methods (corrgrams, parallel coordinate) described earlier could cope with \(p = 20, 50, 100, 500, \dots\) variables. At some point, each of these begins to break down for the purpose of visualizing associations among many variables. We are forced to thin the information presented in graphs more and more as the number of variables increases.

It turns out that there is a way to increase the number of variables displayed dramatically, if we are mainly interested in the pairwise correlations for reasonably normally distributed data. A graphical network diagram portrays variables by nodes (vertices), connected by (weighted) edges whose properties reflect the strength of connections between pairs, such as a correlation. Such diagrams can reveal properties not readily seen by other means.

-

As an example consider Figure fig-big5-qgraph-rodrigues, which portrays the correlations among 25 self-report items reflecting 5 factors (the “Big Five”) considered in personality psychology to represent the dominant aspects of all of personality. These factors are easily remembered by the acronum OCEAN: Openness, Conscientiousness, Extraversion, Agreeableness and Neuroticism. The dataset, psych::bfi, contains data from an online sample of \(n=2800\) with 5 items for each scale.

+

As an example consider Figure 3.45, which portrays the correlations among 25 self-report items reflecting 5 factors (the “Big Five”) considered in personality psychology to represent the dominant aspects of all of personality. These factors are easily remembered by the acronum OCEAN: Openness, Conscientiousness, Extraversion, Agreeableness and Neuroticism. The dataset, psych::bfi, contains data from an online sample of \(n=2800\) with 5 items for each scale.

In this figure (taken from Rodrigues (2021)), the item nodes are labeled according to the OCEAN factor they are assumed to measure. For 25 items, there are \(25 \times 24 / 2 = 300\) correlations, way too much to see. A clearer picture arises when we reduce the number of edges shown according to some criterion. Here, edges are drawn only between nodes where the correlation is considered important by a method (“glasso” = graphical LASSO) designed to make the graph optimally sparse.

library(qgraph)
@@ -2319,7 +2319,7 @@ 

3.8.2 Partial correlations

-

Among the more important statistical applications of network graph theory is the idea that you can also use them to study the the partial (conditional) associations among variables with the contributions of all other variables removed in what are called Graphical Gaussian Models (GGMs) (Højsgaard et al., 2012; Lauritzen, 1996). In a network diagram of these partial associations,

+

Among the more important statistical applications of network graph theory is the idea that you can also use them to study the the partial (conditional) associations among variables with the contributions of all other variables removed in what are called Graphical Gaussian Models (GGMs) (Højsgaard et al., 2012; Lauritzen, 1996). In a network diagram of these partial associations,

  • The edges between nodes represent the partial correlations between those variables.

  • The absence of an edge between two nodes indicates their variables are conditionally independent, given the other variables.

  • @@ -2329,7 +2329,7 @@

    \(\hat{x}_i\) and \(\hat{x}_j\) be the predicted values from the linear regressions of \(x_i\) on \(\mathbf{Z}\) and of \(x_j\) on \(\mathbf{Z}\), respectively. The partial correlation \(p_{ij}\) between \(x_i\) and \(x_j\) controlling for \(\mathbf{Z}\) is given by: \[ p_{x_i,x_j|\mathbf{Z}} = r( x_i, x_j \mid \text{others}) = \text{cor}[ (x_i - \hat{x}_i),\; (x_j - \hat{x}_j)] \tag{3.3}\]

    -

    But, rather than running all these linear regressions, they can all be computed from the inverse of the correlation matrix (Whittaker, 1990, Ch. 5), a relation first noted by Dempster (1972). Let \(\mathbf{R}\) be the correlation matrix of the variables. Then, the matrix \(\mathbf{P}\) of partial correlations can be obtained from the negative inverse, \(-\mathbf{R}^{-1}\), standardized to a correlation matrix by dividing by the square root of product of its diagonal elements, \[ +

    But, rather than running all these linear regressions, they can all be computed from the inverse of the correlation matrix (Whittaker, 1990, Ch. 5), a relation first noted by Dempster (1972). Let \(\mathbf{R}\) be the correlation matrix of the variables. Then, the matrix \(\mathbf{P}\) of partial correlations can be obtained from the negative inverse, \(-\mathbf{R}^{-1}\), standardized to a correlation matrix by dividing by the square root of product of its diagonal elements, \[ P_{ij} = - \frac{R^{-1}_{ij}}{\sqrt{(R^{-1}_{ii} \cdot R^{-1}_{jj})}} \:\: . \]

    source("R/pvPlot.R")
    @@ -2409,10 +2409,10 @@ 

    sec-leverage); Massachusetts (MA) is noteworthy because auto theft in that state is considerably higher than what would be predicted from all other variables.

    +

    In the pvPlot for robbery and auto theft, New York stands out as an influential, high-leverage point (see Section 6.6); Massachusetts (MA) is noteworthy because auto theft in that state is considerably higher than what would be predicted from all other variables.

3.9 Multivariate thinking and visualization

-

TODO: These are just initial notes on a chapter summary, and pointing the way to dimension reduction methods in sec-pca-biplot.

+

TODO: These are just initial notes on a chapter summary, and pointing the way to dimension reduction methods in Chapter 4.

This chapter has covered a lot of ground. We started with simple scatterplots and how to enhance them with graphical summaries and annotations …

The two curses

Multivariate data is often said to suffer from the curse of dimensionality (ref: Bellman1957), meaning that that as the dimensionality of data increases, the volume of the space increases so fast that the available data become sparse, so that the amount of data needed often grows exponentially with the dimensionality.

@@ -2425,7 +2425,7 @@

+
-Lee, S. (2021). Liminal: Multivariate data visualization with tours and embeddings. https://github.com/sa-lee/liminal/ +Lee, S. (2021). Liminal: Multivariate data visualization with tours and embeddings. https://CRAN.R-project.org/package=liminal
Martí, R., & Laguna, M. (2003). Heuristics and meta-heuristics for 2-layer straight line crossing minimization. Discrete Applied Mathematics, 127(3), 665–678. @@ -2667,10 +2667,10 @@


  1. Confidence bands allow us to visualize the uncertainty around a fitted regression curve, which can be of two types: pointwise intervals or simultaneous intervals. The default setting in `ggplot2::geom_smooth() calculates pointwise intervals (using stats::predict.lm(..., interval="confidence") at a confidence level \(1-\alpha\) for the predicted response at each value \(x_i\) of a predictor, and have the frequentist interpretation that over repeated sampling only \(100\;\alpha\) of the predictions at \(x_i\) will be outside that interval. In contrast, simultaneous intervals are calculated so that \(1 - \alpha\) is the probability that all of them cover their corresponding true values simultaneously. These are necessarily wider than pointwise intervals. Commonly used methods for constructing simultaneous confidence bands in regression are the Bonferroni and Scheffé methods, which control the family-wise error rate over all values of \(x_i\). See for precise definitions of these terms. These are different from a prediction band, which is used to represent the uncertainty about the value of a new data-point on the curve, but subject to the additional variance reflected in one observation.↩︎

  2. -
  3. The classic study by Cleveland & McGill (1984);Cleveland & McGill (1985) shows that judgements of magnitude along a common scale are more accurate than those along separate, aligned scales.↩︎

  4. -
  5. The dataset was collected by Bernard Blishen, William Carroll and Catherine Moore, but apparently unpublished. A version updated to the 1981 census is described in Blishen et al. (1987).↩︎

  6. +
  7. The classic study by Cleveland & McGill (1984);Cleveland & McGill (1985) shows that judgements of magnitude along a common scale are more accurate than those along separate, aligned scales.↩︎

  8. +
  9. The dataset was collected by Bernard Blishen, William Carroll and Catherine Moore, but apparently unpublished. A version updated to the 1981 census is described in Blishen et al. (1987).↩︎

  10. Other implementations of parallel coordinate plots in R include: MASS::parcoord(), GGally::ggparcoord() andPairViz::pcp()`. The ggpcp version used here is the most general.↩︎

  11. -
  12. This example was modified from one used by Cook et al. (2008).↩︎

  13. +
  14. This example was modified from one used by Cook et al. (2008).↩︎

  15. Plot shapes given by pch = 15:18 correspond to: filled square (15), filled circle (16), filled triangle point-up (17), filled diamond (18).↩︎

+ + + + + + + + + + + + + + + + + + + + + + +
+
+
+

11  Visualizing Multivariate Models

+
+
+ + +
+ + + + +
+ + +
+ +
+ + + +

Tests of multivariate models, including multivariate analysis of variance (MANOVA) for group differences and multivariate multiple regression (MMRA) can be easily visualized by plots of a hypothesis (“H”) data ellipse for the fitted values relative to the corresponding plot of the error ellipse (“E”) of the residuals, which I call the HE plot framework.

+

For more than a few response variables, these result can be projected onto a lower-dimensional “canonical discriminant” space providing an even simpler description.

+

Packages

+

In this chapter we use the following packages. Load them now

+ +

+11.1 HE plot framework

+

Chapter 9 illustrated the basic ideas of the framework for visualizing multivariate linear models in the context of a simple two group design, using Hotelling’s \(T^2\). The main ideas were illustrated in Figure 9.9.

+

Having described the statistical ideas behind the MLM in Chapter 10, we can proceed to extend this framework to larger designs. Figure 11.1 illustrates these ideas using the simple one-way MANOVA design of the dogfood data from Section 10.2.1.

+
+
+
+
+ +
+
+Figure 11.1: Dogfood quartet: Illustration of the conceptual ideas of the HE plot framework for the dogfood data. (a) Scatterplot of the data; (b) Summary using data ellipses; (c) HE plot shows the variation in the means in relation to pooled within group variance; (d) Transformation from data space to canonical space +
+
+
+
+
    +
  • In data space, each group is summarized by its data ellipse, representing the means and covariances.

  • +
  • Variation against the hypothesis of equal means can be seen by the \(\mathbf{H}\) ellipse in the HE plot, representing the data ellipse of the fitted values. Error variance is shown in the \(\mathbf{E}\) ellipse, representing the pooled within-group covariance matrix, \(\mathbf{S}_p\) and the data ellipse of the residuals from the model.

  • +
  • The MANOVA (or Hotelling’s \(T^2\)) is formally equivalent to a discriminant analysis, predicting group membership from the response variables which can be seen in data space. (The main difference is emphasis and goals: MANOVA seeks to test differences among group means, while discriminant analysis aims at classification of the observations into groups.)

  • +
  • This effectively projects the \(p\)-dimensional space of the predictors into the smaller canonical space that shows the greatest differences among the groups.

  • +
+

For more complex models such as MANOVA with multiple factors or multivariate multivariate regression, there is one \(\mathbf{H}\) ellipse for each term in the model. …

+

+11.2 HE plot construction

+

The HE plot is constructed to allow a direct visualization of the “size” of hypothesized terms in a multivariate linear model in relation to unexplained error variation. These can be displayed in 2D or 3D plots, so I use the term “ellipsoid” below to cover all cases.

+

Error variation is represented by a standard 68% data ellipsoid of the \(\mathbf{E}\) matrix of the residuals in \(\boldsymbol{\Large\varepsilon}\). This is divided by the residual degrees of freedom, so the size of \(\mathbf{E} / \text{df}_e\) is analogous to a mean square error in univariate tests. The choice of 68% coverage allows you to ``read’’ the residual standard deviation as the half-length of the shadow of the \(\mathbf{E}\) ellipsoid on any axis (see Figure 3.10). The \(\mathbf{E}\) ellipsoid is then translated to the overall (grand) means \(\bar{\mathbf{y}}\) of the variables plotted, which allows us to show the means for factor levels on the same scale, facilitating interpretation. In the notation of Equation 3.2, the error ellipsoid is given by \[ +\mathcal{E}_c (\bar{\mathbf{y}}, \mathbf{E}) = \bar{\mathbf{y}} \; \oplus \; c\,\mathbf{E}^{1/2} \:\: , +\] where \(c = \sqrt{2 F_{2, n-2}^{0.68}}\) for 2D plots and \(c = \sqrt{3 F_{3, n-3}^{0.68}}\) for 3D.

+

An ellipsoid representing variation in the means of a factor (or any other term reflected in a general linear hypothesis test, Equation 10.6) in the \(\mathbf{H}\) matrix is simply the data ellipse of the fitted values for that term. Dividing the hypothesis matrix by the error degrees of freedom, giving \(\mathbf{H} / \text{df}_e\), puts this on the same scale as the ellipse. I refer to this as effect size scaling, because it is similar to an effect size index used in univariate models, e.g., \(ES = (\bar{y}_1 - \bar{y}_2) / s_e\) in a two-group, univariate design.

+

This is illustrated in …

+

The geometry of ellipsoids and multivariate tests allow us to go further with another re-scaling of the \(\mathbf{H}\) ellipsoid that gives a visual test of significance for any term in a MLM. This is done simply by dividing \(\mathbf{H} / df_e\) further by the \(\alpha\)-critical value of the corresponding test statistic to show the strength of evidence against the null hypothesis. Among the various multivariate test statistics, Roy’s maximum root test, based on the largest eigenvalue \(\lambda_1\) of \(\mathbf{H} \mathbf{E}^{-1}\), gives \(\mathbf{H} / (\lambda_\alpha df_e)\) which has the visual property that the scaled \(\mathbf{H}\) ellipsoid will protrude somewhere outside the standard \(\mathbf{E}\) ellipsoid if and only if Roy’s test is significant at significance level \(\alpha\). The critical value \(\lambda_\alpha\) for Roy’s test is \[ +\lambda_\alpha = \left(\frac{\text{df}_1}{\text{df}_2}\right) \; F_{\text{df}_1, \text{df}_2}^{1-\alpha} \:\: , +\] where \(\text{df}_1 = \max(p, \text{df}_h)\) and \(\text{df}_2 = \text{df}_h + \text{df}_e - \text{df}_1\).

+

For these data, the HE plot using significance scaling is shown in the right panel of .

+

+11.3 Canonical discriminant analysis

+
+
#> Writing packages to  C:/R/Projects/Vis-MLM-book/bib/pkgs.txt
+#> 8  packages used here:
+#>  broom, car, carData, dplyr, ggplot2, heplots, knitr, tidyr
+
+ + +
+
+ + + + + \ No newline at end of file diff --git a/docs/12-eqcov.html b/docs/12-eqcov.html new file mode 100644 index 00000000..5d86e854 --- /dev/null +++ b/docs/12-eqcov.html @@ -0,0 +1,1114 @@ + + + + + + +12  Visualizing Equality of Covariance Matrices – Visualizing Multivariate Data and Models in R + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

12  Visualizing Equality of Covariance Matrices

+
+
+ + +
+ + + + +
+ + +
+ +
+ + + +
+

To make the preliminary test on variances is rather like putting to sea in a rowing boat to find out whether conditions are sufficiently calm for an ocean liner to leave port. — G. E. P. Box (1953)

+
+

This chapter concerns the extension of tests of homogeneity of variance from the classical univariate ANOVA setting to the analogous multivariate (MANOVA) setting. Such tests are a routine but important aspect of data analysis, as particular violations can drastically impact model estimates and appropriate conclusions that can be drawn (Lix & Keselman, 1996).

+

Beyond issues of model assumptions, the question of equality of covariance matrices is often of general interest itself. For instance, variability is often an important issue in studies of strict equivalence in laboratories comparing across multiple patient measurements and in other applied contexts (see Gastwirth et al., 2009 for other exemplars). Moreover the outcome of such tests often have important consequences for the details of a main method of analysis. Just as the Welsh \(t\)-test (Welch, 1947) is now commonly used and reported for a two-group test of differences in means under unequal variances, a preliminary test of equality of covariance matrices is often used in discriminant analysis to decide whether linear (LDA) or quadratic discriminant analysis (QDA) should be applied in a given problem. In such cases, the data at hand should inform the choice of statistical analysis to utilize.

+

We provide some answers to the following questions:

+
    +
  • Visualization: How can we visualize differences among group variances and covariance matrices, perhaps in a way that is analogous to what is done to visualize differences among group means? As will be illustrated, differences among covariance matrices can be comprised of spread in overall size (“scatter”) and shape (“orientation”). These can be seen in data space with data ellipses, particularly if the data is centered by shifting all groups to the grand mean,

  • +
  • Low-D views: When there are more than a few response variables, what low-dimensional views can show the most interesting properties related to the equality of covariance matrices? Projecting the data into the space of the principal components serves well again here. Surprisingly, we will see that the small dimensions contain useful information about differences among the group covariance matrices.

  • +
  • Other statistics: Box’s \(M\)-test is most widely used. Are there other worthwhile test statistics? We will see that graphics methods suggest alternatives.

  • +
+

The following subsections provide a capsule summary of the issues in this topic. Most of the discussion is couched in terms of a one-way design for simplicity, but the same ideas can apply to two-way (and higher) designs, where a “group” factor is defined as the product combination (interaction) of two or more factor variables. When there are also numeric covariates, this topic can also be extended to the multivariate analysis of covariance (MANCOVA) setting. This can be accomplished by applying these techniques to the residuals from predictions by the covariates alone.

+

Packages

+

In this chapter we use the following packages. Load them now

+ +

+12.1 Homogeneity of Variance in Univariate ANOVA

+

In classical (Gaussian) univariate ANOVA models, the main interest is typically on tests of mean differences in a response \(y\) according to one or more factors. The validity of the typical \(F\) test, however, relies on the assumption of homogeneity of variance: all groups have the same (or similar) variance, \[ +\sigma_1^2 = \sigma_2^2 = \cdots = \sigma_g^2 \; . +\]

+

It turns out that the \(F\) test for differences in means is relatively robust to violation of this assumption (Harwell et al., 1992), as long as the group sample sizes are roughly equal.1 This applies to Type I error \(\alpha\) rates, which are not much affected. However, unequal variance makes the ANOVA tests less efficient: you lose power to detect significant differences.

+

A variety of classical test statistics for homogeneity of variance are available, including Hartley’s \(F_{max}\) (Hartley, 1950), Cochran’s C (Cochran, 1941),and Bartlett’s test (Bartlett, 1937), but these have been found to have terrible statistical properties (Rogan & Keselman, 1977), which prompted Box’s famous quote.

+

Levene (1960) introduced a different form of test, based on the simple idea that when variances are equal across groups, the average absolute values of differences between the observations and group means will also be equal, i.e., substituting an \(L_1\) norm for the \(L_2\) norm of variance. In a one-way design, this is equivalent to a test of group differences in the means of the auxilliary variable \(z_{ij} = | y_{ij} - \bar{y}_i |\).

+

More robust versions of this test were proposed by Brown & Forsythe (1974). These tests substitute the group mean by either the group median or a trimmed mean in the ANOVA of the absolute deviations. Some suggest these should be almost always preferred to Levene’s version using the mean deviation. See Conover et al. (1981) for an early review and Gastwirth et al. (2009) for a general discussion of these tests. In what follows, we refer to this class of tests as “Levene-type” tests and suggest a multivariate extension described below (Section 12.2).

+

These deviations from a group central can be calculated using heplots::colDevs() and the central value can be a function, like mean, median or an anonymous one like function(x) mean(x, trim = 0.1)) that trims 10% off each side of the distribution. With a response Y Levene’s test then be performed “by hand” as follows:

+
+
Z.mean <- abs( colDevs(Y, group) )
+lm(Z.mean ~ group)
+
+Z.med <- abs( colDevs(Y, group, median) )
+lm(Z.med ~ group)
+
+

The function car::leveneTest() does this, so we could examine whether the variances are equal in the Penguin variables, one at a time, like so:

+
+
data(peng, package = "heplots")
+leveneTest(bill_length ~ species, data=peng)
+#> Levene's Test for Homogeneity of Variance (center = median)
+#>        Df F value Pr(>F)
+#> group   2    2.29    0.1
+#>       330
+  # ...
+leveneTest(body_mass ~ species, data=peng)
+#> Levene's Test for Homogeneity of Variance (center = median)
+#>        Df F value Pr(>F)   
+#> group   2    5.13 0.0064 **
+#>       330                  
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+

More conveniently, heplots:leveneTests() with an “s”, does this for each of a set of response variables, specified in a data frame, a model formula or a "mlm" object. It also formats the results in a more pleasing way:

+
+
peng.mod <- lm(cbind(bill_length, bill_depth, flipper_length, body_mass) ~ species, 
+               data = peng)
+leveneTests(peng.mod)
+#> Levene's Tests for Homogeneity of Variance (center = median)
+#> 
+#>                df1 df2 F value Pr(>F)   
+#> bill_length      2 330    2.29 0.1033   
+#> bill_depth       2 330    1.91 0.1494   
+#> flipper_length   2 330    0.44 0.6426   
+#> body_mass        2 330    5.13 0.0064 **
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+

So, this tells us that the groups do not differ in variances on first three variables, but they do for body_mass.

+

+12.2 Visualizing Levene’s test

+

To gain some insight into the problem of homogeneity of variance it is helpful how the situation looks in terms of data. For the Penguin data, it might be simplest just boxplots of the variables and try to see whether the widths of the central 50% boxes seem to be the same, as in Figure 12.1. However, it is perceptually difficult to focus on differences with widths of the boxes within each panel when their centers also differ from group to group.

+
+
See the code
source("R/penguin/penguin-colors.R")
+col <- peng.colors("dark")
+clr <- c(col, gray(.20))
+peng_long <- peng |> 
+  pivot_longer(bill_length:body_mass, 
+               names_to = "variable", 
+               values_to = "value") 
+
+peng_long |>
+  group_by(species) |> 
+  ggplot(aes(value, species, fill = species)) +
+  geom_boxplot() +
+  facet_wrap(~ variable, scales = 'free_x') +
+  theme_penguins() +
+  theme_bw(base_size = 14) +
+  theme(legend.position = 'none') 
+
+
+
+ +
+
+Figure 12.1: Boxplots for the Penguin variables. For assessing homogeneity of variance, we should be looking for differences in width of the central 50% boxes in each panel, rather than difference in central tendency. +
+
+
+
+

Instead, you can see more directly what is tested by the Levene test by graphing the absolute deviations from the group means or medians. This is another example of the graphic idea that to make visual comparisons easier by plotting quantities of direct interest. You can calculate these values as follows:

+
+
vars <- c("bill_length", "bill_depth", "flipper_length", "body_mass")
+pengDevs <- colDevs(peng[, vars], peng$species, median) |>
+  abs()
+
+

From a boxplot of the absolute deviations in Figure 12.2 your eye can now focus on the central value, shown by the median â€|’ line, because Levene’s method is testing whether these differ across groups.

+
+
See the code
# calculate absolute differences from median
+dev_long <- data.frame(species = peng$species, pengDevs) |> 
+  pivot_longer(bill_length:body_mass, 
+               names_to = "variable", 
+               values_to = "value") 
+
+dev_long |>
+  group_by(species) |> 
+  ggplot(aes(value, species, fill = species)) +
+  geom_boxplot() +
+  facet_wrap(~ variable, scales = 'free_x') +
+  xlab("absolute median deviation") +
+  theme_penguins() +
+  theme_bw(base_size = 14) +
+  theme(legend.position = 'none') 
+
+
+
+ +
+
+Figure 12.2: Boxplots for absolute differences from group medians for the Penguin data. +
+
+
+
+

It is now easy to see that the medians largely align for all the variables except for body_mass.

+

+12.3 Homogeneity of variance in MANOVA

+

In the MANOVA context, the main emphasis, of course, is on differences among mean vectors, testing \[ +\mathcal{H}_0 : \mathbf{\mu}_1 = \mathbf{\mu}_2 = \cdots = \mathbf{\mu}_g \; . +\] However, the standard test statistics (Wilks’ Lambda, Hotelling-Lawley trace, Pillai-Bartlett trace, Roy’s maximum root) rely upon the analogous assumption that the within-group covariance matrices \(\mathbf{\Sigma}_i\) are equal for all groups, \[ +\mathbf{\Sigma}_1 = \mathbf{\Sigma}_2 = \cdots = \mathbf{\Sigma}_g \; . +\] This is much stronger than in the univariate case, because it also requires that all the correlations between pairs of variables are the same for all groups. For example, for two responses, there are three parameters (\(\rho, \sigma_1^2, \sigma_2^2\)) assumed equal across all groups; for \(p\) responses, there are \(p (p+1) / 2\) assumed equal.

+ + +

To preview the main example, Figure 12.3 shows data ellipses for the main size variables in the palmerpenguins::penguins data.

+

To view the relations …

+
+
See the code
op <- par(mar = c(4, 4, 1, 1) + .5,
+          mfrow = c(c(1,2)))
+covEllipses(cbind(bill_length, bill_depth) ~ species, data=peng,
+  fill = TRUE,
+  fill.alpha = 0.1,
+  lwd = 3,
+  col = clr)
+
+covEllipses(cbind(bill_length, bill_depth) ~ species, data=peng,
+  center = TRUE, 
+  fill = c(rep(FALSE,3), TRUE), 
+  fill.alpha = .1, 
+  lwd = 3,
+  col = clr,
+  label.pos = c(1:3,0))
+par(op)
+
+
+
+ +
+
+Figure 12.3: Data ellipses for bill length and bill depth in the penguins data, also showing the pooled covariance. Left: As is; right: centered at the grand means for easier comparison. +
+
+
+
+

All pairs:

+
+
Code
clr <- c(peng.colors(), "black")
+covEllipses(peng[,3:6], peng$species, 
+  variables=1:4,
+  col = clr,
+  fill=TRUE, 
+  fill.alpha=.1)
+
+
+
+ +
+
+Figure 12.4: All pairwise covariance ellipses for the penguins data. +
+
+
+
+

They covariance ellipses look pretty similar in size, shape and orientation. But what does Box’s M test (described below) say? As you can see, it concludes strongly against the null hypothesis.

+
+
boxM(cbind(bill_length, bill_depth, flipper_length, body_mass) ~ species, data=peng)
+#> 
+#>  Box's M-test for Homogeneity of Covariance Matrices
+#> 
+#> data:  Y
+#> Chi-Sq (approx.) = 75, df = 20, p-value = 3e-08
+
+

+12.4 Assessing heterogeneity of covariance matrices: Box’s M test

+

Box (1949) proposed the following likelihood-ratio test (LRT) statistic for testing the hypothesis of equal covariance matrices, \[ +M = (N -g) \ln \;|\; \mathbf{S}_p \;|\; - \sum_{i=1}^g (n_i -1) \ln \;|\; \mathbf{S}_i \;|\; \; , +\] {eq-boxm}

+

where \(N = \sum n_i\) is the total sample size and \(\mathbf{S}_p = (N-g)^{-1} \sum_{i=1}^g (n_i - 1) \mathbf{S}_i\) is the pooled covariance matrix. \(M\) can thus be thought of as a ratio of the determinant of the pooled \(\mathbf{S}_p\) to the geometric mean of the determinants of the separate \(\mathbf{S}_i\).

+

In practice, there are various transformations of the value of \(M\) to yield a test statistic with an approximately known distribution (Timm, 1975). Roughly speaking, when each \(n_i > 20\), a \(\chi^2\) approximation is often used; otherwise an \(F\) approximation is known to be more accurate.

+

Asymptotically, \(-2 \ln (M)\) has a \(\chi^2\) distribution. The \(\chi^2\) approximation due to Box (1949, 1950) is that \[ +X^2 = -2 (1-c_1) \ln (M) \quad \sim \quad \chi^2_{df} +\] with \(df = (g-1) p (p+1)/2\) degrees of freedom, and a bias correction constant: \[ +c_1 = \left( +\sum_i \frac{1}{n_i -1} +- \frac{1}{N-g} +\right) +\frac{2p^2 +3p -1}{6 (p+1)(g-1)} \; . +\]

+

In this form, Bartlett’s test for equality of variances in the univariate case is the special case of Box’s M when there is only one response variable, so Bartlett’s test is sometimes used as univariate follow-up to determine which response variables show heterogeneity of variance.

+

Yet, like its univariate counterpart, Box’s test is well-known to be highly sensitive to violation of (multivariate) normality and the presence of outliers. For example, Tiku & Balakrishnan (1984) concluded from simulation studies that the normal-theory LRT provides poor control of Type I error under even modest departures from normality. O’Brien (1992) proposed some robust alternatives, and showed that Box’s normal theory approximation suffered both in controlling the null size of the test and in power. Zhang & Boos (1992) also carried out simulation studies with similar conclusions and used bootstrap methods to obtain corrected critical values.

+

+12.5 Visualizing heterogeneity

+

The goal of this chapter is to use the above background as a platform for discussing approaches to visualizing and testing the heterogeneity of covariance matrices in multivariate designs. While researchers often rely on a single number to determine if their data have met a particular threshold, such compression will often obscure interesting information, particularly when a test concludes that differences exist, and one is left to wonder ``why?’’. It is within this context where, again, visualizations often reign supreme. In fact, we find it somewhat surprising that this issue has not been addressed before graphically in any systematic way. TODO: cut this down

+

In what follows, we propose three visualization-based approaches to questions of heterogeneity of covariance in MANOVA designs:

+
    +
  1. direct visualization of the information in the \(\mathbf{S}_i\) and \(\mathbf{S}_p\) using data ellipsoids to show size and shape as minimal schematic summaries;

  2. +
  3. a simple dotplot of the components of Box’s M test: the log determinants of the \(\mathbf{S}_i\) together with that of the pooled \(\mathbf{S}_p\). Extensions of these simple plots raise the question of whether measures of heterogeneity other than that captured in Box’s test might also be useful; and,

  4. +
  5. the connection between Levene-type tests and an ANOVA (of centered absolute differences) suggests a parallel with a multivariate extension of Levene-type tests and a MANOVA. We explore this with a version of Hypothesis-Error (HE) plots we have found useful for visualizing mean differences in MANOVA designs.

  6. +
+
+
#> Writing packages to  C:/R/Projects/Vis-MLM-book/bib/pkgs.txt
+#> 9  packages used here:
+#>  broom, candisc, car, carData, dplyr, ggplot2, heplots, knitr, tidyr
+
+ + + + +

+
    +
  1. If group sizes are greatly unequal and homogeneity of variance is violated, then the \(F\) statistic is too liberal (\(p\) values too large) when large sample variances are associated with small group sizes. Conversely, the \(F\) statistic is too conservative if large variances are associated with large group sizes.↩︎

  2. +
+
+ + + + + \ No newline at end of file diff --git a/docs/13-case-studies.html b/docs/13-case-studies.html new file mode 100644 index 00000000..0b604d4e --- /dev/null +++ b/docs/13-case-studies.html @@ -0,0 +1,1288 @@ + + + + + + +13  Case studies – Visualizing Multivariate Data and Models in R + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+ + +
+ + + + +
+ + +
+ +
+ + + +

This chapter presents some complete analyses of datasets that will be prominent in the book. Some of this material may later be moved to earlier chapters.

+

Packages

+

In this chapter we use the following packages. Load them now

+ + + + + + + + + + + +

+13.1 Neuro- and Social-cognitive measures in psychiatric groups

+

A Ph.D. dissertation by Laura Hartman (2016) at York University was designed to evaluate whether and how clinical patients diagnosed (on the DSM-IV) as schizophrenic or with schizoaffective disorder could be distinguished from each other and from a normal, control sample on collections of standardized tests in the following domains:

+
    +
  • Neuro-cognitive: processing speed, attention, verbal learning, visual learning and problem solving;

  • +
  • Social-cognitive: managing emotions, theory of mind, externalizing, personalizing bias.

  • +
+

The study is an important contribution to clinical research because the two diagnostic categories are subtly different and their symptoms often overlap. Yet, they’re very different and often require different treatments. A key difference between schizoaffective disorder and schizophrenia is the prominence of mood disorder involving bipolar, manic and depressive moods. With schizoaffective disorder, mood disorders are front and center. With schizophrenia, that is not a dominant part of the disorder, but psychotic ideation (hearing voices, seeing imaginary people) is.

+

+13.1.1 Research questions

+

This example is concerned with the following substantitive questions:

+
    +
  • To what extent can patients diagnosed as schizophrenic or with schizoaffective disorder be distinguished from each other and from a normal control sample using a well-validated, comprehensive neurocognitive battery specifically designed for individuals with psychosis (Heinrichs et al., 2015) ?

  • +
  • If the groups differ, do any of the cognitive domains show particularly larger or smaller differences among these groups?

  • +
  • Do the neurocognitive measures discriminate among the in the same or different ways? If different, how many separate aspects or dimensions are distinguished?

  • +
+

Apart from the research interest, it could aid diagnosis and treatment if these similar mental disorders could be distinguished by tests in the cognitive domain.

+

+13.1.2 Data

+

The clinical sample comprised 116 male and female patients who met the following criteria: 1) a diagnosis of schizophrenia (\(n\) = 70) or schizoaffective disorder (\(n\) = 46) confirmed by the Structured Clinical Interview for DSM-IV-TR Axis I Disorders; 2) were outpatients; 3) a history free of developmental or learning disability; 4) age 18-65; 5) a history free of neurological or endocrine disorder; and 6) no concurrent diagnosis of substance use disorder. Non-psychiatric control participants (\(n\) = 146) were screened for medical and psychiatric illness and history of substance abuse and were recruited from three outpatient clinics.

+
+
data(NeuroCog, package="heplots")
+glimpse(NeuroCog)
+#> Rows: 242
+#> Columns: 10
+#> $ Dx        <fct> Schizophrenia, Schizophrenia, Schizophrenia, Sch…
+#> $ Speed     <int> 19, 8, 14, 7, 21, 31, -1, 17, 7, 37, 30, 26, 32,…
+#> $ Attention <int> 9, 25, 23, 18, 9, 10, 8, 20, 30, 15, 27, 20, 23,…
+#> $ Memory    <int> 19, 15, 15, 14, 35, 26, 3, 27, 26, 17, 28, 22, 2…
+#> $ Verbal    <int> 33, 28, 20, 34, 28, 29, 20, 30, 26, 33, 34, 33, …
+#> $ Visual    <int> 24, 24, 13, 16, 29, 21, 12, 32, 27, 21, 19, 18, …
+#> $ ProbSolv  <int> 39, 40, 32, 31, 45, 33, 29, 29, 30, 33, 30, 39, …
+#> $ SocialCog <int> 28, 37, 24, 36, 28, 28, 28, 44, 39, 24, 32, 36, …
+#> $ Age       <int> 44, 26, 55, 53, 51, 21, 53, 56, 48, 46, 48, 31, …
+#> $ Sex       <fct> Female, Male, Female, Male, Male, Male, Male, Fe…
+
+

The diagnostic classification variable is called Dx in the dataset. To facilitate answering questions regarding group differences, the following contrasts were applied: the first column compares the control group to the average of the diagnosed groups, the second compares the schizophrenia group against the schizoaffective group.

+
+
contrasts(NeuroCog$Dx)
+#>                 [,1] [,2]
+#> Schizophrenia   -0.5    1
+#> Schizoaffective -0.5   -1
+#> Control          1.0    0
+
+

In this analysis, we ignore the SocialCog variable. The primary focus is on the variables Attention : ProbSolv.

+

+13.1.3 A first look

+

As always, plot the data first! We want an overview of the distributions of the variables to see the centers, spread, shape and possible outliers for each group on each variable.

+

The plot below combines the use of boxplots and violin plots to give an informative display. As we saw earlier (Chapter XXX), doing this with ggplot2 requires reshaping the data to long format.

+
+
# Reshape from wide to long
+NC_long <- NeuroCog |>
+  dplyr::select(-SocialCog, -Age, -Sex) |>
+  tidyr::gather(key = response, value = "value", Speed:ProbSolv)
+# view 3 observations per group and measure
+NC_long |>
+  group_by(Dx) |>
+  sample_n(3) |> ungroup()
+#> # A tibble: 9 Ă— 3
+#>   Dx              response  value
+#>   <fct>           <chr>     <int>
+#> 1 Schizophrenia   Speed        39
+#> 2 Schizophrenia   Visual       21
+#> 3 Schizophrenia   Memory       40
+#> 4 Schizoaffective ProbSolv     40
+#> 5 Schizoaffective Speed        25
+#> 6 Schizoaffective Verbal       48
+#> 7 Control         Speed        33
+#> 8 Control         ProbSolv     43
+#> 9 Control         Attention    37
+
+

In the plot, we take care to adjust the transparency (alpha) values for the points, violin plots and boxplots so that all can be seen. Options for geom_boxplot() are used to give these greater visual prominence.

+
+
Code
ggplot(NC_long, aes(x=Dx, y=value, fill=Dx)) +
+  geom_jitter(shape=16, alpha=0.8, size=1, width=0.2) +
+  geom_violin(alpha = 0.1) +
+  geom_boxplot(width=0.5, alpha=0.4, 
+               outlier.alpha=1, outlier.size = 3, outlier.color = "red") +
+  scale_x_discrete(labels = c("Schizo", "SchizAff", "Control")) +
+  facet_wrap(~response, scales = "free_y", as.table = FALSE) +
+  theme_bw() +
+  theme(legend.position="bottom",
+        axis.title = element_text(size = rel(1.2)),
+        axis.text  = element_text(face = "bold"),
+        strip.text = element_text(size = rel(1.2)))
+
+
+
+ +
+
+Figure 13.1: Boxplots and violin plots of the NeuroCog data. +
+
+
+
+

We can see that the control participants score higher on all measures, but there is no consistent pattern of medians for the two patient groups. But these univariate summaries do not inform about the relations among variables.

+

+13.1.4 Bivariate views

+

Corrgram

+

A corrgram (Friendly, 2002) provides a useful reconnaisance plot of the bivariate correlations in the dataset. It suppresses details, and allows focus on the overall pattern. The corrgram::corrgram() function has the ability to enhance perception by permuting the variables in the order of their variable vectors in a biplot, so more highly correlated variables are adjacent in the plot, and example of effect ordering for data displays (Friendly & Kwan, 2003).

+

The plot below includes all variables except for Dx group. There are a number of panel.* functions for choosing how the correlation for each pair is rendered.

+
+
NeuroCog |>
+  select(-Dx) |>
+  corrgram(order = TRUE,
+           diag.panel = panel.density,
+           upper.panel = panel.pie)
+
+
+
+ +
+
+Figure 13.2: corrgram of the NeuroCog data. The upper and lower triangles use two different ways of encoding the value of the correlation for each pair of variables. +
+
+
+
+

In this plot you can see that adjacent variables are more highly correlated than those more widely separated. The diagonal panels show that most variables are reasonably symmetric in their distributions. Age, not included in this analysis is negatively correlated with the others: older participants tend to do less well on these tests.

+

Scatterplot matrix

+

A scatterplot matrix gives a more detailed overview of all pairwise relations. The plot below suppresses the data points and summarizes the relation using data ellipses and regression lines. The model syntax, ~ Speed + ... |Dx, treats Dx as a conditioning variable (similar to the use of the color aestheic in ggplot2) giving a separate data ellipse and regression line for each group. (The legend is suppressed here. The groups are Schizophrenic, SchizoAffective, Normal.)

+
+
scatterplotMatrix(~ Speed + Attention + Memory + Verbal + Visual + ProbSolv | Dx,
+  data=NeuroCog,
+  plot.points = FALSE,
+  smooth = FALSE,
+  legend = FALSE,
+  col = scales::hue_pal()(3),
+  ellipse=list(levels=0.68))
+
+
+
+ +
+
+Figure 13.3: Scatterplot matrix of the NeuroCog data. Points are suppressed here, focusing on the data ellipses and regression lines. Colors for the groups: Schizophrenic (red), SchizoAffective (green), Normal (blue) +
+
+
+
+

In this figure, we can see that the regression lines have similar slopes and similar data ellipses for the groups, though with a few exceptions.

+

TODO: Should we add biplot here?

+

+13.2 Fitting the MLM

+

We proceed to fit the one-way MANOVA model.

+
+
NC.mlm <- lm(cbind(Speed, Attention, Memory, Verbal, Visual, ProbSolv) ~ Dx,
+             data=NeuroCog)
+Anova(NC.mlm)
+#> 
+#> Type II MANOVA Tests: Pillai test statistic
+#>    Df test stat approx F num Df den Df  Pr(>F)    
+#> Dx  2     0.299     6.89     12    470 1.6e-11 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+

The first research question is captured by the contrasts for the Dx factor shown above. We can test these with car::linearHypothesis(). The contrast Dx1 for control vs. the diagnosed groups is highly significant,

+
+
# control vs. patients
+print(linearHypothesis(NC.mlm, "Dx1"), SSP=FALSE)
+#> 
+#> Multivariate Tests: 
+#>                  Df test stat approx F num Df den Df  Pr(>F)    
+#> Pillai            1     0.289     15.9      6    234 2.8e-15 ***
+#> Wilks             1     0.711     15.9      6    234 2.8e-15 ***
+#> Hotelling-Lawley  1     0.407     15.9      6    234 2.8e-15 ***
+#> Roy               1     0.407     15.9      6    234 2.8e-15 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+

but the second contrast, Dx2, comparing the schizophrenic and schizoaffective group, is not.

+
+
# Schizo vs SchizAff
+print(linearHypothesis(NC.mlm, "Dx2"), SSP=FALSE)
+#> 
+#> Multivariate Tests: 
+#>                  Df test stat approx F num Df den Df Pr(>F)
+#> Pillai            1     0.006    0.249      6    234   0.96
+#> Wilks             1     0.994    0.249      6    234   0.96
+#> Hotelling-Lawley  1     0.006    0.249      6    234   0.96
+#> Roy               1     0.006    0.249      6    234   0.96
+
+

+13.2.1 HE plot

+

So the question becomes: how to understand these results.
heplot() shows the visualization of the multivariate model in the space of two response variables (the first two by default). The result (Figure 13.4) tells a very simple story: The control group performs higher on higher measures than the diagnosed groups, which do not differ between themselves.

+

(For technical reasons, to abbreviate the group labels in the plot, we need to update() the MLM model after the labels are reassigned.)

+
+
# abbreviate levels for plots
+NeuroCog$Dx <- factor(NeuroCog$Dx, 
+                      labels = c("Schiz", "SchAff", "Contr"))
+NC.mlm <- update(NC.mlm)
+
+
+
op <- par(mar=c(5,4,1,1)+.1)
+heplot(NC.mlm, 
+       fill=TRUE, fill.alpha=0.1,
+       cex.lab=1.3, cex=1.25)
+par(op)
+
+
+
+ +
+
+Figure 13.4: HE plot of Speed and Attention in the MLM for the NeuroCog data. The labeled points show the means of the groups on the two variables. The blue H ellipse for groups indicates the strong positive correlation of the group means. +
+
+
+
+

This pattern is consistent across all of the response variables, as we see from a plot of pairs(NC.mlm):

+
+
pairs(NC.mlm, 
+      fill=TRUE, fill.alpha=0.1,
+      var.cex=2)
+
+
+
+ +
+
+Figure 13.5: HE plot matrix of the MLM for NeuroCog data. +
+
+
+
+

+13.2.2 Canonical space

+

We can gain further insight, and a simplified plot showing all the response variables by projecting the MANOVA into the canonical space, which is entirely 2-dimensional (because \(df_h=2\)). However, the output from candisc() shows that 98.5% of the mean differences among groups can be accounted for in one canonical dimension. ::: {.cell layout-align=“center”}

+
NC.can <- candisc(NC.mlm)
+NC.can
+#> 
+#> Canonical Discriminant Analysis for Dx:
+#> 
+#>    CanRsq Eigenvalue Difference Percent Cumulative
+#> 1 0.29295    0.41433      0.408    98.5       98.5
+#> 2 0.00625    0.00629      0.408     1.5      100.0
+#> 
+#> Test of H0: The canonical correlations in the 
+#> current row and all that follow are zero
+#> 
+#>   LR test stat approx F numDF denDF Pr(> F)    
+#> 1        0.703     7.53    12   468   9e-13 ***
+#> 2        0.994     0.30     5   235    0.91    
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+

:::

+

Figure 13.6 is the result of the plot() method for class "candisc" objects, that is, the result of calling plot(NC.can, ...). It plots the two canonical scores, \(\mathbf{Z}_{n \times 2}\) for the subjects, together with data ellipses for each of the three groups.

+
+
pos <- c(4, 1, 4, 4, 1, 3)
+col <- c("red", "darkgreen", "blue")
+op <- par(mar=c(5,4,1,1)+.1)
+plot(NC.can, 
+     ellipse=TRUE, 
+     rev.axes=c(TRUE,FALSE), 
+     pch=c(7,9,10),
+     var.cex=1.2, cex.lab=1.5, var.lwd=2,  scale=4.5, 
+     col=col,
+     var.col="black", var.pos=pos,
+     prefix="Canonical dimension ")
+par(op)
+
+
+
+ +
+
+Figure 13.6: Canonical discriminant plot for the NeuroCog data MANOVA. Scores on the two canonical dimensions are plotted, together with 68% data ellipses for each group. +
+
+
+
+

The interpretation of Figure 13.6 is again fairly straightforward. As noted earlier (REF???), the projections of the variable vectors in this plot on the coordinate axes are proportional to the correlations of the responses with the canonical scores. From this, we see that the normal group differs from the two patient groups, having higher scores on all the neurocognitive variables, most of which are highyl correlated. The problem solving measure is slightly different, and this, compared to the cluster of memory, verbal and attention, is what distinguishes the schizophrenic group from the schizoaffectives.

+

The separation of the groups is essentially one-dimensional, with the control group higher on all measures. Moreover, the variables processing speed and visual memory are the purest measures of this dimension, but all variables contribute positively. The second canonical dimension accounts for only 1.5% of group mean differences and is non-significant (by a likelihood ratio test). Yet, if we were to interpret it, we would note that the schizophrenia group is slightly higher on this dimension, scoring better in problem solving and slightly worse on working memory, attention, and verbal learning tasks.

+

Summary

+

This analysis gives a very simple description of the data, in relation to the research questions posed earlier:

+
    +
  • On the basis of these neurocognitive tests, the schizophrenic and schizoaffective groups do not differ significantly overall, but these groups differ greatly from the normal controls.

  • +
  • All cognitive domains distinguish the groups in the same direction, with the greatest differences shown for the variables most closely aligned with the horizontal axis in Figure 13.6.

  • +

+13.3 Social cognitive measures

+

The social cognitive measures were designed to tap various aspects of the perception and cognitive processing of emotions of others. Emotion perception was assessed using a Managing Emotions score from the MCCB. A “theory of mind” (ToM) score assessed ability to read the emotions of others from photographs of the eye region of male and female faces. Two other measures, externalizing bias (ExtBias) and personalizing bias (PersBias) were calculated from a scale measuring the degree to which individuals attribute internal, personal or situational causal attributions to positive and negative social events.

+

The analysis of the SocialCog data proceeds in a similar way: first we fit the MANOVA model, then test the overall differences among groups using Anova(). We find that the overall multivariate test is again significant,

+
+
data(SocialCog, package="heplots")
+SC.mlm <-  lm(cbind(MgeEmotions,ToM, ExtBias, PersBias) ~ Dx,
+               data=SocialCog)
+Anova(SC.mlm)
+#> 
+#> Type II MANOVA Tests: Pillai test statistic
+#>    Df test stat approx F num Df den Df  Pr(>F)    
+#> Dx  2     0.212     3.97      8    268 0.00018 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+

Testing the same two contrasts using linearHypothesis() (results not shown), w e find that the overall multivariate test is again significant, but now both contrasts are significant (Dx1: \(F(4, 133)=5.21, p < 0.001\); Dx2: \(F(4, 133)=2.49, p = 0.0461\)), the test for Dx2 just barely.

+
+
# control vs. patients
+print(linearHypothesis(SC.mlm, "Dx1"), SSP=FALSE)
+# Schizo vs. SchizAff
+print(linearHypothesis(SC.mlm, "Dx2"), SSP=FALSE)
+
+

These results are important, because, if they are reliable and make sense substantively, they imply that patients with schizophrenia and schizoaffective diagnoses can be distinguished by their performance on tasks assessing social perception and cognition. This was potentially a new finding in the literature on schizophrenia.

+

As we did above, it is useful to visualize the nature of these differences among groups with HE plots for the SC.mlm model. Each contrast has a corresponding \(\mathbf{H}\) ellipse, which we can show in the plot using the hypotheses argument. With a single degree of freedom, these degenerate ellipses plot as lines.

+
+
op <- par(mar=c(5,4,1,1)+.1)
+heplot(SC.mlm, 
+       hypotheses=list("Dx1"="Dx1", "Dx2"="Dx2"),
+       fill=TRUE, fill.alpha=.1,
+       cex.lab=1.5, cex=1.2)
+par(op)
+
+
+
+ +
+
+Figure 13.7: HE plot of Speed and Attention in the MLM for the SocialCog data. The labeled points show the means of the groups on the two variables. The lines for Dx1 and Dx2 show the tests of the contrasts among groups. +
+
+
+
+

It can be seen that the three group means are approximately equally spaced on the ToM measure, whereas for MgeEmotions, the control and schizoaffective groups are quite similar, and both are higher than the schizophrenic group. This ordering of the three groups was somewhat similar for the other responses, as we could see in a pairs(SC.mlm) plot.

+

+13.3.1 Model checking

+

Normally, we would continue this analysis, and consider other HE and canonical discriminant plots to further interpret the results, in particular the relations of the cognitive measures to group differences, or perhaps an analysis of the relationships between the neuro- and social-cognitive measures. We don’t pursue this here for reasons of length, but this example actually has a more important lesson to demonstrate.

+

Before beginning the MANOVA analyses, extensive data screening was done by the client using SPSS, in which all the response and predictor variables were checked for univariate normality and multivariate normality (MVN) for both sets. This traditional approach yielded a huge amount of tabular output and no graphs, and did not indicate any major violation of assumptions.1

+

A simple visual test of MVN and the possible presence of multivariate outliers is related to the theory of the data ellipse: Under MVN, the squared Mahalanobis distances \(D^2_M (\mathbf{y}) = (\mathbf{y} - \bar{\mathbf{y}})' \, \mathbf{S}^{-1} \, (\mathbf{y} - \bar{\mathbf{y}})\) should follow a \(\chi^2_p\) distribution. Thus, a quantile-quantile plot of the ordered \(D^2_M\) values vs. corresponding quantiles of the \(\chi^2\) distribution should approximate a straight line (Cox, 1968; Healy, 1968). Note that this should be applied to the residuals from the model – residuals(SC.mlm) – and not to the response variables directly.

+

heplots::cqplot() implements this for "mlm" objects Calling this function for the model SC.mlm produces Figure 13.8. It is immediately apparent that there is one extreme multivariate outlier; three other points are identified, but the remaining observations are nearly within the 95% confidence envelope (using a robust MVE estimate of \(\mathbf{S}\)).

+
+
op <- par(mar=c(5,4,1,1)+.1)
+cqplot(SC.mlm, method="mve", 
+       id.n=4, 
+       main="", 
+       cex.lab=1.25)
+par(op)
+
+
+
+ +
+
+Figure 13.8: Chi-square quantile-quantile plot for residuals from the model SC.mlm. The confidence band gives a point-wise 95% envelope, providing information about uncertainty. One extreme multivariate outlier is highlighted. +
+
+
+
+

Further checking revealed that this was a data entry error where one case (15) in the schizophrenia group had a score of -33 recorded on the ExtBias measure, whose valid range was (-10, +10). In R, it is very easy to re-fit a model to a subset of observations (rather than modifying the dataset itself) using update(). The result of the overall Anova and the test of Dx1 were unchanged; however, the multivariate test for the most interesting contrast Dx2 comparing the schizophrenia and schizoaffective groups became non-significant at the \(\alpha=0.05\) level (\(F(4, 133)=2.18, p = 0.0742\)).

+
+
SC.mlm1 <- update(SC.mlm, 
+                  subset=rownames(SocialCog)!="15")
+
+Anova(SC.mlm1)
+print(linearHypothesis(SC.mlm1, "Dx1"), SSP=FALSE)
+print(linearHypothesis(SC.mlm1, "Dx2"), SSP=FALSE)
+
+

+13.3.2 Canonical HE plot

+

This outcome creates a bit of a quandry for further analysis (do univariate follow-up tests? try a robust model?) and reporting (what to claim about the Dx2 contrast?) that we don’t explore here. Rather, we proceed to attempt to interpret the MLM with the aid of canonical analysis and a canonical HE plot. The canonical analysis of the model SC.mlm1 now shows that both canonical dimensions are significant, and account for 83.9% and 16.1% of between group mean differences respectively.

+
+
SC.can1 <- candisc(SC.mlm1)
+SC.can1
+#> 
+#> Canonical Discriminant Analysis for Dx:
+#> 
+#>   CanRsq Eigenvalue Difference Percent Cumulative
+#> 1 0.1645     0.1969      0.159    83.9       83.9
+#> 2 0.0364     0.0378      0.159    16.1      100.0
+#> 
+#> Test of H0: The canonical correlations in the 
+#> current row and all that follow are zero
+#> 
+#>   LR test stat approx F numDF denDF Pr(> F)    
+#> 1        0.805     3.78     8   264 0.00032 ***
+#> 2        0.964     1.68     3   133 0.17537    
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+
+
op <- par(mar=c(5,4,1,1)+.1)
+heplot(SC.can1, 
+  fill=TRUE, fill.alpha=.1,
+  hypotheses=list("Dx1"="Dx1", "Dx2"="Dx2"),
+  lwd = c(1, 2, 3, 3),
+  col=c("red", "blue", "darkgreen", "darkgreen"),
+  var.lwd=2, 
+  var.col="black", 
+  label.pos=c(3,1), 
+  var.cex=1.2, 
+  cex=1.25, cex.lab=1.2, 
+  scale=2.8,
+  prefix="Canonical dimension ")
+par(op)
+
+
+
+ +
+
+Figure 13.9: Canonical HE plot for the corrected SocialCog MANOVA. The variable vectors show the correlations of the responses with the canonical variables. The embedded green lines show the projections of the H ellipses for the contrasts Dx1 and Dx2 in canonical space. +
+
+
+
+

The HE plot version of this canonical plot is shown in Figure 13.9. Because the heplot() method for a "candisc" object refits the original model to the \(\mathbf{Z}\) canonical scores, it is easy to also project other linear hypotheses into this space. Note that in this view, both the Dx1 and Dx2 contrasts project outside \(\mathbf{E}\) ellipse.2.

+

This canonical HE plot has a very simple description:

+
    +
  • Dimension 1 orders the groups from control to schizoaffective to schizophrenia, while dimension 2 separates the schizoaffective group from the others;
  • +
  • Externalizing bias and theory of mind contributes most to the first dimension, while personal bias and managing emotions are more aligned with the second; and,
  • +
  • The relations of the two contrasts to group differences and to the response variables can be easily read from this plot.
  • +
+
+
#cat("Packages used here:\n")
+write_pkgs(file = .pkg_file)
+#> 10  packages used here:
+#>  broom, candisc, car, carData, corrgram, dplyr, ggplot2, heplots, knitr, tidyr
+
+ + + + +

+
    +
  1. Actually, multivariate normality of the predictors in \(\mathbf{X}\) is not required in the MLM. This assumption applies only to the conditional values \(\mathbf{Y} \;|\; \mathbf{X}\), i.e., that the errors \(\mathbf{\epsilon}_{i}' \sim \mathcal{N}_{p}(\mathbf{0},\boldsymbol{\Sigma})\) with constant covariance matrix. Moreover, the widely used MVN test statistics, such as Mardia’s (1970) test based on multivariate skewness and kurtosis are known to be quite sensitive to mild departures in kurtosis (Mardia, 1974) which do not threaten the validity of the multivariate tests.↩︎

  2. +
  3. The direct application of significance tests to canonical scores probably requires some adjustment because these are computed to have the optimal between-group discrimination.↩︎

  4. +
+
+ + + + + \ No newline at end of file diff --git a/docs/91-colophon.html b/docs/91-colophon.html new file mode 100644 index 00000000..9c443eb3 --- /dev/null +++ b/docs/91-colophon.html @@ -0,0 +1,1121 @@ + + + + + + +Colophon – Visualizing Multivariate Data and Models in R + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

Colophon

+
+
+ + +
+ + + + +
+ + +
+ +
+ + + +

This book was produced using R version 4.4.1 (2024-06-14 ucrt). Fundamental to this was the framework for reproducible documents provided by Yihui Xie’s knitr package.

+

Quarto was used to compile and render the book in HTML and PDF formats. [** Don’t really need all this**]

+
+
Quarto 1.5.53
+[>] Checking versions of quarto binary dependencies...
+      Pandoc version 3.2.0: OK
+      Dart Sass version 1.70.0: OK
+      Deno version 1.41.0: OK
+      Typst version 0.11.0: OK
+[>] Checking versions of quarto dependencies......OK
+[>] Checking Quarto installation......OK
+      Version: 1.5.53
+      CodePage: 1252
+[>] Checking tools....................OK
+      TinyTeX: (not installed)
+      Chromium: (not installed)
+[>] Checking LaTeX....................OK
+      Tex:  (not detected)
+[>] Checking basic markdown render....OK
+[>] Checking Python 3 installation....(None)
+      Unable to locate an installed version of Python 3.
+      Install Python 3 from https://www.python.org/downloads/
+[>] Checking R installation...........OK
+      Version: 4.4.1
+      LibPaths:
+        - C:/R/R-4.4.1/library
+      knitr: 1.49
+      rmarkdown: 2.29
+[>] Checking Knitr engine render......OK
+
+

Package versions

+

The principal R package versions used in examples and illustrations are listed below. These were captured via sessioninfo:::package_info() from all library() commands in the text, and scripts which also updated the references to packages.

+

At the time of writing, most of these were current on CRAN repositories but some development versions are indicated as “local” in the source column.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
packageversiondatesource
bayestestR0.15.02024-10-17CRAN
broom1.0.72024-09-26CRAN
candisc0.9.02024-10-31local
car3.1-32024-09-27CRAN
carData3.0-52022-01-06CRAN
corpcor1.6.102021-09-16CRAN
correlation0.8.62024-10-26CRAN
corrgram1.142021-04-29CRAN
corrplot0.952024-10-14CRAN
datawizard0.13.02024-10-05CRAN
dplyr1.1.42023-11-17CRAN
easystats0.7.32024-07-22CRAN
effects4.2-22022-07-13CRAN
effectsize1.0.02024-12-10CRAN
factoextra1.0.72020-04-01CRAN
FactoMineR2.112024-04-20CRAN
forcats1.0.02023-01-29CRAN
genridge0.8.02024-12-02CRAN
GGally2.2.12024-02-14CRAN
gganimate1.0.92024-02-27CRAN
ggbiplot0.6.22024-01-08CRAN
ggdensity1.0.02023-02-09CRAN
ggeffects2.0.02024-11-27CRAN
ggpcp0.2.02022-11-28CRAN
ggplot23.5.12024-04-23CRAN
ggpubr0.6.02023-02-10CRAN
ggrepel0.9.62024-09-07CRAN
ggstats0.7.02024-09-22CRAN
heplots1.7.32024-12-20local
Hotelling1.0-82021-09-09CRAN
imager1.0.22024-05-13CRAN
insight1.0.02024-11-26CRAN
knitr1.492024-11-08CRAN
lubridate1.9.42024-12-08CRAN
magrittr2.0.32022-03-30CRAN
marginaleffects0.24.02024-11-25CRAN
MASS7.3-612024-06-13CRAN
matlib1.0.12024-10-23local
modelbased0.8.92024-10-26CRAN
modelsummary2.2.02024-09-02CRAN
parameters0.24.02024-11-27CRAN
patchwork1.3.02024-09-16CRAN
performance0.12.42024-10-18CRAN
purrr1.0.22023-08-10CRAN
qgraph1.9.82023-11-03CRAN
readr2.1.52024-01-10CRAN
report0.5.92024-07-10CRAN
Rtsne0.172023-12-07CRAN
see0.9.02024-09-06CRAN
stringr1.5.12023-11-14CRAN
tibble3.2.12023-03-20CRAN
tidyr1.3.12024-01-24CRAN
tidyverse2.0.02023-02-22CRAN
tourr1.2.02024-04-20CRAN
vcd1.4-132024-09-16CRAN
VisCollin0.1.22023-09-05CRAN
+
+ + +
+
+ + + + + \ No newline at end of file diff --git a/docs/95-references.html b/docs/95-references.html new file mode 100644 index 00000000..453b8671 --- /dev/null +++ b/docs/95-references.html @@ -0,0 +1,1834 @@ + + + + + + +References – Visualizing Multivariate Data and Models in R + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

References

+
+
+ + +
+ + + + +
+ + +
+ +
+ + + +
+
+Abbott, E. A. (1884). Flatland: A romance of many dimensions. +Buccaneer Books. +
+
+Adler, D., & Murdoch, D. (2023). Rgl: 3D visualization using +OpenGL. https://CRAN.R-project.org/package=rgl +
+
+Aluja, T., Morineau, A., & Sanchez, G. (2018). Principal +component analysis for data science. https://pca4ds.github.io/ +
+
+Andrews, D. F. (1972). Plots of high dimensional data. +Biometrics, 28, 123–136. +
+
+Anscombe, F. J. (1973). Graphs in statistical analysis. The American +Statistician, 27, 17–21. +
+
+Arel-Bundock, V. (2024a). Marginaleffects: Predictions, comparisons, +slopes, marginal means, and hypothesis tests. https://marginaleffects.com/ +
+
+Arel-Bundock, V. (2024b). Modelsummary: Summary tables and plots for +statistical models and data: Beautiful, customizable, and +publication-ready. https://modelsummary.com +
+
+Asimov, D. (1985). Grand tour. SIAM Journal of Scientific and +Statistical Computing, 6(1), 128–143. +
+
+Barab’asi, A.-L. (2016). Network science. Cambridge University +Press. +
+
+Bartlett, M. S. (1937). Properties of sufficiency and statistical tests. +Proceedings of the Royal Society of London. Series A, +160(901), 268–282. https://doi.org/10.2307/96803 +
+
+Becker, R. A., Cleveland, W. S., & Shyu, M.-J. (1996). The visual +design and control of trellis display. Journal of Computational and +Graphical Statistics, 5(2), 123–155. +
+
+Belsley, D. A. (1991). Conditioning diagnostics: Collinearity and +weak data in regression. Wiley. +
+
+Belsley, D. A., Kuh, E., & Welsch, R. E. (1980). Regression +diagnostics: Identifying influential data and sources of +collinearity. John Wiley; Sons. +
+
+Biecek, P., Baniecki, H., Krzyzinski, M., & Cook, D. (2023). +Performance is not enough: A story of the rashomon’s quartet. +https://arxiv.org/abs/2302.13356 +
+
+Black, C., Southwell, C., Emmerson, L., Lunn, D., & Hart, T. (2018). +Time-lapse imagery of adélie penguins reveals differential winter +strategies and breeding site occupation. PLOS ONE, +13(3), e0193532. https://doi.org/10.1371/journal.pone.0193532 +
+
+Blishen, B., Carroll, W., & Moore, C. (1987). The 1981 socioeconomic +index for occupations in canada. Canadian Review of Sociology/Revue +Canadienne de Sociologie, 24(4), 465–488. https://doi.org/10.1111/j.1755-618x.1987.tb00639.x +
+
+Bock, R. D. (1963). Programming univariate and multivariate analysis of +variance. Technometrics, 5(1), 95–117. https://doi.org/10.1080/00401706.1963.10490061 +
+
+Bock, R. D. (1964). A computer program forunivariate and multivariate +analysis of variance. Proceedings of Scientific Symposium on +Statistics. +
+
+Bondy, J. A., & Murty, U. S. R. (2008). Graph theory. +Springer. +
+
+Borg, I., & Groenen, P. J. F. (2005). Modern Multidimensional Scaling: Theory and +Applications. Springer. +
+
+Borg, I., Groenen, P. J. F., & Mair, P. (2018). Applied +multidimensional scaling and unfolding. In SpringerBriefs in +Statistics. Springer International Publishing. https://doi.org/10.1007/978-3-319-73471-2 +
+
+Box, G. E. P. (1949). A general distribution theory for a class of +likelihood criteria. Biometrika, 36(3-4), 317–346. https://doi.org/10.1093/biomet/36.3-4.317 +
+
+Box, G. E. P. (1950). Problems in the analysis of growth and +wear curves. Biometrics, 6, 362–389. +
+
+Box, G. E. P. (1953). Non-normality and tests on variances. +Biometrika, 40(3/4), 318–335. https://doi.org/10.2307/2333350 +
+
+Brown, M. B., & Forsythe, A. B. (1974). Robust tests for equality of +variances. Journal of the American Statistical Association, +69(346), 364–367. https://doi.org/10.1080/01621459.1974.10482955 +
+
+Brown, P. J., & Zidek, J. V. (1980). Adaptive multivariate ridge +regression. The Annals of Statistics, 8(1), 64–74. http://www.jstor.org/stable/2240743 +
+
+Buja, A., Cook, D., Asimov, D., & Hurley, C. (2005). Computational +methods for high-dimensional rotations in data visualization. In J. S. +CR Rao EJ Wegman (Ed.), Handbook of statistics (pp. 391–413). +Elsevier. https://doi.org/10.1016/s0169-7161(04)24014-7 +
+
+cagne, M. (1885). Coordonnées parallèles +et axiales: Méthode de transformation +géométrique et +procédé nouveau de calcul graphique +déduits de la considération des +coordonnées parallèlles. +Gauthier-Villars. http://historical.library.cornell.edu/cgi-bin/cul.math/docviewer?did=00620001&seq=3 +
+
+Cajori, F. (1926). Origins of fourth dimension concepts. The +American Mathematical Monthly, 33(8), 397–406. https://doi.org/10.1080/00029890.1926.11986607 +
+
+Cattell, R. B. (1966). The scree test for the number of factors. +Multivariate Behavioral Research, 1(2), 245–276. https://doi.org/10.1207/s15327906mbr0102_10 +
+
+Chambers, J. M., & Hastie, T. J. (1991). Statistical models in +s (p. 624). Chapman & Hall/CRC. +
+
+Cleveland, W. S. (1979). Robust locally weighted regression and +smoothing scatterplots. Journal of the American Statistical +Association, 74, 829–836. +
+
+Cleveland, W. S. (1985). The elements of graphing data. +Wadsworth Advanced Books. +
+
+Cleveland, W. S., & Devlin, S. J. (1988). Locally weighted +regression: An approach to regression analysis by local fitting. +Journal of the American Statistical Association, 83, +596–610. +
+
+Cleveland, W. S., & McGill, R. (1984). Graphical perception: Theory, +experimentation and application to the development of graphical methods. +Journal of the American Statistical Association, 79, +531–554. +
+
+Cleveland, W. S., & McGill, R. (1985). Graphical perception and +graphical methods for analyzing scientific data. Science, +229, 828–833. +
+
+Clyde, D. J., Cramer, E. M., & Sherin, R. J. (1966). +Multivariate statistical programs. Biometric +Laboratory,University of Miami. +
+
+Cochran, W. G. (1941). The distribution of the largest of a set of +estimated variances as a fraction of their total. Annals of +Eugenics, 11(1), 47–52. https://doi.org/10.1111/j.1469-1809.1941.tb02271.x +
+
+Conover, W. J., Johnson, M. E., & Johnson, M. M. (1981). A +comparative study of tests for homogeneity of variances, with +applications to the outer continental shelf bidding data. +Technometrics, 23(4), 351–361. https://doi.org/10.1080/00401706.1981.10487680 +
+
+Cook, D., Buja, A., Cabrera, J., & Hurley, C. (1995). Grand tour and +projection pursuit. Journal of Computational and Graphical +Statistics, 4(3), 155. https://doi.org/10.2307/1390844 +
+
+Cook, D., Buja, A., Lee, E.-K., & Wickham, H. (2008). Grand tours, +projection pursuit guided tours, and manual controls. In Handbook of +data visualization (pp. 295–314). Springer Berlin Heidelberg. https://doi.org/10.1007/978-3-540-33037-0_13 +
+
+Cook, D., & Laa, U. (2024). Interactively exploring +high-dimensional data and models in R. Online. https://dicook.github.io/mulgar_book/ +
+
+Cook, D., & Swayne, D. F. (2007). Interactive and dynamic +graphics for data analysis : With R and +GGobi. Springer. http://www.ggobi.org/book/ +
+
+Cook, R. D. (1977). Detection of influential observation in linear +regression. Technometrics, 19(1), 15–18. http://links.jstor.org/sici?sici=0040-1706%28197702%2919%3A1%3C15%3ADOIOIL%3E2.0.CO%3B2-8 +
+
+Cook, R. D. (1993). Exploring partial residual plots. +Technometrics, 35(4), 351–362. +
+
+Cook, R. D. (1996). Added-variable plots and curvature in linear +regression. Technometrics, 38(3), 275–278. https://doi.org/10.1080/00401706.1996.10484507 +
+
+Cook, R. D., & Weisberg, S. (1982). Residuals and influence in +regression. Chapman; Hall. +
+
+Cook, R. D., & Weisberg, S. (1994). ARES plots for generalized +linear models. Computational Statistics & Data Analysis, +17(3), 303–315. https://doi.org/10.1016/0167-9473(92)00075-3 +
+
+Costantini, G., Epskamp, S., Borsboom, D., Perugini, M., Mõttus, R., +Waldorp, L. J., & Cramer, A. O. J. (2015). State of the aRt personality research: A tutorial on network +analysis of personality data in R. Journal of Research +in Personality, 54, 13–29. https://doi.org/10.1016/j.jrp.2014.07.003 +
+
+Cotton, R. (2013). Learning R. O’Reilly Media. +
+
+Cox, D. R. (1968). Notes on some aspects of regression analysis. +Journal of the Royal Statistical Society Series A, +131, 265–279. +
+
+Csárdi, G., Nepusz, T., Traag, V., Horvát, S., Zanini, F., Noom, D., +& Müller, K. (2024). igraph: Network +analysis and visualization in r. https://doi.org/10.5281/zenodo.7682609 +
+
+Curran, J., & Hersh, T. (2021). Hotelling: Hotelling’s t^2 test +and variants. https://CRAN.R-project.org/package=Hotelling +
+
+Davies, R., Locke, S., & D’Agostino McGowan, L. (2022). +datasauRus: Datasets from the datasaurus dozen. https://CRAN.R-project.org/package=datasauRus +
+
+Davis, C. (1990). Body image and weight preoccupation: A comparison +between exercising and non-exercising women. Appetite, +16(1), 84. https://doi.org/10.1016/0195-6663(91)90115-9 +
+
+Dempster, A. P. (1969). Elements of continuous multivariate +analysis. Addison-Wesley. +
+
+Dempster, A. P. (1972). Covariance selection. Biometrics, +28(1), 157–175. +
+
+Dixon, W. J. (1965). BMD biomedical computer programs. Health +Sciences Computing Facility, School of Medicine, University of +California; Health Sciences Computing Faculty. +
+
+Dray, S., Siberchicot, A., & Jean Thioulouse. Based on earlier work +by Alice Julien-Laferrière., with contributions from. (2023). +Adegraphics: An S4 lattice-based package for the representation of +multivariate data. http://pbil.univ-lyon1.fr/ADE-4/ +
+
+Duncan, O. D. (1961). A socioeconomic index for all occupations. In Jr. +A. J. Reiss, P. K. H. O. D. Duncan, & C. C. North (Eds.), +Occupations and social status. The Free Press. +
+
+Efron, B., Hastie, T., Johnstone, I., & Tibshirani, R. (2004). Least +angle regression. The Annals of Statistics, 32(2), +407–499. +
+
+Emerson, J. W., Green, W. A., Schloerke, B., Crowley, J., Cook, D., +Hofmann, H., & Wickham, H. (2013). The generalized pairs plot. +Journal of Computational and Graphical Statistics, +22(1), 79–91. http://www.tandfonline.com/doi/ref/10.1080/10618600.2012.694762 +
+
+Euler, L. (1758). Elementa doctrinae solidorum. Novi Commentarii +Academiae Scientiarum Petropolitanae, 4, 109–140. https://scholarlycommons.pacific.edu/euler-works/230/ +
+
+Farquhar, A. B., & Farquhar, H. (1891). Economic and industrial +delusions: A discourse of the case for protection. Putnam. +
+
+Fienberg, S. E. (1971). Randomization and social affairs: The 1970 draft +lottery. Science, 171, 255–261. +
+
+Finn, J. D. (1967). MULTIVARIANCE: Fortran program for +univariate and multivariate analysis of variance and covariance. +School of Education, State University of New York at Buffalo. +
+
+Fisher, R. A. (1923). Studies in crop variation. II. The manurial +response of different potato varieties. The Journal of Agricultural +Science, 13(2), 311–320. https://hdl.handle.net/2440/15179 +
+
+Fisher, R. A. (1925b). Statistical methods for research +workers. Oliver & Boyd. +
+
+Fisher, R. A. (1925a). Statistical methods for research workers +(6th ed.). Oliver & Boyd. +
+
+Fisher, R. A. (1936). The use of multiple measurements in taxonomic +problems. Annals of Eugenics, 7(2), 179–188. https://doi.org/10.1111/j.1469-1809.1936.tb02137.x +
+
+Fishkeller, M. A., Friedman, J. H., & Tukey, J. W. (1974). +PRIM-9, an interactive multidimensional data display and +analysis system. Proceedings of the Pacific ACM Regional +Conference. +
+
+Flury, B., & Riedwyl, H. (1988). Multivariate statistics: A +practical approach. Chapman & Hall. +
+
+Fox, J. (1987). Effect displays for generalized linear models. In C. C. +Clogg (Ed.), Sociological methodology, 1987 (pp. 347–361). +Jossey-Bass. +
+
+Fox, J. (2003). Effect displays in R for generalized linear +models. Journal of Statistical Software, 8(15), 1–27. +
+
+Fox, J. (2016). Applied regression analysis and generalized linear +models (Third edition.). SAGE. +
+
+Fox, J. (2020). Regression diagnostics (2nd ed.). +SAGE Publications, Inc. https://doi.org/10.4135/9781071878651 +
+
+Fox, J. (2021). A mathematical primer for social statistics +(2nd ed.). SAGE Publications, Inc. https://doi.org/10.4135/9781071878835 +
+
+Fox, J., & Monette, G. (1992). Generalized collinearity diagnostics. +Journal of the American Statistical Association, +87(417), 178–183. +
+
+Fox, J., & Weisberg, S. (2018a). An R companion to +applied regression (Third). SAGE Publications. https://books.google.ca/books?id=uPNrDwAAQBAJ +
+
+Fox, J., & Weisberg, S. (2018b). Visualizing fit and lack of fit in +complex regression models with predictor effect plots and partial +residuals. Journal of Statistical Software, 87(9). https://doi.org/10.18637/jss.v087.i09 +
+
+Fox, J., Weisberg, S., & Price, B. (2023). Car: Companion to +applied regression. https://CRAN.R-project.org/package=car +
+
+Fox, J., Weisberg, S., Price, B., Friendly, M., & Hong, J. (2022). +Effects: Effect displays for linear, generalized linear, and other +models. https://www.r-project.org +
+
+Friedman, J., Hastie, T., Tibshirani, R., Narasimhan, B., Tay, K., +Simon, N., & Yang, J. (2023). Glmnet: Lasso and elastic-net +regularized generalized linear models. https://glmnet.stanford.edu +
+
+Friendly, M. (1991). SAS System for statistical +graphics (1st ed.). SAS Institute. http://www.sas. +com/service/doc/pubcat/uspubcat/ind_files/56143.html +
+
+Friendly, M. (1994). Mosaic displays for multi-way contingency tables. +Journal of the American Statistical Association, 89, +190–200. http://www.jstor.org/stable/2291215 +
+
+Friendly, M. (1999). Extending mosaic displays: Marginal, conditional, +and partial views of categorical data. Journal of Computational and +Graphical Statistics, 8(3), 373–395. http://datavis.ca/papers/drew/drew.pdf +
+
+Friendly, M. (2002). Corrgrams: Exploratory displays for correlation +matrices. The American Statistician, 56(4), 316–324. +https://doi.org/10.1198/000313002533 +
+
+Friendly, M. (2007). HE plots for multivariate general +linear models. Journal of Computational and Graphical +Statistics, 16(2), 421–444. https://doi.org/10.1198/106186007X208407 +
+
+Friendly, M. (2008). The Golden Age of statistical +graphics. Statistical Science, 23(4), 502–535. https://doi.org/10.1214/08-STS268 +
+
+Friendly, M. (2011). Generalized ridge trace plots: Visualizing bias +and precision with the genridge R package. SCS +Seminar. +
+
+Friendly, M. (2013). The generalized ridge trace plot: Visualizing bias +and precision. Journal of Computational and Graphical +Statistics, 22(1), 50–68. https://doi.org/10.1080/10618600.2012.681237 +
+
+Friendly, M. (2022). The life and works of andré-michel +guerry, revisited. Sociological Spectrum, 42(4-6), +233–259. https://doi.org/10.1080/02732173.2022.2078450 +
+
+Friendly, M. (2023). vcdExtra: Vcd extensions and additions. https://friendly.github.io/vcdExtra/ +
+
+Friendly, M. (2024). Genridge: Generalized ridge trace plots for +ridge regression. https://github.com/friendly/genridge +
+
+Friendly, M., Fox, J., & Chalmers, P. (2024). Matlib: Matrix +functions for teaching and learning linear algebra and multivariate +statistics. https://github.com/friendly/matlib +
+
+Friendly, M., & Kwan, E. (2003). Effect ordering for data displays. +Computational Statistics and Data Analysis, 43(4), +509–539. https://doi.org/10.1016/S0167-9473(02)00290-6 +
+
+Friendly, M., & Kwan, E. (2009). Where’s Waldo: +Visualizing collinearity diagnostics. The American +Statistician, 63(1), 56–65. https://doi.org/10.1198/tast.2009.0012 +
+
+Friendly, M., & Meyer, D. (2016). Discrete data analysis with +R: Visualization and modeling techniques for categorical +and count data. Chapman & Hall/CRC. +
+
+Friendly, M., Monette, G., & Fox, J. (2013). Elliptical insights: +Understanding statistical methods through elliptical geometry. +Statistical Science, 28(1), 1–39. https://doi.org/10.1214/12-STS402 +
+
+Friendly, M., & Wainer, H. (2021). A history of data +visualization and graphic communication. Harvard University Press. +https://doi.org/10.4159/9780674259034 +
+
+Fuller, W. (2006). Measurement error models (2nd ed.). John +Wiley & Sons. +
+
+Funkhouser, H. G. (1937). Historical development of the graphical +representation of statistical data. Osiris, 3(1), +269–405. http://tinyurl.com/32ema9 +
+
+Gabriel, K. R. (1971). The biplot graphic display of matrices with +application to principal components analysis. Biometrics, +58(3), 453–467. https://doi.org/10.2307/2334381 +
+
+Gabriel, K. R. (1981). Biplot display of multivariate matrices for +inspection of data and diagnosis. In V. Barnett (Ed.), Interpreting +multivariate data (pp. 147–173). John Wiley; Sons. +
+
+Galton, F. (1863). Meteorographica, or methods of mapping the +weather. Macmillan. http://www.mugu.com/galton/books/meteorographica/index.htm +
+
+Galton, F. (1886). Regression towards mediocrity in hereditary stature. +Journal of the Anthropological Institute, 15, 246–263. +http://www.jstor.org/cgi-bin/jstor/viewitem/09595295/dm995266/99p0374f/0 +
+
+Galton, F. (1889). Natural inheritance. Macmillan. http://galton.org/books/natural-inheritance/pdf/galton-nat-inh-1up-clean.pdf +
+
+Gannett, H. (1898). Statistical atlas of the united states, eleventh +(1890) census. U.S. Government Printing Office. +
+
+Gastwirth, J. L., Gel, Y. R., & Miao, W. (2009). The impact of Levene’s test of equality of variances on +statistical theory and practice. Statistical Science, +24(3), 343–360. https://doi.org/10.1214/09-STS301 +
+
+Gelman, A., Hullman, J., & Kennedy, L. (2023). Causal quartets: +Different ways to attain the same average treatment effect. http://www.stat.columbia.edu/~gelman/research/unpublished/causal_quartets.pdf +
+
+Goeman, J., Meijer, R., Chaturvedi, N., & Lueder, M. (2022). +Penalized: L1 (lasso and fused lasso) and L2 (ridge) penalized +estimation in GLMs and in the cox model. https://CRAN.R-project.org/package=penalized +
+
+Gorman, K. B., Williams, T. D., & Fraser, W. R. (2014). Ecological +sexual dimorphism and environmental variability within a community of +antarctic penguins (genus pygoscelis). PLoS +ONE, 9(3), e90081. https://doi.org/10.1371/journal.pone.0090081 +
+
+Gower, J. C., & Hand, D. J. (1996). Biplots. Chapman & +Hall. +
+
+Gower, J. C., Lubbe, S. G., & Roux, N. J. L. (2011). +Understanding biplots. Wiley. http://books.google.ca/books?id=66gQCi5JOKYC +
+
+Grandjean, M. (2016). A social network analysis of Twitter: +Mapping the digital humanities community. Cogent Arts +&Amp; Humanities, 3(1), 1171458. https://doi.org/10.1080/23311983.2016.1171458 +
+
+Graybill, F. A. (1961). An introduction to linear statistical +models. McGraw-Hill. +
+
+Greenacre, M. (1984). Theory and applications of correspondence +analysis. Academic Press. +
+
+Greenacre, M. (2010). Biplots in practice. +FundaciĂłn BBVA. https://books.google.ca/books?id=dv4LrFP7U\_EC +
+
+Guerry, A.-M. (1833). Essai sur la statistique morale de la +France. Crochard. +
+
+Hahsler, M., Buchta, C., & Hornik, K. (2024). Seriation: +Infrastructure for ordering objects using seriation. https://github.com/mhahsler/seriation +
+
+Haitovsky, Y. (1987). On multivariate ridge regression. +Biometrika, 74(3), 563–570. https://doi.org/10.1093/biomet/74.3.563 +
+
+Harrison, P. (2023). Langevitour: Smooth interactive touring of high +dimensions, demonstrated with scRNA-seq data. The R Journal, +15(2), 206–219. https://doi.org/10.32614/RJ-2023-046 +
+
+Harrison, P. (2024). Langevitour: Langevin tour. https://logarithmic.net/langevitour/ +
+
+Hart, C., & Wang, E. (2022). Detourr: Portable and performant +tour animations. https://CRAN.R-project.org/package=detourr +
+
+Hartigan, J. A. (1975a). Clustering algorithms. John Wiley; +Sons. +
+
+Hartigan, J. A. (1975b). Printer graphics for clustering. Journal of +Statistical Computing and Simulation, 4, 187–213. +
+
+Hartley, H. O. (1950). The use of range in analysis of variance. +Biometrika, 37(3–4), 271–280. https://doi.org/10.1093/biomet/37.3-4.271 +
+
+Hartman, L. I. (2016). Schizophrenia and schizoaffective disorder: +One condition or two? [PhD dissertation]. York University. +
+
+Harwell, M. R., Rubinstein, E. N., Hayes, W. S., & Olds, C. C. +(1992). Summarizing monte carlo results in methodological research: The +one- and two-factor fixed effects ANOVA cases. Journal +of Educational and Behavioral Statistics, 17(4), 315–339. +https://doi.org/10.3102/10769986017004315 +
+
+Hastie, T., Tibshirani, R., & Friedman, J. (2009). The elements +of statistical learning: Data mining, inference and prediction (2nd +ed.). Springer. http://www-stat.stanford.edu/~tibs/ElemStatLearn/ +
+
+Healy, M. J. R. (1968). Multivariate normal plotting. Journal of the +Royal Statistical Society Series C, 17(2), 157–161. +
+
+Heinrichs, R. W., Pinnock, F., Muharib, E., Hartman, L., Goldberg, J., +& McDermid Vaz, S. (2015). Neurocognitive normality in schizophrenia +revisited. Schizophrenia Research: Cognition, 2(4), +227–232. https://doi.org/10.1016/j.scog.2015.09.001 +
+
+Herschel, J. F. W. (1833). On the investigation of the orbits of +revolving double stars: Being a supplement to a paper entitled +"micrometrical measures of 364 double stars". Memoirs of the Royal +Astronomical Society, 5, 171–222. +
+
+Hoaglin, D. C., & Welsch, R. E. (1978). The hat matrix in regression +and ANOVA. The American Statistician, +32(1), 17–22. https://doi.org/10.1080/00031305.1978.10479237 +
+
+Hocking, R. R. (2013). Methods and applications of linear models: +Regression and the analysis of variance. Wiley. https://books.google.ca/books?id=iq2J-1iS6HcC +
+
+Hoerl, A. E., & Kennard, R. W. (1970). Ridge regression: +Biased estimation for nonorthogonal problems. +Technometrics, 12, 55–67. +
+
+Hoerl, A. E., Kennard, R. W., & Baldwin, K. F. (1975). Ridge +regression: Some simulations. Communications in Statistics, +4(2), 105–123. https://doi.org/10.1080/03610927508827232 +
+
+Hofmann, H., VanderPlas, S., & Ge, Y. (2022). Ggpcp: Parallel +coordinate plots in the ggplot2 framework. https://github.com/heike/ggpcp +
+
+Hofstadter, D. R. (1979). Gödel, escher, bach: An eternal golden +braid. Basic Books. +
+
+Højsgaard, S., Edwards, D., & Lauritzen, S. (2012). Graphical +models with R. Springer Science & Business Media. +
+
+Horst, A., Hill, A., & Gorman, K. (2022). Palmerpenguins: Palmer +archipelago (antarctica) penguin data. https://allisonhorst.github.io/palmerpenguins/ +
+
+Hotelling, H. (1931). The generalization of Student’s ratio. The Annals of Mathematical +Statistics, 2(3), 360–378. https://doi.org/10.1214/aoms/1177732979 +
+
+Husson, F., Josse, J., Le, S., & Mazet, J. (2024). FactoMineR: +Multivariate exploratory data analysis and data mining. http://factominer.free.fr +
+
+Husson, F., Le, S., & Pagès, J. (2017). Exploratory multivariate +analysis by example using r. Chapman & Hall. https://doi.org/10.1201/b21874 +
+
+IBM. (1965). Proceedings of the IBM scientific computing symposium +on statistics: Oct 21-23, 1963 (L. Robinson, Ed.). IBM. https://www.amazon.com/Proceedings-Scientific-Computing-Symposium-Statistics/dp/B000GL5RLU +
+
+Inselberg, A. (1985). The plane with parallel coordinates. The +Visual Computer, 1, 69–91. +
+
+Isvoranu, A.-M., Epskamp, S., Waldorp, L. J., & Borsboom, D. (2022). +Network psychometrics with r: A guide for behavioral and social +scientists. Routledge. https://doi.org/10.4324/9781003111238 +
+
+Kassambara, A., & Mundt, F. (2020). Factoextra: Extract and +visualize the results of multivariate data analyses. http://www.sthda.com/english/rpkgs/factoextra +
+
+Kastellec, J. P., & Leoni, E. L. (2007). Using graphs instead of +tables in political science. Perspectives on Politics, +5(04), 755–771. https://doi.org/10.1017/S1537592707072209 +
+
+Krijthe, J. (2023). Rtsne: T-distributed stochastic neighbor +embedding using a barnes-hut implementation. https://github.com/jkrijthe/Rtsne +
+
+Kruskal, J. B. (1964). Multidimensional scaling by optimizing goodness +of fit to a nonmetric hypothesis. Psychometrika, +29(1), 1–27. https://doi.org/10.1007/bf02289565 +
+
+Kwan, E., Lu, I. R. R., & Friendly, M. (2009). Tableplot: A new tool +for assessing precise predictions. Zeitschrift für +Psychologie / Journal of Psychology, 217(1), 38–48. https://doi.org/10.1027/0044-3409.217.1.38 +
+
+Larmarange, J. (2024). Ggstats: Extension to ggplot2 for plotting +stats. https://larmarange.github.io/ggstats/ +
+
+Larsen, W. A., & McCleary, S. J. (1972). The use of partial residual +plots in regression analysis. Technometrics, 14, +781–790. +
+
+Lauritzen, S. L. (1996). Graphical models. Oxford University +Press. +
+
+Lawless, J. F., & Wang, P. (1976). A simulation study of ridge and +other regression estimators. Communications in Statistics, +5, 307–323. +
+
+Lee, E.-K., & Cook, D. (2009). A projection pursuit index for large +p small n data. Statistics and Computing, 20(3), +381–392. https://doi.org/10.1007/s11222-009-9131-1 +
+
+Lee, S. (2021). Liminal: Multivariate data visualization with tours +and embeddings. https://CRAN.R-project.org/package=liminal +
+
+Levene, H. (1960). Robust tests for equality of variances. In I. Olkin, +S. G. Ghurye, W. Hoeffding, W. G. Madow, & H. B. Mann (Eds.), +Contributions to probability and statistics: Essays in honor of +Harold Hotelling (pp. 278–292). Stanford University +Press. +
+
+Lix, J. M., L. M. Keselman, & Keselman, H. J. (1996). Consequences +of assumption violations revisited: A quantitative review of +alternatives to the one-way analysis of variance F test. +Review of Educational Research, 66(4), 579–619. https://doi.org/10.3102/00346543066004579 +
+
+Longley, J. W. (1967). An appraisal of least squares programs for the +electronic computer from the point of view of the user. Journal of +the American Statistical Association, 62, 819–841. +https://doi.org/https://www.tandfonline.com/doi/abs/10.1080/01621459.1967.10500896 +
+
+LĂĽdecke, D. (2024). Ggeffects: Create tidy data frames of marginal +effects for ggplot from model outputs. https://strengejacke.github.io/ggeffects/ +
+
+LĂĽdecke, D., Ben-Shachar, M. S., Patil, I., Waggoner, P., & +Makowski, D. (2021). performance: An +R package for assessment, comparison and testing of +statistical models. Journal of Open Source Software, +6(60), 3139. https://doi.org/10.21105/joss.03139 +
+
+LĂĽdecke, D., Ben-Shachar, M. S., Patil, I., Wiernik, B. M., & +Makowski, D. (2022). Easystats: Framework for easy statistical modeling, +visualization, and reporting. In CRAN. https://easystats.github.io/easystats/ +
+
+Maaten, L. van der, & Hinton, G. (2008). Visualizing data using +t-SNE. Journal of Machine Learning +Research, 9, 2579–2605. http://www.jmlr.org/papers/v9/vandermaaten08a.html +
+
+Mardia, K. V. (1970). Measures of multivariate skewness and kurtosis +with applications. Biometrika, 57(3), 519–530. +https://doi.org/http://dx.doi.org/10.2307/2334770 +
+
+Mardia, K. V. (1974). Applications of some measures of multivariate +skewness and kurtosis in testing normality and robustness studies. +Sankhya: The Indian Journal of Statistics, Series B, +36(2), 115–128. http://www.jstor.org/stable/25051892 +
+
+Marquardt, D. W. (1970). Generalized inverses, ridge regression, biased +linear estimation, and nonlinear estimation. Technometrics, +12, 591–612. +
+
+Marquardt, D. W., & Snee, R. D. (1975). Ridge regression in +practice. The American Statistician, 29(1), 3–20. https://doi.org/10.1080/00031305.1975.10479105 +
+
+Martí, R., & Laguna, M. (2003). Heuristics and meta-heuristics for +2-layer straight line crossing minimization. Discrete Applied +Mathematics, 127(3), 665–678. +
+
+Matejka, J., & Fitzmaurice, G. (2017, May). Same stats, different +graphs. Proceedings of the 2017 CHI Conference on Human +Factors in Computing Systems. https://doi.org/10.1145/3025453.3025912 +
+
+Matloff, N. (2011). The art of R programming: +A tour of statistical software design. No Starch +Press. +
+
+McDonald, G. C. (2009). Ridge regression. Wiley Interdisciplinary +Reviews: Computational Statistics, 1(1), 93–100. https://doi.org/10.1002/wics.14 +
+
+McGowan, L. D., Gerke, T., & Barrett, M. (2023). Causal inference is +not just a statistics problem. Journal of Statistics and Data +Science Education, 1–9. https://doi.org/10.1080/26939169.2023.2276446 +
+
+Meyer, D., Zeileis, A., Hornik, K., & Friendly, M. (2024). Vcd: +Visualizing categorical data. https://CRAN.R-project.org/package=vcd +
+
+Meyers, L. S., Gamst, G., & Guarino, A. J. (2006). Applied +multivariate research: Design and interpretation. SAGE +Publications. +
+
+Monette, G. (1990). Geometry of multiple regression and interactive +3-D graphics. In J. Fox & S. Long (Eds.), Modern +methods of data analysis (pp. 209–256). SAGE Publications. +
+
+O’Brien, P. C. (1992). Robust procedures for testing equality of +covariance matrices. Biometrics, 48(3), 819–827. http://www.jstor.org/stable/2532347 +
+
+Oksanen, J., Simpson, G. L., Blanchet, F. G., Kindt, R., Legendre, P., +Minchin, P. R., O’Hara, R. B., Solymos, P., Stevens, M. H. H., Szoecs, +E., Wagner, H., Barbour, M., Bedward, M., Bolker, B., Borcard, D., +Carvalho, G., Chirico, M., De Caceres, M., Durand, S., … Weedon, J. +(2024). Vegan: Community ecology package. https://github.com/vegandevs/vegan +
+
+Otto, J., & Kahle, D. (2023). Ggdensity: Interpretable bivariate +density visualization with ggplot2. https://jamesotto852.github.io/ggdensity/ +
+
+Pearson, K. (1896). Contributions to the mathematical theory of +evolution—III, regression, heredity and panmixia. +Philosophical Transactions of the Royal Society of London, +187, 253–318. +
+
+Pearson, K. (1901). On lines and planes of closest fit to systems of +points in space. Philosophical Magazine, 6(2), +559–572. +
+
+Pearson, K. (1903). I. Mathematical contributions to the theory of +evolution. —XI. On the influence of natural selection on the variability +and correlation of organs. Philosophical Transactions of the Royal +Society of London, 200(321–330), 1–66. https://doi.org/10.1098/rsta.1903.0001 +
+
+Pedersen, T. L., & Robinson, D. (2024). Gganimate: A grammar of +animated graphics. https://gganimate.com +
+
+Pineo, P. O., & Porter, J. (1967). Occupational prestige in canada*. +Canadian Review of Sociology, 4(1), 24–40. +https://doi.org/https://doi.org/10.1111/j.1755-618X.1967.tb00472.x +
+
+Pineo, P. O., & Porter, J. (2008). Occupational prestige in canada. +Canadian Review of Sociology, 4(1), 24–40. https://doi.org/10.1111/j.1755-618x.1967.tb00472.x +
+
+Playfair, W. (1786). Commercial and political atlas: Representing, +by copper-plate charts, the progress of the commerce, revenues, +expenditure, and debts of england, during the whole of the eighteenth +century. Debrett; Robinson;; Sewell. http://ucpj.uchicago.edu/Isis/journal/demo/v000n000/000000/000000.fg4.html +
+
+Playfair, W. (1801). Statistical breviary; shewing, on a principle +entirely new, the resources of every state and kingdom in +Europe. Wallis. +
+
+Reaven, G. M., & Miller, R. G. (1968). Study of the relationship +between glucose and insulin responses to an oral glucose load in man. +Diabetes, 17(9), 560–569. https://doi.org/10.2337/diab.17.9.560 +
+
+Reaven, G. M., & Miller, R. G. (1979). An attempt to define the +nature of chemical diabetes using a multidimensional analysis. +Diabetologia, 16, 17–24. +
+
+Robinaugh, D. J., Hoekstra, R. H. A., Toner, E. R., & Borsboom, D. +(2019). The network approach to psychopathology: A review of the +literature 2008–2018 and an agenda for future research. +Psychological Medicine, 50(3), 353–366. https://doi.org/10.1017/s0033291719003404 +
+
+Rogan, J. C., & Keselman, H. J. (1977). Is the ANOVA +f-test robust to variance heterogeneity when sample sizes are equal?: An +investigation via a coefficient of variation. American Educational +Research Journal, 14(4), 493–498. https://doi.org/10.3102/00028312014004493 +
+
+Sarkar, D. (2024). Lattice: Trellis graphics for r. https://lattice.r-forge.r-project.org/ +
+
+Scheffé, H. A. (1960). The analysis of variance. Wiley. +
+
+Schloerke, B., Cook, D., Larmarange, J., Briatte, F., Marbach, M., +Thoen, E., Elberg, A., & Crowley, J. (2024). GGally: Extension +to ggplot2. https://ggobi.github.io/ggally/ +
+
+Scott, D. W. (1992). Multivariate density estimation: Theory, +practice, and visualization. Wiley. +
+
+Searle, S. R., Speed, F. M., & Milliken, G. A. (1980). Population +marginal means in the linear model: An alternative to least squares +means. The American Statistician, 34(4), 216–221. +
+
+Shapiro, S. S., & Wilk, M. B. (1965). An analysis of variance test +for normality (complete samples). Biometrika, 52(3–4), +591–611. https://doi.org/10.1093/biomet/52.3-4.591 +
+
+Shepard, R. N. (1962a). The analysis of proximities: Multidimensional +scaling with an unknown distance function. i. Psychometrika, +27(2), 125–140. https://doi.org/10.1007/bf02289630 +
+
+Shepard, R. N. (1962b). The analysis of proximities: Multidimensional +scaling with an unknown distance function. II. Psychometrika, +27(3), 219–246. https://doi.org/10.1007/bf02289621 +
+
+Shepard, R. N., Romney, A. K., Nerlove, S. B., & Board, M. S. S. +(1972a). Multidimensional scaling; theory and applications in the +behavioral sciences: Vols. II. Applications. Seminar Press. https://books.google.ca/books?id=PpFAAQAAIAAJ +
+
+Shepard, R. N., Romney, A. K., Nerlove, S. B., & Board, M. S. S. +(1972b). Multidimensional scaling: Theory and applications in the +behavioral sciences: Vols. I. Theory. Seminar Press. https://books.google.ca/books?id=pJRAAQAAIAAJ +
+
+Shoben, E. J. (1983). Applications of multidimensional scaling in +cognitive psychology. Applied Psychological Measurement, +7(4), 473–490. https://doi.org/10.1177/014662168300700406 +
+
+Silverman, B. W. (1986). Density estimation for statistics and data +analysis. Chapman & Hall. +
+
+Simpson, E. H. (1951). The interpretation of interaction in contingency +tables. Journal of the Royal Statistical Society, Series B, +30, 238–241. +
+
+Swayne, D. F., Cook, D., & Buja, A. (1998). XGobi: Interactive +dynamic data visualization in the x window system. Journal of +Computational and Graphical Statistics, 7(1), 113–130. https://doi.org/10.1080/10618600.1998.10474764 +
+
+Swayne, D. F., Lang, D. T., Buja, A., & Cook, D. (2003). +GGobi: Evolving from XGobi into an extensible +framework for interactive data visualization. Computational +Statistics &Amp; Data Analysis, 43(4), 423–444. https://doi.org/10.1016/s0167-9473(02)00286-4 +
+
+Teetor, P. (2011). R cookbook. +O’Reilly Media. +
+
+Tibshirani, R. (1996). Regression shrinkage and selection via the lasso. +Journal of the Royal Statistical Society, Series B: +Methodological, 58, 267–288. +
+
+Tiku, M. L., & Balakrishnan, N. (1984). Testing equality of +population variances the robust way. Communications in Statistics - +Theory and Methods, 13(17), 2143–2159. https://doi.org/10.1080/03610928408828818 +
+
+Timm, N. H. (1975). Multivariate analysis with applications in +education and psychology. Wadsworth (Brooks/Cole). +
+
+Torgerson, W. S. (1952). Multidimensional scaling: I. Theory and method. +Psychometrika, 17(4), 401–419. https://doi.org/10.1007/bf02288916 +
+
+VanderPlas, S., Ge, Y., Unwin, A., & Hofmann, H. (2023). Penguins go +parallel: A grammar of graphics framework for generalized parallel +coordinate plots. Journal of Computational and Graphical +Statistics, 1–16. https://doi.org/10.1080/10618600.2023.2195462 +
+
+Velleman, P. F., & Welsh, R. E. (1981). Efficient computing of +regression diagnostics. The American Statistician, +35(4), 234–242. +
+
+Vinod, H. D. (1978). A survey of ridge regression and related techniques +for improvements over ordinary least squares. The Review of +Economics and Statistics, 60(1), 121–131. http://www.jstor.org/stable/1924340 +
+
+Waddell, A., & Oldford, R. W. (2023). Loon: Interactive +statistical data visualization. https://CRAN.R-project.org/package=loon +
+
+Warne, F. T. (2014). A primer on multivariate analysis of +variance(MANOVA) for behavioral scientists. Practical Assessment, +Research & Evaluation, 19(1). https://scholarworks.umass.edu/pare/vol19/iss1/17/ +
+
+Wegman, E. J. (1990). Hyperdimensional data analysis using parallel +coordinates. Journal of the American Statistical Association, +85(411), 664–675. +
+
+Wei, T., & Simko, V. (2024). Corrplot: Visualization of a +correlation matrix. https://github.com/taiyun/corrplot +
+
+Welch, B. L. (1947). The generalization of "student’s" problem when +several different population varlances are involved. +Biometrika, 34(1–2), 28–35. https://doi.org/10.1093/biomet/34.1-2.28 +
+
+West, D. B. (2001). Introduction to graph theory. Prentice +hall. +
+
+Whittaker, J. (1990). Graphical models in applied multivariate +statistics. John Wiley; Sons. +
+
+Wickham, H. (2014). Advanced R. Chapman and +Hall/CRC. +
+
+Wickham, H., & Cook, D. (2024). Tourr: Tour methods for +multivariate data visualisation. https://github.com/ggobi/tourr +
+
+Wickham, H., Cook, D., Hofmann, H., & Buja, A. (2011). Tourr: An +R package for exploring multivariate data with projections. +Journal of Statistical Software, 40(2). https://doi.org/10.18637/jss.v040.i02 +
+
+Wilkinson, G. N., & Rogers, C. E. (1973). Symbolic description of +factorial models for analysis of variance. Applied Statistics, +22(3), 392. https://doi.org/10.2307/2346786 +
+
+Winer, B. J. (1962). Statistical principles in experimental +design. McGraw-Hill. +
+
+Wood, S. N. (2006). Generalized additive models: An introduction +with r. Chapman; Hall/CRC Press. +
+
+Wright, K. (2021). Corrgram: Plot a correlogram. https://kwstat.github.io/corrgram/ +
+
+Xie, Y. (2021). Animation: A gallery of animations in statistics and +utilities to create animations. https://yihui.org/animation/ +
+
+Xu, Z., & Oldford, R. W. (2021). Loon.tour: Tour in ’loon’. +https://cran.r-project.org/package=loon.tourr +
+
+Zhang, J., & Boos, D. D. (1992). Bootstrap critical values for +testing homogeneity of covariance matrices. Journal of the American +Statistical Association, 87(418), 425–429. http://www.jstor.org/stable/2290273 +
+
+

Package used

+ + +
+
+ + + + + \ No newline at end of file diff --git a/docs/figs/case-studies/fig-NC-HE-pairs-1.png b/docs/figs/case-studies/fig-NC-HE-pairs-1.png new file mode 100644 index 00000000..a8bb70fc Binary files /dev/null and b/docs/figs/case-studies/fig-NC-HE-pairs-1.png differ diff --git a/docs/figs/case-studies/fig-NC-HEplot-1.png b/docs/figs/case-studies/fig-NC-HEplot-1.png new file mode 100644 index 00000000..32d61f73 Binary files /dev/null and b/docs/figs/case-studies/fig-NC-HEplot-1.png differ diff --git a/docs/figs/case-studies/fig-NC-boxplot-1.png b/docs/figs/case-studies/fig-NC-boxplot-1.png new file mode 100644 index 00000000..eaaf75ee Binary files /dev/null and b/docs/figs/case-studies/fig-NC-boxplot-1.png differ diff --git a/docs/figs/case-studies/fig-NC-candisc-1.png b/docs/figs/case-studies/fig-NC-candisc-1.png new file mode 100644 index 00000000..acf835c6 Binary files /dev/null and b/docs/figs/case-studies/fig-NC-candisc-1.png differ diff --git a/docs/figs/case-studies/fig-NC-corrgram-1.png b/docs/figs/case-studies/fig-NC-corrgram-1.png new file mode 100644 index 00000000..2f56729b Binary files /dev/null and b/docs/figs/case-studies/fig-NC-corrgram-1.png differ diff --git a/docs/figs/case-studies/fig-NC-scatmat-1.png b/docs/figs/case-studies/fig-NC-scatmat-1.png new file mode 100644 index 00000000..225a6517 Binary files /dev/null and b/docs/figs/case-studies/fig-NC-scatmat-1.png differ diff --git a/docs/figs/case-studies/fig-SC-HEplot-1.png b/docs/figs/case-studies/fig-SC-HEplot-1.png new file mode 100644 index 00000000..0ddf6dca Binary files /dev/null and b/docs/figs/case-studies/fig-SC-HEplot-1.png differ diff --git a/docs/figs/case-studies/fig-SC-cqplot-1.png b/docs/figs/case-studies/fig-SC-cqplot-1.png new file mode 100644 index 00000000..f4e11fee Binary files /dev/null and b/docs/figs/case-studies/fig-SC-cqplot-1.png differ diff --git a/docs/figs/case-studies/fig-SC1-hecan-1.png b/docs/figs/case-studies/fig-SC1-hecan-1.png new file mode 100644 index 00000000..123e5da5 Binary files /dev/null and b/docs/figs/case-studies/fig-SC1-hecan-1.png differ diff --git a/docs/figs/ch04/fig-crime-biplot2-1.png b/docs/figs/ch04/fig-crime-biplot2-1.png index 28c57e99..ab5b66b3 100644 Binary files a/docs/figs/ch04/fig-crime-biplot2-1.png and b/docs/figs/ch04/fig-crime-biplot2-1.png differ diff --git a/docs/figs/ch04/fig-crime-biplot3-1.png b/docs/figs/ch04/fig-crime-biplot3-1.png index 213f247f..5f7add36 100644 Binary files a/docs/figs/ch04/fig-crime-biplot3-1.png and b/docs/figs/ch04/fig-crime-biplot3-1.png differ diff --git a/docs/figs/ch04/fig-mtcars-biplot-1.png b/docs/figs/ch04/fig-mtcars-biplot-1.png index 5855f544..5d3853bd 100644 Binary files a/docs/figs/ch04/fig-mtcars-biplot-1.png and b/docs/figs/ch04/fig-mtcars-biplot-1.png differ diff --git a/docs/figs/ch06/fig-duncan-check-model-1.png b/docs/figs/ch06/fig-duncan-check-model-1.png index a7b321a7..4ff3ee5f 100644 Binary files a/docs/figs/ch06/fig-duncan-check-model-1.png and b/docs/figs/ch06/fig-duncan-check-model-1.png differ diff --git a/docs/figs/ch12/fig-peng-boxplots-1.png b/docs/figs/ch12/fig-peng-boxplots-1.png new file mode 100644 index 00000000..0cb8509f Binary files /dev/null and b/docs/figs/ch12/fig-peng-boxplots-1.png differ diff --git a/docs/figs/ch12/fig-peng-covEllipse-pairs-1.png b/docs/figs/ch12/fig-peng-covEllipse-pairs-1.png new file mode 100644 index 00000000..5bdd0cdc Binary files /dev/null and b/docs/figs/ch12/fig-peng-covEllipse-pairs-1.png differ diff --git a/docs/figs/ch12/fig-peng-covEllipse0-1.png b/docs/figs/ch12/fig-peng-covEllipse0-1.png new file mode 100644 index 00000000..64eccf32 Binary files /dev/null and b/docs/figs/ch12/fig-peng-covEllipse0-1.png differ diff --git a/docs/figs/ch12/fig-peng-devplots-1.png b/docs/figs/ch12/fig-peng-devplots-1.png new file mode 100644 index 00000000..662a4ea5 Binary files /dev/null and b/docs/figs/ch12/fig-peng-devplots-1.png differ diff --git a/docs/images/dogfood-quartet.png b/docs/images/dogfood-quartet.png new file mode 100644 index 00000000..8d6e5d7c Binary files /dev/null and b/docs/images/dogfood-quartet.png differ diff --git a/docs/index.html b/docs/index.html index 41146c53..90bce70e 100644 --- a/docs/index.html +++ b/docs/index.html @@ -407,14 +407,14 @@

Visualizing Multivariate Data and Models in R

To comport oneself with perfect propriety in Polygonal society, one ought to be a Polygon oneself. — Edwin A. Abbott, Flatland

-

In 1884, an English schoolmaster, Edwin Abbott Abbott, shook the world of Victorian culture with a slim volume, Flatland: A Romance of Many Dimensions (Abbott, 1884). He described a two-dimensional world, Flatland, inhabited entirely by geometric figures in the plane. His purpose was satirical, to poke fun at the social and gender class system at the time: Women were mere line segments, while men were represented as polygons with varying numbers of sides— a triangle was a working man, but acute isosceles were soldiers or criminals of very small angle; gentlemen and professionals had more sides. Abbot published this under the pseudonym, “A Square”, suggesting his place in the hierarchy.

+

In 1884, an English schoolmaster, Edwin Abbott Abbott, shook the world of Victorian culture with a slim volume, Flatland: A Romance of Many Dimensions (Abbott, 1884). He described a two-dimensional world, Flatland, inhabited entirely by geometric figures in the plane. His purpose was satirical, to poke fun at the social and gender class system at the time: Women were mere line segments, while men were represented as polygons with varying numbers of sides— a triangle was a working man, but acute isosceles were soldiers or criminals of very small angle; gentlemen and professionals had more sides. Abbot published this under the pseudonym, “A Square”, suggesting his place in the hierarchy.

True, said the Sphere; it appears to you a Plane, because you are not accustomed to light and shade and perspective; just as in Flatland a Hexagon would appear a Straight Line to one who has not the Art of Sight Recognition. But in reality it is a Solid, as you shall learn by the sense of Feeling. — Edwin A. Abbott, Flatland

But how did it feel to be a member of a flatland society? How could a point (a newborn child?) understand a line (a woman)? How does a Triangle “see” a Hexagon or even a infinitely-sided Circle? Abbott introduces the very idea of different dimensions of existence through dreams and visions:

  • A Square dreams of visiting a one-dimensional Lineland where men appear as lines, and women are merely “illustrious points”, but the inhabitants can only see the Square as lines.

  • -
  • In a vision, the Square is visited by a Sphere, to illustrate what a 2D Flatlander could understand from a 3D sphere (Figure fig-flatland-spheres) that passes through the plane he inhabits. It is a large circle when seen at the moment of its’ greatest extent. As the Spehere rises, it becomes progressively smaller, until it becomes a point, and then vanishes.

  • +
  • In a vision, the Square is visited by a Sphere, to illustrate what a 2D Flatlander could understand from a 3D sphere (Figure 1) that passes through the plane he inhabits. It is a large circle when seen at the moment of its’ greatest extent. As the Spehere rises, it becomes progressively smaller, until it becomes a point, and then vanishes.

@@ -423,7 +423,7 @@

Visualizing Multivariate Data and Models in R

-Figure 1: A 2D Flatlander seeing a sphere as it passes through Flatland. The line, labeled â€My Eye’ indicates what the Flatlander would see. Source: Abbott (1884) +Figure 1: A 2D Flatlander seeing a sphere as it passes through Flatland. The line, labeled â€My Eye’ indicates what the Flatlander would see. Source: Abbott (1884)
@@ -432,7 +432,7 @@

Visualizing Multivariate Data and Models in R

In One Dimensions, did not a moving Point produce a Line with two terminal points? In two Dimensions, did not a moving Line produce a Square with four terminal points? In Three Dimensions, did not a moving Square produce - did not the eyes of mine behold it - that blessed being, a Cube, with eight terminal points? And in Four Dimensions, shall not a moving Cube - alas, for Analogy, and alas for the Progress of Truth if it be not so - shall not, I say the motion of a divine Cube result in a still more divine organization with sixteen terminal points? — Edwin A. Abbott

-

For Abbot, the way for a citizen of any world to imagine one more dimension was to consider how a higher-dimensional object would change over time.1 A line moved over time could produce a rectangle as shown in Figure fig-1D-4D; that rectangle moving in another direction over time would produce a 3D figure, and so forth.

+

For Abbot, the way for a citizen of any world to imagine one more dimension was to consider how a higher-dimensional object would change over time.1 A line moved over time could produce a rectangle as shown in Figure 2; that rectangle moving in another direction over time would produce a 3D figure, and so forth.

@@ -445,7 +445,7 @@

Visualizing Multivariate Data and Models in R

-

But wait! Where does that 4D thing (a tesseract) come from? To really see a tesseract it helps to view it in an animation over time (Figure fig-tesseract). But like the Square, contemplating 3D from a 2D world, it takes some imagination.

+

But wait! Where does that 4D thing (a tesseract) come from? To really see a tesseract it helps to view it in an animation over time (Figure 3). But like the Square, contemplating 3D from a 2D world, it takes some imagination.

@@ -493,7 +493,7 @@

Visualizing Multivariate Data and Models in R

Figure 4: Four views of the pollen data, zooming in, clockwise from the upper left to discover the word “EUREKA”. -

This can be seen better in a 3D animation. The rgl package (Adler & Murdoch, 2023) is used to create a 3D scatterplot of the first three variables. Then the animation package (Xie, 2021) is use to record a sequence of images, adjusting the rgl::par3d(zoom) value.

+

This can be seen better in a 3D animation. The rgl package (Adler & Murdoch, 2023) is used to create a 3D scatterplot of the first three variables. Then the animation package (Xie, 2021) is use to record a sequence of images, adjusting the rgl::par3d(zoom) value.

Code
library(animation)
 library(rgl)
@@ -532,8 +532,8 @@ 

Visualizing Multivariate Data and Models in R

Multivariate scientific discoveries

-

Lest this example seem contrived (which it admittedly is), multivariate visualization has played an important role in quite a few scientific discoveries. Among these, Francis Galton’s (1863) discovery of the anti-cyclonic pattern of wind direction in relation to barometric pressure from many weather measures recorded systematically across all weather stations, lighthouses and observatories in Europe in December 1861 stands out as the best example of a scientific discovery achieved almost entirely through graphical means–– something that was totally unexpected, and purely the product of his use of remarkably novel high-dimensional graphs (Friendly & Wainer, 2021, pp. 170–173).

-

A more recent example is the discovery of two general classes in the development of Type 2 diabetes by Reaven & Miller (1979), using PRIM-9 (Fishkeller et al., 1974), the first computer system for high-dimensional visualization2. In an earlier study Reaven & Miller (1968) examined the relation between blood glucose levels and the production of insulin in normal subjects and in patients with varying degrees of hyperglicemia (elevated blood sugar level). They found a peculiar â€â€™horse shoe’’ shape in this relation (shown in Figure fig-diabetes1), about which they could only speculate: perhaps individuals with the best glucose tolerance also had the lowest levels of insulin as a response to an oral dose of glucose; perhaps those with low glucose response could secrete higher levels of insulin; perhaps those who were low on both glucose and insulin responses followed some other mechanism. In 2D plots, this was a mystery.

+

Lest this example seem contrived (which it admittedly is), multivariate visualization has played an important role in quite a few scientific discoveries. Among these, Francis Galton’s (1863) discovery of the anti-cyclonic pattern of wind direction in relation to barometric pressure from many weather measures recorded systematically across all weather stations, lighthouses and observatories in Europe in December 1861 stands out as the best example of a scientific discovery achieved almost entirely through graphical means–– something that was totally unexpected, and purely the product of his use of remarkably novel high-dimensional graphs (Friendly & Wainer, 2021, pp. 170–173).

+

A more recent example is the discovery of two general classes in the development of Type 2 diabetes by Reaven & Miller (1979), using PRIM-9 (Fishkeller et al., 1974), the first computer system for high-dimensional visualization2. In an earlier study Reaven & Miller (1968) examined the relation between blood glucose levels and the production of insulin in normal subjects and in patients with varying degrees of hyperglicemia (elevated blood sugar level). They found a peculiar â€â€™horse shoe’’ shape in this relation (shown in Figure 6), about which they could only speculate: perhaps individuals with the best glucose tolerance also had the lowest levels of insulin as a response to an oral dose of glucose; perhaps those with low glucose response could secrete higher levels of insulin; perhaps those who were low on both glucose and insulin responses followed some other mechanism. In 2D plots, this was a mystery.

data(Diabetes, package="heplots")
 plot(instest ~ glutest, data=Diabetes, 
@@ -547,13 +547,13 @@ 

Visualizing Multivariate Data and Models in R

-Figure 6: Reproduction of a graph similar to that from Reaven & Miller (1968) on the relationship between glucose and insulin response to being given an oral dose of glucose. +Figure 6: Reproduction of a graph similar to that from Reaven & Miller (1968) on the relationship between glucose and insulin response to being given an oral dose of glucose.
-

An answer to their questions came ten years later, when they were able to visualize similar but new data in 3D using the PRIM-9 system. In a carefully controlled study, they also measured â€â€™steady state plasma glucose’’ (SSPG), a measure of the efficiency of use of insulin in the body, where large values mean insulin resistance, as well as other variables. PRIM-9 allowed them to explore various sets of three variables, and, more importantly, to rotate a given plot in three dimensions to search for interesting features. One plot that stood out concerned the relation between plasma glucose response, plasma insulin response and SSPG response, shown in Figure fig-ReavenMiller-3d.

+

An answer to their questions came ten years later, when they were able to visualize similar but new data in 3D using the PRIM-9 system. In a carefully controlled study, they also measured â€â€™steady state plasma glucose’’ (SSPG), a measure of the efficiency of use of insulin in the body, where large values mean insulin resistance, as well as other variables. PRIM-9 allowed them to explore various sets of three variables, and, more importantly, to rotate a given plot in three dimensions to search for interesting features. One plot that stood out concerned the relation between plasma glucose response, plasma insulin response and SSPG response, shown in Figure 7.

@@ -561,18 +561,18 @@

Visualizing Multivariate Data and Models in R

-Figure 7: Artist’s rendition of data from Reaven & Miller (1979) as seen in three dimensions using the PRIM-9 system. Labels for the clusters have been added, identifying the three groups of patients. Source: Reaven & Miller (1979). +Figure 7: Artist’s rendition of data from Reaven & Miller (1979) as seen in three dimensions using the PRIM-9 system. Labels for the clusters have been added, identifying the three groups of patients. Source: Reaven & Miller (1979).
-

From this graphical insight, they were able to classify the participants into three groups, based on clinical levels of glucose and insulin. The people in the wing on the left in Figure fig-ReavenMiller-3d were considered to have overt diabetes, the most advanced form, characterized by elevated fasting blood glucose concentration and classical diabetic symptoms. Those in the right wing were classified as latent or chemical diabetics, with no symptoms of diabetes but demonstrable abnormality of oral or intravenous glucose tolerance. Those in the central blob were classified as normal.

-

Previous thinking was that Type 2 diabetes (when the body cannot make enough insulin, as opposed to Type I, an autoimmune condition where the pancreatic cells have been destroyed) progressed from the chemical stage to an overt one in a smooth transition. However, it was clear from Figure fig-ReavenMiller-3d that the only “path” from one to the other lead through the cluster of normal patients near the origin, so that explanation must be wrong. Instead, this suggested that the chemical and overt diabetics were distinct classes. Indeed, longitudinal studies showed that patients classified as chemical diabetics rarely developed the overt form. The understanding of the etiology of Type 2 diabetes was altered dramatically by the power of high-D interactive graphics.

+

From this graphical insight, they were able to classify the participants into three groups, based on clinical levels of glucose and insulin. The people in the wing on the left in Figure 7 were considered to have overt diabetes, the most advanced form, characterized by elevated fasting blood glucose concentration and classical diabetic symptoms. Those in the right wing were classified as latent or chemical diabetics, with no symptoms of diabetes but demonstrable abnormality of oral or intravenous glucose tolerance. Those in the central blob were classified as normal.

+

Previous thinking was that Type 2 diabetes (when the body cannot make enough insulin, as opposed to Type I, an autoimmune condition where the pancreatic cells have been destroyed) progressed from the chemical stage to an overt one in a smooth transition. However, it was clear from Figure 7 that the only “path” from one to the other lead through the cluster of normal patients near the origin, so that explanation must be wrong. Instead, this suggested that the chemical and overt diabetics were distinct classes. Indeed, longitudinal studies showed that patients classified as chemical diabetics rarely developed the overt form. The understanding of the etiology of Type 2 diabetes was altered dramatically by the power of high-D interactive graphics.

What I assume

It is assumed that the reader has a background in applied intermediate statistics including material on univariate linear models including analysis of variance (ANOVA) and multiple regression. This means you should be familiar with … TODO: Complete this required background

-

There will also be some mathematics in the book where words and diagrams are not enough. The mathematical level will be intermediate, mostly consisting of simple algebra. No derivations, proofs, theorems here! For multivariate methods, it will be useful to express ideas using matrix notation to simplify presentation. The single symbol I’m using math to express ideas, and all you will need is a reading-level of understanding. For this, the first chapter of Fox (2021), A mathematical primer for social statistics, is excellent. If you want to learn something of using matrix algebra for data analysis and statistics, I recommend our package matlib (Friendly et al., 2024).

+

There will also be some mathematics in the book where words and diagrams are not enough. The mathematical level will be intermediate, mostly consisting of simple algebra. No derivations, proofs, theorems here! For multivariate methods, it will be useful to express ideas using matrix notation to simplify presentation. The single symbol I’m using math to express ideas, and all you will need is a reading-level of understanding. For this, the first chapter of Fox (2021), A mathematical primer for social statistics, is excellent. If you want to learn something of using matrix algebra for data analysis and statistics, I recommend our package matlib (Friendly et al., 2024).

I also assume the reader to have at least a basic familiarity with R. While R fundamentals are outside the scope of the book, I believe that this language provides a rich set of resources, far beyond that offered by other statistical software packages, and is well worth learning.

-

For those not familiar with R, I recommend Matloff (2011), Wickham (2014), and Cotton (2013) for introductions to programming in the language. Fox & Weisberg (2018) and Teetor (2011) are great for learning about how to conduct basic statistical analyses in R. TODO: Revise this list.

+

For those not familiar with R, I recommend Matloff (2011), Wickham (2014), and Cotton (2013) for introductions to programming in the language. Fox & Weisberg (2018) and Teetor (2011) are great for learning about how to conduct basic statistical analyses in R. TODO: Revise this list.

TODO: Add stuff on general books about graphics

Conventions used in this book

TODO: Some stuff below is just for testing… Revise.

@@ -581,7 +581,7 @@

Visualizing Multivariate Data and Models in R

  • italic : indicates terms to be emphasized or defined in the text, …

  • bold : is used for names of R packages. Or, better yet: bold monospace, but I’d rather this be in a different color. Perhaps I can use “r colorize(”lattice”, “green”)” inline -> lattice will do this? This does bold & color, but can’t use monospace.

    -

    I can now use inline â€pkg(“lattice”)’ generating lattice, or also with a citation, pkg("lattice", cite=TRUE) -> lattice (Sarkar, 2024). Can also refer to the matlib package (Friendly et al., 2024), including “package” between the name and citation.

    +

    I can now use inline â€pkg(“lattice”)’ generating lattice, or also with a citation, pkg("lattice", cite=TRUE) -> lattice (Sarkar, 2024). Can also refer to the matlib package (Friendly et al., 2024), including “package” between the name and citation.

  • fixed-width : is used in program listings as well as in text to refer to variable and function names, R statement elements and keywords.

  • R code in program listings and output is presented in monospaced (typewriter) font, fira mono

  • @@ -591,7 +591,7 @@

    Visualizing Multivariate Data and Models in R

    -
    +

      -
    1. In his famous TV series, Cosmos, Carl Sagan provides an intriguing video presentation Flatland and the 4th dimension. However, as far back as 1754 (Cajori, 1926), the idea of adding a fourth dimension appears in Jean le Rond d’Alembert’s “Dimensions”, and one realization of a four-dimensional object is a tesseract, shown in Figure fig-1D-4D.↩︎

    2. +
    3. In his famous TV series, Cosmos, Carl Sagan provides an intriguing video presentation Flatland and the 4th dimension. However, as far back as 1754 (Cajori, 1926), the idea of adding a fourth dimension appears in Jean le Rond d’Alembert’s “Dimensions”, and one realization of a four-dimensional object is a tesseract, shown in Figure 2.↩︎

    4. PRIM-9 is an acronym for Picturing, Rotation, Isolation and Masking in up to 9 dimensions. These operations are fundamental to interactive and dynamic data visualization.↩︎