edits to Ch08 re MT comments

friendly · Nov 23, 2024 · 4e55846 · 4e55846
1 parent e5bbf6d
commit 4e55846
Show file tree

Hide file tree

Showing 10 changed files with 269 additions and 20 deletions.
diff --git a/08-collinearity-ridge.qmd b/08-collinearity-ridge.qmd
@@ -25,7 +25,7 @@ and presents some novel visual tools for these purposes using the  `r pkg("VisCo
 
 One class of solutions for collinearity involves _regularization methods_ such as ridge regression. 
 Another collection of graphical methods, generalized ridge trace plots, implemented 
-in the `r pkg("genridge")`` package, sheds further light on what is accomplished by this technique.
+in the `r pkg("genridge")` package, sheds further light on what is accomplished by this technique.
 More generally, the methods of this chapter are further examples of how data and confidence
 ellipsoids can be used to visualize bias **and** precision of regression estimates.
 
@@ -109,8 +109,8 @@ The sampling variances and covariances of the estimated coefficients is
 $\text{Var} (\widehat{\mathbf{b}}) = \sigma_\epsilon^2 \times (\mathbf{X}^\mathsf{T} \mathbf{X})^{-1}$ and $\sigma_\epsilon^2$ is the variance of the residuals $\mathbf{\epsilon}$, estimated by the
 mean squared error (MSE).
 
-In the limiting case, when one $x_i$ is _perfectly_
-predictable from the other $x$s, i.e., $R^2 (x_i | \text{other }x) = 1$, 
+
+In the limiting case, collinearity becomes particularly problematic  when one $x_i$ is _perfectly_ predictable from the other $x$s, i.e., $R^2 (x_i | \text{other }x) = 1$. This is problematic because:
 
 * there is no _unique_ solution for the regression coefficients 
 $\mathbf{b} = (\mathbf{X}^\mathsf{T} \mathbf{X})^{-1} \mathbf{X} \mathbf{y}$;
@@ -121,10 +121,13 @@ This extreme case reflects a situation when one or more predictors are effective
 when you include two variables $x$ and $y$ and their sum $z = x + y$ in a model.
 For instance, a dataset may include variables for income, expenses, and savings. But
 income is the sum of expenses and savings, so not all three should be used as predictors.
-A more subtly case is the use _ipsatized_, defined as
+
+A more subtle case is the use _ipsatized_, defined as
 scores that sum to a constant, such as proportions of a total. You might have scores on
 tests of reading, math, spelling and geography. With ipsatized scores, any one of these
-is necessarily 1 $-$ sum of the others. 
+is necessarily 1 $-$ sum of the others, i.e., if reading is 0.5, math and geography are
+both 0.15, then geography must be 0.2. Once thre of the four scores are known, the last
+provides no new information.
 
 More generally, collinearity refers to the case when there are very high
 **multiple correlations** among the predictors, such as $R^2 (x_i | \text{other }x) \ge 0.9$. 
@@ -141,7 +144,7 @@ so a coefficient that would significant if the predictors were uncorrelated beco
 when collinearity is present.
 * Thus you may find a situation where an overall model is highly significant (large $F$-statistic), while
 no (or few) of the individual predictors are. This is a puzzlement!
-* Beyond this, the least squares solution may have poor numerical accurracy [@Longley:1967], because
+* Beyond this, the least squares solution may have poor numerical accuracy [@Longley:1967], because
 the solution depends inversely on the determinant $|\,\mathbf{X}^\mathsf{T} \mathbf{X}\,|$, which approaches 0 as multiple correlations increase.
 * There is an interpretive problem as well. Recall that the coefficients $\hat{b}$ are _partial coefficients_, meaning that they estimate
 change $\Delta y$ in $y$ when $x$ changes by one unit $\Delta x$, but holding **all other variables
@@ -216,8 +219,8 @@ beta <- c(3, 3)          # true coefficients
 # Specify a covariance matrix, with standard deviations
 #   s[1], s[2] and correlation r
 Cov <- function(s, r){
-  matrix(c(s[1],    r * prod(s),
-         r * prod(s), s[2]), nrow = 2, ncol = 2)
+  matrix(c(s[1],        r * s[1]*s[2],
+         r * s[1]*s[2], s[2]), nrow = 2, ncol = 2)
 }
 
 # Generate a dataframe of X, y for each rho
@@ -270,8 +273,8 @@ do_plots <- function(XY, mod, r) {
   confidenceEllipse(mod,
                     col = "red",
                     fill = TRUE, fill.alpha = 0.1,
-                    xlab = "x1 coefficient",
-                    ylab = "x2 coefficient",
+                    xlab = expression(paste("x1 coefficient, ", beta[1])),
+                    ylab = expression(paste("x2 coefficient, ", beta[2])),
                     xlim = c(-5, 10),
                     ylim = c(-5, 10),
                     asp = 1)
@@ -360,7 +363,7 @@ of such terms, relative to what would be obtained with uncorrelated data. Visual
 by comparing the areas of the ellipses in the bottom row of @fig-collin-data-beta.
 Because the magnitude of the GVIF increases with the number of degrees of freedom for the set of parameters, 
 Fox & Monette suggest the analog $\sqrt{\text{GVIF}^{1/2 \text{df}}}$ as the measure of impact on standard
-errors.
+errors. This is what `car::vif()` calculates for a factor or other term with more than 1 df.
 
 **Example**: This example uses the `cars` dataset in the `VisCollin` package
 containing various measures of size and performance on 406 models of automobiles from 1982. Interest is focused on predicting gas mileage, `mpg`.
@@ -375,11 +378,13 @@ accelerate from 0 -- 60 mph and model year (1970--1982). Perhaps surprisingly, o
 significantly predict gas mileage. What's going on here?
 
 ```{r cars-mod}
-cars.mod <- lm (mpg ~ cylinder + engine + horse + weight + accel + year, 
+cars.mod <- lm (mpg ~ cylinder + engine + horse + 
+                      weight + accel + year, 
                 data=cars)
 Anova(cars.mod)
 ```
 
+
 We check the variance inflation factors, using `car::vif()`. We see that most predictors have very high
 VIFs, indicating moderately severe multicollinearity.
 
@@ -390,10 +395,20 @@ sqrt(vif(cars.mod))
 ```
 
 According to $\sqrt{\text{VIF}}$, the standard error of `cylinder` has been
-multiplied by 3.26 and it's $t$-value divided by this number,
+multiplied by $\sqrt{10.63} = 3.26$ and it's $t$-value is divided by this number,
 compared with the case when all predictors are
 uncorrelated. `engine`, `horse` and `weight` suffer a similar fate.
 
+If we also included the factor `origin` in the models, we would get the generalized GVIF:
+
+```{r cars-mod2}
+cars.mod2 <- lm (mpg ~ cylinder + engine + horse + 
+                       weight + accel + year + origin 
+                 data=cars)
+vif(cars.mod2)
+```
+
+
 ::: {.callout-tip title="Connection with inverse of correlation matrix"}
 
 In the linear regression model with standardized predictors,

diff --git a/10-mlm-review.qmd b/10-mlm-review.qmd
@@ -673,7 +673,10 @@ Then, we can illustrate @eq-H-contrasts by extracting the 1 df $\mathbf{H}$  mat
 from the results of `linearHypothesis`.
 
 <!-- this doesn't work -->
-```{r dogfood-Eqn, results='asis'}
+```{r dogfood-Eqn}
+#| results: asis
+#| echo: false
+#| eval: false
 options(print.latexMatrix = list(display.labels=FALSE))
 SSP_H1 <- H1$SSPH |> round(digits=2)
 SSP_H2 <- H2$SSPH |> round(digits=2)
@@ -814,7 +817,8 @@ In @fig-parenting-boxpl, I've also plotted the group means with white dots.
 #| code-summary: "See the ggplot code"
 #| fig-cap: Faceted boxplots of scores on the three parenting scales, showing also the mean for each. 
 parenting_long <- Parenting |>
-  tidyr::pivot_longer(cols=caring:play, names_to = "variable")
+  tidyr::pivot_longer(cols=caring:play, 
+                      names_to = "variable")
 
 ggplot(parenting_long, 
        aes(x=group, y=value, fill=group)) +

diff --git a/11-mlm-viz.qmd b/11-mlm-viz.qmd
@@ -28,15 +28,17 @@ library(tidyr)
 ## HE plot framework {#sec-he-framework}
 
 @sec-Hotelling illustrated the basic ideas of the framework for visualizing multivariate
-linear models in the context of a simple two group design, using Hotelling's $T^2$.  These are illustrated in @fig-HE-framework.
+linear models in the context of a simple two group design, using Hotelling's $T^2$.  The main ideas are illustrated in @fig-HE-framework.
 
 * In data space, each group is summarized by its **data ellipse**, representing the means and covariances.
 
 * Variation against the hypothesis of equal means can be seen by the $\mathbf{H}$ ellipse in the **HE plot**, representing the data ellipse of the fitted values. Error variance is shown in the $\mathbf{E}$ ellipse,
 representing the pooled within-group covariance matrix, $\mathbf{S}_p$ and the data ellipse of the residuals from the model.
 
 * The MANOVA (or Hotelling's $T^2$) is formally equivalent to a **discriminant analysis**, predicting
-group membership from the response variables which can be seen in data space.
+group membership from the response variables which can be seen in data space. (The main difference is 
+emphasis and goals: MANOVA seeks to test differences among group means, while discriminant analysis aims
+at classification of the observations into groups.)
 
 * This effectively projects the $p$-dimensional
 space of the predictors into the smaller **canonical space** that shows the greatest differences among
@@ -47,10 +49,24 @@ the groups.
 #| echo: false
 #| fig-align: center
 #| out-width: "100%"
-#| fig-cap: "The Hypothesis Error plot framework. _Source_: author"
+#| fig-cap: "The Hypothesis Error plot framework for a two-group design. Above: Data ellipses can be summarized in an HE plot showing the pooled within-group error ($\\mathbf{E}$) ellipse and the $\\mathbf{H}$ "ellipse" for the group means.
+#| Below: Observations projected on the line joining the means give discriminant scores which correpond to a one-dimensional canonical space."
 knitr::include_graphics("images/HE-framework.png")
 ```
 
+Having described the statistical ideas behind the MLM in @sec-mlm-review, we can proceed to
+extend this framework for visualization ...
+
+```{r}
+#| label: fig-arcmanov1
+#| echo: false
+#| fig-align: center
+#| out-width: "100%"
+#| fig-cap: "Conceptual plots showing the essential ideas behind multivariate tests, in terms of the hypothesis
+#| ($\\mathbf{H}$) and error ($\\mathbf{E}$) matrices for a 1-way MANOVA design with two response variables, $Y_1$ and $Y_2$"
+knitr::include_graphics("images/arcmanov.png")
+```
+
 For more complex models such as MANOVA with multiple factors or multivariate multivariate regression,
 there is one $\mathbf{H}$ ellipse for each term in the model. ...
 

diff --git a/R/arcmanov.sas b/R/arcmanov.sas
@@ -0,0 +1,181 @@
+%include goptions;
+*include arc ;
+options ls=95;
+goptions htext=1.7 htitle=2.5;
+%let ht=1.7;
+*goptions hsize=7.  vsize=7.5;
+Data means;
+    Input Group Y1 Y2;
+    R    = .4 + .5*uniform(123131);
+cards;
+1  8  10
+2 10   5
+3 12  14
+4 17  20
+5 18  11
+6 20  16
+7 25  20
+8 27  26
+;
+%macro fig1;
+Data Fig1;
+   *annocms;
+   %annomac;
+   %dclanno;
+   %system(2,2,4);
+   Set means;
+   If _N_=1 then do;
+    %label(2,36.5,'Scatter around group means ',BLACK,0,0, &ht ,,6);
+    %label(2,35,'represented by each ellipse',BLACK,0,0,&ht,,6);
+    %label(38,2,'(a)',BLACK, 0,0,&ht,,4)
+   End;
+ * color = 'H' || put(Group*45,HEX3.);
+   color = 'BLACK';
+   RX=3.3; RY=3*R;
+   tilt=35 + 10*uniform(0);
+   %ellipse(Y1,Y2,RX,RY,tilt,color=black,line=1+_N_,width=2);
+   %label(Y1+.5,Y2,put(_N_,2.),black,0,0,1.5,,6);
+Title '(a) Individual group scatter';
+symbol1 v=dot h=1.5 i=none c=black r=8;
+
+proc gplot data=means;
+   plot y2 * y1=group /     nolegend frame    anno=fig1
+                            vaxis=axis1 haxis=axis2 hm=1 vm=1;
+
+   axis1 order=(0 to 40 by 10) /* length=6.5 in */;
+   axis2 order=(0 to 40 by 10) /* length=6.5 in */;
+
+Proc Summary data=means;
+    Class GROUP;
+    Var Y1 Y2;
+    Output out=GRAND mean=Y1 Y2;
+Proc Print data=GRAND;
+%mend;
+
+%macro fig2;
+Data Fig2;
+    %dclanno;
+    %system(2,2,4);
+    Length TEXT $30;
+    Set GRAND;
+    drop a rad xp yp Y1 Y2 group _type_ _freq_ rot tilt;
+    If _TYPE_=0;
+       %ellipse(Y1,Y2,16, 8, 50,color=BLACK,line=1,width=2);
+       %ellipse(Y1,Y2, 4,2.5,37,color=BLACK,line=1,width=2);
+       X=Y1; Y=Y2; SIZE=1.8; TEXT='+';
+       FUNCTION='SYMBOL'; output;
+       rad = arcos(-1)/180;
+       Do a = 0 to 270 by 90;
+          ang = 37+a;
+          XP = Y1 + 6*cos(rad*ang);
+          YP = Y2 + 6*sin(rad*ang);
+          %line(Y1,Y2, XP,YP,*,2,1);
+       End;
+%label(26,29,'H matrix',BLACK,0,0,&ht,,6);
+%label(21,15,'E matrix',BLACK,0,0,&ht,,6);
+%label(1,37.5,'Deviations of group means from',BLACK,0,0,&ht,,6);
+%label(1,36.0,'grand mean (outer) and pooled ',BLACK,0,0,&ht,,6);
+%label(1,34.5,'within-group (inner) ellipses.',BLACK,0,0,&ht,,6);
+%label(39,37,'How big is H relative to E?  ',BLACK,0,0,2,,4);
+%label(38,2,'(b)',BLACK, 0,0, &ht,,4)
+Title '(b) Between and Within Scatter';
+Proc print;
+
+Proc GPLOT data=means;
+   Plot Y2 * Y1    / anno=Fig2 frame
+                     vaxis=axis1 haxis=axis2 vminor=1 hminor=1;
+   axis1 order=(0 to 40 by 10);
+   axis2 order=(0 to 40 by 10);
+	run; quit;
+%mend;
+
+%macro fig3;
+Data Can;
+   Label CAN1='First Canonical dimension'
+         CAN2='Second Canonical dimension';
+   CAN1=0; CAN2=0;
+   output;
+Data Fig3;
+   Set Can;
+   %dclanno;
+   Length TEXT $30;
+   %system(2,2,4);
+   %ellipse(CAN1,CAN2,   5, 2.5,  103,color=black,line=1,width=2);   /* H*       */
+   %ellipse(CAN1,CAN2, 4.6, 1.56, 113,color=red  ,line=2,width=1);   /* HE-1     */
+   %ellipse(CAN1,CAN2,   1, 1.6,    0,color=black,line=1,width=2);   /* E*       */
+   %ellipse(CAN1,CAN2,   1,   1,    0,color=red  ,line=2,width=1);   /* EE-1 = I */
+%label(-2.1,5.0,'H*   ',BLACK,0,0,&ht,,6);
+%label(-2.0,4.0,'HE-1 ',RED  ,0,0,&ht,,6);
+%label( 0.0,2.0,'E*   ',BLACK,0,0,&ht,,6);
+%label(  .0, .8,'I    ',RED  ,0,0,&ht,,6);
+%label( 5.9,5.8,'The E matrix is orthogonalized',BLACK,0,0,&ht,,4);
+%label( 5.9,5.4,'by its principal components.  ',BLACK,0,0,&ht,,4);
+%label( 5.9,4.8,'The same transformation is    ',BLACK,0,0,&ht,,4);
+%label( 5.9,4.4,'applied to the H matrix.      ',BLACK,0,0,&ht,,4);
+%label( 5.8,-5.8,'(c)',BLACK, 0,0,&ht,,A)
+
+Title '(c) H Matrix standardized by E matrix, giving HE'
+             m=(-.1,+.8) H=1.8 '-1';
+proc gplot data=can;
+   plot can2 * can1/ anno=fig3 frame
+                     vaxis=axis1 haxis=axis2 vminor=1 hminor=1;
+   axis1 order=(-6 to 6 by 2) 
+         label=(a=90 r=0);
+   axis2 order=(-6 to 6 by 2) 
+            ;
+	run; quit;
+%mend;
+%macro fig4;
+Data Fig4;
+   Set Can;
+   %dclanno;
+   Length TEXT $30;
+   Drop rad a XP YP;
+   %system(2,2,4);
+   %ellipse(CAN1,CAN2, 4.6, 1.56, 113,color=RED  ,line=2,width=2);   /* HE-1     */
+   %ellipse(CAN1,CAN2,   1,   1,    0,color=RED  ,line=2,width=2);   /* EE-1 = I */
+   rad = arcos(-1)/180;
+       Do a = 0 to 270 by 90;
+          ang =113+a;
+          If a=0 | a=180 then len=4.6;
+                         else len=1.56;
+          XP = CAN1 + len*cos(rad*ang);
+          YP = CAN2 + len*sin(rad*ang);
+          %line(CAN1,CAN2, XP,YP,black,1,1);
+       End;
+%label(-2.5,4.0,'HE-1 ',RED  ,0,0,&ht,,4);
+%label(  .0, .8,'I    ',RED  ,0,0,&ht,,6);
+%label( -0.8,2.5,'l',BLACK,    0,0,1.7,CGREEK,0);
+      X=.;Y=.; TEXT='1'; SIZE=.9; output;
+%label( 1.1,.02, 'l',BLACK,    0,0,1.7,CGREEK,0);
+      X=.;Y=.; TEXT='2'; SIZE=.9; output;
+%label( 5.9,5.8,'The size of HE-1 is now shown ',BLACK,0,0,&ht,,4);
+%label( 5.9,5.4,'directly by the size of its   ',BLACK,0,0,&ht,,4);
+%label( 5.9,5.0,'latent roots.                 ',BLACK,0,0,&ht,,4);
+%label( 5.8,-5.8,'(d)',BLACK, 0,0,&ht,,A)
+
+Title '(d) Principal axes of HE' m=(-.1,+.8) H=1.8 '-1';
+proc gplot data=can;
+   plot can2 * can1/ anno=fig4 frame
+                     vaxis=axis1 haxis=axis2 vminor=1 hminor=1;
+   axis1 order=(-6 to 6 by 2)
+         label=(a=90 r=0);
+   axis2 order=(-6 to 6 by 2)
+            ;
+	run; quit;
+%mend;
+%gdispla(OFF);
+%fig1;
+%fig2;
+%fig3;
+%fig4;
+
+%gdispla(ON);
+
+%panels(rows=2, cols=2, order=down);
+
+/*
+%panels(rows=1, cols=2);
+%gskip;
+%panels(rows=1, cols=2, replay=1:3 2:4);
+*/
diff --git a/R/cars-colldiag.R b/R/cars-colldiag.R
@@ -25,6 +25,11 @@ cars.mod <- lm (mpg ~ cylinder + engine + horse + weight + accel + year,
                 data=cars)
 vif(cars.mod)
 
+cars.mod2 <- lm (mpg ~ cylinder + engine + horse + weight + accel + year + origin,
+                data=cars)
+vif(cars.mod2)
+
+
 # SAS: / collin option
 #colldiag(cars.mod)
 
@@ -121,3 +126,20 @@ ggp + geom_text_repel(data = scores[dsq > qchisq(0.95, df = 6),],
                 size = 5)
 
 
+library(ggbiplot)
+
+cars.pca <- reflect(cars.pca.save, columns = 5:6)
+# why no ellipse?
+ggbiplot(cars.pca,
+         choices = 6:5,
+         obs.scale = 1, var.scale = 1,
+         point.size = 1,
+         var.factor =10,
+         varname.size = 4,
+         varname.color = "blue",
+         ellipse = TRUE,
+         ellipse.prob = 0.5, ellipse.alpha = 0.1,
+         axis.title = "Dim") +
+  theme_bw(base_size = 14)
+
+