From 6386ae15a5d2515afcef290e917532aca989b5bb Mon Sep 17 00:00:00 2001 From: Michael Friendly Date: Fri, 25 Oct 2024 17:54:17 -0400 Subject: [PATCH] Ch03 variable ordering in crime data --- 03-multivariate_plots.qmd | 94 +++++-- R/crime/crime-correlation.R | 81 ++++-- R/crime/crime-network.R | 41 +-- R/pvPlot.R | 52 ++++ docs/01-intro.html | 95 +++---- docs/index.html | 117 +++----- docs/search.json | 2 +- docs/site_libs/bootstrap/bootstrap.min.css | 4 +- docs/site_libs/kePrint-0.0.1/kePrint.js | 8 + docs/site_libs/lightable-0.0.1/lightable.css | 272 +++++++++++++++++++ 10 files changed, 557 insertions(+), 209 deletions(-) create mode 100644 R/pvPlot.R create mode 100644 docs/site_libs/kePrint-0.0.1/kePrint.js create mode 100644 docs/site_libs/lightable-0.0.1/lightable.css diff --git a/03-multivariate_plots.qmd b/03-multivariate_plots.qmd index 68d4a3a9..17d9833d 100644 --- a/03-multivariate_plots.qmd +++ b/03-multivariate_plots.qmd @@ -1487,7 +1487,7 @@ the nonparametric smooth doesn't differ much from the linear regression line. Exceptions to this appear mainly in the columns for `robbery` and `auto` (auto theft). -### Corrgrams {#sec-corrgram} +## Corrgrams {#sec-corrgram} What if you want to summarize the data even further, for example to show only the value of the correlation for each pair of variables? A @@ -1502,7 +1502,9 @@ ordering the levels of factors and variables in graphic displays to make important features most apparent. For variables, this means that we can arrange the variables in a matrix-like display in such a way as to make the pattern of relationships easiest to see. Methods to achieve this -include using principal components and cluster analysis to put the most +include using principal components +and cluster analysis +to put the most related variables together as described in @sec-pca-biplot. ```{r} @@ -1521,38 +1523,42 @@ panels as illustrated in @fig-corrgram-renderings. For example, a corrgram similar to @fig-crime-spm can be produced as follows (not shown here): -```{r} +```{r corrgram} #| eval: false -crime |> - select(where(is.numeric)) |> - corrgram(lower.panel = panel.ellipse, - upper.panel = panel.ellipse, - diag.panel = panel.density) +crime.cor <- crime |> + dplyr::select(where(is.numeric)) |> + cor() + +corrgram(crime.cor, + lower.panel = panel.ellipse, + upper.panel = panel.ellipse, + diag.panel = panel.density) ``` -`corrplot::corrplot()` provides the rendering methods +With the **corrplot** package, `corrplot()` provides the rendering methods `c("circle", "square", "ellipse", "number", "shade", "color", "pie")`, but only one can be used at a time. The function -`corrplot::corrplot.mixed()` allows different options to be selected for -the lower and upper triangles. The iconic shape is colored with a +`corrplot.mixed()` allows different options to be selected for +the lower and upper triangles. The iconic rendering shape is colored with a gradient in relation to the correlation value. +For comparison, @fig-crime-corrplot uses ellipses below the diagonal and +filled pie charts below the diagonal using a gradient of the fill color +in both cases. ```{r} #| label: fig-crime-corrplot #| fig-width: 8 #| fig-height: 8 #| out-width: "100%" -#| fig-cap: "Corrplot of the `crime` data, showing the correlation between each pair of variables with an ellipse (lower) and a pie chart symbol (upper), all shaded in proportion to the correlation value, also shown numerically." -crime |> - select(where(is.numeric)) |> - cor() |> - corrplot.mixed( - lower = "ellipse", - upper = "pie", - tl.col = "black", - tl.srt = 0, - addCoef.col = "black", - addCoefasPercent = TRUE) +#| fig-cap: "Mixed corrplot of the `crime` data, showing the correlation between each pair of variables with an ellipse (lower) and a pie chart symbol (upper), all shaded in proportion to the correlation value, also shown numerically." +corrplot.mixed(crime.cor, + lower = "ellipse", + upper = "pie", + tl.col = "black", + tl.srt = 0, + tl.cex = 1.25, + addCoef.col = "black", + addCoefasPercent = TRUE) ``` The combination of renderings shown in @fig-crime-corrplot is @@ -1562,6 +1568,50 @@ the values for murder with larceny and auto theft in row 1, columns 6-7 with those in column 1, rows 6-7, where the former are easier to distinguish. The shading color adds another visual cue. +The variables in @fig-crime-spm and @fig-crime-corrplot are arranged +by their order in the dataset, which is not often the most useful. +A better idea is to arrange the variables so that the most highly +correlated variables are adjacent. + +A general method described in @sec-var-order orders the variables according to the +angles of the eigenvectors from a principal components analysis +around a unit circle. `corrMatOrder()` provides several methods +(`order = c("AOE", "FPC", "hclust", "alphabet")`), +of which this is +`order = "AOE"`. Using this in `corrplot()` produces @fig-crime-corrplot-AOE. + +```{r corMatOrder} +ord <- corrMatOrder(crime.cor, order = "AOE") +rownames(crime.cor)[ord] +``` + + + + + + +```{r} +#| label: fig-crime-corrplot-AOE +#| fig-width: 8 +#| fig-height: 8 +#| out-width: "100%" +#| fig-cap: "Corrplot of the `crime` data with the variables reordered according to the angles of variable eigenvectors." +corrplot.mixed(crime.cor, + order = "AOE", + lower = "ellipse", + upper = "ellipse", + tl.col = "black", + tl.srt = 0, + tl.cex = 1.25, + addCoef.col = "black", + addCoefasPercent = TRUE) +``` + +In this case, where the correlations among the crime variables are all positive, the effect of +variable re-ordering is subtle, but ... +@fig-mtcars-corrplot-varorder and @fig-mtcars-corrplot-pcaorder provide a more dramatic example. + + Variations of corrgrams are worthy replacements for a numeric table of correlations, which are often presented in publications only for archival value. Including the numeric value (rounded here, for diff --git a/R/crime/crime-correlation.R b/R/crime/crime-correlation.R index 13cbd4dc..8dcbf172 100644 --- a/R/crime/crime-correlation.R +++ b/R/crime/crime-correlation.R @@ -8,37 +8,64 @@ library(dplyr) data(crime, package="ggbiplot") +crime.cor <- crime |> + dplyr::select(where(is.numeric)) |> + cor() + # similar to Fig 3.24 -crime |> - select(where(is.numeric)) |> - corrgram(lower.panel = panel.ellipse, - upper.panel = panel.ellipse, - diag.panel = panel.density) +corrgram(crime.cor, + lower.panel = panel.ellipse, + upper.panel = panel.ellipse, + diag.panel = panel.density) # show representation of ellipse and correlation value -crime |> - select(where(is.numeric)) |> - cor() |> - corrplot(diag = FALSE, - method = "ellipse", - tl.col = "black", - tl.srt = 0, - addCoef.col = "black", - addCoefasPercent = TRUE) - -crime |> - select(where(is.numeric)) |> - cor() |> - corrplot.mixed( - lower = "ellipse", - upper = "pie", - tl.col = "black", - tl.srt = 0, - tl.cex = 1.3, - addCoef.col = "black", - addCoefasPercent = TRUE) - +corrplot(crime.cor, + diag = FALSE, + method = "ellipse", + tl.col = "black", + tl.srt = 0, + addCoef.col = "black", + addCoefasPercent = TRUE) + +# use correlation ordering ("AOE") +corrplot(crime.cor, + diag = FALSE, + order = "AOE", + method = "ellipse", + tl.col = "black", + tl.srt = 0, + addCoef.col = "black", + addCoefasPercent = TRUE) + + +corrplot.mixed(crime.cor, + lower = "ellipse", + upper = "pie", + tl.col = "black", + tl.srt = 0, + tl.cex = 1.25, + addCoef.col = "black", + addCoefasPercent = TRUE) + +corrplot.mixed(crime.cor, + order = "AOE", #"FPC", + lower = "ellipse", + upper = "ellipse", + tl.col = "black", + tl.srt = 0, + tl.cex = 1.25, + addCoef.col = "black", + addCoefasPercent = TRUE) + +ord <- corrMatOrder(crime.cor, order = "AOE") +rownames(crime.cor)[ord] + +library(seriation) + +ord <- seriate(crime.cor, method = "PCA_angle") +# what's the order ? +permute(crime.cor, ord) |> rownames() diff --git a/R/crime/crime-network.R b/R/crime/crime-network.R index 98f22135..def1691e 100644 --- a/R/crime/crime-network.R +++ b/R/crime/crime-network.R @@ -8,29 +8,29 @@ library(qgraph) #' data(crime, package = "ggbiplot") -corrmat <- crime |> +crime.cor <- crime |> dplyr::select(where(is.numeric)) |> cor() # ### "association graph": network of correlations -qgraph(corrmat, +qgraph(crime.cor, title = "Crime data, correlations", title.cex = 1.25, graph = "cor", - minimum = "sig", sampleSize = nrow(crime), + minimum = "sig", sampleSize = nrow(crime), alpha = 0.01, color = grey(.9), vsize = 12, - labels = rownames(corrmat), + labels = rownames(crime.cor), posCol = "blue") # ### "concentration graph": network of partial correlations # Correlations between variables that cannot be explained by other variables in the network -qgraph(corrmat, +qgraph(crime.cor, title = "Crime data, partial correlations", title.cex = 1.25, graph = "pcor", - minimum = "sig", sampleSize = nrow(crime), + minimum = "sig", sampleSize = nrow(crime), alpha = 0.01, color = grey(.9), vsize = 14, - labels = rownames(corrmat), + labels = rownames(crime.cor), edge.labels = TRUE, edge.label.cex = 1.7, posCol = "blue") @@ -38,43 +38,48 @@ qgraph(corrmat, #' ### variable ordering: reorder variables by PC1 & PC2 angles library(seriation) -ord <- seriate(corrmat, method = "PCA_angle") +ord <- seriate(crime.cor, method = "PCA_angle") # what's the order ? -permute(corrmat, ord) |> rownames() +permute(crime.cor, ord) |> rownames() -qgraph(permute(corrmat, ord), +qgraph(permute(crime.cor, ord), title = "Crime data, correlations", title.cex = 1.25, graph = "cor", - minimum = "sig", sampleSize = nrow(crime), + minimum = "sig", sampleSize = nrow(crime), alpha = 0.01, color = grey(.9), vsize = 12, - labels = rownames(permute(corrmat, ord)), + labels = rownames(permute(crime.cor, ord)), edge.labels = TRUE, edge.label.cex = 1.3, posCol = "blue") +#' to understand the partial correlations, make scatterplots of the residuals from the +#' models where each x_i, x_j are predicted by all others. I've never seen such a plot, +#' but could be done by modifying AVplot +#' - -#' `mtcars` data +#' ## `mtcars` data +#' Try the same things for the mtcars data data(mtcars) -corrmat <- cor(mtcars) +cars.cor <- cor(mtcars) -qgraph(corrmat, +qgraph(cars.cor, graph = "cor", minimum = "sig", sampleSize = nrow(mtcars), color = grey(.9), vsize = 12, labels = names(mtcars), +# edge.labels = TRUE, edge.label.cex = 1.3, posCol = "blue") -qgraph(corrmat, +qgraph(cars.cor, graph = "pcor", minimum = "sig", sampleSize = nrow(mtcars), color = grey(.9), vsize = 12, labels = names(mtcars), + edge.labels = TRUE, edge.label.cex = 1.3, posCol = "blue") -# same, for crime data diff --git a/R/pvPlot.R b/R/pvPlot.R new file mode 100644 index 00000000..a8b32954 --- /dev/null +++ b/R/pvPlot.R @@ -0,0 +1,52 @@ +# Partial variables plot + +# To understand the partial correlations, make scatterplots of the residuals from the +# models where each x_i, x_j are predicted by all others. + +# see also: pairs version +# https://stackoverflow.com/questions/35591033/plot-scatterplot-matrix-with-partial-correlation-coefficients-in-r + +pvPlot <- function(X, vars = 1:2, + col = "black", + pch = 16, + cex = par("cex"), + axes = TRUE, + ...) { + nv <- ncol(X) + nr <- nrow(X) + v1 <- vars[1] + v2 <- vars[2] + all <- if(is.numeric(vars)) seq_along(nv) else names(X) + others <- setdiff(all, vars) + res <- X[, vars] + res[, 1] <- lsfit(X[, others], X[, v1])$residuals + res[, 2] <- lsfit(X[, others], X[, v2])$residuals + plot(res, + col = col, pch = pch, cex = cex, ...) + if (axes) + abline(h = 0, v = 0, col = "gray") + invisible(res) +} + +if(FALSE) { + data(crime, package = "ggbiplot") + res <- crime |> + tibble::column_to_rownames("st") |> + dplyr::select(where(is.numeric)) |> + pvPlot(vars = c("burglary", "larceny")) + + head(res) + car::scatterplot(larceny ~ burglary, data = res, + xlab = "burglary residual", + ylab = "larceny residual", + pch = 16, col = "black", + smooth = FALSE, boxplots = FALSE, + grid = FALSE, + id = list(n=5)) + abline(h = 0, v = 0, col = "gray") + text(-600, 1300, + label = paste("partial r =", + round(cor(res[,1], res[,2]), 3)), + pos = 4) + +} \ No newline at end of file diff --git a/docs/01-intro.html b/docs/01-intro.html index 91279784..9bf698e4 100644 --- a/docs/01-intro.html +++ b/docs/01-intro.html @@ -2,9 +2,9 @@ - + -1  Introduction – Visualizing Multivariate Data and Models in R +Visualizing Multivariate Data and Models in R - 1  Introduction