diff --git a/03-multivariate_plots.qmd b/03-multivariate_plots.qmd index ace2e391..6fe5adbe 100644 --- a/03-multivariate_plots.qmd +++ b/03-multivariate_plots.qmd @@ -461,8 +461,8 @@ The next step, by John Emerson and others [@Emerson-etal:2013] was to recognize that combinations of continuous and discrete, categorical variables could be plotted in different ways. -* a pair of continuous variables ... -* a pair of one continuous and one categorical variable ... +* two continuous variables can be shown as a standard scatterplot of points and/or bivariate density contours, or simply by numeric summaries such as a correlation value; +* a pair of one continuous and one categorical variable can be shown as side-by-side boxplots or violin plots, histograms or density plots * two categorical variables could be shown in a mosaic plot or by grouped bar plots. In the `ggplot2` framework, these displays are implemented in the diff --git a/R/penguin/peng-ggally.R b/R/penguin/peng-ggally.R new file mode 100644 index 00000000..e1229f10 --- /dev/null +++ b/R/penguin/peng-ggally.R @@ -0,0 +1,93 @@ +#' --- +#' title: Penguin data, GGally ggpairs +#' --- +#' + +library(ggplot2) +library(GGally) + +load(here::here("data", "peng.RData")) +str(peng) + +theme_set(theme_bw(base_size = 16)) + +# basic plot +ggpairs(peng, columns=3:6, + aes(color=species, alpha=0.5)) + + +# use panel functions + +# my_panel <- function(data, mapping, ...){ +# p <- ggplot(data = data, mapping = mapping) + +# geom_point() + +# geom_smooth(method=loess, formula = y ~ x, +# fill="red", color="red", ...) + +# geom_smooth(method=lm, formula = y ~ x, +# fill="blue", color="blue", ...) +# p +# } + +my_panel <- function(data, mapping, ...){ + p <- ggplot(data = data, mapping = mapping) + + geom_point() + + geom_smooth(method=lm, formula = y ~ x, se = FALSE, ...) + + geom_smooth(method=loess, formula = y ~ x, se = FALSE, ...) + p +} + + + +ggpairs(peng, columns=3:6, + mapping = aes(color=species, alpha=0.2), + lower = list(continuous = my_panel), + upper = list(continuous = my_panel), + progress = FALSE) + +# only regression line & data ellipse +my_panel1 <- function(data, mapping, ...){ + p <- ggplot(data = data, mapping = mapping) + + geom_smooth(method=lm, formula = y ~ x, se = FALSE, ...) + + stat_ellipse(geom = "polygon", level = 0.68, ...) + p +} + +ggpairs(peng, columns=3:6, + mapping = aes(color=species, fill = species, alpha=0.2), + lower = list(continuous = my_panel1), + upper = list(continuous = my_panel1), + progress = FALSE) + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank()) + + + +# make a general panel function, with elements optional +gg_panel <- function(data, mapping, ..., + points = FALSE, + lm = TRUE, + loess = FALSE, + ellipse = TRUE){ + + pts <- if(points) geom_point else NULL + lml <- if(lm) geom_smooth(method=lm, formula = y ~ x, se = FALSE, ...) else NULL + sml <- if(loess) geom_smooth(method=loess, formula = y ~ x, se = FALSE, ...) else NULL + ell <- if(ellipse) stat_ellipse() else NULL + + p <- ggplot(data = data, mapping = mapping) + + pts + + lml + + sml + + ell + p +} + +ggpairs(peng, columns=3:6, + mapping = aes(color=species, alpha=0.2), + lower = list(continuous = wrap(gg_panel)), + upper = list(continuous = wrap(gg_panel)), + progress = FALSE) + + + + diff --git a/R/penguin/peng-ggplot.R b/R/penguin/peng-ggplot.R index 62d29b3c..d50cd969 100644 --- a/R/penguin/peng-ggplot.R +++ b/R/penguin/peng-ggplot.R @@ -5,6 +5,9 @@ library(dplyr) library(ggplot2) library(car) +if(!require("ggdensity")) install.packages("ggdensity") +library(ggdensity) +library(patchwork) load(here::here("data", "peng.RData")) str(peng) @@ -25,9 +28,28 @@ ggplot(peng, # remove points ggplot(peng, aes(x = bill_length, y = bill_depth, - color = species, shape = species, fill=species)) + + color = species, fill=species)) + geom_smooth(method = "lm", se=FALSE, linewidth=2) + stat_ellipse(geom = "polygon", level = 0.95, alpha = 0.2) + stat_ellipse(geom = "polygon", level = 0.68, alpha = 0.2) + stat_ellipse(geom = "polygon", level = 0.40, alpha = 0.2) + theme(legend.position = c(0.85, 0.15)) + +# use bivariate contours +p1 <- ggplot(peng, + aes(x = bill_length, y = bill_depth, + color = species)) + + geom_smooth(method = "lm", se=FALSE, linewidth=2) + + geom_density_2d(linewidth = 1.2, bins = 8) + + ggtitle("geom_density_2d") + + theme(legend.position = c(0.85, 0.15)) + +p2 <- ggplot(peng, + aes(x = bill_length, y = bill_depth, + color = species, fill = species)) + + geom_smooth(method = "lm", se=FALSE, linewidth=2) + + geom_hdr(probs = c(0.95, 0.68, 0.4), show.legend = FALSE) + + ggtitle("ggdensity::geom_hdr") + + theme(legend.position = c(0.85, 0.15)) + +p1 + p2 diff --git a/R/penguin/peng-scat2.R b/R/penguin/peng-scat2.R index c425f3b0..3190386e 100644 --- a/R/penguin/peng-scat2.R +++ b/R/penguin/peng-scat2.R @@ -50,7 +50,7 @@ scatterplotMatrix(~ bill_length + bill_depth + flipper_length + body_mass | spec smooth = FALSE) scatterplotMatrix(~ bill_length + bill_depth + flipper_length + body_mass | species, - data = peng, col = col, legend=FALSE, + data = peng, col = col, legend=FALSE, cex.labels = 2.5, ellipse = list(levels = 0.68), smooth = FALSE, plot.points = FALSE) @@ -66,3 +66,5 @@ covEllipses(peng[3:6], peng$species, col = col, fill=TRUE, fill.alpha=.1) + + diff --git a/R/penguin/penguins-lm.R b/R/penguin/penguins-lm.R index 1714dda6..6777fad3 100644 --- a/R/penguin/penguins-lm.R +++ b/R/penguin/penguins-lm.R @@ -9,17 +9,23 @@ library(effects) library(palmerpenguins) # clean up variable names, etc. -peng <- penguins |> - rename( - bill_length = bill_length_mm, - bill_depth = bill_depth_mm, - flipper_length = flipper_length_mm, - body_mass = body_mass_g - ) |> - mutate(species = as.factor(species), - island = as.factor(island), - sex = as.factor(substr(sex,1,1))) |> - tidyr::drop_na() +# peng <- penguins |> +# rename( +# bill_length = bill_length_mm, +# bill_depth = bill_depth_mm, +# flipper_length = flipper_length_mm, +# body_mass = body_mass_g +# ) |> +# mutate(species = as.factor(species), +# island = as.factor(island), +# sex = as.factor(substr(sex,1,1))) |> +# tidyr::drop_na() + +load(here::here("data", "peng.RData")) +str(peng) + +theme_set(theme_bw(base_size = 16)) + str(peng) #View(peng) @@ -34,7 +40,7 @@ ggplotColours <- function(n = 6, h = c(0, 360) + 15){ # scatterplot matrix scatterplotMatrix(~ bill_length + bill_depth + flipper_length + body_mass | species, data=peng, - col = ggplotColours(3), + col = scales::hue_pal()(3), ellipse=list(levels=0.68)) # same, with GGally diff --git a/child/03-data-ellipse.qmd b/child/03-data-ellipse.qmd index 19aba353..1cb43708 100644 --- a/child/03-data-ellipse.qmd +++ b/child/03-data-ellipse.qmd @@ -420,6 +420,9 @@ as shown in @fig-peng-ggplot2. This idea, of **visual thinning** a graph to focus on what should be seen, becomes increasingly useful as the data becomes more complex. The `ggplot2` framework encourages this, because we can think of various components as layers, to be included or not. +Here I chose to include only the regression line and +add data ellipses of 40%, 68% and 95% coverage to highlight the increasing bivariate +density around the group means. ```{r} #| label: fig-peng-ggplot2 @@ -435,3 +438,42 @@ ggplot(peng, theme(legend.position = c(0.85, 0.15)) ``` +While I emphasize data ellipses (because I like their beautiful geometry), other visual +summaries of the bivariate density are possible and often useful. + +For a single variable, `stats::density()` and `ggplot2::geom_density()` +calculate a smoothed estimate of the density using kernel methods whose smoothness +is controlled by a bandwidth parameter, analogous to the span in a loess smoother. +This idea extends to two (and more) variables ... + +`ggplot2` provides `geom_density_2d()` which takes horizontal slices of the 3d surface at equally-spaced heights ... +The **ggdensity** package [@R-ggdensity] extends this ... + +```{r} +#| label: fig-peng-ggdensity +#| out-width: "100%" +#| fig-cap: "**Bivariate density**: ." +library(ggdensity) +library(patchwork) +p1 <- ggplot(peng, + aes(x = bill_length, y = bill_depth, + color = species)) + + geom_smooth(method = "lm", se=FALSE, linewidth=2) + + geom_density_2d(linewidth = 1.2, bins = 8) + + ggtitle("geom_density_2d") + + theme(legend.position = c(0.85, 0.15)) + +p2 <- ggplot(peng, + aes(x = bill_length, y = bill_depth, + color = species, fill = species)) + + geom_smooth(method = "lm", se=FALSE, linewidth=2) + + geom_hdr(probs = c(0.95, 0.68, 0.4), show.legend = FALSE) + + ggtitle("ggdensity::geom_hdr") + + theme(legend.position = c(0.85, 0.15)) + +p1 + p2 +``` + + + +