Skip to content

Commit

Permalink
penguins get bivariate density plots
Browse files Browse the repository at this point in the history
  • Loading branch information
friendly committed Nov 11, 2023
1 parent 5920954 commit f8d8bbb
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 16 deletions.
4 changes: 2 additions & 2 deletions 03-multivariate_plots.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -461,8 +461,8 @@ The next step, by John Emerson and others [@Emerson-etal:2013] was to recognize
that combinations of continuous and discrete, categorical variables could be plotted
in different ways.

* a pair of continuous variables ...
* a pair of one continuous and one categorical variable ...
* two continuous variables can be shown as a standard scatterplot of points and/or bivariate density contours, or simply by numeric summaries such as a correlation value;
* a pair of one continuous and one categorical variable can be shown as side-by-side boxplots or violin plots, histograms or density plots
* two categorical variables could be shown in a mosaic plot or by grouped bar plots.

In the `ggplot2` framework, these displays are implemented in the
Expand Down
93 changes: 93 additions & 0 deletions R/penguin/peng-ggally.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#' ---
#' title: Penguin data, GGally ggpairs
#' ---
#'

library(ggplot2)
library(GGally)

load(here::here("data", "peng.RData"))
str(peng)

theme_set(theme_bw(base_size = 16))

# basic plot
ggpairs(peng, columns=3:6,
aes(color=species, alpha=0.5))


# use panel functions

# my_panel <- function(data, mapping, ...){
# p <- ggplot(data = data, mapping = mapping) +
# geom_point() +
# geom_smooth(method=loess, formula = y ~ x,
# fill="red", color="red", ...) +
# geom_smooth(method=lm, formula = y ~ x,
# fill="blue", color="blue", ...)
# p
# }

my_panel <- function(data, mapping, ...){
p <- ggplot(data = data, mapping = mapping) +
geom_point() +
geom_smooth(method=lm, formula = y ~ x, se = FALSE, ...) +
geom_smooth(method=loess, formula = y ~ x, se = FALSE, ...)
p
}



ggpairs(peng, columns=3:6,
mapping = aes(color=species, alpha=0.2),
lower = list(continuous = my_panel),
upper = list(continuous = my_panel),
progress = FALSE)

# only regression line & data ellipse
my_panel1 <- function(data, mapping, ...){
p <- ggplot(data = data, mapping = mapping) +
geom_smooth(method=lm, formula = y ~ x, se = FALSE, ...) +
stat_ellipse(geom = "polygon", level = 0.68, ...)
p
}

ggpairs(peng, columns=3:6,
mapping = aes(color=species, fill = species, alpha=0.2),
lower = list(continuous = my_panel1),
upper = list(continuous = my_panel1),
progress = FALSE) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())



# make a general panel function, with elements optional
gg_panel <- function(data, mapping, ...,
points = FALSE,
lm = TRUE,
loess = FALSE,
ellipse = TRUE){

pts <- if(points) geom_point else NULL
lml <- if(lm) geom_smooth(method=lm, formula = y ~ x, se = FALSE, ...) else NULL
sml <- if(loess) geom_smooth(method=loess, formula = y ~ x, se = FALSE, ...) else NULL
ell <- if(ellipse) stat_ellipse() else NULL

p <- ggplot(data = data, mapping = mapping) +
pts +
lml +
sml +
ell
p
}

ggpairs(peng, columns=3:6,
mapping = aes(color=species, alpha=0.2),
lower = list(continuous = wrap(gg_panel)),
upper = list(continuous = wrap(gg_panel)),
progress = FALSE)




24 changes: 23 additions & 1 deletion R/penguin/peng-ggplot.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
library(dplyr)
library(ggplot2)
library(car)
if(!require("ggdensity")) install.packages("ggdensity")
library(ggdensity)
library(patchwork)

load(here::here("data", "peng.RData"))
str(peng)
Expand All @@ -25,9 +28,28 @@ ggplot(peng,
# remove points
ggplot(peng,
aes(x = bill_length, y = bill_depth,
color = species, shape = species, fill=species)) +
color = species, fill=species)) +
geom_smooth(method = "lm", se=FALSE, linewidth=2) +
stat_ellipse(geom = "polygon", level = 0.95, alpha = 0.2) +
stat_ellipse(geom = "polygon", level = 0.68, alpha = 0.2) +
stat_ellipse(geom = "polygon", level = 0.40, alpha = 0.2) +
theme(legend.position = c(0.85, 0.15))

# use bivariate contours
p1 <- ggplot(peng,
aes(x = bill_length, y = bill_depth,
color = species)) +
geom_smooth(method = "lm", se=FALSE, linewidth=2) +
geom_density_2d(linewidth = 1.2, bins = 8) +
ggtitle("geom_density_2d") +
theme(legend.position = c(0.85, 0.15))

p2 <- ggplot(peng,
aes(x = bill_length, y = bill_depth,
color = species, fill = species)) +
geom_smooth(method = "lm", se=FALSE, linewidth=2) +
geom_hdr(probs = c(0.95, 0.68, 0.4), show.legend = FALSE) +
ggtitle("ggdensity::geom_hdr") +
theme(legend.position = c(0.85, 0.15))

p1 + p2
4 changes: 3 additions & 1 deletion R/penguin/peng-scat2.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ scatterplotMatrix(~ bill_length + bill_depth + flipper_length + body_mass | spec
smooth = FALSE)

scatterplotMatrix(~ bill_length + bill_depth + flipper_length + body_mass | species,
data = peng, col = col, legend=FALSE,
data = peng, col = col, legend=FALSE, cex.labels = 2.5,
ellipse = list(levels = 0.68),
smooth = FALSE,
plot.points = FALSE)
Expand All @@ -66,3 +66,5 @@ covEllipses(peng[3:6], peng$species,
col = col,
fill=TRUE,
fill.alpha=.1)


30 changes: 18 additions & 12 deletions R/penguin/penguins-lm.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,23 @@ library(effects)
library(palmerpenguins)

# clean up variable names, etc.
peng <- penguins |>
rename(
bill_length = bill_length_mm,
bill_depth = bill_depth_mm,
flipper_length = flipper_length_mm,
body_mass = body_mass_g
) |>
mutate(species = as.factor(species),
island = as.factor(island),
sex = as.factor(substr(sex,1,1))) |>
tidyr::drop_na()
# peng <- penguins |>
# rename(
# bill_length = bill_length_mm,
# bill_depth = bill_depth_mm,
# flipper_length = flipper_length_mm,
# body_mass = body_mass_g
# ) |>
# mutate(species = as.factor(species),
# island = as.factor(island),
# sex = as.factor(substr(sex,1,1))) |>
# tidyr::drop_na()

load(here::here("data", "peng.RData"))
str(peng)

theme_set(theme_bw(base_size = 16))


str(peng)
#View(peng)
Expand All @@ -34,7 +40,7 @@ ggplotColours <- function(n = 6, h = c(0, 360) + 15){
# scatterplot matrix
scatterplotMatrix(~ bill_length + bill_depth + flipper_length + body_mass | species,
data=peng,
col = ggplotColours(3),
col = scales::hue_pal()(3),
ellipse=list(levels=0.68))

# same, with GGally
Expand Down
42 changes: 42 additions & 0 deletions child/03-data-ellipse.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,9 @@ as shown in @fig-peng-ggplot2.
This idea, of **visual thinning** a graph to focus on what should be seen,
becomes increasingly useful as the data becomes more complex. The `ggplot2` framework encourages this,
because we can think of various components as layers, to be included or not.
Here I chose to include only the regression line and
add data ellipses of 40%, 68% and 95% coverage to highlight the increasing bivariate
density around the group means.

```{r}
#| label: fig-peng-ggplot2
Expand All @@ -435,3 +438,42 @@ ggplot(peng,
theme(legend.position = c(0.85, 0.15))
```

While I emphasize data ellipses (because I like their beautiful geometry), other visual
summaries of the bivariate density are possible and often useful.

For a single variable, `stats::density()` and `ggplot2::geom_density()`
calculate a smoothed estimate of the density using kernel methods whose smoothness
is controlled by a bandwidth parameter, analogous to the span in a loess smoother.
This idea extends to two (and more) variables ...

`ggplot2` provides `geom_density_2d()` which takes horizontal slices of the 3d surface at equally-spaced heights ...
The **ggdensity** package [@R-ggdensity] extends this ...

```{r}
#| label: fig-peng-ggdensity
#| out-width: "100%"
#| fig-cap: "**Bivariate density**: ."
library(ggdensity)
library(patchwork)
p1 <- ggplot(peng,
aes(x = bill_length, y = bill_depth,
color = species)) +
geom_smooth(method = "lm", se=FALSE, linewidth=2) +
geom_density_2d(linewidth = 1.2, bins = 8) +
ggtitle("geom_density_2d") +
theme(legend.position = c(0.85, 0.15))
p2 <- ggplot(peng,
aes(x = bill_length, y = bill_depth,
color = species, fill = species)) +
geom_smooth(method = "lm", se=FALSE, linewidth=2) +
geom_hdr(probs = c(0.95, 0.68, 0.4), show.legend = FALSE) +
ggtitle("ggdensity::geom_hdr") +
theme(legend.position = c(0.85, 0.15))
p1 + p2
```




0 comments on commit f8d8bbb

Please sign in to comment.