From 015dc649fd7eb24ff05bf6a024879e83b3ef21e5 Mon Sep 17 00:00:00 2001 From: hanneoberman Date: Wed, 16 Oct 2024 12:36:23 +0200 Subject: [PATCH 1/5] make `plot_corr()` work on `mids` objects --- R/plot_corr.R | 79 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 29 deletions(-) diff --git a/R/plot_corr.R b/R/plot_corr.R index 2d290e0..e50a3b6 100644 --- a/R/plot_corr.R +++ b/R/plot_corr.R @@ -39,45 +39,58 @@ plot_corr <- if (is.matrix(data) && ncol(data) > 1) { data <- as.data.frame(data) } - verify_data(data = data, df = TRUE) + verify_data(data = data, df = TRUE, imp = TRUE) vrb <- rlang::enexpr(vrb) - vrb_matched <- match_vrb(vrb, names(data)) + vrbs_in_data <- if (mice::is.mids(data)) { + names(data$imp) + } else { + names(data) + } + vrb_matched <- match_vrb(vrb, vrbs_in_data) if (length(vrb_matched) < 2) { cli::cli_abort("The number of variables should be two or more to compute correlations.") } - # check if any column is constant - constants <- apply(data[, vrb_matched], MARGIN = 2, function(x) { - all(is.na(x)) || max(x, na.rm = TRUE) == min(x, na.rm = TRUE) - }) - if (any(constants)) { - vrb_matched <- vrb_matched[!constants] - cli::cli_inform( - c( - "No correlations computed for variable(s):", - " " = paste(names(constants[which(constants)]), collapse = ", "), - "i" = "Correlations are undefined for constants." + if (is.data.frame(data)) { + # for data: check if any column is constant + constants <- apply(data[, vrb_matched], MARGIN = 2, function(x) { + all(is.na(x)) || max(x, na.rm = TRUE) == min(x, na.rm = TRUE) + }) + if (any(constants)) { + vrb_matched <- vrb_matched[!constants] + cli::cli_inform( + c( + "No correlations computed for variable(s):", + " " = paste(names(constants[which(constants)]), collapse = ", "), + "i" = "Correlations are undefined for constants." + ) ) - ) + } + # compute correlations + corr <- stats::cov2cor(stats::cov( + data.matrix(data[, vrb_matched]), + use = "pairwise.complete.obs" + )) + } + if (mice::is.mids(data)) { + # check constatnts etc. + imps <- mice::complete(data, "all") + corrs <- purrr::map(imps, ~ { + stats::cor(.x) + }) + corr <- Reduce("+", corrs) / length(corrs) } - # compute correlations p <- length(vrb_matched) - corrs <- data.frame( + long <- data.frame( vrb = rep(vrb_matched, each = p), prd = vrb_matched, - corr = matrix( - round(stats::cov2cor( - stats::cov(data.matrix(data[, vrb_matched]), use = "pairwise.complete.obs") - ), 2), - nrow = p * p, - byrow = TRUE - ) + corr = matrix(round(corr, 2), nrow = p * p, byrow = TRUE) ) if (!diagonal) { - corrs[corrs$vrb == corrs$prd, "corr"] <- NA + long[long$vrb == long$prd, "corr"] <- NA } # create plot gg <- - ggplot2::ggplot(corrs, + ggplot2::ggplot(long, ggplot2::aes( x = .data$prd, y = .data$vrb, @@ -95,18 +108,26 @@ plot_corr <- limits = c(-1, 1) ) + theme_minimice() + lab_x <- "Imputation model predictor" + if (mice::is.mids(data)) { + lab_y <- "Column name" + lab_note <- "*pooled across imputations" + } else { + lab_y <- "Column name" + lab_note <- "*pairwise complete observations" + } if (caption) { gg <- gg + ggplot2::labs( - x = "Imputation model predictor", - y = "Variable to impute", + x = lab_x, + y = lab_y, fill = "Correlation* ", - caption = "*pairwise complete observations" + caption = lab_note ) } else { gg <- gg + - ggplot2::labs(x = "Imputation model predictor", y = "Variable to impute", fill = "Correlation") + ggplot2::labs(x = lab_x, y = lab_y, fill = "Correlation") } if (label) { gg <- From 145b367157a8165501e9aedd6ffd5d39b89b2e64 Mon Sep 17 00:00:00 2001 From: hanneoberman Date: Wed, 16 Oct 2024 14:55:02 +0200 Subject: [PATCH 2/5] functionality to plot difference between observed and imputed data --- R/plot_corr.R | 143 +++++++++++++++++++++++++++++++------------------- 1 file changed, 90 insertions(+), 53 deletions(-) diff --git a/R/plot_corr.R b/R/plot_corr.R index e50a3b6..0772afc 100644 --- a/R/plot_corr.R +++ b/R/plot_corr.R @@ -2,6 +2,7 @@ #' #' @param data A dataset of class `data.frame`, `tibble`, or `matrix`. #' @param vrb String, vector, or unquoted expression with variable name(s), default is "all". +#' @param diff Logical indicating whether the difference between the observed and imputed data is plotted. #' @param label Logical indicating whether correlation values should be displayed. #' @param square Logical indicating whether the plot tiles should be squares. #' @param diagonal Logical indicating whether the correlation of each variable with itself should be displayed. @@ -31,62 +32,95 @@ plot_corr <- function(data, vrb = "all", + diff = FALSE, label = FALSE, square = TRUE, diagonal = FALSE, rotate = FALSE, caption = TRUE) { + # process inputs if (is.matrix(data) && ncol(data) > 1) { data <- as.data.frame(data) } verify_data(data = data, df = TRUE, imp = TRUE) + if (diff && !mice::is.mids(data)) { + cli::cli_abort("Difference in correlations can only be computed with imputed data.") + } vrb <- rlang::enexpr(vrb) - vrbs_in_data <- if (mice::is.mids(data)) { - names(data$imp) + if (mice::is.mids(data)) { + imp <- TRUE + mids <- data + data <- data$data } else { - names(data) + imp <- FALSE } + vrbs_in_data <- names(data) vrb_matched <- match_vrb(vrb, vrbs_in_data) + # check if any column is constant + constants <- apply(data[, vrb_matched], MARGIN = 2, function(x) { + all(is.na(x)) || max(x, na.rm = TRUE) == min(x, na.rm = TRUE) + }) + if (any(constants)) { + vrb_matched <- vrb_matched[!constants] + cli::cli_inform( + c( + "No correlations computed for variable(s):", + " " = paste(names(constants[which(constants)]), collapse = ", "), + "i" = "Correlations are undefined for constants." + ) + ) + } if (length(vrb_matched) < 2) { cli::cli_abort("The number of variables should be two or more to compute correlations.") } - if (is.data.frame(data)) { - # for data: check if any column is constant - constants <- apply(data[, vrb_matched], MARGIN = 2, function(x) { - all(is.na(x)) || max(x, na.rm = TRUE) == min(x, na.rm = TRUE) - }) - if (any(constants)) { - vrb_matched <- vrb_matched[!constants] - cli::cli_inform( - c( - "No correlations computed for variable(s):", - " " = paste(names(constants[which(constants)]), collapse = ", "), - "i" = "Correlations are undefined for constants." - ) - ) - } - # compute correlations - corr <- stats::cov2cor(stats::cov( - data.matrix(data[, vrb_matched]), - use = "pairwise.complete.obs" - )) + # create plot labels + lab_x <- "Imputation model predictor" + lab_y <- "Imputation target" + lab_fill <- "Correlation* + " + if (!imp) { + lab_note <- "*pairwise complete observations" } - if (mice::is.mids(data)) { - # check constatnts etc. - imps <- mice::complete(data, "all") + if (imp) { + lab_note <- "*pooled across imputations" + } + if (diff) { + lab_fill <- "Difference in correlations* +" + lab_note <- "*observed minus imputed (pooled across imputations)" + } + # compute correlations + if (!imp | diff) { + corr <- stats::cov2cor(stats::cov(data.matrix(data[, vrb_matched]), use = "pairwise.complete.obs")) + } + if (imp) { + imps <- mice::complete(mids, "all") corrs <- purrr::map(imps, ~ { - stats::cor(.x) + stats::cov2cor(stats::cov(data.matrix(.x[, vrb_matched]), use = "pairwise.complete.obs")) }) - corr <- Reduce("+", corrs) / length(corrs) + if (diff) { + corr <- corr - (Reduce("+", corrs) / length(corrs)) + } else { + corr <- Reduce("+", corrs) / length(corrs) + } } + # convert correlations into plotting object p <- length(vrb_matched) long <- data.frame( vrb = rep(vrb_matched, each = p), prd = vrb_matched, corr = matrix(round(corr, 2), nrow = p * p, byrow = TRUE) ) + if (!diff) { + long$text <- long$corr + } + if (diff) { + long$text <- sprintf("%+.2f", long$corr) + long$text[long$corr < 0] <- long$corr[long$corr < 0] + } if (!diagonal) { long[long$vrb == long$prd, "corr"] <- NA + long[long$vrb == long$prd, "text"] <- "" } # create plot gg <- @@ -94,40 +128,43 @@ plot_corr <- ggplot2::aes( x = .data$prd, y = .data$vrb, - label = .data$corr, + label = .data$text, fill = .data$corr )) + ggplot2::geom_tile(color = "black", alpha = 0.6) + ggplot2::scale_x_discrete(limits = vrb_matched, position = "top") + ggplot2::scale_y_discrete(limits = rev(vrb_matched)) + - ggplot2::scale_fill_gradient2( - low = ggplot2::alpha("deepskyblue", 0.6), - mid = "lightyellow", - high = ggplot2::alpha("orangered", 0.6), - na.value = "grey90", - limits = c(-1, 1) - ) + + ggplot2::labs( + x = lab_x, + y = lab_y, + fill = lab_fill, + caption = lab_note + ) + theme_minimice() - lab_x <- "Imputation model predictor" - if (mice::is.mids(data)) { - lab_y <- "Column name" - lab_note <- "*pooled across imputations" - } else { - lab_y <- "Column name" - lab_note <- "*pairwise complete observations" - } - if (caption) { + # edit plot to match function arguments + if (!diff) { gg <- gg + - ggplot2::labs( - x = lab_x, - y = lab_y, - fill = "Correlation* - ", - caption = lab_note + ggplot2::scale_fill_gradient2( + low = ggplot2::alpha("deepskyblue", 0.6), + mid = "lightyellow", + high = ggplot2::alpha("orangered", 0.6), + na.value = "grey90", + limits = c(-1, 1) ) - } else { + } + if (diff) { + gg <- gg + ggplot2::scale_fill_gradient2( + low = ggplot2::alpha("deepskyblue", 1), + mid = "lightyellow", + high = ggplot2::alpha("orangered", 1), + na.value = "grey90", + limits = c(-2, 2) + ) + } + if (!caption) { + lab_fill <- substring(lab_fill, 1, nchar(lab_fill) - 2) gg <- gg + - ggplot2::labs(x = lab_x, y = lab_y, fill = "Correlation") + ggplot2::labs(fill = lab_fill, caption = NULL) } if (label) { gg <- From 73239114be4bfc629956813f40c660ef2e542c2e Mon Sep 17 00:00:00 2001 From: hanneoberman Date: Fri, 18 Oct 2024 14:10:50 +0200 Subject: [PATCH 3/5] update documentation for new function argument --- man/plot_corr.Rd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/man/plot_corr.Rd b/man/plot_corr.Rd index a12af75..1222cac 100644 --- a/man/plot_corr.Rd +++ b/man/plot_corr.Rd @@ -7,6 +7,7 @@ plot_corr( data, vrb = "all", + diff = FALSE, label = FALSE, square = TRUE, diagonal = FALSE, @@ -19,6 +20,8 @@ plot_corr( \item{vrb}{String, vector, or unquoted expression with variable name(s), default is "all".} +\item{diff}{Logical indicating whether the difference between the observed and imputed data is plotted.} + \item{label}{Logical indicating whether correlation values should be displayed.} \item{square}{Logical indicating whether the plot tiles should be squares.} From 75b9958401aebd594acd55c5bb9e248e35193fb6 Mon Sep 17 00:00:00 2001 From: hanneoberman Date: Fri, 18 Oct 2024 14:41:10 +0200 Subject: [PATCH 4/5] change difference plot to imp-obs (instead of obs-imp) --- R/plot_corr.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/plot_corr.R b/R/plot_corr.R index 0772afc..44a001b 100644 --- a/R/plot_corr.R +++ b/R/plot_corr.R @@ -87,7 +87,7 @@ plot_corr <- if (diff) { lab_fill <- "Difference in correlations* " - lab_note <- "*observed minus imputed (pooled across imputations)" + lab_note <- "*pooled imputed minus observed" } # compute correlations if (!imp | diff) { @@ -99,7 +99,7 @@ plot_corr <- stats::cov2cor(stats::cov(data.matrix(.x[, vrb_matched]), use = "pairwise.complete.obs")) }) if (diff) { - corr <- corr - (Reduce("+", corrs) / length(corrs)) + corr <- (Reduce("+", corrs) / length(corrs)) - corr } else { corr <- Reduce("+", corrs) / length(corrs) } From 0aea399dbbad3ba373faacc58310196311014ff5 Mon Sep 17 00:00:00 2001 From: hanneoberman Date: Wed, 23 Oct 2024 16:19:43 +0200 Subject: [PATCH 5/5] make differences in correlations more pronounced in color --- R/plot_corr.R | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/R/plot_corr.R b/R/plot_corr.R index 44a001b..c079845 100644 --- a/R/plot_corr.R +++ b/R/plot_corr.R @@ -153,10 +153,15 @@ plot_corr <- ) } if (diff) { - gg <- gg + ggplot2::scale_fill_gradient2( - low = ggplot2::alpha("deepskyblue", 1), - mid = "lightyellow", - high = ggplot2::alpha("orangered", 1), + gg <- gg + ggplot2::scale_fill_gradientn( + colors = c( + "deepskyblue4", + "deepskyblue", + "lightyellow", + "orangered", + "orangered4" + ), + values = c(0, 0.475, 0.5, 0.525, 1), na.value = "grey90", limits = c(-2, 2) )