From e38ad6325fe9daa08f5e3ef565ae1bca0865ef24 Mon Sep 17 00:00:00 2001 From: rmflight Date: Thu, 18 Jul 2024 19:49:43 -0400 Subject: [PATCH] whole lot of changes - added tests for everything - added cor_fast - updated kt_fast --- DESCRIPTION | 2 +- NAMESPACE | 2 + NEWS.md | 7 + R/kendalltau.R | 273 ++++++++---------- R/left_censorship.R | 2 + R/other_correlations.R | 174 +++++++++++ R/utils.R | 57 +++- README.Rmd | 24 +- README.html | 84 ++++-- README.md | 75 ++++- docs/404.html | 2 +- docs/CODE_OF_CONDUCT.html | 2 +- docs/LICENSE-text.html | 2 +- docs/LICENSE.html | 2 +- docs/articles/ici-kendalltau.html | 10 +- docs/articles/index.html | 2 +- .../articles/testing-for-left-censorship.html | 2 +- docs/authors.html | 6 +- docs/index.html | 67 ++++- docs/news/index.html | 9 +- docs/pkgdown.yml | 4 +- docs/reference/add_uniform_noise.html | 2 +- docs/reference/calculate_matrix_medians.html | 2 +- docs/reference/cor_fast.html | 139 +++++++++ docs/reference/cor_matrix_2_long_df.html | 2 +- docs/reference/disable_logging.html | 2 +- docs/reference/enable_logging.html | 2 +- docs/reference/ici_kendalltau.html | 16 +- docs/reference/ici_kendalltau_ref.html | 6 +- docs/reference/ici_kt.html | 2 +- docs/reference/index.html | 11 +- docs/reference/kt_fast.html | 30 +- docs/reference/log_memory.html | 2 +- docs/reference/log_message.html | 2 +- docs/reference/long_df_2_cor_matrix.html | 2 +- docs/reference/missing_dataset.html | 2 +- docs/reference/pairwise_completeness.html | 2 +- docs/reference/rank_order_data.html | 2 +- docs/reference/show_progress.html | 2 +- docs/reference/test_left_censorship.html | 2 +- docs/reference/yeast_missing.html | 2 +- docs/search.json | 2 +- docs/sitemap.xml | 3 + man/cor_fast.Rd | 50 ++++ man/ici_kendalltau.Rd | 6 + man/ici_kendalltau_ref.Rd | 2 +- man/kt_fast.Rd | 20 +- tests/testthat/_snaps/kendall-tau.md | 24 +- tests/testthat/test-cor-fast.R | 47 +++ tests/testthat/test-kendall-tau.R | 50 +++- 50 files changed, 982 insertions(+), 262 deletions(-) create mode 100644 R/other_correlations.R create mode 100644 docs/reference/cor_fast.html create mode 100644 man/cor_fast.Rd create mode 100644 tests/testthat/test-cor-fast.R diff --git a/DESCRIPTION b/DESCRIPTION index 5967173..d7fd6e6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ICIKendallTau Title: Calculates information-content-informed Kendall-tau -Version: 1.1.7 +Version: 1.2.0 Authors@R: c( person( given = c("Robert", "M"), diff --git a/NAMESPACE b/NAMESPACE index dc81870..49752fb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(add_uniform_noise) export(calculate_matrix_medians) +export(cor_fast) export(cor_matrix_2_long_df) export(disable_logging) export(enable_logging) @@ -16,4 +17,5 @@ export(rank_order_data) export(show_progress) export(test_left_censorship) importFrom(Rcpp,sourceCpp) +importFrom(stats,median) useDynLib(ICIKendallTau) diff --git a/NEWS.md b/NEWS.md index f8e0b3b..db7dfc6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# ICIKendallTau 1.2.0 + +- Refactored much of `ici_kendalltau`, making the code more consistent and easier to extend, as well as providing more informative error messages. Thanks to @njtierney for suggestions. +- Also refactored `kt_fast` to be more consistent and use more functions internally. **Note**: now returns matrix or data.frame regardless of whether passing simply two vectors or a matrix input. +- Added `alternative` and `continuity` arguments to both `ici_kendalltau` and `kt_fast`. +- Added `cor_fast` to allow running many iterations of `cor.test` on large matrix inputs if desired, with parallel processing to speed things up. + # ICIKendallTau 1.1.3 - makes `rank_order_data` take a sample class argument to enable splitting out by class. diff --git a/R/kendalltau.R b/R/kendalltau.R index 2353f53..ac09494 100644 --- a/R/kendalltau.R +++ b/R/kendalltau.R @@ -9,6 +9,8 @@ #' @param scale_max logical, should everything be scaled compared to the maximum correlation? #' @param diag_good logical, should the diagonal entries reflect how many entries in the sample were "good"? #' @param include_only only run the correlations that include the members (as a vector) or combinations (as a list or data.frame) +#' @param alternative what is the alternative for the p-value test? +#' @param continuity should a continuity correction be applied? #' @param check_timing logical to determine should we try to estimate run time for full dataset? (default is FALSE) #' @param return_matrix logical, should the data.frame or matrix result be returned? #' @@ -96,6 +98,8 @@ ici_kendalltau = function(data_matrix, scale_max = TRUE, diag_good = TRUE, include_only = NULL, + alternative = "two.sided", + continuity = FALSE, check_timing = FALSE, return_matrix = TRUE){ @@ -129,6 +133,8 @@ ici_kendalltau = function(data_matrix, which = "icikt", include_arg = include_arg) + n_todo = sum(purrr::map_int(split_comparisons, nrow)) + if (check_timing) { sample_compare = sample(nrow(split_comparisons[[1]]), 5) tmp_pairwise = split_comparisons[[1]][, sample_compare] @@ -145,7 +151,7 @@ ici_kendalltau = function(data_matrix, t1 = Sys.time() # note here, this takes our list of comparisons, and then calls the do_split # function above on each of them. - split_cor = computation$split_fun(split_comparisons, ici_split, exclude_data, perspective, do_log_memory) + split_cor = computation$split_fun(split_comparisons, ici_split, exclude_data, perspective, do_log_memory, alternative, continuity) t2 = Sys.time() t_diff = as.numeric(difftime(t2, t1, units = "secs")) @@ -242,13 +248,22 @@ setup_comparisons = function(samples, } else if (which %in% "completeness") { named_comparisons$missingness = Inf named_comparisons$completeness = Inf + } else if (which %in% "kt") { + named_comparisons$tau = Inf + named_comparisons$pvalue = Inf + } else if (which %in% "pearson") { + named_comparisons$rho = Inf + named_comparisons$pvalue = Inf + } else if (which %in% "spearman") { + named_comparisons$rho = Inf + named_comparisons$pvalue = Inf } split_comparisons = split(named_comparisons, named_comparisons$core) return(split_comparisons) } -ici_split = function(do_comparisons, exclude_data, perspective, do_log_memory) { +ici_split = function(do_comparisons, exclude_data, perspective, do_log_memory, alternative, continuity) { #seq_range = seq(in_range[1], in_range[2]) #print(seq_range) @@ -259,7 +274,8 @@ ici_split = function(do_comparisons, exclude_data, perspective, do_log_memory) { for (irow in seq_len(nrow(do_comparisons))) { iloc = do_comparisons[irow, 1] jloc = do_comparisons[irow, 2] - ici_res = ici_kt(exclude_data[, iloc], exclude_data[, jloc], perspective = perspective) + ici_res = ici_kt(exclude_data[, iloc], exclude_data[, jloc], perspective = perspective, + alternative = alternative, continuity = continuity) raw[irow] = ici_res["tau"] pvalue[irow] = ici_res["pvalue"] taumax[irow] = ici_res["tau_max"] @@ -274,6 +290,53 @@ ici_split = function(do_comparisons, exclude_data, perspective, do_log_memory) { do_comparisons } +kt_split = function(do_comparisons, x, na_method, do_log_memory) { + #seq_range = seq(in_range[1], in_range[2]) + #print(seq_range) + tau = vector("numeric", nrow(do_comparisons)) + pvalue = tau + + for (irow in seq(1, nrow(do_comparisons))) { + return_na = FALSE + iloc = do_comparisons[irow, 1] + jloc = do_comparisons[irow, 2] + + tmp_x = x[, iloc] + tmp_y = x[, jloc] + if (na_method %in% "pairwise.complete.obs") { + pair_good = !is.na(tmp_x) & !is.na(tmp_y) + if (sum(pair_good) == 0) { + return_na = TRUE + } else { + return_na = FALSE + tmp_x = tmp_x[pair_good] + tmp_y = tmp_y[pair_good] + } + } else if (na_method %in% c("everything", "all.obs")) { + return_na = any(is.na(c(tmp_x, tmp_y))) + } + + if (return_na) { + tau[irow] = as.double(NA) + pvalue[irow] = as.double(NA) + + } else { + ici_res = ici_kt(tmp_x, tmp_y) + tau[irow] = ici_res["tau"] + pvalue[irow] = ici_res["pvalue"] + + } + if (do_log_memory && ((irow %% 100) == 0)) { + log_memory() + } + + } + do_comparisons$tau = tau + do_comparisons$pvalue = pvalue + do_comparisons +} + + scale_and_reshape = function(split_cor, n_good = n_good, scale_max = scale_max, @@ -334,7 +397,7 @@ scale_and_reshape = function(split_cor, } } -#' fast kendall tau +#' Fast kendall tau #' #' Uses the underlying c++ implementation of `ici_kt` to provide a fast version #' of Kendall-tau correlation. @@ -342,184 +405,92 @@ scale_and_reshape = function(split_cor, #' @param x a numeric vector, matrix, or data frame. #' @param y NULL (default) or a vector. #' @param use an optional character string giving a method for computing correlations in the presence of missing values. This must be (an abbreviation of) one of the strings "everything", "all.obs", "complete.obs", or "pairwise.complete.obs". +#' @param alternative the type of test +#' @param continuity should a continuity correction be applied #' @param return_matrix Should the matrices of values be returned, or a long data.frame #' #' @details Although the interface is *mostly* identical to the built-in #' [stats::cor()] method, there are some differences. #' +#' * if only `x` is provided as a matrix or data.frame, the columns must be named. #' * if providing both `x` and `y`, it is assumed they are both #' single vectors. #' * if `NA` values are present, this function does not error, but will either remove them #' or return `NA`, depending on the option. #' * "na.or.complete" is not a valid option for `use`. -#' * A named vector or a named list with matrices is returned, with the `tau` and `pvalue` values. +#' * A named list with matrices or data.frame is returned, with the `tau` and `pvalue` values. #' -#' @return a named vector or list of matrices. +#' @return a list of matrices, tau, pvalue, or a data.frame. #' @export -kt_fast = function(x, y = NULL, use = "everything", return_matrix = TRUE) +kt_fast = function(x, y = NULL, use = "everything", + alternative = "two.sided", + continuity = FALSE, return_matrix = TRUE) { do_log_memory = get("memory", envir = icikt_logger) - # checking types + use_arg = rlang::caller_arg(use) + x_arg = rlang::caller_arg(x) + y_arg = rlang::caller_arg(y) + + # checking use na_method = match.arg(use, c("all.obs", "complete.obs", "pairwise.complete.obs", "everything", "na.or.complete")) #message(na_method) - if (na_method %in% "na.or.complete") { - stop("'na.or.complete' is not a supported use option.") - } - if (is.data.frame(y)) { - y = as.matrix(y) - } - - if (is.data.frame(x)) { - x = as.matrix(x) - } - - if (!is.matrix(x) && is.null(y)) { - stop("supply both 'x' and 'y' or a matrix-like 'x'") - } - - if (!(is.numeric(x) || is.logical(x))) { - stop("'x' must be numeric") - } - - if (!is.null(y)) { - if (!(is.numeric(y) || is.logical(y))) - stop("'y' must be numeric") - stopifnot(is.atomic(y)) - } + # all of this top stuff can probably be wrapped in a function to parse + # the logic, and can borrow some of the other new helpers. + check_na_method(na_method) + + x = check_x_y(x = x, y = y, + x_arg = x_arg, + y_arg = y_arg) + + na_vals = is.na(x) + any_na = any(na_vals) + no_na_rows = rowSums(na_vals) == 0 + sum_nona = sum(no_na_rows) + + computation = check_furrr() + + split_comparisons = setup_comparisons(colnames(x), + include_only = NULL, + diag_good = FALSE, + ncore = computation$ncore, + which = "kt", + include_arg = NULL) - if (!is.null(y)) { - if ((is.null(ncol(x))) && (is.null(ncol(y)))) { - x = x - any_na = any(is.na(c(x, y))) - if (na_method %in% c("everything", "all.obs")) { - if (any_na) { - return(c("tau" = NA, "pvalue" = NA)) - } else { - kt_vals = ici_kt(x, y) - return(kt_vals[c("tau", "pvalue")]) - } - } else if (na_method %in% c("complete.obs", "pairwise.complete.obs")) { - na_x = is.na(x) - na_y = is.na(y) - not_na = !(na_x | na_y) - if (length(not_na) == 0) { - return(c("tau" = NA, "pvalue" = NA)) - } - return(ici_kt(x[not_na], y[not_na])) - } + do_computation = TRUE + if (na_method %in% c("everything", "all.obs")) { + if (any_na) { + do_computation = FALSE } } - - n_sample = ncol(x) - - # figure out if we are using furrr to process these - if ("furrr" %in% utils::installed.packages()) { - ncore = future::nbrOfWorkers() - names(ncore) = NULL - split_fun = furrr::future_map - } else { - ncore = 1 - split_fun = purrr::map - } - - log_message("Figuring out comparisons to do ...") - # generate the array of comparisons, 2 x ..., - # where each column is a comparison between two columns of data - pairwise_comparisons = utils::combn(n_sample, 2) - - extra_comparisons = matrix(rep(seq(1, n_sample), each = 2), nrow = 2, ncol = n_sample, byrow = FALSE) - pairwise_comparisons = cbind(pairwise_comparisons, extra_comparisons) - - if (is.null(colnames(x))) { - colnames(x) = seq(1, ncol(x)) - } - named_comparisons = data.frame(s1 = colnames(x)[pairwise_comparisons[1, ]], - s2 = colnames(x)[pairwise_comparisons[2, ]]) - - n_todo = nrow(named_comparisons) - log_message("Splitting up across compute ...") - n_each = ceiling(n_todo / ncore) - - which_core = rep(seq(1, ncore), each = n_each) - which_core = which_core[1:nrow(named_comparisons)] - - named_comparisons$core = which_core - named_comparisons$tau = Inf - named_comparisons$pvalue = Inf - - na_vals = is.na(x) - do_calculation = TRUE - if (na_method %in% "complete.obs") { - no_na_rows = rowSums(na_vals) == 0 - if (sum(no_na_rows) == 0) { - named_comparisons$tau = NA - named_comparisons$pvalue = NA - split_cor = split(named_comparisons, named_comparisons$core) - do_calculation = FALSE + + if (na_method %in% c("complete.obs", "pariwise.complete.obs")) { + if (sum_nona == 0) { + do_computation = FALSE } else { x = x[no_na_rows, , drop = FALSE] } } + + if (!do_computation) { + split_cor = purrr::map(split_comparisons, \(in_split){ + in_split$tau = NA + in_split$pvalue = NA + in_split + }) + } + - split_comparisons = split(named_comparisons, named_comparisons$core) - - - do_split = function(do_comparisons, x, na_method, do_log_memory) { - #seq_range = seq(in_range[1], in_range[2]) - #print(seq_range) - tau = vector("numeric", nrow(do_comparisons)) - pvalue = tau - - for (irow in seq(1, nrow(do_comparisons))) { - return_na = FALSE - iloc = do_comparisons[irow, 1] - jloc = do_comparisons[irow, 2] - - tmp_x = x[, iloc] - tmp_y = x[, jloc] - if (na_method %in% "pairwise.complete.obs") { - pair_good = !is.na(tmp_x) & !is.na(tmp_y) - if (sum(pair_good) == 0) { - return_na = TRUE - } else { - return_na = FALSE - tmp_x = tmp_x[pair_good] - tmp_y = tmp_y[pair_good] - } - } else if (na_method %in% c("everything", "all.obs")) { - return_na = any(is.na(c(tmp_x, tmp_y))) - } - - if (return_na) { - tau[irow] = as.double(NA) - pvalue[irow] = as.double(NA) - - } else { - ici_res = ici_kt(tmp_x, tmp_y) - tau[irow] = ici_res["tau"] - pvalue[irow] = ici_res["pvalue"] - - } - if (do_log_memory && ((irow %% 100) == 0)) { - log_memory() - } - - } - do_comparisons$tau = tau - do_comparisons$pvalue = pvalue - do_comparisons - } # we record how much time is actually spent doing ICI-Kt # itself, as some of the other operations will add a bit of time # - if (do_calculation) { + if (do_computation) { t1 = Sys.time() # note here, this takes our list of comparisons, and then calls the do_split # function above on each of them. - split_cor = split_fun(split_comparisons, do_split, x, na_method, do_log_memory) + split_cor = computation$split_fun(split_comparisons, kt_split, x, na_method, do_log_memory) t2 = Sys.time() t_diff = as.numeric(difftime(t2, t1, units = "secs")) } else { @@ -540,9 +511,6 @@ kt_fast = function(x, y = NULL, use = "everything", return_matrix = TRUE) tau_matrix[one_way_index] = all_cor$tau tau_matrix[back_way_index] = all_cor$tau - tau_matrix[one_way_index] = all_cor$tau - tau_matrix[back_way_index] = all_cor$tau - pvalue_matrix[one_way_index] = all_cor$pvalue pvalue_matrix[back_way_index] = all_cor$pvalue @@ -575,6 +543,7 @@ pairwise_completeness = function(data_matrix, return_matrix = TRUE){ data_arg = rlang::caller_arg(data_matrix) + include_arg = rlang::caller_arg(include_only) check_if_colnames_null(data_matrix, arg = data_arg) data_matrix = transform_to_matrix(data_matrix, arg = data_arg) diff --git a/R/left_censorship.R b/R/left_censorship.R index da375f6..5cd1498 100644 --- a/R/left_censorship.R +++ b/R/left_censorship.R @@ -28,6 +28,8 @@ #' missingness$values #' missingness$binomial_test #' +#' @importFrom stats median +#' #' @export #' @return data.frame of trials / successes, and binom.test result test_left_censorship = function(data_matrix, diff --git a/R/other_correlations.R b/R/other_correlations.R new file mode 100644 index 0000000..7504313 --- /dev/null +++ b/R/other_correlations.R @@ -0,0 +1,174 @@ +#' Fast correlation with test +#' +#' Allows to run `cor.test` on a matrix of inputs. +#' +#' @param x a numeric vector, matrix, or data frame. +#' @param y NULL (default) or a vector. +#' @param use an optional character string giving a method for computing correlations in the presence of missing values. This must be (an abbreviation of) one of the strings "everything", "all.obs", "complete.obs", or "pairwise.complete.obs". +#' @param method which correlation method to use, "pearson" or "spearman" +#' @param alternative how to perform the statistical test +#' @param continuity should a continuity correction be applied +#' @param return_matrix should the matrices of values be returned, or a long data.frame +#' +#' @details Although the interface is *mostly* identical to the built-in +#' [stats::cor.test()] method, there are some differences. +#' +#' * if only `x` is provided as a matrix, the columns must be named. +#' * if providing both `x` and `y`, it is assumed they are both +#' single vectors. +#' * if `NA` values are present, this function does not error, but will either remove them +#' or return `NA`, depending on the option. +#' * "na.or.complete" is not a valid option for `use`. +#' * A named list with matrices or data.frame is returned, with the `rho` and `pvalue` values. +#' +#' @return a list of matrices, rho, pvalue, or a data.frame. +#' @export +cor_fast = function(x, y = NULL, use = "everything", + method = "pearson", + alternative = "two.sided", + continuity = FALSE, + return_matrix = TRUE) +{ + do_log_memory = get("memory", envir = icikt_logger) + use_arg = rlang::caller_arg(use) + x_arg = rlang::caller_arg(x) + y_arg = rlang::caller_arg(y) + + # checking use + na_method = match.arg(use, c("all.obs", "complete.obs", "pairwise.complete.obs", + "everything", "na.or.complete")) + + #message(na_method) + # all of this top stuff can probably be wrapped in a function to parse + # the logic, and can borrow some of the other new helpers. + check_na_method(na_method) + + x = check_x_y(x = x, y = y, + x_arg = x_arg, + y_arg = y_arg) + + na_vals = is.na(x) + any_na = any(na_vals) + no_na_rows = rowSums(na_vals) == 0 + sum_nona = sum(no_na_rows) + + computation = check_furrr() + + split_comparisons = setup_comparisons(colnames(x), + include_only = NULL, + diag_good = FALSE, + ncore = computation$ncore, + which = method, + include_arg = NULL) + + do_computation = TRUE + if (na_method %in% c("everything", "all.obs")) { + if (any_na) { + do_computation = FALSE + } + } + + if (na_method %in% c("complete.obs", "pariwise.complete.obs")) { + if (sum_nona == 0) { + do_computation = FALSE + } else { + x = x[no_na_rows, , drop = FALSE] + } + } + + if (!do_computation) { + split_cor = purrr::map(split_comparisons, \(in_split){ + in_split$tau = NA + in_split$pvalue = NA + in_split + }) + } + + + + # we record how much time is actually spent doing ICI-Kt + # itself, as some of the other operations will add a bit of time + # + if (do_computation) { + t1 = Sys.time() + # note here, this takes our list of comparisons, and then calls the do_split + # function above on each of them. + split_cor = computation$split_fun(split_comparisons, cor_split, x, na_method, do_log_memory, method, alternative, continuity) + t2 = Sys.time() + t_diff = as.numeric(difftime(t2, t1, units = "secs")) + } else { + t_diff = 0 + } + + all_cor = purrr::list_rbind(split_cor) + if (return_matrix) { + log_message("Generating the output matrix ...") + cor_matrix = matrix(0, nrow = ncol(x), ncol = ncol(x)) + rownames(cor_matrix) = colnames(cor_matrix) = colnames(x) + rho_matrix = cor_matrix + pvalue_matrix = cor_matrix + + one_way_index = cbind(all_cor$s1, all_cor$s2) + back_way_index = cbind(all_cor$s2, all_cor$s1) + + rho_matrix[one_way_index] = all_cor$rho + rho_matrix[back_way_index] = all_cor$rho + + pvalue_matrix[one_way_index] = all_cor$pvalue + pvalue_matrix[back_way_index] = all_cor$pvalue + + return(list(rho = rho_matrix, pvalue = pvalue_matrix, run_time = t_diff)) + } else { + return(list(rho = all_cor, run_time = t_diff)) + } + +} + +cor_split = function(do_comparisons, x, na_method, do_log_memory, method, alternative, continuity) { + #seq_range = seq(in_range[1], in_range[2]) + #print(seq_range) + rho = vector("numeric", nrow(do_comparisons)) + pvalue = rho + + for (irow in seq(1, nrow(do_comparisons))) { + return_na = FALSE + iloc = do_comparisons[irow, 1] + jloc = do_comparisons[irow, 2] + + tmp_x = x[, iloc] + tmp_y = x[, jloc] + if (na_method %in% "pairwise.complete.obs") { + pair_good = !is.na(tmp_x) & !is.na(tmp_y) + if (sum(pair_good) == 0) { + return_na = TRUE + } else { + return_na = FALSE + tmp_x = tmp_x[pair_good] + tmp_y = tmp_y[pair_good] + } + } else if (na_method %in% c("everything", "all.obs")) { + return_na = any(is.na(c(tmp_x, tmp_y))) + } + + if (return_na) { + rho[irow] = as.double(NA) + pvalue[irow] = as.double(NA) + + } else { + cor_res = stats::cor.test(tmp_x, tmp_y, + alternative = alternative, + method = method, + continuity = continuity) + rho[irow] = cor_res[["estimate"]][[1]] + pvalue[irow] = cor_res[["p.value"]] + + } + if (do_log_memory && ((irow %% 100) == 0)) { + log_memory() + } + + } + do_comparisons$rho = rho + do_comparisons$pvalue = pvalue + do_comparisons +} diff --git a/R/utils.R b/R/utils.R index 6eed12f..10402ea 100644 --- a/R/utils.R +++ b/R/utils.R @@ -45,7 +45,7 @@ check_if_numeric = function(data_matrix, cli::cli_abort( message = c('{.arg {arg}} must be a numeric type.', 'x' = 'Currently, {.code class({arg}[1])} returns \\ - {class(data_matrix[1])}'), + {.obj_type_friendly {data_matrix[1]}}'), call = call ) } @@ -77,4 +77,59 @@ check_furrr = function() } return(list(ncore = ncore, split_fun = split_fun)) +} + +check_na_method = function(na_method, + na_arg = rlang::caller_arg(na_method), + na_call = rlang::caller_env()) +{ + if (na_method %in% 'na.or.complete') { + cli::cli_abort(message = c( + '\'na.or.complete\' is not a supported value for {.arg use}.', + 'i' = 'Please use one of {.emph all.obs complete.obs pairwise.complete everthing}.' + ), + call = na_call) + } +} + +check_x_y = function(x, y, + x_arg = rlang::caller_arg(x), + y_arg = rlang::caller_arg(y), + x_call = rlang::caller_env()) +{ + dim_x = dim(x) + + if (is.null(dim_x) && is.null(y)) { + cli::cli_abort(message = c( + '{.arg {x_arg}} and {.arg y} should both be provided as vectors, or {.arg {x_arg}} should be matrix-like.', + 'i' = '{.arg {x_arg}} is a single vector, and {.arg y} is `NULL`.' + ), + call = x_call) + } + + + if (!is.null(dim_x)) { + check_if_colnames_null(x, x_arg, x_call) + } + x = transform_to_matrix(x, x_arg, x_call) + check_if_numeric(x, x_arg, x_call) + + if (!is.null(y)) { + dim_y = dim(y) + + if (!is.null(dim_x) || !is.null(dim_y)) { + cli::cli_abort(message = c( + 'Both {.arg {x_arg}} and {.arg {y_arg}} must be vectors.', + 'i' = '{.code ncol({x_arg})} returns {ncol(x)}, {.code ncol({y_arg})} returns {ncol(y)}.' + )) + } + + y = transform_to_matrix(y, y_arg, x_call) + check_if_numeric(y, y_arg, x_call) + + x = cbind(x, y) + colnames(x) = make.names(c(x_arg, y_arg)) + + } + x } \ No newline at end of file diff --git a/README.Rmd b/README.Rmd index 0630658..fd1f560 100644 --- a/README.Rmd +++ b/README.Rmd @@ -118,8 +118,6 @@ y = rnorm(1000) x2 = rnorm(40000) y2 = rnorm(40000) -library(microbenchmark) - microbenchmark( cor(x, y, method = "kendall"), ici_kt(x, y, "global"), @@ -170,6 +168,28 @@ r_4 = ici_kendalltau(matrix_2, return_matrix = FALSE) r_4 ``` +## Other Correlations + +`ici_kendalltau` and `ici_kt` calculate the p-value of the correlation as part of the overall calculation. +`stats::cor` does not, and `stats::cor.test` can only calculate the p-value for a single comparison of two vectors. +It is sometimes advantageous to obtain p-values for a large number of correlations. +We provide `cor_fast`, which works analogously to `kt_fast`, with the ability to choose `pearson` or `spearman` as the method. +Note that if a matrix is provided, the columns must be named. + +```{r} +#| label: cor-fast +r_5 = cor_fast(x, y, method = "pearson") +r_5 +``` + +```{r} +#| label: cor-fast2 +m_3 = cbind(x, y, x) +colnames(m_3) = c("s1", "s2", "s3") +r_6 = cor_fast(m_3) +r_6 +``` + ## Code of Conduct Please note that the ICIKendallTau project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms. diff --git a/README.html b/README.html index 8dfafa2..09ed201 100644 --- a/README.html +++ b/README.html @@ -606,7 +606,7 @@

ICIKendallTau

-

ICIKendallTau status badge

+

ICIKendallTau status badge

You can see the pkgdown site here.

@@ -720,19 +720,17 @@

Is It Fast?

x2 = rnorm(40000) y2 = rnorm(40000) -library(microbenchmark) - -microbenchmark( - cor(x, y, method = "kendall"), - ici_kt(x, y, "global"), - ici_kt(x2, y2, "global"), - times = 5 -) -#> Unit: microseconds -#> expr min lq mean median uq max neval -#> cor(x, y, method = "kendall") 11666.371 11671.671 12405.0886 12084.801 13276.95 13325.649 5 -#> ici_kt(x, y, "global") 253.826 255.717 430.9002 277.985 317.24 1049.733 5 -#> ici_kt(x2, y2, "global") 13405.302 13731.770 15208.7108 14693.928 15415.66 18796.894 5 +microbenchmark( + cor(x, y, method = "kendall"), + ici_kt(x, y, "global"), + ici_kt(x2, y2, "global"), + times = 5 +) +#> Unit: microseconds +#> expr min lq mean median uq max neval +#> cor(x, y, method = "kendall") 11755.570 11928.248 12437.300 12409.265 12841.010 13252.408 5 +#> ici_kt(x, y, "global") 239.267 248.566 413.841 263.888 269.249 1048.235 5 +#> ici_kt(x2, y2, "global") 13897.996 13991.561 15903.593 15191.136 18077.771 18359.500 5

In the case of 40,000 features, the average time on a modern CPU is 14 milliseconds.

Of course, if you want to use it to calculate Kendall-tau-b without @@ -745,8 +743,18 @@

Is It Fast?

stats::cor.

k_tau_fast = kt_fast(x, y)
 k_tau_fast
-#>          tau       pvalue 
-#> -0.003411411  0.871672260
+#> $tau +#> x y +#> x 1.000000000 -0.003411411 +#> y -0.003411411 1.000000000 +#> +#> $pvalue +#> x y +#> x 0.0000000 0.8716723 +#> y 0.8716723 0.0000000 +#> +#> $run_time +#> [1] 0.02890801

Parallelism

If you have {future} and the {furrr} packages installed, then it is also possible to split up the a set of matrix comparisons across compute @@ -774,7 +782,49 @@

Many Many Comparisons

#> 3 s4 s4 0 1.0000000 0 1.000000 1.0000000 #> #> $run_time -#> [1] 0.01783729 +#> [1] 0.02034473 +

Other Correlations

+

ici_kendalltau and ici_kt calculate the +p-value of the correlation as part of the overall calculation. +stats::cor does not, and stats::cor.test can +only calculate the p-value for a single comparison of two vectors. It is +sometimes advantageous to obtain p-values for a large number of +correlations. We provide cor_fast, which works analogously +to kt_fast, with the ability to choose pearson +or spearman as the method. Note that if a matrix is +provided, the columns must be named.

+
r_5 = cor_fast(x, y, method = "pearson")
+r_5
+#> $rho
+#>            x          y
+#> x 1.00000000 0.00720612
+#> y 0.00720612 1.00000000
+#> 
+#> $pvalue
+#>           x         y
+#> x 0.0000000 0.8199608
+#> y 0.8199608 0.0000000
+#> 
+#> $run_time
+#> [1] 0.02450991
+
m_3 = cbind(x, y, x)
+colnames(m_3) = c("s1", "s2", "s3")
+r_6 = cor_fast(m_3)
+r_6
+#> $rho
+#>            s1         s2         s3
+#> s1 1.00000000 0.00720612 1.00000000
+#> s2 0.00720612 1.00000000 0.00720612
+#> s3 1.00000000 0.00720612 1.00000000
+#> 
+#> $pvalue
+#>           s1        s2        s3
+#> s1 0.0000000 0.8199608 0.0000000
+#> s2 0.8199608 0.0000000 0.8199608
+#> s3 0.0000000 0.8199608 0.0000000
+#> 
+#> $run_time
+#> [1] 0.0231781

Code of Conduct

Please note that the ICIKendallTau project is released with a Contributor Code of Conduct. By contributing to this project, you agree to abide diff --git a/README.md b/README.md index 680ba20..97f7af9 100644 --- a/README.md +++ b/README.md @@ -136,8 +136,6 @@ y = rnorm(1000) x2 = rnorm(40000) y2 = rnorm(40000) -library(microbenchmark) - microbenchmark( cor(x, y, method = "kendall"), ici_kt(x, y, "global"), @@ -145,10 +143,10 @@ microbenchmark( times = 5 ) #> Unit: microseconds -#> expr min lq mean median uq max neval -#> cor(x, y, method = "kendall") 11666.371 11671.671 12405.0886 12084.801 13276.95 13325.649 5 -#> ici_kt(x, y, "global") 253.826 255.717 430.9002 277.985 317.24 1049.733 5 -#> ici_kt(x2, y2, "global") 13405.302 13731.770 15208.7108 14693.928 15415.66 18796.894 5 +#> expr min lq mean median uq max neval +#> cor(x, y, method = "kendall") 11755.570 11928.248 12437.300 12409.265 12841.010 13252.408 5 +#> ici_kt(x, y, "global") 239.267 248.566 413.841 263.888 269.249 1048.235 5 +#> ici_kt(x2, y2, "global") 13897.996 13991.561 15903.593 15191.136 18077.771 18359.500 5 ``` In the case of 40,000 features, the average time on a modern CPU is 14 @@ -169,8 +167,18 @@ treats `NA` values similarly to `stats::cor`. ``` r k_tau_fast = kt_fast(x, y) k_tau_fast -#> tau pvalue -#> -0.003411411 0.871672260 +#> $tau +#> x y +#> x 1.000000000 -0.003411411 +#> y -0.003411411 1.000000000 +#> +#> $pvalue +#> x y +#> x 0.0000000 0.8716723 +#> y 0.8716723 0.0000000 +#> +#> $run_time +#> [1] 0.02890801 ``` ## Parallelism @@ -207,7 +215,56 @@ r_4 #> 3 s4 s4 0 1.0000000 0 1.000000 1.0000000 #> #> $run_time -#> [1] 0.01783729 +#> [1] 0.02034473 +``` + +## Other Correlations + +`ici_kendalltau` and `ici_kt` calculate the p-value of the correlation +as part of the overall calculation. `stats::cor` does not, and +`stats::cor.test` can only calculate the p-value for a single comparison +of two vectors. It is sometimes advantageous to obtain p-values for a +large number of correlations. We provide `cor_fast`, which works +analogously to `kt_fast`, with the ability to choose `pearson` or +`spearman` as the method. Note that if a matrix is provided, the columns +must be named. + +``` r +r_5 = cor_fast(x, y, method = "pearson") +r_5 +#> $rho +#> x y +#> x 1.00000000 0.00720612 +#> y 0.00720612 1.00000000 +#> +#> $pvalue +#> x y +#> x 0.0000000 0.8199608 +#> y 0.8199608 0.0000000 +#> +#> $run_time +#> [1] 0.02450991 +``` + +``` r +m_3 = cbind(x, y, x) +colnames(m_3) = c("s1", "s2", "s3") +r_6 = cor_fast(m_3) +r_6 +#> $rho +#> s1 s2 s3 +#> s1 1.00000000 0.00720612 1.00000000 +#> s2 0.00720612 1.00000000 0.00720612 +#> s3 1.00000000 0.00720612 1.00000000 +#> +#> $pvalue +#> s1 s2 s3 +#> s1 0.0000000 0.8199608 0.0000000 +#> s2 0.8199608 0.0000000 0.8199608 +#> s3 0.0000000 0.8199608 0.0000000 +#> +#> $run_time +#> [1] 0.0231781 ``` ## Code of Conduct diff --git a/docs/404.html b/docs/404.html index ad1ac45..6427b5a 100644 --- a/docs/404.html +++ b/docs/404.html @@ -24,7 +24,7 @@ ICIKendallTau - 1.1.3 + 1.2.0 + +

+ + + +
+
+
+ +
+

Allows to run cor.test on a matrix of inputs.

+
+ +
+

Usage

+
cor_fast(
+  x,
+  y = NULL,
+  use = "everything",
+  method = "pearson",
+  alternative = "two.sided",
+  continuity = FALSE,
+  return_matrix = TRUE
+)
+
+ +
+

Arguments

+
x
+

a numeric vector, matrix, or data frame.

+ + +
y
+

NULL (default) or a vector.

+ + +
use
+

an optional character string giving a method for computing correlations in the presence of missing values. This must be (an abbreviation of) one of the strings "everything", "all.obs", "complete.obs", or "pairwise.complete.obs".

+ + +
method
+

which correlation method to use, "pearson" or "spearman"

+ + +
alternative
+

how to perform the statistical test

+ + +
continuity
+

should a continuity correction be applied

+ + +
return_matrix
+

should the matrices of values be returned, or a long data.frame

+ +
+
+

Value

+ + +

a list of matrices, rho, pvalue, or a data.frame.

+
+
+

Details

+

Although the interface is mostly identical to the built-in +stats::cor.test() method, there are some differences.

  • if only x is provided as a matrix, the columns must be named.

  • +
  • if providing both x and y, it is assumed they are both +single vectors.

  • +
  • if NA values are present, this function does not error, but will either remove them +or return NA, depending on the option.

  • +
  • "na.or.complete" is not a valid option for use.

  • +
  • A named list with matrices or data.frame is returned, with the rho and pvalue values.

  • +
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/docs/reference/cor_matrix_2_long_df.html b/docs/reference/cor_matrix_2_long_df.html index 0893901..b1fdcc4 100644 --- a/docs/reference/cor_matrix_2_long_df.html +++ b/docs/reference/cor_matrix_2_long_df.html @@ -10,7 +10,7 @@ ICIKendallTau - 1.1.3 + 1.2.0