From e38ad6325fe9daa08f5e3ef565ae1bca0865ef24 Mon Sep 17 00:00:00 2001
From: rmflight
Date: Thu, 18 Jul 2024 19:49:43 -0400
Subject: [PATCH] whole lot of changes
- added tests for everything
- added cor_fast
- updated kt_fast
---
DESCRIPTION | 2 +-
NAMESPACE | 2 +
NEWS.md | 7 +
R/kendalltau.R | 273 ++++++++----------
R/left_censorship.R | 2 +
R/other_correlations.R | 174 +++++++++++
R/utils.R | 57 +++-
README.Rmd | 24 +-
README.html | 84 ++++--
README.md | 75 ++++-
docs/404.html | 2 +-
docs/CODE_OF_CONDUCT.html | 2 +-
docs/LICENSE-text.html | 2 +-
docs/LICENSE.html | 2 +-
docs/articles/ici-kendalltau.html | 10 +-
docs/articles/index.html | 2 +-
.../articles/testing-for-left-censorship.html | 2 +-
docs/authors.html | 6 +-
docs/index.html | 67 ++++-
docs/news/index.html | 9 +-
docs/pkgdown.yml | 4 +-
docs/reference/add_uniform_noise.html | 2 +-
docs/reference/calculate_matrix_medians.html | 2 +-
docs/reference/cor_fast.html | 139 +++++++++
docs/reference/cor_matrix_2_long_df.html | 2 +-
docs/reference/disable_logging.html | 2 +-
docs/reference/enable_logging.html | 2 +-
docs/reference/ici_kendalltau.html | 16 +-
docs/reference/ici_kendalltau_ref.html | 6 +-
docs/reference/ici_kt.html | 2 +-
docs/reference/index.html | 11 +-
docs/reference/kt_fast.html | 30 +-
docs/reference/log_memory.html | 2 +-
docs/reference/log_message.html | 2 +-
docs/reference/long_df_2_cor_matrix.html | 2 +-
docs/reference/missing_dataset.html | 2 +-
docs/reference/pairwise_completeness.html | 2 +-
docs/reference/rank_order_data.html | 2 +-
docs/reference/show_progress.html | 2 +-
docs/reference/test_left_censorship.html | 2 +-
docs/reference/yeast_missing.html | 2 +-
docs/search.json | 2 +-
docs/sitemap.xml | 3 +
man/cor_fast.Rd | 50 ++++
man/ici_kendalltau.Rd | 6 +
man/ici_kendalltau_ref.Rd | 2 +-
man/kt_fast.Rd | 20 +-
tests/testthat/_snaps/kendall-tau.md | 24 +-
tests/testthat/test-cor-fast.R | 47 +++
tests/testthat/test-kendall-tau.R | 50 +++-
50 files changed, 982 insertions(+), 262 deletions(-)
create mode 100644 R/other_correlations.R
create mode 100644 docs/reference/cor_fast.html
create mode 100644 man/cor_fast.Rd
create mode 100644 tests/testthat/test-cor-fast.R
diff --git a/DESCRIPTION b/DESCRIPTION
index 5967173..d7fd6e6 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
Package: ICIKendallTau
Title: Calculates information-content-informed Kendall-tau
-Version: 1.1.7
+Version: 1.2.0
Authors@R: c(
person(
given = c("Robert", "M"),
diff --git a/NAMESPACE b/NAMESPACE
index dc81870..49752fb 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -2,6 +2,7 @@
export(add_uniform_noise)
export(calculate_matrix_medians)
+export(cor_fast)
export(cor_matrix_2_long_df)
export(disable_logging)
export(enable_logging)
@@ -16,4 +17,5 @@ export(rank_order_data)
export(show_progress)
export(test_left_censorship)
importFrom(Rcpp,sourceCpp)
+importFrom(stats,median)
useDynLib(ICIKendallTau)
diff --git a/NEWS.md b/NEWS.md
index f8e0b3b..db7dfc6 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+# ICIKendallTau 1.2.0
+
+- Refactored much of `ici_kendalltau`, making the code more consistent and easier to extend, as well as providing more informative error messages. Thanks to @njtierney for suggestions.
+- Also refactored `kt_fast` to be more consistent and use more functions internally. **Note**: now returns matrix or data.frame regardless of whether passing simply two vectors or a matrix input.
+- Added `alternative` and `continuity` arguments to both `ici_kendalltau` and `kt_fast`.
+- Added `cor_fast` to allow running many iterations of `cor.test` on large matrix inputs if desired, with parallel processing to speed things up.
+
# ICIKendallTau 1.1.3
- makes `rank_order_data` take a sample class argument to enable splitting out by class.
diff --git a/R/kendalltau.R b/R/kendalltau.R
index 2353f53..ac09494 100644
--- a/R/kendalltau.R
+++ b/R/kendalltau.R
@@ -9,6 +9,8 @@
#' @param scale_max logical, should everything be scaled compared to the maximum correlation?
#' @param diag_good logical, should the diagonal entries reflect how many entries in the sample were "good"?
#' @param include_only only run the correlations that include the members (as a vector) or combinations (as a list or data.frame)
+#' @param alternative what is the alternative for the p-value test?
+#' @param continuity should a continuity correction be applied?
#' @param check_timing logical to determine should we try to estimate run time for full dataset? (default is FALSE)
#' @param return_matrix logical, should the data.frame or matrix result be returned?
#'
@@ -96,6 +98,8 @@ ici_kendalltau = function(data_matrix,
scale_max = TRUE,
diag_good = TRUE,
include_only = NULL,
+ alternative = "two.sided",
+ continuity = FALSE,
check_timing = FALSE,
return_matrix = TRUE){
@@ -129,6 +133,8 @@ ici_kendalltau = function(data_matrix,
which = "icikt",
include_arg = include_arg)
+ n_todo = sum(purrr::map_int(split_comparisons, nrow))
+
if (check_timing) {
sample_compare = sample(nrow(split_comparisons[[1]]), 5)
tmp_pairwise = split_comparisons[[1]][, sample_compare]
@@ -145,7 +151,7 @@ ici_kendalltau = function(data_matrix,
t1 = Sys.time()
# note here, this takes our list of comparisons, and then calls the do_split
# function above on each of them.
- split_cor = computation$split_fun(split_comparisons, ici_split, exclude_data, perspective, do_log_memory)
+ split_cor = computation$split_fun(split_comparisons, ici_split, exclude_data, perspective, do_log_memory, alternative, continuity)
t2 = Sys.time()
t_diff = as.numeric(difftime(t2, t1, units = "secs"))
@@ -242,13 +248,22 @@ setup_comparisons = function(samples,
} else if (which %in% "completeness") {
named_comparisons$missingness = Inf
named_comparisons$completeness = Inf
+ } else if (which %in% "kt") {
+ named_comparisons$tau = Inf
+ named_comparisons$pvalue = Inf
+ } else if (which %in% "pearson") {
+ named_comparisons$rho = Inf
+ named_comparisons$pvalue = Inf
+ } else if (which %in% "spearman") {
+ named_comparisons$rho = Inf
+ named_comparisons$pvalue = Inf
}
split_comparisons = split(named_comparisons, named_comparisons$core)
return(split_comparisons)
}
-ici_split = function(do_comparisons, exclude_data, perspective, do_log_memory) {
+ici_split = function(do_comparisons, exclude_data, perspective, do_log_memory, alternative, continuity) {
#seq_range = seq(in_range[1], in_range[2])
#print(seq_range)
@@ -259,7 +274,8 @@ ici_split = function(do_comparisons, exclude_data, perspective, do_log_memory) {
for (irow in seq_len(nrow(do_comparisons))) {
iloc = do_comparisons[irow, 1]
jloc = do_comparisons[irow, 2]
- ici_res = ici_kt(exclude_data[, iloc], exclude_data[, jloc], perspective = perspective)
+ ici_res = ici_kt(exclude_data[, iloc], exclude_data[, jloc], perspective = perspective,
+ alternative = alternative, continuity = continuity)
raw[irow] = ici_res["tau"]
pvalue[irow] = ici_res["pvalue"]
taumax[irow] = ici_res["tau_max"]
@@ -274,6 +290,53 @@ ici_split = function(do_comparisons, exclude_data, perspective, do_log_memory) {
do_comparisons
}
+kt_split = function(do_comparisons, x, na_method, do_log_memory) {
+ #seq_range = seq(in_range[1], in_range[2])
+ #print(seq_range)
+ tau = vector("numeric", nrow(do_comparisons))
+ pvalue = tau
+
+ for (irow in seq(1, nrow(do_comparisons))) {
+ return_na = FALSE
+ iloc = do_comparisons[irow, 1]
+ jloc = do_comparisons[irow, 2]
+
+ tmp_x = x[, iloc]
+ tmp_y = x[, jloc]
+ if (na_method %in% "pairwise.complete.obs") {
+ pair_good = !is.na(tmp_x) & !is.na(tmp_y)
+ if (sum(pair_good) == 0) {
+ return_na = TRUE
+ } else {
+ return_na = FALSE
+ tmp_x = tmp_x[pair_good]
+ tmp_y = tmp_y[pair_good]
+ }
+ } else if (na_method %in% c("everything", "all.obs")) {
+ return_na = any(is.na(c(tmp_x, tmp_y)))
+ }
+
+ if (return_na) {
+ tau[irow] = as.double(NA)
+ pvalue[irow] = as.double(NA)
+
+ } else {
+ ici_res = ici_kt(tmp_x, tmp_y)
+ tau[irow] = ici_res["tau"]
+ pvalue[irow] = ici_res["pvalue"]
+
+ }
+ if (do_log_memory && ((irow %% 100) == 0)) {
+ log_memory()
+ }
+
+ }
+ do_comparisons$tau = tau
+ do_comparisons$pvalue = pvalue
+ do_comparisons
+}
+
+
scale_and_reshape = function(split_cor,
n_good = n_good,
scale_max = scale_max,
@@ -334,7 +397,7 @@ scale_and_reshape = function(split_cor,
}
}
-#' fast kendall tau
+#' Fast kendall tau
#'
#' Uses the underlying c++ implementation of `ici_kt` to provide a fast version
#' of Kendall-tau correlation.
@@ -342,184 +405,92 @@ scale_and_reshape = function(split_cor,
#' @param x a numeric vector, matrix, or data frame.
#' @param y NULL (default) or a vector.
#' @param use an optional character string giving a method for computing correlations in the presence of missing values. This must be (an abbreviation of) one of the strings "everything", "all.obs", "complete.obs", or "pairwise.complete.obs".
+#' @param alternative the type of test
+#' @param continuity should a continuity correction be applied
#' @param return_matrix Should the matrices of values be returned, or a long data.frame
#'
#' @details Although the interface is *mostly* identical to the built-in
#' [stats::cor()] method, there are some differences.
#'
+#' * if only `x` is provided as a matrix or data.frame, the columns must be named.
#' * if providing both `x` and `y`, it is assumed they are both
#' single vectors.
#' * if `NA` values are present, this function does not error, but will either remove them
#' or return `NA`, depending on the option.
#' * "na.or.complete" is not a valid option for `use`.
-#' * A named vector or a named list with matrices is returned, with the `tau` and `pvalue` values.
+#' * A named list with matrices or data.frame is returned, with the `tau` and `pvalue` values.
#'
-#' @return a named vector or list of matrices.
+#' @return a list of matrices, tau, pvalue, or a data.frame.
#' @export
-kt_fast = function(x, y = NULL, use = "everything", return_matrix = TRUE)
+kt_fast = function(x, y = NULL, use = "everything",
+ alternative = "two.sided",
+ continuity = FALSE, return_matrix = TRUE)
{
do_log_memory = get("memory", envir = icikt_logger)
- # checking types
+ use_arg = rlang::caller_arg(use)
+ x_arg = rlang::caller_arg(x)
+ y_arg = rlang::caller_arg(y)
+
+ # checking use
na_method = match.arg(use, c("all.obs", "complete.obs", "pairwise.complete.obs",
"everything", "na.or.complete"))
#message(na_method)
- if (na_method %in% "na.or.complete") {
- stop("'na.or.complete' is not a supported use option.")
- }
- if (is.data.frame(y)) {
- y = as.matrix(y)
- }
-
- if (is.data.frame(x)) {
- x = as.matrix(x)
- }
-
- if (!is.matrix(x) && is.null(y)) {
- stop("supply both 'x' and 'y' or a matrix-like 'x'")
- }
-
- if (!(is.numeric(x) || is.logical(x))) {
- stop("'x' must be numeric")
- }
-
- if (!is.null(y)) {
- if (!(is.numeric(y) || is.logical(y)))
- stop("'y' must be numeric")
- stopifnot(is.atomic(y))
- }
+ # all of this top stuff can probably be wrapped in a function to parse
+ # the logic, and can borrow some of the other new helpers.
+ check_na_method(na_method)
+
+ x = check_x_y(x = x, y = y,
+ x_arg = x_arg,
+ y_arg = y_arg)
+
+ na_vals = is.na(x)
+ any_na = any(na_vals)
+ no_na_rows = rowSums(na_vals) == 0
+ sum_nona = sum(no_na_rows)
+
+ computation = check_furrr()
+
+ split_comparisons = setup_comparisons(colnames(x),
+ include_only = NULL,
+ diag_good = FALSE,
+ ncore = computation$ncore,
+ which = "kt",
+ include_arg = NULL)
- if (!is.null(y)) {
- if ((is.null(ncol(x))) && (is.null(ncol(y)))) {
- x = x
- any_na = any(is.na(c(x, y)))
- if (na_method %in% c("everything", "all.obs")) {
- if (any_na) {
- return(c("tau" = NA, "pvalue" = NA))
- } else {
- kt_vals = ici_kt(x, y)
- return(kt_vals[c("tau", "pvalue")])
- }
- } else if (na_method %in% c("complete.obs", "pairwise.complete.obs")) {
- na_x = is.na(x)
- na_y = is.na(y)
- not_na = !(na_x | na_y)
- if (length(not_na) == 0) {
- return(c("tau" = NA, "pvalue" = NA))
- }
- return(ici_kt(x[not_na], y[not_na]))
- }
+ do_computation = TRUE
+ if (na_method %in% c("everything", "all.obs")) {
+ if (any_na) {
+ do_computation = FALSE
}
}
-
- n_sample = ncol(x)
-
- # figure out if we are using furrr to process these
- if ("furrr" %in% utils::installed.packages()) {
- ncore = future::nbrOfWorkers()
- names(ncore) = NULL
- split_fun = furrr::future_map
- } else {
- ncore = 1
- split_fun = purrr::map
- }
-
- log_message("Figuring out comparisons to do ...")
- # generate the array of comparisons, 2 x ...,
- # where each column is a comparison between two columns of data
- pairwise_comparisons = utils::combn(n_sample, 2)
-
- extra_comparisons = matrix(rep(seq(1, n_sample), each = 2), nrow = 2, ncol = n_sample, byrow = FALSE)
- pairwise_comparisons = cbind(pairwise_comparisons, extra_comparisons)
-
- if (is.null(colnames(x))) {
- colnames(x) = seq(1, ncol(x))
- }
- named_comparisons = data.frame(s1 = colnames(x)[pairwise_comparisons[1, ]],
- s2 = colnames(x)[pairwise_comparisons[2, ]])
-
- n_todo = nrow(named_comparisons)
- log_message("Splitting up across compute ...")
- n_each = ceiling(n_todo / ncore)
-
- which_core = rep(seq(1, ncore), each = n_each)
- which_core = which_core[1:nrow(named_comparisons)]
-
- named_comparisons$core = which_core
- named_comparisons$tau = Inf
- named_comparisons$pvalue = Inf
-
- na_vals = is.na(x)
- do_calculation = TRUE
- if (na_method %in% "complete.obs") {
- no_na_rows = rowSums(na_vals) == 0
- if (sum(no_na_rows) == 0) {
- named_comparisons$tau = NA
- named_comparisons$pvalue = NA
- split_cor = split(named_comparisons, named_comparisons$core)
- do_calculation = FALSE
+
+ if (na_method %in% c("complete.obs", "pariwise.complete.obs")) {
+ if (sum_nona == 0) {
+ do_computation = FALSE
} else {
x = x[no_na_rows, , drop = FALSE]
}
}
+
+ if (!do_computation) {
+ split_cor = purrr::map(split_comparisons, \(in_split){
+ in_split$tau = NA
+ in_split$pvalue = NA
+ in_split
+ })
+ }
+
- split_comparisons = split(named_comparisons, named_comparisons$core)
-
-
- do_split = function(do_comparisons, x, na_method, do_log_memory) {
- #seq_range = seq(in_range[1], in_range[2])
- #print(seq_range)
- tau = vector("numeric", nrow(do_comparisons))
- pvalue = tau
-
- for (irow in seq(1, nrow(do_comparisons))) {
- return_na = FALSE
- iloc = do_comparisons[irow, 1]
- jloc = do_comparisons[irow, 2]
-
- tmp_x = x[, iloc]
- tmp_y = x[, jloc]
- if (na_method %in% "pairwise.complete.obs") {
- pair_good = !is.na(tmp_x) & !is.na(tmp_y)
- if (sum(pair_good) == 0) {
- return_na = TRUE
- } else {
- return_na = FALSE
- tmp_x = tmp_x[pair_good]
- tmp_y = tmp_y[pair_good]
- }
- } else if (na_method %in% c("everything", "all.obs")) {
- return_na = any(is.na(c(tmp_x, tmp_y)))
- }
-
- if (return_na) {
- tau[irow] = as.double(NA)
- pvalue[irow] = as.double(NA)
-
- } else {
- ici_res = ici_kt(tmp_x, tmp_y)
- tau[irow] = ici_res["tau"]
- pvalue[irow] = ici_res["pvalue"]
-
- }
- if (do_log_memory && ((irow %% 100) == 0)) {
- log_memory()
- }
-
- }
- do_comparisons$tau = tau
- do_comparisons$pvalue = pvalue
- do_comparisons
- }
# we record how much time is actually spent doing ICI-Kt
# itself, as some of the other operations will add a bit of time
#
- if (do_calculation) {
+ if (do_computation) {
t1 = Sys.time()
# note here, this takes our list of comparisons, and then calls the do_split
# function above on each of them.
- split_cor = split_fun(split_comparisons, do_split, x, na_method, do_log_memory)
+ split_cor = computation$split_fun(split_comparisons, kt_split, x, na_method, do_log_memory)
t2 = Sys.time()
t_diff = as.numeric(difftime(t2, t1, units = "secs"))
} else {
@@ -540,9 +511,6 @@ kt_fast = function(x, y = NULL, use = "everything", return_matrix = TRUE)
tau_matrix[one_way_index] = all_cor$tau
tau_matrix[back_way_index] = all_cor$tau
- tau_matrix[one_way_index] = all_cor$tau
- tau_matrix[back_way_index] = all_cor$tau
-
pvalue_matrix[one_way_index] = all_cor$pvalue
pvalue_matrix[back_way_index] = all_cor$pvalue
@@ -575,6 +543,7 @@ pairwise_completeness = function(data_matrix,
return_matrix = TRUE){
data_arg = rlang::caller_arg(data_matrix)
+ include_arg = rlang::caller_arg(include_only)
check_if_colnames_null(data_matrix, arg = data_arg)
data_matrix = transform_to_matrix(data_matrix, arg = data_arg)
diff --git a/R/left_censorship.R b/R/left_censorship.R
index da375f6..5cd1498 100644
--- a/R/left_censorship.R
+++ b/R/left_censorship.R
@@ -28,6 +28,8 @@
#' missingness$values
#' missingness$binomial_test
#'
+#' @importFrom stats median
+#'
#' @export
#' @return data.frame of trials / successes, and binom.test result
test_left_censorship = function(data_matrix,
diff --git a/R/other_correlations.R b/R/other_correlations.R
new file mode 100644
index 0000000..7504313
--- /dev/null
+++ b/R/other_correlations.R
@@ -0,0 +1,174 @@
+#' Fast correlation with test
+#'
+#' Allows to run `cor.test` on a matrix of inputs.
+#'
+#' @param x a numeric vector, matrix, or data frame.
+#' @param y NULL (default) or a vector.
+#' @param use an optional character string giving a method for computing correlations in the presence of missing values. This must be (an abbreviation of) one of the strings "everything", "all.obs", "complete.obs", or "pairwise.complete.obs".
+#' @param method which correlation method to use, "pearson" or "spearman"
+#' @param alternative how to perform the statistical test
+#' @param continuity should a continuity correction be applied
+#' @param return_matrix should the matrices of values be returned, or a long data.frame
+#'
+#' @details Although the interface is *mostly* identical to the built-in
+#' [stats::cor.test()] method, there are some differences.
+#'
+#' * if only `x` is provided as a matrix, the columns must be named.
+#' * if providing both `x` and `y`, it is assumed they are both
+#' single vectors.
+#' * if `NA` values are present, this function does not error, but will either remove them
+#' or return `NA`, depending on the option.
+#' * "na.or.complete" is not a valid option for `use`.
+#' * A named list with matrices or data.frame is returned, with the `rho` and `pvalue` values.
+#'
+#' @return a list of matrices, rho, pvalue, or a data.frame.
+#' @export
+cor_fast = function(x, y = NULL, use = "everything",
+ method = "pearson",
+ alternative = "two.sided",
+ continuity = FALSE,
+ return_matrix = TRUE)
+{
+ do_log_memory = get("memory", envir = icikt_logger)
+ use_arg = rlang::caller_arg(use)
+ x_arg = rlang::caller_arg(x)
+ y_arg = rlang::caller_arg(y)
+
+ # checking use
+ na_method = match.arg(use, c("all.obs", "complete.obs", "pairwise.complete.obs",
+ "everything", "na.or.complete"))
+
+ #message(na_method)
+ # all of this top stuff can probably be wrapped in a function to parse
+ # the logic, and can borrow some of the other new helpers.
+ check_na_method(na_method)
+
+ x = check_x_y(x = x, y = y,
+ x_arg = x_arg,
+ y_arg = y_arg)
+
+ na_vals = is.na(x)
+ any_na = any(na_vals)
+ no_na_rows = rowSums(na_vals) == 0
+ sum_nona = sum(no_na_rows)
+
+ computation = check_furrr()
+
+ split_comparisons = setup_comparisons(colnames(x),
+ include_only = NULL,
+ diag_good = FALSE,
+ ncore = computation$ncore,
+ which = method,
+ include_arg = NULL)
+
+ do_computation = TRUE
+ if (na_method %in% c("everything", "all.obs")) {
+ if (any_na) {
+ do_computation = FALSE
+ }
+ }
+
+ if (na_method %in% c("complete.obs", "pariwise.complete.obs")) {
+ if (sum_nona == 0) {
+ do_computation = FALSE
+ } else {
+ x = x[no_na_rows, , drop = FALSE]
+ }
+ }
+
+ if (!do_computation) {
+ split_cor = purrr::map(split_comparisons, \(in_split){
+ in_split$tau = NA
+ in_split$pvalue = NA
+ in_split
+ })
+ }
+
+
+
+ # we record how much time is actually spent doing ICI-Kt
+ # itself, as some of the other operations will add a bit of time
+ #
+ if (do_computation) {
+ t1 = Sys.time()
+ # note here, this takes our list of comparisons, and then calls the do_split
+ # function above on each of them.
+ split_cor = computation$split_fun(split_comparisons, cor_split, x, na_method, do_log_memory, method, alternative, continuity)
+ t2 = Sys.time()
+ t_diff = as.numeric(difftime(t2, t1, units = "secs"))
+ } else {
+ t_diff = 0
+ }
+
+ all_cor = purrr::list_rbind(split_cor)
+ if (return_matrix) {
+ log_message("Generating the output matrix ...")
+ cor_matrix = matrix(0, nrow = ncol(x), ncol = ncol(x))
+ rownames(cor_matrix) = colnames(cor_matrix) = colnames(x)
+ rho_matrix = cor_matrix
+ pvalue_matrix = cor_matrix
+
+ one_way_index = cbind(all_cor$s1, all_cor$s2)
+ back_way_index = cbind(all_cor$s2, all_cor$s1)
+
+ rho_matrix[one_way_index] = all_cor$rho
+ rho_matrix[back_way_index] = all_cor$rho
+
+ pvalue_matrix[one_way_index] = all_cor$pvalue
+ pvalue_matrix[back_way_index] = all_cor$pvalue
+
+ return(list(rho = rho_matrix, pvalue = pvalue_matrix, run_time = t_diff))
+ } else {
+ return(list(rho = all_cor, run_time = t_diff))
+ }
+
+}
+
+cor_split = function(do_comparisons, x, na_method, do_log_memory, method, alternative, continuity) {
+ #seq_range = seq(in_range[1], in_range[2])
+ #print(seq_range)
+ rho = vector("numeric", nrow(do_comparisons))
+ pvalue = rho
+
+ for (irow in seq(1, nrow(do_comparisons))) {
+ return_na = FALSE
+ iloc = do_comparisons[irow, 1]
+ jloc = do_comparisons[irow, 2]
+
+ tmp_x = x[, iloc]
+ tmp_y = x[, jloc]
+ if (na_method %in% "pairwise.complete.obs") {
+ pair_good = !is.na(tmp_x) & !is.na(tmp_y)
+ if (sum(pair_good) == 0) {
+ return_na = TRUE
+ } else {
+ return_na = FALSE
+ tmp_x = tmp_x[pair_good]
+ tmp_y = tmp_y[pair_good]
+ }
+ } else if (na_method %in% c("everything", "all.obs")) {
+ return_na = any(is.na(c(tmp_x, tmp_y)))
+ }
+
+ if (return_na) {
+ rho[irow] = as.double(NA)
+ pvalue[irow] = as.double(NA)
+
+ } else {
+ cor_res = stats::cor.test(tmp_x, tmp_y,
+ alternative = alternative,
+ method = method,
+ continuity = continuity)
+ rho[irow] = cor_res[["estimate"]][[1]]
+ pvalue[irow] = cor_res[["p.value"]]
+
+ }
+ if (do_log_memory && ((irow %% 100) == 0)) {
+ log_memory()
+ }
+
+ }
+ do_comparisons$rho = rho
+ do_comparisons$pvalue = pvalue
+ do_comparisons
+}
diff --git a/R/utils.R b/R/utils.R
index 6eed12f..10402ea 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -45,7 +45,7 @@ check_if_numeric = function(data_matrix,
cli::cli_abort(
message = c('{.arg {arg}} must be a numeric type.',
'x' = 'Currently, {.code class({arg}[1])} returns \\
- {class(data_matrix[1])}'),
+ {.obj_type_friendly {data_matrix[1]}}'),
call = call
)
}
@@ -77,4 +77,59 @@ check_furrr = function()
}
return(list(ncore = ncore,
split_fun = split_fun))
+}
+
+check_na_method = function(na_method,
+ na_arg = rlang::caller_arg(na_method),
+ na_call = rlang::caller_env())
+{
+ if (na_method %in% 'na.or.complete') {
+ cli::cli_abort(message = c(
+ '\'na.or.complete\' is not a supported value for {.arg use}.',
+ 'i' = 'Please use one of {.emph all.obs complete.obs pairwise.complete everthing}.'
+ ),
+ call = na_call)
+ }
+}
+
+check_x_y = function(x, y,
+ x_arg = rlang::caller_arg(x),
+ y_arg = rlang::caller_arg(y),
+ x_call = rlang::caller_env())
+{
+ dim_x = dim(x)
+
+ if (is.null(dim_x) && is.null(y)) {
+ cli::cli_abort(message = c(
+ '{.arg {x_arg}} and {.arg y} should both be provided as vectors, or {.arg {x_arg}} should be matrix-like.',
+ 'i' = '{.arg {x_arg}} is a single vector, and {.arg y} is `NULL`.'
+ ),
+ call = x_call)
+ }
+
+
+ if (!is.null(dim_x)) {
+ check_if_colnames_null(x, x_arg, x_call)
+ }
+ x = transform_to_matrix(x, x_arg, x_call)
+ check_if_numeric(x, x_arg, x_call)
+
+ if (!is.null(y)) {
+ dim_y = dim(y)
+
+ if (!is.null(dim_x) || !is.null(dim_y)) {
+ cli::cli_abort(message = c(
+ 'Both {.arg {x_arg}} and {.arg {y_arg}} must be vectors.',
+ 'i' = '{.code ncol({x_arg})} returns {ncol(x)}, {.code ncol({y_arg})} returns {ncol(y)}.'
+ ))
+ }
+
+ y = transform_to_matrix(y, y_arg, x_call)
+ check_if_numeric(y, y_arg, x_call)
+
+ x = cbind(x, y)
+ colnames(x) = make.names(c(x_arg, y_arg))
+
+ }
+ x
}
\ No newline at end of file
diff --git a/README.Rmd b/README.Rmd
index 0630658..fd1f560 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -118,8 +118,6 @@ y = rnorm(1000)
x2 = rnorm(40000)
y2 = rnorm(40000)
-library(microbenchmark)
-
microbenchmark(
cor(x, y, method = "kendall"),
ici_kt(x, y, "global"),
@@ -170,6 +168,28 @@ r_4 = ici_kendalltau(matrix_2, return_matrix = FALSE)
r_4
```
+## Other Correlations
+
+`ici_kendalltau` and `ici_kt` calculate the p-value of the correlation as part of the overall calculation.
+`stats::cor` does not, and `stats::cor.test` can only calculate the p-value for a single comparison of two vectors.
+It is sometimes advantageous to obtain p-values for a large number of correlations.
+We provide `cor_fast`, which works analogously to `kt_fast`, with the ability to choose `pearson` or `spearman` as the method.
+Note that if a matrix is provided, the columns must be named.
+
+```{r}
+#| label: cor-fast
+r_5 = cor_fast(x, y, method = "pearson")
+r_5
+```
+
+```{r}
+#| label: cor-fast2
+m_3 = cbind(x, y, x)
+colnames(m_3) = c("s1", "s2", "s3")
+r_6 = cor_fast(m_3)
+r_6
+```
+
## Code of Conduct
Please note that the ICIKendallTau project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms.
diff --git a/README.html b/README.html
index 8dfafa2..09ed201 100644
--- a/README.html
+++ b/README.html
@@ -606,7 +606,7 @@
ICIKendallTau
-
+
You can see the pkgdown site here .
@@ -720,19 +720,17 @@ Is It Fast?
x2 = rnorm (40000 )
y2 = rnorm (40000 )
-library (microbenchmark)
-
-microbenchmark (
- cor (x, y, method = "kendall" ),
- ici_kt (x, y, "global" ),
- ici_kt (x2, y2, "global" ),
- times = 5
- )
-#> Unit: microseconds
-#> expr min lq mean median uq max neval
-#> cor(x, y, method = "kendall") 11666.371 11671.671 12405.0886 12084.801 13276.95 13325.649 5
-#> ici_kt(x, y, "global") 253.826 255.717 430.9002 277.985 317.24 1049.733 5
-#> ici_kt(x2, y2, "global") 13405.302 13731.770 15208.7108 14693.928 15415.66 18796.894 5
+microbenchmark (
+ cor (x, y, method = "kendall" ),
+ ici_kt (x, y, "global" ),
+ ici_kt (x2, y2, "global" ),
+ times = 5
+ )
+#> Unit: microseconds
+#> expr min lq mean median uq max neval
+#> cor(x, y, method = "kendall") 11755.570 11928.248 12437.300 12409.265 12841.010 13252.408 5
+#> ici_kt(x, y, "global") 239.267 248.566 413.841 263.888 269.249 1048.235 5
+#> ici_kt(x2, y2, "global") 13897.996 13991.561 15903.593 15191.136 18077.771 18359.500 5
In the case of 40,000 features, the average time on a modern CPU is
14 milliseconds.
Of course, if you want to use it to calculate Kendall-tau-b without
@@ -745,8 +743,18 @@
Is It Fast?
stats::cor
.
k_tau_fast = kt_fast (x, y)
k_tau_fast
-#> tau pvalue
-#> -0.003411411 0.871672260
+#> $tau
+#> x y
+#> x 1.000000000 -0.003411411
+#> y -0.003411411 1.000000000
+#>
+#> $pvalue
+#> x y
+#> x 0.0000000 0.8716723
+#> y 0.8716723 0.0000000
+#>
+#> $run_time
+#> [1] 0.02890801
Parallelism
If you have {future} and the {furrr} packages installed, then it is
also possible to split up the a set of matrix comparisons across compute
@@ -774,7 +782,49 @@
Many Many Comparisons
#> 3 s4 s4 0 1.0000000 0 1.000000 1.0000000
#>
#> $run_time
-#> [1] 0.01783729
+#> [1] 0.02034473
+Other Correlations
+ici_kendalltau
and ici_kt
calculate the
+p-value of the correlation as part of the overall calculation.
+stats::cor
does not, and stats::cor.test
can
+only calculate the p-value for a single comparison of two vectors. It is
+sometimes advantageous to obtain p-values for a large number of
+correlations. We provide cor_fast
, which works analogously
+to kt_fast
, with the ability to choose pearson
+or spearman
as the method. Note that if a matrix is
+provided, the columns must be named.
+ r_5 = cor_fast (x, y, method = "pearson" )
+ r_5
+#> $rho
+#> x y
+#> x 1.00000000 0.00720612
+#> y 0.00720612 1.00000000
+#>
+#> $pvalue
+#> x y
+#> x 0.0000000 0.8199608
+#> y 0.8199608 0.0000000
+#>
+#> $run_time
+#> [1] 0.02450991
+ m_3 = cbind (x, y, x)
+colnames (m_3) = c ("s1" , "s2" , "s3" )
+ r_6 = cor_fast (m_3)
+ r_6
+#> $rho
+#> s1 s2 s3
+#> s1 1.00000000 0.00720612 1.00000000
+#> s2 0.00720612 1.00000000 0.00720612
+#> s3 1.00000000 0.00720612 1.00000000
+#>
+#> $pvalue
+#> s1 s2 s3
+#> s1 0.0000000 0.8199608 0.0000000
+#> s2 0.8199608 0.0000000 0.8199608
+#> s3 0.0000000 0.8199608 0.0000000
+#>
+#> $run_time
+#> [1] 0.0231781
Code of Conduct
Please note that the ICIKendallTau project is released with a Contributor
Code of Conduct . By contributing to this project, you agree to abide
diff --git a/README.md b/README.md
index 680ba20..97f7af9 100644
--- a/README.md
+++ b/README.md
@@ -136,8 +136,6 @@ y = rnorm(1000)
x2 = rnorm(40000)
y2 = rnorm(40000)
-library(microbenchmark)
-
microbenchmark(
cor(x, y, method = "kendall"),
ici_kt(x, y, "global"),
@@ -145,10 +143,10 @@ microbenchmark(
times = 5
)
#> Unit: microseconds
-#> expr min lq mean median uq max neval
-#> cor(x, y, method = "kendall") 11666.371 11671.671 12405.0886 12084.801 13276.95 13325.649 5
-#> ici_kt(x, y, "global") 253.826 255.717 430.9002 277.985 317.24 1049.733 5
-#> ici_kt(x2, y2, "global") 13405.302 13731.770 15208.7108 14693.928 15415.66 18796.894 5
+#> expr min lq mean median uq max neval
+#> cor(x, y, method = "kendall") 11755.570 11928.248 12437.300 12409.265 12841.010 13252.408 5
+#> ici_kt(x, y, "global") 239.267 248.566 413.841 263.888 269.249 1048.235 5
+#> ici_kt(x2, y2, "global") 13897.996 13991.561 15903.593 15191.136 18077.771 18359.500 5
```
In the case of 40,000 features, the average time on a modern CPU is 14
@@ -169,8 +167,18 @@ treats `NA` values similarly to `stats::cor`.
``` r
k_tau_fast = kt_fast(x, y)
k_tau_fast
-#> tau pvalue
-#> -0.003411411 0.871672260
+#> $tau
+#> x y
+#> x 1.000000000 -0.003411411
+#> y -0.003411411 1.000000000
+#>
+#> $pvalue
+#> x y
+#> x 0.0000000 0.8716723
+#> y 0.8716723 0.0000000
+#>
+#> $run_time
+#> [1] 0.02890801
```
## Parallelism
@@ -207,7 +215,56 @@ r_4
#> 3 s4 s4 0 1.0000000 0 1.000000 1.0000000
#>
#> $run_time
-#> [1] 0.01783729
+#> [1] 0.02034473
+```
+
+## Other Correlations
+
+`ici_kendalltau` and `ici_kt` calculate the p-value of the correlation
+as part of the overall calculation. `stats::cor` does not, and
+`stats::cor.test` can only calculate the p-value for a single comparison
+of two vectors. It is sometimes advantageous to obtain p-values for a
+large number of correlations. We provide `cor_fast`, which works
+analogously to `kt_fast`, with the ability to choose `pearson` or
+`spearman` as the method. Note that if a matrix is provided, the columns
+must be named.
+
+``` r
+r_5 = cor_fast(x, y, method = "pearson")
+r_5
+#> $rho
+#> x y
+#> x 1.00000000 0.00720612
+#> y 0.00720612 1.00000000
+#>
+#> $pvalue
+#> x y
+#> x 0.0000000 0.8199608
+#> y 0.8199608 0.0000000
+#>
+#> $run_time
+#> [1] 0.02450991
+```
+
+``` r
+m_3 = cbind(x, y, x)
+colnames(m_3) = c("s1", "s2", "s3")
+r_6 = cor_fast(m_3)
+r_6
+#> $rho
+#> s1 s2 s3
+#> s1 1.00000000 0.00720612 1.00000000
+#> s2 0.00720612 1.00000000 0.00720612
+#> s3 1.00000000 0.00720612 1.00000000
+#>
+#> $pvalue
+#> s1 s2 s3
+#> s1 0.0000000 0.8199608 0.0000000
+#> s2 0.8199608 0.0000000 0.8199608
+#> s3 0.0000000 0.8199608 0.0000000
+#>
+#> $run_time
+#> [1] 0.0231781
```
## Code of Conduct
diff --git a/docs/404.html b/docs/404.html
index ad1ac45..6427b5a 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -24,7 +24,7 @@
ICIKendallTau
- 1.1.3
+ 1.2.0
diff --git a/docs/CODE_OF_CONDUCT.html b/docs/CODE_OF_CONDUCT.html
index 9425034..8a1c41e 100644
--- a/docs/CODE_OF_CONDUCT.html
+++ b/docs/CODE_OF_CONDUCT.html
@@ -10,7 +10,7 @@
ICIKendallTau
- 1.1.3
+ 1.2.0
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html
index ab4f9e4..90ed8ba 100644
--- a/docs/LICENSE-text.html
+++ b/docs/LICENSE-text.html
@@ -10,7 +10,7 @@
ICIKendallTau
- 1.1.3
+ 1.2.0
diff --git a/docs/LICENSE.html b/docs/LICENSE.html
index 18edbcb..ab6fb93 100644
--- a/docs/LICENSE.html
+++ b/docs/LICENSE.html
@@ -10,7 +10,7 @@
ICIKendallTau
- 1.1.3
+ 1.2.0
diff --git a/docs/articles/ici-kendalltau.html b/docs/articles/ici-kendalltau.html
index f056162..0c7711f 100644
--- a/docs/articles/ici-kendalltau.html
+++ b/docs/articles/ici-kendalltau.html
@@ -26,7 +26,7 @@
ICIKendallTau
- 1.1.3
+ 1.2.0
@@ -329,11 +329,11 @@