diff --git a/DESCRIPTION b/DESCRIPTION index f8fdbae..9e2498f 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tglkmeans Title: Efficient Implementation of K-Means++ Algorithm -Version: 0.4.0 +Version: 0.5.0 Authors@R: c( person("Aviezer", "Lifshitz", , "aviezer.lifshitz@weizmann.ac.il", role = c("aut", "cre")), person("Amos", "Tanay", role = "aut"), @@ -30,6 +30,8 @@ Imports: future, ggplot2 (>= 2.2.0), magrittr, + Matrix, + methods, parallel (>= 3.3.2), plyr (>= 1.8.4), purrr (>= 0.2.0), diff --git a/NAMESPACE b/NAMESPACE index afb2449..4e4625d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export("%>%") export(TGL_kmeans) export(TGL_kmeans_tidy) +export(downsample_matrix) export(simulate_data) export(tglkmeans.set_parallel) import(dplyr) diff --git a/NEWS.md b/NEWS.md index 7844b04..938561e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# tglkmeans 0.5.0 + +* Added `dowsample_matrix` function to downsample the columns of a count matrix to a target number. + # tglkmeans 0.4.0 * Default of `id_column` parameter was changed to `FALSE`. Note that this is a breaking change, and if you want to use an id column, you need to set it explicitly to `TRUE`. diff --git a/R/RcppExports.R b/R/RcppExports.R index 8e3121b..ff2cd0c 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -13,3 +13,11 @@ TGL_kmeans_cpp <- function(ids, mat, k, metric, max_iter = 40, min_delta = 0.000 .Call('_tglkmeans_TGL_kmeans_cpp', PACKAGE = 'tglkmeans', ids, mat, k, metric, max_iter, min_delta, use_cpp_random, seed) } +downsample_matrix_cpp <- function(input, samples, random_seed) { + .Call('_tglkmeans_downsample_matrix_cpp', PACKAGE = 'tglkmeans', input, samples, random_seed) +} + +rcpp_downsample_sparse <- function(matrix, samples, random_seed) { + .Call('_tglkmeans_rcpp_downsample_sparse', PACKAGE = 'tglkmeans', matrix, samples, random_seed) +} + diff --git a/R/downsample.R b/R/downsample.R new file mode 100644 index 0000000..d6d3f4b --- /dev/null +++ b/R/downsample.R @@ -0,0 +1,109 @@ +#' Downsample the columns of a matrix to a target number +#' +#' @description This function takes a matrix and downsamples it to a target number of samples. +#' It uses a random seed for reproducibility and allows for removing columns with +#' small sums. +#' +#' @param mat An integer matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). +#' If the matrix contains NAs, the function will run significantly slower. Values that are +#' not integers will be coerced to integers using {.code floor()}. +#' @param target_n The target number of samples to downsample to. +#' @param target_q A target quantile of sums to downsample to. Only one of {.field target_n} or {.field target_q} can be provided. +#' @param seed The random seed for reproducibility (default is NULL) +#' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE) +#' +#' @return The downsampled matrix +#' +#' @examples +#' \dontshow{ +#' tglkmeans.set_parallel(1) +#' } +#' +#' mat <- matrix(1:12, nrow = 4) +#' downsample_matrix(mat, 2) +#' +#' # Remove columns with small sums +#' downsample_matrix(mat, 12, remove_columns = TRUE) +#' +#' # sparse matrix +#' mat_sparse <- Matrix::Matrix(mat, sparse = TRUE) +#' downsample_matrix(mat_sparse, 2) +#' +#' # with a quantile +#' downsample_matrix(mat, target_q = 0.5) +#' +#' @export +downsample_matrix <- function(mat, target_n = NULL, target_q = NULL, seed = NULL, remove_columns = FALSE) { + if (is.null(target_n) && is.null(target_q)) { + cli_abort("Either {.field target_n} or {.field target_q} must be provided.") + } else if (!is.null(target_n) && !is.null(target_q)) { + cli_abort("Only one of {.field target_n} or {.field target_q} can be provided.") + } + + sums <- colsums_matrix(mat) + if (!is.null(target_q)) { + target_n <- round(stats::quantile(sums, target_q)) + cli::cli_alert_info("Using {.val {target_n}} as the target number.") + } + + if (is.null(seed)) { + seed <- sample(1:10000, 1) + cli::cli_alert_warning("No seed provided. Using {.val {seed}}.") + } else if (!is.numeric(seed) || seed <= 0 || seed != as.integer(seed)) { + cli_abort("{.field seed} must be a positive integer.") + } + + if (!is.logical(remove_columns)) { + cli_abort("{.field remove_columns} must be a logical value.") + } + + if (target_n <= 0 || target_n != as.integer(target_n)) { + cli_abort("{.field target_n} must be a positive integer.") + } + + # replace NAs with 0s for the cpp code + has_nas <- FALSE + if (any(is.na(mat))) { + has_nas <- TRUE + cli_warn("Input matrix contains NAs. Processing would be significantly slower.") + orig_mat <- mat + mat[is.na(mat)] <- 0 + } + + if (methods::is(mat, "dgCMatrix")) { + ds_mat <- rcpp_downsample_sparse(mat, target_n, seed) + } else if (is.matrix(mat)) { + ds_mat <- downsample_matrix_cpp(mat, target_n, seed) + } + + small_cols <- sums < target_n + if (any(small_cols)) { + if (remove_columns) { + ds_mat <- ds_mat[, !small_cols, drop = FALSE] + if (has_nas) { + orig_mat <- orig_mat[, !small_cols, drop = FALSE] + } + cli_alert_info("Removed {.val {sum(small_cols)}} columns with a sum smaller than {.val {target_n}}.") + } else { + cli_warn("{.val {sum(small_cols)}} columns have a sum smaller than {.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.") + } + } + + if (has_nas) { + # put back the NAs + ds_mat[is.na(orig_mat)] <- NA + } + + + return(ds_mat) +} + +colsums_matrix <- function(mat) { + if (methods::is(mat, "dgCMatrix")) { + return(Matrix::colSums(mat, na.rm = TRUE)) + } else if (is.matrix(mat)) { + return(colSums(mat, na.rm = TRUE)) + } else { + cli_abort("Input must be a matrix or a sparse matrix (dgCMatrix). class of {.field mat} is {.val {class(mat)}}.") + } +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 30c405b..f55ec4c 100755 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -27,6 +27,10 @@ reference: - contents: - TGL_kmeans - TGL_kmeans_tidy +- title: Matrix + desc: matrix utility functions +- contents: + - downsample_matrix - title: misc desc: utility functions - contents: diff --git a/man/TGL_kmeans.Rd b/man/TGL_kmeans.Rd index a3ef07a..f3679b2 100644 --- a/man/TGL_kmeans.Rd +++ b/man/TGL_kmeans.Rd @@ -9,7 +9,7 @@ TGL_kmeans( k, metric = "euclid", max_iter = 40, - min_delta = 1e-04, + min_delta = 0.0001, verbose = FALSE, keep_log = FALSE, id_column = FALSE, diff --git a/man/TGL_kmeans_tidy.Rd b/man/TGL_kmeans_tidy.Rd index f304ba8..567060e 100644 --- a/man/TGL_kmeans_tidy.Rd +++ b/man/TGL_kmeans_tidy.Rd @@ -9,7 +9,7 @@ TGL_kmeans_tidy( k, metric = "euclid", max_iter = 40, - min_delta = 1e-04, + min_delta = 0.0001, verbose = FALSE, keep_log = FALSE, id_column = FALSE, diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd new file mode 100644 index 0000000..db41148 --- /dev/null +++ b/man/downsample_matrix.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/downsample.R +\name{downsample_matrix} +\alias{downsample_matrix} +\title{Downsample the columns of a matrix to a target number} +\usage{ +downsample_matrix( + mat, + target_n = NULL, + target_q = NULL, + seed = NULL, + remove_columns = FALSE +) +} +\arguments{ +\item{mat}{An integer matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). +If the matrix contains NAs, the function will run significantly slower. Values that are +not integers will be coerced to integers using {.code floor()}.} + +\item{target_n}{The target number of samples to downsample to.} + +\item{target_q}{A target quantile of sums to downsample to. Only one of {.field target_n} or {.field target_q} can be provided.} + +\item{seed}{The random seed for reproducibility (default is NULL)} + +\item{remove_columns}{Logical indicating whether to remove columns with small sums (default is FALSE)} +} +\value{ +The downsampled matrix +} +\description{ +This function takes a matrix and downsamples it to a target number of samples. +It uses a random seed for reproducibility and allows for removing columns with +small sums. +} +\examples{ +\dontshow{ +tglkmeans.set_parallel(1) +} + +mat <- matrix(1:12, nrow = 4) +downsample_matrix(mat, 2) + +# Remove columns with small sums +downsample_matrix(mat, 12, remove_columns = TRUE) + +# sparse matrix +mat_sparse <- Matrix::Matrix(mat, sparse = TRUE) +downsample_matrix(mat_sparse, 2) + +# with a quantile +downsample_matrix(mat, target_q = 0.5) + +} diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp new file mode 100644 index 0000000..b4e6507 --- /dev/null +++ b/src/DownsampleWorker.cpp @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include +#include "DownsampleWorker.h" + +typedef float float32_t; +typedef double float64_t; +typedef unsigned char uint8_t; +typedef unsigned int uint_t; + +static size_t +ceil_power_of_two(const size_t size) { + return size_t(1) << size_t(ceil(log2(float64_t(size)))); +} + +template +static void initialize_tree(const std::vector& input, std::vector& tree) { + assert(input.size() >= 2); + + size_t input_size = ceil_power_of_two(input.size()); + tree.resize(2 * input_size - 1); // Resize the tree to the necessary size + + // Copy input to the beginning of the tree and fill the rest with zeros + std::copy(input.begin(), input.end(), tree.begin()); + std::fill(tree.begin() + input.size(), tree.begin() + input_size, 0); + + // Iteratively build the tree + size_t tree_index = 0; + while (input_size > 1) { + size_t half_size = input_size / 2; + for (size_t index = 0; index < half_size; ++index) { + const auto left = tree[tree_index + index * 2]; + const auto right = tree[tree_index + index * 2 + 1]; + tree[tree_index + input_size + index] = left + right; + + assert(left >= 0); + assert(right >= 0); + assert(tree[tree_index + input_size + index] == size_t(left) + size_t(right)); + } + tree_index += input_size; + input_size = half_size; + } + assert(tree.size() == 2 * input.size() - 1); +} + +static size_t random_sample(std::vector& tree, ssize_t random) { + size_t size_of_level = 1; + ssize_t base_of_level = tree.size() - 1; + size_t index_in_level = 0; + size_t index_in_tree = base_of_level + index_in_level; + + while (true) { + assert(index_in_tree == base_of_level + index_in_level); + assert(tree[index_in_tree] > random); + + --tree[index_in_tree]; + size_of_level *= 2; + base_of_level -= size_of_level; + + if (base_of_level < 0) { + return index_in_level; + } + + index_in_level *= 2; + index_in_tree = base_of_level + index_in_level; + ssize_t right_random = random - ssize_t(tree[index_in_tree]); + + assert(tree[base_of_level + index_in_level] + tree[base_of_level + index_in_level + 1] == + tree[base_of_level + size_of_level + index_in_level / 2] + 1); + + if (right_random >= 0) { + ++index_in_level; + ++index_in_tree; + assert(index_in_level < size_of_level); + random = right_random; + } + } +} + +template +static void downsample_slice(const std::vector& input, std::vector& output, const int32_t samples, const size_t random_seed) { + assert(output.size() == input.size()); + + if (samples < 0 || input.size() == 0) { + return; + } + + if (input.size() == 1) { + output[0] = O(static_cast(samples) < static_cast(input[0]) ? samples : input[0]); + return; + } + + size_t input_size = ceil_power_of_two(input.size()); + std::vector tree(2 * input_size - 1); + initialize_tree(input, tree); + size_t& total = tree[tree.size() - 1]; + + if (total <= static_cast(samples)) { + std::copy(input.begin(), input.end(), output.begin()); + return; + } + + std::fill(output.begin(), output.end(), O(0)); + + std::minstd_rand random(random_seed); + for (size_t index = 0; index < static_cast(samples); ++index) { + size_t sampled_index = random_sample(tree, random() % total); + if (sampled_index < output.size()) { + ++output[sampled_index]; + } + } +} + +DownsampleWorker::DownsampleWorker(const Rcpp::IntegerMatrix& input, Rcpp::IntegerMatrix& output, int samples, unsigned int random_seed) + : input_matrix(input), output_matrix(output), samples(samples), random_seed(random_seed) {} + +void DownsampleWorker::operator()(std::size_t begin, std::size_t end) { + for (std::size_t col = begin; col < end; ++col) { + std::vector input_vec(input_matrix.column(col).begin(), input_matrix.column(col).end()); + std::vector output_vec(input_vec.size(), 0); + + downsample_slice(input_vec, output_vec, samples, random_seed); + + std::copy(output_vec.begin(), output_vec.end(), output_matrix.column(col).begin()); + } +} + +DownsampleWorkerSparse::DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, + Rcpp::IntegerVector& out_x, int samples, unsigned int random_seed) + : input_i(i), input_p(p), input_x(x), output_x(out_x), samples(samples), random_seed(random_seed) {} + +void DownsampleWorkerSparse::operator()(std::size_t begin, std::size_t end) { + for (std::size_t col = begin; col < end; ++col) { + // Extract the current column from the sparse matrix + std::vector input_vec; + for (int idx = input_p[col]; idx < input_p[col + 1]; ++idx) { + input_vec.push_back(input_x[idx]); + } + + std::vector output_vec(input_vec.size(), 0); + + downsample_slice(input_vec, output_vec, samples, random_seed); + + // Store results in the output sparse matrix + for (int idx = input_p[col], out_idx = 0; idx < input_p[col + 1]; ++idx, ++out_idx) { + output_x[idx] = output_vec[out_idx]; + } + } +} diff --git a/src/DownsampleWorker.h b/src/DownsampleWorker.h new file mode 100644 index 0000000..53c944c --- /dev/null +++ b/src/DownsampleWorker.h @@ -0,0 +1,40 @@ +#ifndef DOWNSAMPLEWORKER_H +#define DOWNSAMPLEWORKER_H + +#include +#include +#include + +class DownsampleWorker : public RcppParallel::Worker { +private: + RcppParallel::RMatrix input_matrix; + RcppParallel::RMatrix output_matrix; + int samples; + unsigned int random_seed; + +public: + DownsampleWorker(const Rcpp::IntegerMatrix& input, Rcpp::IntegerMatrix& output, int samples, unsigned int random_seed); + + // Parallel operator + void operator()(std::size_t begin, std::size_t end) override; +}; + +class DownsampleWorkerSparse : public RcppParallel::Worker { +private: + Rcpp::IntegerVector input_i; + Rcpp::IntegerVector input_p; + Rcpp::IntegerVector input_x; + Rcpp::IntegerVector output_x; + int samples; + unsigned int random_seed; + +public: + DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, + Rcpp::IntegerVector& out_x, int samples, unsigned int random_seed); + + // Parallel operator + void operator()(std::size_t begin, std::size_t end) override; +}; + + +#endif // DOWNSAMPLEWORKER_H diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index d189d69..d5474ab 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -51,11 +51,39 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// downsample_matrix_cpp +Rcpp::IntegerMatrix downsample_matrix_cpp(Rcpp::IntegerMatrix input, int samples, unsigned int random_seed); +RcppExport SEXP _tglkmeans_downsample_matrix_cpp(SEXP inputSEXP, SEXP samplesSEXP, SEXP random_seedSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type input(inputSEXP); + Rcpp::traits::input_parameter< int >::type samples(samplesSEXP); + Rcpp::traits::input_parameter< unsigned int >::type random_seed(random_seedSEXP); + rcpp_result_gen = Rcpp::wrap(downsample_matrix_cpp(input, samples, random_seed)); + return rcpp_result_gen; +END_RCPP +} +// rcpp_downsample_sparse +Rcpp::S4 rcpp_downsample_sparse(Rcpp::S4 matrix, int samples, unsigned int random_seed); +RcppExport SEXP _tglkmeans_rcpp_downsample_sparse(SEXP matrixSEXP, SEXP samplesSEXP, SEXP random_seedSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::S4 >::type matrix(matrixSEXP); + Rcpp::traits::input_parameter< int >::type samples(samplesSEXP); + Rcpp::traits::input_parameter< unsigned int >::type random_seed(random_seedSEXP); + rcpp_result_gen = Rcpp::wrap(rcpp_downsample_sparse(matrix, samples, random_seed)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_tglkmeans_reduce_coclust", (DL_FUNC) &_tglkmeans_reduce_coclust, 3}, {"_tglkmeans_reduce_num_trials", (DL_FUNC) &_tglkmeans_reduce_num_trials, 2}, {"_tglkmeans_TGL_kmeans_cpp", (DL_FUNC) &_tglkmeans_TGL_kmeans_cpp, 8}, + {"_tglkmeans_downsample_matrix_cpp", (DL_FUNC) &_tglkmeans_downsample_matrix_cpp, 3}, + {"_tglkmeans_rcpp_downsample_sparse", (DL_FUNC) &_tglkmeans_rcpp_downsample_sparse, 3}, {NULL, NULL, 0} }; diff --git a/src/downsample.cpp b/src/downsample.cpp new file mode 100644 index 0000000..b2da02f --- /dev/null +++ b/src/downsample.cpp @@ -0,0 +1,47 @@ +#include +#include +#include +#include "DownsampleWorker.h" + +typedef float float32_t; +typedef double float64_t; +typedef unsigned char uint8_t; +typedef unsigned int uint_t; + +// [[Rcpp::export]] +Rcpp::IntegerMatrix downsample_matrix_cpp(Rcpp::IntegerMatrix input, int samples, unsigned int random_seed) { + Rcpp::IntegerMatrix output(input.nrow(), input.ncol()); + + DownsampleWorker worker(input, output, samples, random_seed); + RcppParallel::parallelFor(0, input.ncol(), worker); + + return output; +} + +// [[Rcpp::export]] +Rcpp::S4 rcpp_downsample_sparse(Rcpp::S4 matrix, int samples, unsigned int random_seed) { + // Extract components of the dgCMatrix + Rcpp::IntegerVector i = matrix.slot("i"); + Rcpp::IntegerVector p = matrix.slot("p"); + Rcpp::IntegerVector x = matrix.slot("x"); + + int nrows = Rcpp::as(matrix.slot("Dim"))[0]; + int ncols = Rcpp::as(matrix.slot("Dim"))[1]; + + // Prepare output vector + Rcpp::IntegerVector out_x(x.size()); + + // Create and run the DownsampleWorkerSparse + DownsampleWorkerSparse worker(i, p, x, out_x, samples, random_seed); + RcppParallel::parallelFor(0, ncols, worker); + + // Create a new dgCMatrix object for the output + Rcpp::S4 out_matrix("dgCMatrix"); + out_matrix.slot("i") = i; + out_matrix.slot("p") = p; + Rcpp::NumericVector out_x_double = Rcpp::as(out_x); + out_matrix.slot("x") = out_x_double; + out_matrix.slot("Dim") = Rcpp::IntegerVector::create(nrows, ncols); + + return out_matrix; +} \ No newline at end of file diff --git a/tests/testthat/test-downsample.R b/tests/testthat/test-downsample.R new file mode 100644 index 0000000..8a04d8b --- /dev/null +++ b/tests/testthat/test-downsample.R @@ -0,0 +1,204 @@ +test_that("downsample_matrix returns the correct number of samples", { + mat <- matrix(1:12, nrow = 4) + target_n <- 2 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + expect_true(all(mat >= ds_mat)) +}) + +test_that("downsample_matrix removes columns with small sums when remove_columns is TRUE", { + mat <- matrix(1:12, nrow = 4) + target_n <- 12 + ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE) + expect_equal(ncol(ds_mat), 2) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix does not remove columns with small sums when remove_columns is FALSE", { + mat <- matrix(1:12, nrow = 4) + target_n <- 12 + expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = FALSE)) + expect_equal(ncol(ds_mat), ncol(mat)) + expect_true(all(colSums(ds_mat[, -1], na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix returns the correct number of samples when there are NAs", { + mat <- matrix(1:12, nrow = 4) + mat[1, 1] <- NA + target_n <- 2 + expect_warning(ds_mat <- downsample_matrix(mat, target_n)) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + + # make sure the NAs are still there + expect_true(all(is.na(ds_mat[1, 1]))) +}) + +test_that("downsample_matrix returns the correct number of samples when there are NAs and remove_columns is TRUE", { + mat <- matrix(1:12, nrow = 4) + mat[1, 1] <- NA + target_n <- 2 + expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE)) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + + # make sure the NAs are still there + expect_true(all(is.na(ds_mat[1, 1]))) +}) + +test_that("downsample_matrix returns the correct number of samples when there are NAs and remove_columns is FALSE", { + mat <- matrix(1:12, nrow = 4) + mat[1, 1] <- NA + target_n <- 2 + expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = FALSE)) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + expect_true(all(mat[!is.na(mat)] >= ds_mat[!is.na(ds_mat)])) + + # make sure the NAs are still there + expect_true(all(is.na(ds_mat[1, 1]))) +}) + +test_that("downsample_matrix returns the correct number of samples when there are NAs and remove_columns is TRUE and the matrix is sparse", { + mat <- Matrix::Matrix(matrix(1:12, nrow = 4), sparse = TRUE) + mat[1, 1] <- NA + target_n <- 2 + expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE)) + expect_true(all(Matrix::colSums(ds_mat, na.rm = TRUE) == target_n)) + + # make sure the NAs are still there + expect_true(all(is.na(ds_mat[1, 1]))) +}) + +test_that("downsample_matrix works with all zeros matrix", { + mat <- matrix(0, nrow = 4) + target_n <- 2 + expect_warning(ds_mat <- downsample_matrix(mat, target_n)) + expect_true(all(mat == ds_mat)) + + ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE) + expect_equal(ncol(ds_mat), 0) +}) + +# Test with Different Matrix Sizes +test_that("downsample_matrix works with larger matrix", { + mat <- matrix(1:1e3, nrow = 10) + target_n <- 5 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix works with single-column matrix", { + mat <- matrix(1:10, nrow = 10) + target_n <- 5 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix works with single-row matrix", { + mat <- matrix(1:10, nrow = 1) + target_n <- 5 + ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + expect_equal(ncol(ds_mat), 6) +}) + +# Test with Different target_n Values +test_that("downsample_matrix with target_n equal to number of rows", { + mat <- matrix(1:12, nrow = 4) + target_n <- 4 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix with target_n greater than number of rows", { + mat <- matrix(1:12, nrow = 4) + target_n <- 6 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) <= target_n)) +}) + +test_that("downsample_matrix with target_n equal to 1", { + mat <- matrix(1:12, nrow = 4) + target_n <- 1 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix with target_n equal to 0", { + mat <- matrix(1:12, nrow = 4) + target_n <- 0 + expect_error(downsample_matrix(mat, target_n)) +}) + +# Test with Different Matrix Types +test_that("downsample_matrix with sparse matrix", { + mat <- Matrix::Matrix(matrix(1:12, nrow = 4), sparse = TRUE) + target_n <- 2 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(Matrix::colSums(ds_mat, na.rm = TRUE) == target_n)) + expect_true(all(mat >= ds_mat)) +}) + +test_that("downsample_matrix with invalid matrix type", { + mat <- 1:12 + target_n <- 2 + expect_error(downsample_matrix(mat, target_n)) + + mat <- Matrix::Matrix(matrix(1:12, nrow = 4), sparse = FALSE) + target_n <- 2 + expect_error(downsample_matrix(mat, target_n)) +}) + +# Test with non-integer values +test_that("downsample_matrix with non-integer values", { + mat <- matrix(1:12, nrow = 4) + mat[1, 1] <- 1.5 + target_n <- 2 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + expect_true(all(mat >= ds_mat)) +}) + +# Test with Various Seed Values +test_that("downsample_matrix with fixed seed", { + mat <- matrix(1:12, nrow = 4) + target_n <- 2 + seed <- 123 + ds_mat1 <- downsample_matrix(mat, target_n, seed = seed) + ds_mat2 <- downsample_matrix(mat, target_n, seed = seed) + expect_equal(ds_mat1, ds_mat2) +}) + +test_that("downsample_matrix without specifying seed", { + mat <- matrix(1:12, nrow = 4) + target_n <- 2 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(is.matrix(ds_mat)) +}) + +test_that("downsample_matrix with invalid seed", { + mat <- matrix(1:12, nrow = 4) + target_n <- 2 + expect_error(downsample_matrix(mat, target_n, seed = -1)) + expect_error(downsample_matrix(mat, target_n, seed = "not a number")) +}) + +test_that("downsample_matrix correctly downsamples matrix using target_q", { + mat <- matrix(1:12, nrow = 4) + target_q <- 0.5 + ds_mat <- downsample_matrix(mat, target_q = target_q, remove_columns = TRUE) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == round(stats::quantile(colSums(mat), target_q)))) + + expect_warning(ds_mat <- downsample_matrix(mat, target_q = target_q, remove_columns = FALSE)) + expect_true(all(mat >= ds_mat)) +}) + +test_that("Cannot provide both target_n and target_q", { + mat <- matrix(1:12, nrow = 4) + target_n <- 2 + target_q <- 0.5 + expect_error(downsample_matrix(mat, target_n = target_n, target_q = target_q)) +}) + +test_that("Fail when no target_n or target_q is provided", { + mat <- matrix(1:12, nrow = 4) + expect_error(downsample_matrix(mat)) +})