From 2a21c4e25544498e7bb3d72aa5ab72ef24b8cb24 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Thu, 28 Dec 2023 23:27:56 +0200 Subject: [PATCH 01/10] Added downsampling for dense matrices --- NAMESPACE | 1 + R/RcppExports.R | 4 + R/downsample.R | 40 ++++++++++ man/TGL_kmeans.Rd | 2 +- man/TGL_kmeans_tidy.Rd | 2 +- man/downsample_matrix.Rd | 33 ++++++++ src/DownsampleWorker.cpp | 131 +++++++++++++++++++++++++++++++ src/DownsampleWorker.h | 22 ++++++ src/RcppExports.cpp | 14 ++++ src/downsample.cpp | 19 +++++ tests/testthat/test-downsample.R | 23 ++++++ 11 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 R/downsample.R create mode 100644 man/downsample_matrix.Rd create mode 100644 src/DownsampleWorker.cpp create mode 100644 src/DownsampleWorker.h create mode 100644 src/downsample.cpp create mode 100644 tests/testthat/test-downsample.R diff --git a/NAMESPACE b/NAMESPACE index afb2449..4e4625d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export("%>%") export(TGL_kmeans) export(TGL_kmeans_tidy) +export(downsample_matrix) export(simulate_data) export(tglkmeans.set_parallel) import(dplyr) diff --git a/R/RcppExports.R b/R/RcppExports.R index 8e3121b..a09d6a4 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -13,3 +13,7 @@ TGL_kmeans_cpp <- function(ids, mat, k, metric, max_iter = 40, min_delta = 0.000 .Call('_tglkmeans_TGL_kmeans_cpp', PACKAGE = 'tglkmeans', ids, mat, k, metric, max_iter, min_delta, use_cpp_random, seed) } +downsample_matrix_cpp <- function(input, samples, random_seed) { + .Call('_tglkmeans_downsample_matrix_cpp', PACKAGE = 'tglkmeans', input, samples, random_seed) +} + diff --git a/R/downsample.R b/R/downsample.R new file mode 100644 index 0000000..c8ce4ec --- /dev/null +++ b/R/downsample.R @@ -0,0 +1,40 @@ +#' Downsample a matrix to a target number of in each column +#' +#' @description This function takes a matrix and downsamples it to a target number of samples. +#' It uses a random seed for reproducibility and allows for removing columns with +#' small sums. +#' +#' @param mat The input matrix to be downsampled +#' @param target_n The target number of samples to downsample to +#' @param seed The random seed for reproducibility (default is NULL) +#' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE) +#' +#' @return The downsampled matrix +#' +#' @examples +#' mat <- matrix(1:12, nrow = 4) +#' downsample_matrix(mat, 2) +#' +#' # Remove columns with small sums +#' downsample_matrix(mat, 12, remove_columns = TRUE) +#' +#' @export +downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE) { + if (is.null(seed)) { + seed <- sample(1:10000, 1) + } + + ds_mat <- downsample_matrix_cpp(mat, target_n, seed) + + sums <- colSums(ds_mat, na.rm = TRUE) + small_cols <- sums < target_n + if (any(small_cols)) { + if (remove_columns) { + ds_mat <- ds_mat[, !small_cols] + } else { + cli_warn("Some columns ({which(small_cols)}) have a sum<{.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.") + } + } + + return(ds_mat) +} diff --git a/man/TGL_kmeans.Rd b/man/TGL_kmeans.Rd index a3ef07a..f3679b2 100644 --- a/man/TGL_kmeans.Rd +++ b/man/TGL_kmeans.Rd @@ -9,7 +9,7 @@ TGL_kmeans( k, metric = "euclid", max_iter = 40, - min_delta = 1e-04, + min_delta = 0.0001, verbose = FALSE, keep_log = FALSE, id_column = FALSE, diff --git a/man/TGL_kmeans_tidy.Rd b/man/TGL_kmeans_tidy.Rd index f304ba8..567060e 100644 --- a/man/TGL_kmeans_tidy.Rd +++ b/man/TGL_kmeans_tidy.Rd @@ -9,7 +9,7 @@ TGL_kmeans_tidy( k, metric = "euclid", max_iter = 40, - min_delta = 1e-04, + min_delta = 0.0001, verbose = FALSE, keep_log = FALSE, id_column = FALSE, diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd new file mode 100644 index 0000000..a697c77 --- /dev/null +++ b/man/downsample_matrix.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/downsample.R +\name{downsample_matrix} +\alias{downsample_matrix} +\title{Downsample a matrix to a target number of in each column} +\usage{ +downsample_matrix(mat, target_n, seed = NULL, remove_columns = FALSE) +} +\arguments{ +\item{mat}{The input matrix to be downsampled} + +\item{target_n}{The target number of samples to downsample to} + +\item{seed}{The random seed for reproducibility (default is NULL)} + +\item{remove_columns}{Logical indicating whether to remove columns with small sums (default is FALSE)} +} +\value{ +The downsampled matrix +} +\description{ +This function takes a matrix and downsamples it to a target number of samples. +It uses a random seed for reproducibility and allows for removing columns with +small sums. +} +\examples{ +mat <- matrix(1:12, nrow = 4) +downsample_matrix(mat, 2) + +# Remove columns with small sums +downsample_matrix(mat, 12, remove_columns = TRUE) + +} diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp new file mode 100644 index 0000000..ce1070a --- /dev/null +++ b/src/DownsampleWorker.cpp @@ -0,0 +1,131 @@ +#include +#include +#include +#include +#include +#include +#include "DownsampleWorker.h" + +typedef float float32_t; +typedef double float64_t; +typedef unsigned char uint8_t; +typedef unsigned int uint_t; + +static size_t +ceil_power_of_two(const size_t size) { + return size_t(1) << size_t(ceil(log2(float64_t(size)))); +} + +template +static void initialize_tree(const std::vector& input, std::vector& tree) { + assert(input.size() >= 2); + + size_t input_size = ceil_power_of_two(input.size()); + tree.resize(2 * input_size - 1); // Resize the tree to the necessary size + + // Copy input to the beginning of the tree and fill the rest with zeros + std::copy(input.begin(), input.end(), tree.begin()); + std::fill(tree.begin() + input.size(), tree.begin() + input_size, 0); + + // Iteratively build the tree + size_t tree_index = 0; + while (input_size > 1) { + size_t half_size = input_size / 2; + for (size_t index = 0; index < half_size; ++index) { + const auto left = tree[tree_index + index * 2]; + const auto right = tree[tree_index + index * 2 + 1]; + tree[tree_index + input_size + index] = left + right; + + assert(left >= 0); + assert(right >= 0); + assert(tree[tree_index + input_size + index] == size_t(left) + size_t(right)); + } + tree_index += input_size; + input_size = half_size; + } + assert(tree.size() == 2 * input.size() - 1); +} + +static size_t random_sample(std::vector& tree, ssize_t random) { + size_t size_of_level = 1; + ssize_t base_of_level = tree.size() - 1; + size_t index_in_level = 0; + size_t index_in_tree = base_of_level + index_in_level; + + while (true) { + assert(index_in_tree == base_of_level + index_in_level); + assert(tree[index_in_tree] > random); + + --tree[index_in_tree]; + size_of_level *= 2; + base_of_level -= size_of_level; + + if (base_of_level < 0) { + return index_in_level; + } + + index_in_level *= 2; + index_in_tree = base_of_level + index_in_level; + ssize_t right_random = random - ssize_t(tree[index_in_tree]); + + assert(tree[base_of_level + index_in_level] + tree[base_of_level + index_in_level + 1] == + tree[base_of_level + size_of_level + index_in_level / 2] + 1); + + if (right_random >= 0) { + ++index_in_level; + ++index_in_tree; + assert(index_in_level < size_of_level); + random = right_random; + } + } +} + +template +static void downsample_slice(const std::vector& input, std::vector& output, const int32_t samples, const size_t random_seed) { + assert(output.size() == input.size()); // Replacing FastAssertCompare + + if (samples < 0 || input.size() == 0) { + return; + } + + if (input.size() == 1) { + output[0] = O(static_cast(samples) < static_cast(input[0]) ? samples : input[0]); + return; + } + + size_t input_size = ceil_power_of_two(input.size()); + std::vector tree(2 * input_size - 1); + initialize_tree(input, tree); + size_t& total = tree[tree.size() - 1]; + + if (total <= static_cast(samples)) { + std::copy(input.begin(), input.end(), output.begin()); + return; + } + + std::fill(output.begin(), output.end(), O(0)); + + std::minstd_rand random(random_seed); + for (size_t index = 0; index < static_cast(samples); ++index) { + size_t sampled_index = random_sample(tree, random() % total); + if (sampled_index < output.size()) { + ++output[sampled_index]; + } + } +} + + +DownsampleWorker::DownsampleWorker(const Rcpp::IntegerMatrix& input, Rcpp::IntegerMatrix& output, int samples, unsigned int random_seed) + : input_matrix(input), output_matrix(output), samples(samples), random_seed(random_seed) {} + +void DownsampleWorker::operator()(std::size_t begin, std::size_t end) { + for (std::size_t col = begin; col < end; ++col) { + std::vector input_vec(input_matrix.column(col).begin(), input_matrix.column(col).end()); + std::vector output_vec(input_vec.size(), 0); + + downsample_slice(input_vec, output_vec, samples, random_seed); + + std::copy(output_vec.begin(), output_vec.end(), output_matrix.column(col).begin()); + } +} + diff --git a/src/DownsampleWorker.h b/src/DownsampleWorker.h new file mode 100644 index 0000000..f480568 --- /dev/null +++ b/src/DownsampleWorker.h @@ -0,0 +1,22 @@ +#ifndef DOWNSAMPLEWORKER_H +#define DOWNSAMPLEWORKER_H + +#include +#include +#include + +class DownsampleWorker : public RcppParallel::Worker { +private: + RcppParallel::RMatrix input_matrix; + RcppParallel::RMatrix output_matrix; + int samples; + unsigned int random_seed; + +public: + DownsampleWorker(const Rcpp::IntegerMatrix& input, Rcpp::IntegerMatrix& output, int samples, unsigned int random_seed); + + // Parallel operator + void operator()(std::size_t begin, std::size_t end) override; +}; + +#endif // DOWNSAMPLEWORKER_H diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index d189d69..4b964d6 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -51,11 +51,25 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// downsample_matrix_cpp +Rcpp::IntegerMatrix downsample_matrix_cpp(Rcpp::IntegerMatrix input, int samples, unsigned int random_seed); +RcppExport SEXP _tglkmeans_downsample_matrix_cpp(SEXP inputSEXP, SEXP samplesSEXP, SEXP random_seedSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type input(inputSEXP); + Rcpp::traits::input_parameter< int >::type samples(samplesSEXP); + Rcpp::traits::input_parameter< unsigned int >::type random_seed(random_seedSEXP); + rcpp_result_gen = Rcpp::wrap(downsample_matrix_cpp(input, samples, random_seed)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_tglkmeans_reduce_coclust", (DL_FUNC) &_tglkmeans_reduce_coclust, 3}, {"_tglkmeans_reduce_num_trials", (DL_FUNC) &_tglkmeans_reduce_num_trials, 2}, {"_tglkmeans_TGL_kmeans_cpp", (DL_FUNC) &_tglkmeans_TGL_kmeans_cpp, 8}, + {"_tglkmeans_downsample_matrix_cpp", (DL_FUNC) &_tglkmeans_downsample_matrix_cpp, 3}, {NULL, NULL, 0} }; diff --git a/src/downsample.cpp b/src/downsample.cpp new file mode 100644 index 0000000..4a43833 --- /dev/null +++ b/src/downsample.cpp @@ -0,0 +1,19 @@ +#include +#include +#include +#include "DownsampleWorker.h" + +typedef float float32_t; +typedef double float64_t; +typedef unsigned char uint8_t; +typedef unsigned int uint_t; + +// [[Rcpp::export]] +Rcpp::IntegerMatrix downsample_matrix_cpp(Rcpp::IntegerMatrix input, int samples, unsigned int random_seed) { + Rcpp::IntegerMatrix output(input.nrow(), input.ncol()); + + DownsampleWorker worker(input, output, samples, random_seed); + RcppParallel::parallelFor(0, input.ncol(), worker); + + return output; +} diff --git a/tests/testthat/test-downsample.R b/tests/testthat/test-downsample.R new file mode 100644 index 0000000..e2b359c --- /dev/null +++ b/tests/testthat/test-downsample.R @@ -0,0 +1,23 @@ + +test_that("downsample_matrix returns the correct number of samples", { + mat <- matrix(1:12, nrow = 4) + target_n <- 2 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix removes columns with small sums when remove_columns is TRUE", { + mat <- matrix(1:12, nrow = 4) + target_n <- 12 + ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE) + expect_equal(ncol(ds_mat), 2) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix does not remove columns with small sums when remove_columns is FALSE", { + mat <- matrix(1:12, nrow = 4) + target_n <- 12 + expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = FALSE)) + expect_equal(ncol(ds_mat), ncol(mat)) + expect_true(all(colSums(ds_mat[, -1], na.rm = TRUE) == target_n)) +}) \ No newline at end of file From 803eab43cd93651978c66ea2cf777b243ce52f7c Mon Sep 17 00:00:00 2001 From: aviezerl Date: Thu, 28 Dec 2023 23:41:25 +0200 Subject: [PATCH 02/10] deal with NA --- R/downsample.R | 9 ++++++++- src/DownsampleWorker.cpp | 2 +- tests/testthat/test-downsample.R | 14 ++++++++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/R/downsample.R b/R/downsample.R index c8ce4ec..29e50c3 100644 --- a/R/downsample.R +++ b/R/downsample.R @@ -24,17 +24,24 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE seed <- sample(1:10000, 1) } + # replace NAs with 0s for the cpp code + orig_mat <- mat + mat[is.na(mat)] <- 0 ds_mat <- downsample_matrix_cpp(mat, target_n, seed) sums <- colSums(ds_mat, na.rm = TRUE) small_cols <- sums < target_n if (any(small_cols)) { if (remove_columns) { - ds_mat <- ds_mat[, !small_cols] + ds_mat <- ds_mat[, !small_cols, drop = FALSE] + orig_mat <- orig_mat[, !small_cols, drop = FALSE] } else { cli_warn("Some columns ({which(small_cols)}) have a sum<{.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.") } } + # put back the NAs + ds_mat[is.na(orig_mat)] <- NA + return(ds_mat) } diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp index ce1070a..8ec03fe 100644 --- a/src/DownsampleWorker.cpp +++ b/src/DownsampleWorker.cpp @@ -82,7 +82,7 @@ static size_t random_sample(std::vector& tree, ssize_t random) { template static void downsample_slice(const std::vector& input, std::vector& output, const int32_t samples, const size_t random_seed) { - assert(output.size() == input.size()); // Replacing FastAssertCompare + assert(output.size() == input.size()); if (samples < 0 || input.size() == 0) { return; diff --git a/tests/testthat/test-downsample.R b/tests/testthat/test-downsample.R index e2b359c..1d3d89f 100644 --- a/tests/testthat/test-downsample.R +++ b/tests/testthat/test-downsample.R @@ -1,4 +1,3 @@ - test_that("downsample_matrix returns the correct number of samples", { mat <- matrix(1:12, nrow = 4) target_n <- 2 @@ -20,4 +19,15 @@ test_that("downsample_matrix does not remove columns with small sums when remove expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = FALSE)) expect_equal(ncol(ds_mat), ncol(mat)) expect_true(all(colSums(ds_mat[, -1], na.rm = TRUE) == target_n)) -}) \ No newline at end of file +}) + +test_that("downsample_matrix returns the correct number of samples when there are NAs", { + mat <- matrix(1:12, nrow = 4) + mat[1, 1] <- NA + target_n <- 2 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + + # make sure the NAs are still there + expect_true(all(is.na(ds_mat[1, 1]))) +}) From 5ea568eb663a16673893e7a06d9c7241f9187c67 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Fri, 29 Dec 2023 00:34:09 +0200 Subject: [PATCH 03/10] Added support for dgCMatrix --- DESCRIPTION | 2 ++ R/RcppExports.R | 4 ++++ R/downsample.R | 39 +++++++++++++++++++++++++------- man/downsample_matrix.Rd | 6 ++++- src/DownsampleWorker.cpp | 23 ++++++++++++++++++- src/DownsampleWorker.h | 20 ++++++++++++++++ src/RcppExports.cpp | 14 ++++++++++++ src/downsample.cpp | 28 +++++++++++++++++++++++ tests/testthat/test-downsample.R | 2 +- 9 files changed, 127 insertions(+), 11 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f8fdbae..2d6cff1 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,6 +30,8 @@ Imports: future, ggplot2 (>= 2.2.0), magrittr, + Matrix, + methods, parallel (>= 3.3.2), plyr (>= 1.8.4), purrr (>= 0.2.0), diff --git a/R/RcppExports.R b/R/RcppExports.R index a09d6a4..ff2cd0c 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -17,3 +17,7 @@ downsample_matrix_cpp <- function(input, samples, random_seed) { .Call('_tglkmeans_downsample_matrix_cpp', PACKAGE = 'tglkmeans', input, samples, random_seed) } +rcpp_downsample_sparse <- function(matrix, samples, random_seed) { + .Call('_tglkmeans_rcpp_downsample_sparse', PACKAGE = 'tglkmeans', matrix, samples, random_seed) +} + diff --git a/R/downsample.R b/R/downsample.R index 29e50c3..9e6d72d 100644 --- a/R/downsample.R +++ b/R/downsample.R @@ -4,7 +4,7 @@ #' It uses a random seed for reproducibility and allows for removing columns with #' small sums. #' -#' @param mat The input matrix to be downsampled +#' @param mat The input matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). If the matrix contains NAs, the function will run significantly slower. #' @param target_n The target number of samples to downsample to #' @param seed The random seed for reproducibility (default is NULL) #' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE) @@ -18,6 +18,10 @@ #' # Remove columns with small sums #' downsample_matrix(mat, 12, remove_columns = TRUE) #' +#' # sparse matrix +#' mat_sparse <- Matrix::Matrix(mat, sparse = TRUE) +#' downsample_matrix(mat_sparse, 2) +#' #' @export downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE) { if (is.null(seed)) { @@ -25,23 +29,42 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE } # replace NAs with 0s for the cpp code - orig_mat <- mat - mat[is.na(mat)] <- 0 - ds_mat <- downsample_matrix_cpp(mat, target_n, seed) + has_nas <- FALSE + if (any(is.na(mat))) { + has_nas <- TRUE + cli_warn("Input matrix contains NAs. Processing would be significantly slower.") + orig_mat <- mat + mat[is.na(mat)] <- 0 + } + + + if (methods::is(mat, "dgCMatrix")) { + ds_mat <- rcpp_downsample_sparse(mat, target_n, seed) + sums <- Matrix::colSums(ds_mat, na.rm = TRUE) + } else if (is.matrix(mat)) { + ds_mat <- downsample_matrix_cpp(mat, target_n, seed) + sums <- colSums(ds_mat, na.rm = TRUE) + } else { + cli_abort("Input must be a matrix or a sparse matrix (dgCMatrix). class of {.field mat} is {.val {class(mat)}}.") + } - sums <- colSums(ds_mat, na.rm = TRUE) small_cols <- sums < target_n if (any(small_cols)) { if (remove_columns) { ds_mat <- ds_mat[, !small_cols, drop = FALSE] - orig_mat <- orig_mat[, !small_cols, drop = FALSE] + if (has_nas) { + orig_mat <- orig_mat[, !small_cols, drop = FALSE] + } } else { cli_warn("Some columns ({which(small_cols)}) have a sum<{.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.") } } - # put back the NAs - ds_mat[is.na(orig_mat)] <- NA + if (has_nas) { + # put back the NAs + ds_mat[is.na(orig_mat)] <- NA + } + return(ds_mat) } diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd index a697c77..3aa07cf 100644 --- a/man/downsample_matrix.Rd +++ b/man/downsample_matrix.Rd @@ -7,7 +7,7 @@ downsample_matrix(mat, target_n, seed = NULL, remove_columns = FALSE) } \arguments{ -\item{mat}{The input matrix to be downsampled} +\item{mat}{The input matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). If the matrix contains NAs, the function will run significantly slower.} \item{target_n}{The target number of samples to downsample to} @@ -30,4 +30,8 @@ downsample_matrix(mat, 2) # Remove columns with small sums downsample_matrix(mat, 12, remove_columns = TRUE) +# sparse matrix +mat_sparse <- Matrix::Matrix(mat, sparse = TRUE) +downsample_matrix(mat_sparse, 2) + } diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp index 8ec03fe..fc1b93e 100644 --- a/src/DownsampleWorker.cpp +++ b/src/DownsampleWorker.cpp @@ -114,7 +114,6 @@ static void downsample_slice(const std::vector& input, std::vector& output } } - DownsampleWorker::DownsampleWorker(const Rcpp::IntegerMatrix& input, Rcpp::IntegerMatrix& output, int samples, unsigned int random_seed) : input_matrix(input), output_matrix(output), samples(samples), random_seed(random_seed) {} @@ -129,3 +128,25 @@ void DownsampleWorker::operator()(std::size_t begin, std::size_t end) { } } +DownsampleWorkerSparse::DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, + Rcpp::IntegerVector& out_x, int nrows, int ncols, int samples, unsigned int random_seed) + : input_i(i), input_p(p), input_x(x), output_x(out_x), nrows(nrows), ncols(ncols), samples(samples), random_seed(random_seed) {} + +void DownsampleWorkerSparse::operator()(std::size_t begin, std::size_t end) { + for (std::size_t col = begin; col < end; ++col) { + // Extract the current column from the sparse matrix + std::vector input_vec; + for (int idx = input_p[col]; idx < input_p[col + 1]; ++idx) { + input_vec.push_back(input_x[idx]); + } + + std::vector output_vec(input_vec.size(), 0); + + downsample_slice(input_vec, output_vec, samples, random_seed); + + // Store results in the output sparse matrix + for (int idx = input_p[col], out_idx = 0; idx < input_p[col + 1]; ++idx, ++out_idx) { + output_x[idx] = output_vec[out_idx]; + } + } +} diff --git a/src/DownsampleWorker.h b/src/DownsampleWorker.h index f480568..5c6cf08 100644 --- a/src/DownsampleWorker.h +++ b/src/DownsampleWorker.h @@ -19,4 +19,24 @@ class DownsampleWorker : public RcppParallel::Worker { void operator()(std::size_t begin, std::size_t end) override; }; +class DownsampleWorkerSparse : public RcppParallel::Worker { +private: + Rcpp::IntegerVector input_i; + Rcpp::IntegerVector input_p; + Rcpp::IntegerVector input_x; + Rcpp::IntegerVector output_x; + int nrows; + int ncols; + int samples; + unsigned int random_seed; + +public: + DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, + Rcpp::IntegerVector& out_x, int nrows, int ncols, int samples, unsigned int random_seed); + + // Parallel operator + void operator()(std::size_t begin, std::size_t end) override; +}; + + #endif // DOWNSAMPLEWORKER_H diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 4b964d6..d5474ab 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -64,12 +64,26 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// rcpp_downsample_sparse +Rcpp::S4 rcpp_downsample_sparse(Rcpp::S4 matrix, int samples, unsigned int random_seed); +RcppExport SEXP _tglkmeans_rcpp_downsample_sparse(SEXP matrixSEXP, SEXP samplesSEXP, SEXP random_seedSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::S4 >::type matrix(matrixSEXP); + Rcpp::traits::input_parameter< int >::type samples(samplesSEXP); + Rcpp::traits::input_parameter< unsigned int >::type random_seed(random_seedSEXP); + rcpp_result_gen = Rcpp::wrap(rcpp_downsample_sparse(matrix, samples, random_seed)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_tglkmeans_reduce_coclust", (DL_FUNC) &_tglkmeans_reduce_coclust, 3}, {"_tglkmeans_reduce_num_trials", (DL_FUNC) &_tglkmeans_reduce_num_trials, 2}, {"_tglkmeans_TGL_kmeans_cpp", (DL_FUNC) &_tglkmeans_TGL_kmeans_cpp, 8}, {"_tglkmeans_downsample_matrix_cpp", (DL_FUNC) &_tglkmeans_downsample_matrix_cpp, 3}, + {"_tglkmeans_rcpp_downsample_sparse", (DL_FUNC) &_tglkmeans_rcpp_downsample_sparse, 3}, {NULL, NULL, 0} }; diff --git a/src/downsample.cpp b/src/downsample.cpp index 4a43833..6019819 100644 --- a/src/downsample.cpp +++ b/src/downsample.cpp @@ -17,3 +17,31 @@ Rcpp::IntegerMatrix downsample_matrix_cpp(Rcpp::IntegerMatrix input, int samples return output; } + +// [[Rcpp::export]] +Rcpp::S4 rcpp_downsample_sparse(Rcpp::S4 matrix, int samples, unsigned int random_seed) { + // Extract components of the dgCMatrix + Rcpp::IntegerVector i = matrix.slot("i"); + Rcpp::IntegerVector p = matrix.slot("p"); + Rcpp::IntegerVector x = matrix.slot("x"); + + int nrows = Rcpp::as(matrix.slot("Dim"))[0]; + int ncols = Rcpp::as(matrix.slot("Dim"))[1]; + + // Prepare output vector + Rcpp::IntegerVector out_x(x.size()); + + // Create and run the DownsampleWorkerSparse + DownsampleWorkerSparse worker(i, p, x, out_x, nrows, ncols, samples, random_seed); + RcppParallel::parallelFor(0, ncols, worker); + + // Create a new dgCMatrix object for the output + Rcpp::S4 out_matrix("dgCMatrix"); + out_matrix.slot("i") = i; + out_matrix.slot("p") = p; + Rcpp::NumericVector out_x_double = Rcpp::as(out_x); + out_matrix.slot("x") = out_x_double; + out_matrix.slot("Dim") = Rcpp::IntegerVector::create(nrows, ncols); + + return out_matrix; +} \ No newline at end of file diff --git a/tests/testthat/test-downsample.R b/tests/testthat/test-downsample.R index 1d3d89f..1b2267c 100644 --- a/tests/testthat/test-downsample.R +++ b/tests/testthat/test-downsample.R @@ -25,7 +25,7 @@ test_that("downsample_matrix returns the correct number of samples when there ar mat <- matrix(1:12, nrow = 4) mat[1, 1] <- NA target_n <- 2 - ds_mat <- downsample_matrix(mat, target_n) + expect_warning(ds_mat <- downsample_matrix(mat, target_n)) expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) # make sure the NAs are still there From 1fabce245584a0273c41f2df6e7e85fe5af97aab Mon Sep 17 00:00:00 2001 From: aviezerl Date: Fri, 29 Dec 2023 10:15:01 +0200 Subject: [PATCH 04/10] added tests --- R/downsample.R | 16 +++- man/downsample_matrix.Rd | 6 +- tests/testthat/test-downsample.R | 149 +++++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+), 4 deletions(-) diff --git a/R/downsample.R b/R/downsample.R index 9e6d72d..ca93e78 100644 --- a/R/downsample.R +++ b/R/downsample.R @@ -4,8 +4,10 @@ #' It uses a random seed for reproducibility and allows for removing columns with #' small sums. #' -#' @param mat The input matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). If the matrix contains NAs, the function will run significantly slower. -#' @param target_n The target number of samples to downsample to +#' @param mat An integer matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). +#' If the matrix contains NAs, the function will run significantly slower. Values that are +#' not integers will be coerced to integers using {.code floor()}. +#' @param target_n The target number of samples to downsample to. #' @param seed The random seed for reproducibility (default is NULL) #' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE) #' @@ -26,6 +28,8 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE) { if (is.null(seed)) { seed <- sample(1:10000, 1) + } else if (!is.numeric(seed) || seed <= 0 || seed != as.integer(seed)) { + cli_abort("{.field seed} must be a positive integer.") } # replace NAs with 0s for the cpp code @@ -37,6 +41,14 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE mat[is.na(mat)] <- 0 } + if (!is.logical(remove_columns)) { + cli_abort("{.field remove_columns} must be a logical value.") + } + + if (target_n <= 0 || target_n != as.integer(target_n)) { + cli_abort("{.field target_n} must be a positive integer.") + } + if (methods::is(mat, "dgCMatrix")) { ds_mat <- rcpp_downsample_sparse(mat, target_n, seed) diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd index 3aa07cf..dfdb81d 100644 --- a/man/downsample_matrix.Rd +++ b/man/downsample_matrix.Rd @@ -7,9 +7,11 @@ downsample_matrix(mat, target_n, seed = NULL, remove_columns = FALSE) } \arguments{ -\item{mat}{The input matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). If the matrix contains NAs, the function will run significantly slower.} +\item{mat}{An integer matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). +If the matrix contains NAs, the function will run significantly slower. Values that are +not integers will be coerced to integers using {.code floor()}.} -\item{target_n}{The target number of samples to downsample to} +\item{target_n}{The target number of samples to downsample to.} \item{seed}{The random seed for reproducibility (default is NULL)} diff --git a/tests/testthat/test-downsample.R b/tests/testthat/test-downsample.R index 1b2267c..631e990 100644 --- a/tests/testthat/test-downsample.R +++ b/tests/testthat/test-downsample.R @@ -3,6 +3,7 @@ test_that("downsample_matrix returns the correct number of samples", { target_n <- 2 ds_mat <- downsample_matrix(mat, target_n) expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + expect_true(all(mat >= ds_mat)) }) test_that("downsample_matrix removes columns with small sums when remove_columns is TRUE", { @@ -31,3 +32,151 @@ test_that("downsample_matrix returns the correct number of samples when there ar # make sure the NAs are still there expect_true(all(is.na(ds_mat[1, 1]))) }) + +test_that("downsample_matrix returns the correct number of samples when there are NAs and remove_columns is TRUE", { + mat <- matrix(1:12, nrow = 4) + mat[1, 1] <- NA + target_n <- 2 + expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE)) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + + # make sure the NAs are still there + expect_true(all(is.na(ds_mat[1, 1]))) +}) + +test_that("downsample_matrix returns the correct number of samples when there are NAs and remove_columns is FALSE", { + mat <- matrix(1:12, nrow = 4) + mat[1, 1] <- NA + target_n <- 2 + expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = FALSE)) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + expect_true(all(mat[!is.na(mat)] >= ds_mat[!is.na(ds_mat)])) + + # make sure the NAs are still there + expect_true(all(is.na(ds_mat[1, 1]))) +}) + +test_that("downsample_matrix returns the correct number of samples when there are NAs and remove_columns is TRUE and the matrix is sparse", { + mat <- Matrix::Matrix(matrix(1:12, nrow = 4), sparse = TRUE) + mat[1, 1] <- NA + target_n <- 2 + expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE)) + expect_true(all(Matrix::colSums(ds_mat, na.rm = TRUE) == target_n)) + + # make sure the NAs are still there + expect_true(all(is.na(ds_mat[1, 1]))) +}) + +test_that("downsample_matrix works with all zeros matrix", { + mat <- matrix(0, nrow = 4) + target_n <- 2 + expect_warning(ds_mat <- downsample_matrix(mat, target_n)) + expect_true(all(mat == ds_mat)) + + ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE) + expect_equal(ncol(ds_mat), 0) +}) + +# Test with Different Matrix Sizes +test_that("downsample_matrix works with larger matrix", { + mat <- matrix(1:1e3, nrow = 10) + target_n <- 5 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix works with single-column matrix", { + mat <- matrix(1:10, nrow = 10) + target_n <- 5 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix works with single-row matrix", { + mat <- matrix(1:10, nrow = 1) + target_n <- 5 + ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + expect_equal(ncol(ds_mat), 6) +}) + +# Test with Different target_n Values +test_that("downsample_matrix with target_n equal to number of rows", { + mat <- matrix(1:12, nrow = 4) + target_n <- 4 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix with target_n greater than number of rows", { + mat <- matrix(1:12, nrow = 4) + target_n <- 6 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) <= target_n)) +}) + +test_that("downsample_matrix with target_n equal to 1", { + mat <- matrix(1:12, nrow = 4) + target_n <- 1 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) +}) + +test_that("downsample_matrix with target_n equal to 0", { + mat <- matrix(1:12, nrow = 4) + target_n <- 0 + expect_error(downsample_matrix(mat, target_n)) +}) + +# Test with Different Matrix Types +test_that("downsample_matrix with sparse matrix", { + mat <- Matrix::Matrix(matrix(1:12, nrow = 4), sparse = TRUE) + target_n <- 2 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(Matrix::colSums(ds_mat, na.rm = TRUE) == target_n)) + expect_true(all(mat >= ds_mat)) +}) + +test_that("downsample_matrix with invalid matrix type", { + mat <- 1:12 + target_n <- 2 + expect_error(downsample_matrix(mat, target_n)) + + mat <- Matrix::Matrix(matrix(1:12, nrow = 4), sparse = FALSE) + target_n <- 2 + expect_error(downsample_matrix(mat, target_n)) +}) + +# Test with non-integer values +test_that("downsample_matrix with non-integer values", { + mat <- matrix(1:12, nrow = 4) + mat[1, 1] <- 1.5 + target_n <- 2 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n)) + expect_true(all(mat >= ds_mat)) +}) + +# Test with Various Seed Values +test_that("downsample_matrix with fixed seed", { + mat <- matrix(1:12, nrow = 4) + target_n <- 2 + seed <- 123 + ds_mat1 <- downsample_matrix(mat, target_n, seed = seed) + ds_mat2 <- downsample_matrix(mat, target_n, seed = seed) + expect_equal(ds_mat1, ds_mat2) +}) + +test_that("downsample_matrix without specifying seed", { + mat <- matrix(1:12, nrow = 4) + target_n <- 2 + ds_mat <- downsample_matrix(mat, target_n) + expect_true(is.matrix(ds_mat)) +}) + +test_that("downsample_matrix with invalid seed", { + mat <- matrix(1:12, nrow = 4) + target_n <- 2 + expect_error(downsample_matrix(mat, target_n, seed = -1)) + expect_error(downsample_matrix(mat, target_n, seed = "not a number")) +}) From 981e3ec0399953158b01d1f7117b7f74a04a9d2c Mon Sep 17 00:00:00 2001 From: aviezerl Date: Fri, 29 Dec 2023 10:35:32 +0200 Subject: [PATCH 05/10] show the user the random seed --- R/downsample.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/downsample.R b/R/downsample.R index ca93e78..f0537db 100644 --- a/R/downsample.R +++ b/R/downsample.R @@ -28,6 +28,7 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE) { if (is.null(seed)) { seed <- sample(1:10000, 1) + cli::cli_alert_warning("No seed provided. Using {.val {seed}}.") } else if (!is.numeric(seed) || seed <= 0 || seed != as.integer(seed)) { cli_abort("{.field seed} must be a positive integer.") } From 63064f229299e943c502a6af12b74698543fcdc6 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Fri, 29 Dec 2023 10:58:14 +0200 Subject: [PATCH 06/10] added target_q parameter --- R/downsample.R | 52 +++++++++++++++++++++++--------- man/downsample_matrix.Rd | 13 +++++++- tests/testthat/test-downsample.R | 22 ++++++++++++++ 3 files changed, 71 insertions(+), 16 deletions(-) diff --git a/R/downsample.R b/R/downsample.R index f0537db..292f277 100644 --- a/R/downsample.R +++ b/R/downsample.R @@ -8,6 +8,7 @@ #' If the matrix contains NAs, the function will run significantly slower. Values that are #' not integers will be coerced to integers using {.code floor()}. #' @param target_n The target number of samples to downsample to. +#' @param target_q A target quantile of sums to downsample to. Only one of {.field target_n} or {.field target_q} can be provided. #' @param seed The random seed for reproducibility (default is NULL) #' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE) #' @@ -24,8 +25,23 @@ #' mat_sparse <- Matrix::Matrix(mat, sparse = TRUE) #' downsample_matrix(mat_sparse, 2) #' +#' # with a quantile +#' downsample_matrix(mat, target_q = 0.5) +#' #' @export -downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE) { +downsample_matrix <- function(mat, target_n = NULL, target_q = NULL, seed = NULL, remove_columns = FALSE) { + if (is.null(target_n) && is.null(target_q)) { + cli_abort("Either {.field target_n} or {.field target_q} must be provided.") + } else if (!is.null(target_n) && !is.null(target_q)) { + cli_abort("Only one of {.field target_n} or {.field target_q} can be provided.") + } + + sums <- colsums_matrix(mat) + if (!is.null(target_q)) { + target_n <- round(stats::quantile(sums, target_q)) + cli::cli_alert_info("Using {.val {target_n}} as the target number.") + } + if (is.null(seed)) { seed <- sample(1:10000, 1) cli::cli_alert_warning("No seed provided. Using {.val {seed}}.") @@ -33,15 +49,6 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE cli_abort("{.field seed} must be a positive integer.") } - # replace NAs with 0s for the cpp code - has_nas <- FALSE - if (any(is.na(mat))) { - has_nas <- TRUE - cli_warn("Input matrix contains NAs. Processing would be significantly slower.") - orig_mat <- mat - mat[is.na(mat)] <- 0 - } - if (!is.logical(remove_columns)) { cli_abort("{.field remove_columns} must be a logical value.") } @@ -50,15 +57,19 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE cli_abort("{.field target_n} must be a positive integer.") } + # replace NAs with 0s for the cpp code + has_nas <- FALSE + if (any(is.na(mat))) { + has_nas <- TRUE + cli_warn("Input matrix contains NAs. Processing would be significantly slower.") + orig_mat <- mat + mat[is.na(mat)] <- 0 + } if (methods::is(mat, "dgCMatrix")) { ds_mat <- rcpp_downsample_sparse(mat, target_n, seed) - sums <- Matrix::colSums(ds_mat, na.rm = TRUE) } else if (is.matrix(mat)) { ds_mat <- downsample_matrix_cpp(mat, target_n, seed) - sums <- colSums(ds_mat, na.rm = TRUE) - } else { - cli_abort("Input must be a matrix or a sparse matrix (dgCMatrix). class of {.field mat} is {.val {class(mat)}}.") } small_cols <- sums < target_n @@ -68,8 +79,9 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE if (has_nas) { orig_mat <- orig_mat[, !small_cols, drop = FALSE] } + cli_alert_info("Removed {.val {sum(small_cols)}} columns with a sum smaller than {.val {target_n}}.") } else { - cli_warn("Some columns ({which(small_cols)}) have a sum<{.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.") + cli_warn("{.val {sum(small_cols)}} columns have a sum smaller than {.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.") } } @@ -81,3 +93,13 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE return(ds_mat) } + +colsums_matrix <- function(mat) { + if (methods::is(mat, "dgCMatrix")) { + return(Matrix::colSums(mat, na.rm = TRUE)) + } else if (is.matrix(mat)) { + return(colSums(mat, na.rm = TRUE)) + } else { + cli_abort("Input must be a matrix or a sparse matrix (dgCMatrix). class of {.field mat} is {.val {class(mat)}}.") + } +} diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd index dfdb81d..f590673 100644 --- a/man/downsample_matrix.Rd +++ b/man/downsample_matrix.Rd @@ -4,7 +4,13 @@ \alias{downsample_matrix} \title{Downsample a matrix to a target number of in each column} \usage{ -downsample_matrix(mat, target_n, seed = NULL, remove_columns = FALSE) +downsample_matrix( + mat, + target_n = NULL, + target_q = NULL, + seed = NULL, + remove_columns = FALSE +) } \arguments{ \item{mat}{An integer matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). @@ -13,6 +19,8 @@ not integers will be coerced to integers using {.code floor()}.} \item{target_n}{The target number of samples to downsample to.} +\item{target_q}{A target quantile of sums to downsample to. Only one of {.field target_n} or {.field target_q} can be provided.} + \item{seed}{The random seed for reproducibility (default is NULL)} \item{remove_columns}{Logical indicating whether to remove columns with small sums (default is FALSE)} @@ -36,4 +44,7 @@ downsample_matrix(mat, 12, remove_columns = TRUE) mat_sparse <- Matrix::Matrix(mat, sparse = TRUE) downsample_matrix(mat_sparse, 2) +# with a quantile +downsample_matrix(mat, target_q = 0.5) + } diff --git a/tests/testthat/test-downsample.R b/tests/testthat/test-downsample.R index 631e990..8a04d8b 100644 --- a/tests/testthat/test-downsample.R +++ b/tests/testthat/test-downsample.R @@ -180,3 +180,25 @@ test_that("downsample_matrix with invalid seed", { expect_error(downsample_matrix(mat, target_n, seed = -1)) expect_error(downsample_matrix(mat, target_n, seed = "not a number")) }) + +test_that("downsample_matrix correctly downsamples matrix using target_q", { + mat <- matrix(1:12, nrow = 4) + target_q <- 0.5 + ds_mat <- downsample_matrix(mat, target_q = target_q, remove_columns = TRUE) + expect_true(all(colSums(ds_mat, na.rm = TRUE) == round(stats::quantile(colSums(mat), target_q)))) + + expect_warning(ds_mat <- downsample_matrix(mat, target_q = target_q, remove_columns = FALSE)) + expect_true(all(mat >= ds_mat)) +}) + +test_that("Cannot provide both target_n and target_q", { + mat <- matrix(1:12, nrow = 4) + target_n <- 2 + target_q <- 0.5 + expect_error(downsample_matrix(mat, target_n = target_n, target_q = target_q)) +}) + +test_that("Fail when no target_n or target_q is provided", { + mat <- matrix(1:12, nrow = 4) + expect_error(downsample_matrix(mat)) +}) From ed2002e7bd4763eb31a0b6016edba34b5b9f447e Mon Sep 17 00:00:00 2001 From: aviezerl Date: Fri, 29 Dec 2023 11:06:49 +0200 Subject: [PATCH 07/10] bumped version to 0.5.0 --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ R/downsample.R | 6 +++++- man/downsample_matrix.Rd | 6 +++++- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2d6cff1..9e2498f 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tglkmeans Title: Efficient Implementation of K-Means++ Algorithm -Version: 0.4.0 +Version: 0.5.0 Authors@R: c( person("Aviezer", "Lifshitz", , "aviezer.lifshitz@weizmann.ac.il", role = c("aut", "cre")), person("Amos", "Tanay", role = "aut"), diff --git a/NEWS.md b/NEWS.md index 7844b04..938561e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# tglkmeans 0.5.0 + +* Added `dowsample_matrix` function to downsample the columns of a count matrix to a target number. + # tglkmeans 0.4.0 * Default of `id_column` parameter was changed to `FALSE`. Note that this is a breaking change, and if you want to use an id column, you need to set it explicitly to `TRUE`. diff --git a/R/downsample.R b/R/downsample.R index 292f277..d6d3f4b 100644 --- a/R/downsample.R +++ b/R/downsample.R @@ -1,4 +1,4 @@ -#' Downsample a matrix to a target number of in each column +#' Downsample the columns of a matrix to a target number #' #' @description This function takes a matrix and downsamples it to a target number of samples. #' It uses a random seed for reproducibility and allows for removing columns with @@ -15,6 +15,10 @@ #' @return The downsampled matrix #' #' @examples +#' \dontshow{ +#' tglkmeans.set_parallel(1) +#' } +#' #' mat <- matrix(1:12, nrow = 4) #' downsample_matrix(mat, 2) #' diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd index f590673..db41148 100644 --- a/man/downsample_matrix.Rd +++ b/man/downsample_matrix.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/downsample.R \name{downsample_matrix} \alias{downsample_matrix} -\title{Downsample a matrix to a target number of in each column} +\title{Downsample the columns of a matrix to a target number} \usage{ downsample_matrix( mat, @@ -34,6 +34,10 @@ It uses a random seed for reproducibility and allows for removing columns with small sums. } \examples{ +\dontshow{ +tglkmeans.set_parallel(1) +} + mat <- matrix(1:12, nrow = 4) downsample_matrix(mat, 2) From 38f5298b00c75f8f273c6503ec7d4315ad30cf86 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Fri, 29 Dec 2023 11:11:31 +0200 Subject: [PATCH 08/10] removed unused private member --- src/DownsampleWorker.cpp | 4 ++-- src/DownsampleWorker.h | 3 +-- src/downsample.cpp | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp index fc1b93e..4f047f7 100644 --- a/src/DownsampleWorker.cpp +++ b/src/DownsampleWorker.cpp @@ -129,8 +129,8 @@ void DownsampleWorker::operator()(std::size_t begin, std::size_t end) { } DownsampleWorkerSparse::DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, - Rcpp::IntegerVector& out_x, int nrows, int ncols, int samples, unsigned int random_seed) - : input_i(i), input_p(p), input_x(x), output_x(out_x), nrows(nrows), ncols(ncols), samples(samples), random_seed(random_seed) {} + Rcpp::IntegerVector& out_x, int ncols, int samples, unsigned int random_seed) + : input_i(i), input_p(p), input_x(x), output_x(out_x), ncols(ncols), samples(samples), random_seed(random_seed) {} void DownsampleWorkerSparse::operator()(std::size_t begin, std::size_t end) { for (std::size_t col = begin; col < end; ++col) { diff --git a/src/DownsampleWorker.h b/src/DownsampleWorker.h index 5c6cf08..b7936b4 100644 --- a/src/DownsampleWorker.h +++ b/src/DownsampleWorker.h @@ -25,14 +25,13 @@ class DownsampleWorkerSparse : public RcppParallel::Worker { Rcpp::IntegerVector input_p; Rcpp::IntegerVector input_x; Rcpp::IntegerVector output_x; - int nrows; int ncols; int samples; unsigned int random_seed; public: DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, - Rcpp::IntegerVector& out_x, int nrows, int ncols, int samples, unsigned int random_seed); + Rcpp::IntegerVector& out_x, int ncols, int samples, unsigned int random_seed); // Parallel operator void operator()(std::size_t begin, std::size_t end) override; diff --git a/src/downsample.cpp b/src/downsample.cpp index 6019819..f4c2484 100644 --- a/src/downsample.cpp +++ b/src/downsample.cpp @@ -32,7 +32,7 @@ Rcpp::S4 rcpp_downsample_sparse(Rcpp::S4 matrix, int samples, unsigned int rando Rcpp::IntegerVector out_x(x.size()); // Create and run the DownsampleWorkerSparse - DownsampleWorkerSparse worker(i, p, x, out_x, nrows, ncols, samples, random_seed); + DownsampleWorkerSparse worker(i, p, x, out_x, ncols, samples, random_seed); RcppParallel::parallelFor(0, ncols, worker); // Create a new dgCMatrix object for the output From f3f1e2c8a1a8159608dbc6fbb555164e53b50786 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Fri, 29 Dec 2023 11:13:54 +0200 Subject: [PATCH 09/10] removed another unused variable --- src/DownsampleWorker.cpp | 4 ++-- src/DownsampleWorker.h | 3 +-- src/downsample.cpp | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp index 4f047f7..b4e6507 100644 --- a/src/DownsampleWorker.cpp +++ b/src/DownsampleWorker.cpp @@ -129,8 +129,8 @@ void DownsampleWorker::operator()(std::size_t begin, std::size_t end) { } DownsampleWorkerSparse::DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, - Rcpp::IntegerVector& out_x, int ncols, int samples, unsigned int random_seed) - : input_i(i), input_p(p), input_x(x), output_x(out_x), ncols(ncols), samples(samples), random_seed(random_seed) {} + Rcpp::IntegerVector& out_x, int samples, unsigned int random_seed) + : input_i(i), input_p(p), input_x(x), output_x(out_x), samples(samples), random_seed(random_seed) {} void DownsampleWorkerSparse::operator()(std::size_t begin, std::size_t end) { for (std::size_t col = begin; col < end; ++col) { diff --git a/src/DownsampleWorker.h b/src/DownsampleWorker.h index b7936b4..53c944c 100644 --- a/src/DownsampleWorker.h +++ b/src/DownsampleWorker.h @@ -25,13 +25,12 @@ class DownsampleWorkerSparse : public RcppParallel::Worker { Rcpp::IntegerVector input_p; Rcpp::IntegerVector input_x; Rcpp::IntegerVector output_x; - int ncols; int samples; unsigned int random_seed; public: DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, - Rcpp::IntegerVector& out_x, int ncols, int samples, unsigned int random_seed); + Rcpp::IntegerVector& out_x, int samples, unsigned int random_seed); // Parallel operator void operator()(std::size_t begin, std::size_t end) override; diff --git a/src/downsample.cpp b/src/downsample.cpp index f4c2484..b2da02f 100644 --- a/src/downsample.cpp +++ b/src/downsample.cpp @@ -32,7 +32,7 @@ Rcpp::S4 rcpp_downsample_sparse(Rcpp::S4 matrix, int samples, unsigned int rando Rcpp::IntegerVector out_x(x.size()); // Create and run the DownsampleWorkerSparse - DownsampleWorkerSparse worker(i, p, x, out_x, ncols, samples, random_seed); + DownsampleWorkerSparse worker(i, p, x, out_x, samples, random_seed); RcppParallel::parallelFor(0, ncols, worker); // Create a new dgCMatrix object for the output From c27516d143953a2253f962d0a077e4d64b7fefa9 Mon Sep 17 00:00:00 2001 From: aviezerl Date: Fri, 29 Dec 2023 11:15:16 +0200 Subject: [PATCH 10/10] updated _pkgdown --- _pkgdown.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 30c405b..f55ec4c 100755 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -27,6 +27,10 @@ reference: - contents: - TGL_kmeans - TGL_kmeans_tidy +- title: Matrix + desc: matrix utility functions +- contents: + - downsample_matrix - title: misc desc: utility functions - contents: