Merge pull request #15 from tanaylab/feat@downsample-matrix

Feat@downsample matrix
tanaylab · Dec 30, 2023 · c65b008 · c65b008
2 parents bad1652 + c27516d
commit c65b008
Show file tree

Hide file tree

Showing 14 changed files with 656 additions and 3 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tglkmeans
 Title: Efficient Implementation of K-Means++ Algorithm
-Version: 0.4.0
+Version: 0.5.0
 Authors@R: c(
     person("Aviezer", "Lifshitz", , "[email protected]", role = c("aut", "cre")),
     person("Amos", "Tanay", role = "aut"),
@@ -30,6 +30,8 @@ Imports:
     future,
     ggplot2 (>= 2.2.0),
     magrittr,
+    Matrix,
+    methods,
     parallel (>= 3.3.2),
     plyr (>= 1.8.4),
     purrr (>= 0.2.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export("%>%")
 export(TGL_kmeans)
 export(TGL_kmeans_tidy)
+export(downsample_matrix)
 export(simulate_data)
 export(tglkmeans.set_parallel)
 import(dplyr)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# tglkmeans 0.5.0
+
+* Added `dowsample_matrix` function to downsample the columns of a count matrix to a target number. 
+
 # tglkmeans 0.4.0
 
 * Default of `id_column` parameter was changed to `FALSE`. Note that this is a breaking change, and if you want to use an id column, you need to set it explicitly to `TRUE`.

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -13,3 +13,11 @@ TGL_kmeans_cpp <- function(ids, mat, k, metric, max_iter = 40, min_delta = 0.000
     .Call('_tglkmeans_TGL_kmeans_cpp', PACKAGE = 'tglkmeans', ids, mat, k, metric, max_iter, min_delta, use_cpp_random, seed)
 }
 
+downsample_matrix_cpp <- function(input, samples, random_seed) {
+    .Call('_tglkmeans_downsample_matrix_cpp', PACKAGE = 'tglkmeans', input, samples, random_seed)
+}
+
+rcpp_downsample_sparse <- function(matrix, samples, random_seed) {
+    .Call('_tglkmeans_rcpp_downsample_sparse', PACKAGE = 'tglkmeans', matrix, samples, random_seed)
+}
+
diff --git a/R/downsample.R b/R/downsample.R
@@ -0,0 +1,109 @@
+#' Downsample the columns of a matrix to a target number
+#'
+#' @description This function takes a matrix and downsamples it to a target number of samples.
+#' It uses a random seed for reproducibility and allows for removing columns with
+#' small sums.
+#'
+#' @param mat An integer matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix).
+#' If the matrix contains NAs, the function will run significantly slower. Values that are
+#' not integers will be coerced to integers using {.code floor()}.
+#' @param target_n The target number of samples to downsample to.
+#' @param target_q A target quantile of sums to downsample to. Only one of {.field target_n} or {.field target_q} can be provided.
+#' @param seed The random seed for reproducibility (default is NULL)
+#' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE)
+#'
+#' @return The downsampled matrix
+#'
+#' @examples
+#' \dontshow{
+#' tglkmeans.set_parallel(1)
+#' }
+#'
+#' mat <- matrix(1:12, nrow = 4)
+#' downsample_matrix(mat, 2)
+#'
+#' # Remove columns with small sums
+#' downsample_matrix(mat, 12, remove_columns = TRUE)
+#'
+#' # sparse matrix
+#' mat_sparse <- Matrix::Matrix(mat, sparse = TRUE)
+#' downsample_matrix(mat_sparse, 2)
+#'
+#' # with a quantile
+#' downsample_matrix(mat, target_q = 0.5)
+#'
+#' @export
+downsample_matrix <- function(mat, target_n = NULL, target_q = NULL, seed = NULL, remove_columns = FALSE) {
+    if (is.null(target_n) && is.null(target_q)) {
+        cli_abort("Either {.field target_n} or {.field target_q} must be provided.")
+    } else if (!is.null(target_n) && !is.null(target_q)) {
+        cli_abort("Only one of {.field target_n} or {.field target_q} can be provided.")
+    }
+
+    sums <- colsums_matrix(mat)
+    if (!is.null(target_q)) {
+        target_n <- round(stats::quantile(sums, target_q))
+        cli::cli_alert_info("Using {.val {target_n}} as the target number.")
+    }
+
+    if (is.null(seed)) {
+        seed <- sample(1:10000, 1)
+        cli::cli_alert_warning("No seed provided. Using {.val {seed}}.")
+    } else if (!is.numeric(seed) || seed <= 0 || seed != as.integer(seed)) {
+        cli_abort("{.field seed} must be a positive integer.")
+    }
+
+    if (!is.logical(remove_columns)) {
+        cli_abort("{.field remove_columns} must be a logical value.")
+    }
+
+    if (target_n <= 0 || target_n != as.integer(target_n)) {
+        cli_abort("{.field target_n} must be a positive integer.")
+    }
+
+    # replace NAs with 0s for the cpp code
+    has_nas <- FALSE
+    if (any(is.na(mat))) {
+        has_nas <- TRUE
+        cli_warn("Input matrix contains NAs. Processing would be significantly slower.")
+        orig_mat <- mat
+        mat[is.na(mat)] <- 0
+    }
+
+    if (methods::is(mat, "dgCMatrix")) {
+        ds_mat <- rcpp_downsample_sparse(mat, target_n, seed)
+    } else if (is.matrix(mat)) {
+        ds_mat <- downsample_matrix_cpp(mat, target_n, seed)
+    }
+
+    small_cols <- sums < target_n
+    if (any(small_cols)) {
+        if (remove_columns) {
+            ds_mat <- ds_mat[, !small_cols, drop = FALSE]
+            if (has_nas) {
+                orig_mat <- orig_mat[, !small_cols, drop = FALSE]
+            }
+            cli_alert_info("Removed {.val {sum(small_cols)}} columns with a sum smaller than {.val {target_n}}.")
+        } else {
+            cli_warn("{.val {sum(small_cols)}} columns have a sum smaller than {.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.")
+        }
+    }
+
+    if (has_nas) {
+        # put back the NAs
+        ds_mat[is.na(orig_mat)] <- NA
+    }
+
+
+    return(ds_mat)
+}
+
+colsums_matrix <- function(mat) {
+    if (methods::is(mat, "dgCMatrix")) {
+        return(Matrix::colSums(mat, na.rm = TRUE))
+    } else if (is.matrix(mat)) {
+        return(colSums(mat, na.rm = TRUE))
+    } else {
+        cli_abort("Input must be a matrix or a sparse matrix (dgCMatrix). class of {.field mat} is {.val {class(mat)}}.")
+    }
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -27,6 +27,10 @@ reference:
 - contents:
   - TGL_kmeans
   - TGL_kmeans_tidy
+- title: Matrix
+  desc: matrix utility functions
+- contents: 
+  - downsample_matrix
 - title: misc
   desc: utility functions
 - contents: 

diff --git a/man/TGL_kmeans.Rd b/man/TGL_kmeans.Rd
diff --git a/man/TGL_kmeans_tidy.Rd b/man/TGL_kmeans_tidy.Rd
diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd
diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp
@@ -0,0 +1,152 @@
+#include <vector>
+#include <algorithm>
+#include <cassert>
+#include <random>
+#include <Rcpp.h>
+#include <RcppParallel.h>
+#include "DownsampleWorker.h"
+
+typedef float float32_t;
+typedef double float64_t;
+typedef unsigned char uint8_t;
+typedef unsigned int uint_t;
+
+static size_t
+ceil_power_of_two(const size_t size) {
+    return size_t(1) << size_t(ceil(log2(float64_t(size))));
+}
+
+template<typename D>
+static void initialize_tree(const std::vector<D>& input, std::vector<size_t>& tree) {
+    assert(input.size() >= 2); 
+
+    size_t input_size = ceil_power_of_two(input.size());
+    tree.resize(2 * input_size - 1); // Resize the tree to the necessary size
+
+    // Copy input to the beginning of the tree and fill the rest with zeros
+    std::copy(input.begin(), input.end(), tree.begin());
+    std::fill(tree.begin() + input.size(), tree.begin() + input_size, 0);
+
+    // Iteratively build the tree
+    size_t tree_index = 0;
+    while (input_size > 1) {
+        size_t half_size = input_size / 2;
+        for (size_t index = 0; index < half_size; ++index) {
+            const auto left = tree[tree_index + index * 2];
+            const auto right = tree[tree_index + index * 2 + 1];
+            tree[tree_index + input_size + index] = left + right;
+
+            assert(left >= 0);
+            assert(right >= 0);
+            assert(tree[tree_index + input_size + index] == size_t(left) + size_t(right));
+        }
+        tree_index += input_size;
+        input_size = half_size;
+    }
+    assert(tree.size() == 2 * input.size() - 1);
+}
+
+static size_t random_sample(std::vector<size_t>& tree, ssize_t random) {
+    size_t size_of_level = 1;
+    ssize_t base_of_level = tree.size() - 1;
+    size_t index_in_level = 0;
+    size_t index_in_tree = base_of_level + index_in_level;
+
+    while (true) {
+        assert(index_in_tree == base_of_level + index_in_level);
+        assert(tree[index_in_tree] > random);
+
+        --tree[index_in_tree];
+        size_of_level *= 2;
+        base_of_level -= size_of_level;
+
+        if (base_of_level < 0) {
+            return index_in_level;
+        }
+
+        index_in_level *= 2;
+        index_in_tree = base_of_level + index_in_level;
+        ssize_t right_random = random - ssize_t(tree[index_in_tree]);
+
+        assert(tree[base_of_level + index_in_level] + tree[base_of_level + index_in_level + 1] ==
+               tree[base_of_level + size_of_level + index_in_level / 2] + 1);
+
+        if (right_random >= 0) {
+            ++index_in_level;
+            ++index_in_tree;
+            assert(index_in_level < size_of_level);
+            random = right_random;
+        }
+    }
+}
+
+template<typename D, typename O>
+static void downsample_slice(const std::vector<D>& input, std::vector<O>& output, const int32_t samples, const size_t random_seed) {
+    assert(output.size() == input.size()); 
+
+    if (samples < 0 || input.size() == 0) {
+        return;
+    }
+
+    if (input.size() == 1) {
+        output[0] = O(static_cast<double>(samples) < static_cast<double>(input[0]) ? samples : input[0]);
+        return;
+    }
+
+    size_t input_size = ceil_power_of_two(input.size());
+    std::vector<size_t> tree(2 * input_size - 1);
+    initialize_tree(input, tree);
+    size_t& total = tree[tree.size() - 1];
+
+    if (total <= static_cast<size_t>(samples)) {
+        std::copy(input.begin(), input.end(), output.begin());
+        return;
+    }
+
+    std::fill(output.begin(), output.end(), O(0));
+
+    std::minstd_rand random(random_seed);
+    for (size_t index = 0; index < static_cast<size_t>(samples); ++index) {
+        size_t sampled_index = random_sample(tree, random() % total);
+        if (sampled_index < output.size()) {
+            ++output[sampled_index];
+        }
+    }
+}
+
+DownsampleWorker::DownsampleWorker(const Rcpp::IntegerMatrix& input, Rcpp::IntegerMatrix& output, int samples, unsigned int random_seed)
+    : input_matrix(input), output_matrix(output), samples(samples), random_seed(random_seed) {}
+
+void DownsampleWorker::operator()(std::size_t begin, std::size_t end) {
+    for (std::size_t col = begin; col < end; ++col) {        
+        std::vector<int> input_vec(input_matrix.column(col).begin(), input_matrix.column(col).end());
+        std::vector<int> output_vec(input_vec.size(), 0);
+
+        downsample_slice(input_vec, output_vec, samples, random_seed);
+
+        std::copy(output_vec.begin(), output_vec.end(), output_matrix.column(col).begin());
+    }
+}
+
+DownsampleWorkerSparse::DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, 
+                                               Rcpp::IntegerVector& out_x, int samples, unsigned int random_seed)
+    : input_i(i), input_p(p), input_x(x), output_x(out_x), samples(samples), random_seed(random_seed) {}
+
+void DownsampleWorkerSparse::operator()(std::size_t begin, std::size_t end) {
+    for (std::size_t col = begin; col < end; ++col) {
+        // Extract the current column from the sparse matrix
+        std::vector<int> input_vec;
+        for (int idx = input_p[col]; idx < input_p[col + 1]; ++idx) {
+            input_vec.push_back(input_x[idx]);
+        }
+
+        std::vector<int> output_vec(input_vec.size(), 0);
+
+        downsample_slice(input_vec, output_vec, samples, random_seed);
+
+        // Store results in the output sparse matrix
+        for (int idx = input_p[col], out_idx = 0; idx < input_p[col + 1]; ++idx, ++out_idx) {
+            output_x[idx] = output_vec[out_idx];
+        }
+    }
+}