From 2a21c4e25544498e7bb3d72aa5ab72ef24b8cb24 Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Thu, 28 Dec 2023 23:27:56 +0200
Subject: [PATCH 01/10] Added downsampling for dense matrices

---
 NAMESPACE                        |   1 +
 R/RcppExports.R                  |   4 +
 R/downsample.R                   |  40 ++++++++++
 man/TGL_kmeans.Rd                |   2 +-
 man/TGL_kmeans_tidy.Rd           |   2 +-
 man/downsample_matrix.Rd         |  33 ++++++++
 src/DownsampleWorker.cpp         | 131 +++++++++++++++++++++++++++++++
 src/DownsampleWorker.h           |  22 ++++++
 src/RcppExports.cpp              |  14 ++++
 src/downsample.cpp               |  19 +++++
 tests/testthat/test-downsample.R |  23 ++++++
 11 files changed, 289 insertions(+), 2 deletions(-)
 create mode 100644 R/downsample.R
 create mode 100644 man/downsample_matrix.Rd
 create mode 100644 src/DownsampleWorker.cpp
 create mode 100644 src/DownsampleWorker.h
 create mode 100644 src/downsample.cpp
 create mode 100644 tests/testthat/test-downsample.R

diff --git a/NAMESPACE b/NAMESPACE
index afb2449..4e4625d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,6 +3,7 @@
 export("%>%")
 export(TGL_kmeans)
 export(TGL_kmeans_tidy)
+export(downsample_matrix)
 export(simulate_data)
 export(tglkmeans.set_parallel)
 import(dplyr)
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 8e3121b..a09d6a4 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -13,3 +13,7 @@ TGL_kmeans_cpp <- function(ids, mat, k, metric, max_iter = 40, min_delta = 0.000
     .Call('_tglkmeans_TGL_kmeans_cpp', PACKAGE = 'tglkmeans', ids, mat, k, metric, max_iter, min_delta, use_cpp_random, seed)
 }
 
+downsample_matrix_cpp <- function(input, samples, random_seed) {
+    .Call('_tglkmeans_downsample_matrix_cpp', PACKAGE = 'tglkmeans', input, samples, random_seed)
+}
+
diff --git a/R/downsample.R b/R/downsample.R
new file mode 100644
index 0000000..c8ce4ec
--- /dev/null
+++ b/R/downsample.R
@@ -0,0 +1,40 @@
+#' Downsample a matrix to a target number of in each column
+#'
+#' @description This function takes a matrix and downsamples it to a target number of samples.
+#' It uses a random seed for reproducibility and allows for removing columns with
+#' small sums.
+#'
+#' @param mat The input matrix to be downsampled
+#' @param target_n The target number of samples to downsample to
+#' @param seed The random seed for reproducibility (default is NULL)
+#' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE)
+#'
+#' @return The downsampled matrix
+#'
+#' @examples
+#' mat <- matrix(1:12, nrow = 4)
+#' downsample_matrix(mat, 2)
+#'
+#' # Remove columns with small sums
+#' downsample_matrix(mat, 12, remove_columns = TRUE)
+#'
+#' @export
+downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE) {
+    if (is.null(seed)) {
+        seed <- sample(1:10000, 1)
+    }
+
+    ds_mat <- downsample_matrix_cpp(mat, target_n, seed)
+
+    sums <- colSums(ds_mat, na.rm = TRUE)
+    small_cols <- sums < target_n
+    if (any(small_cols)) {
+        if (remove_columns) {
+            ds_mat <- ds_mat[, !small_cols]
+        } else {
+            cli_warn("Some columns ({which(small_cols)}) have a sum<{.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.")
+        }
+    }
+
+    return(ds_mat)
+}
diff --git a/man/TGL_kmeans.Rd b/man/TGL_kmeans.Rd
index a3ef07a..f3679b2 100644
--- a/man/TGL_kmeans.Rd
+++ b/man/TGL_kmeans.Rd
@@ -9,7 +9,7 @@ TGL_kmeans(
   k,
   metric = "euclid",
   max_iter = 40,
-  min_delta = 1e-04,
+  min_delta = 0.0001,
   verbose = FALSE,
   keep_log = FALSE,
   id_column = FALSE,
diff --git a/man/TGL_kmeans_tidy.Rd b/man/TGL_kmeans_tidy.Rd
index f304ba8..567060e 100644
--- a/man/TGL_kmeans_tidy.Rd
+++ b/man/TGL_kmeans_tidy.Rd
@@ -9,7 +9,7 @@ TGL_kmeans_tidy(
   k,
   metric = "euclid",
   max_iter = 40,
-  min_delta = 1e-04,
+  min_delta = 0.0001,
   verbose = FALSE,
   keep_log = FALSE,
   id_column = FALSE,
diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd
new file mode 100644
index 0000000..a697c77
--- /dev/null
+++ b/man/downsample_matrix.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/downsample.R
+\name{downsample_matrix}
+\alias{downsample_matrix}
+\title{Downsample a matrix to a target number of in each column}
+\usage{
+downsample_matrix(mat, target_n, seed = NULL, remove_columns = FALSE)
+}
+\arguments{
+\item{mat}{The input matrix to be downsampled}
+
+\item{target_n}{The target number of samples to downsample to}
+
+\item{seed}{The random seed for reproducibility (default is NULL)}
+
+\item{remove_columns}{Logical indicating whether to remove columns with small sums (default is FALSE)}
+}
+\value{
+The downsampled matrix
+}
+\description{
+This function takes a matrix and downsamples it to a target number of samples.
+It uses a random seed for reproducibility and allows for removing columns with
+small sums.
+}
+\examples{
+mat <- matrix(1:12, nrow = 4)
+downsample_matrix(mat, 2)
+
+# Remove columns with small sums
+downsample_matrix(mat, 12, remove_columns = TRUE)
+
+}
diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp
new file mode 100644
index 0000000..ce1070a
--- /dev/null
+++ b/src/DownsampleWorker.cpp
@@ -0,0 +1,131 @@
+#include <vector>
+#include <algorithm>
+#include <cassert>
+#include <random>
+#include <Rcpp.h>
+#include <RcppParallel.h>
+#include "DownsampleWorker.h"
+
+typedef float float32_t;
+typedef double float64_t;
+typedef unsigned char uint8_t;
+typedef unsigned int uint_t;
+
+static size_t
+ceil_power_of_two(const size_t size) {
+    return size_t(1) << size_t(ceil(log2(float64_t(size))));
+}
+
+template<typename D>
+static void initialize_tree(const std::vector<D>& input, std::vector<size_t>& tree) {
+    assert(input.size() >= 2); 
+
+    size_t input_size = ceil_power_of_two(input.size());
+    tree.resize(2 * input_size - 1); // Resize the tree to the necessary size
+
+    // Copy input to the beginning of the tree and fill the rest with zeros
+    std::copy(input.begin(), input.end(), tree.begin());
+    std::fill(tree.begin() + input.size(), tree.begin() + input_size, 0);
+
+    // Iteratively build the tree
+    size_t tree_index = 0;
+    while (input_size > 1) {
+        size_t half_size = input_size / 2;
+        for (size_t index = 0; index < half_size; ++index) {
+            const auto left = tree[tree_index + index * 2];
+            const auto right = tree[tree_index + index * 2 + 1];
+            tree[tree_index + input_size + index] = left + right;
+
+            assert(left >= 0);
+            assert(right >= 0);
+            assert(tree[tree_index + input_size + index] == size_t(left) + size_t(right));
+        }
+        tree_index += input_size;
+        input_size = half_size;
+    }
+    assert(tree.size() == 2 * input.size() - 1);
+}
+
+static size_t random_sample(std::vector<size_t>& tree, ssize_t random) {
+    size_t size_of_level = 1;
+    ssize_t base_of_level = tree.size() - 1;
+    size_t index_in_level = 0;
+    size_t index_in_tree = base_of_level + index_in_level;
+
+    while (true) {
+        assert(index_in_tree == base_of_level + index_in_level);
+        assert(tree[index_in_tree] > random);
+
+        --tree[index_in_tree];
+        size_of_level *= 2;
+        base_of_level -= size_of_level;
+
+        if (base_of_level < 0) {
+            return index_in_level;
+        }
+
+        index_in_level *= 2;
+        index_in_tree = base_of_level + index_in_level;
+        ssize_t right_random = random - ssize_t(tree[index_in_tree]);
+
+        assert(tree[base_of_level + index_in_level] + tree[base_of_level + index_in_level + 1] ==
+               tree[base_of_level + size_of_level + index_in_level / 2] + 1);
+
+        if (right_random >= 0) {
+            ++index_in_level;
+            ++index_in_tree;
+            assert(index_in_level < size_of_level);
+            random = right_random;
+        }
+    }
+}
+
+template<typename D, typename O>
+static void downsample_slice(const std::vector<D>& input, std::vector<O>& output, const int32_t samples, const size_t random_seed) {
+    assert(output.size() == input.size()); // Replacing FastAssertCompare
+
+    if (samples < 0 || input.size() == 0) {
+        return;
+    }
+
+    if (input.size() == 1) {
+        output[0] = O(static_cast<double>(samples) < static_cast<double>(input[0]) ? samples : input[0]);
+        return;
+    }
+
+    size_t input_size = ceil_power_of_two(input.size());
+    std::vector<size_t> tree(2 * input_size - 1);
+    initialize_tree(input, tree);
+    size_t& total = tree[tree.size() - 1];
+
+    if (total <= static_cast<size_t>(samples)) {
+        std::copy(input.begin(), input.end(), output.begin());
+        return;
+    }
+
+    std::fill(output.begin(), output.end(), O(0));
+
+    std::minstd_rand random(random_seed);
+    for (size_t index = 0; index < static_cast<size_t>(samples); ++index) {
+        size_t sampled_index = random_sample(tree, random() % total);
+        if (sampled_index < output.size()) {
+            ++output[sampled_index];
+        }
+    }
+}
+
+
+DownsampleWorker::DownsampleWorker(const Rcpp::IntegerMatrix& input, Rcpp::IntegerMatrix& output, int samples, unsigned int random_seed)
+    : input_matrix(input), output_matrix(output), samples(samples), random_seed(random_seed) {}
+
+void DownsampleWorker::operator()(std::size_t begin, std::size_t end) {
+    for (std::size_t col = begin; col < end; ++col) {        
+        std::vector<int> input_vec(input_matrix.column(col).begin(), input_matrix.column(col).end());
+        std::vector<int> output_vec(input_vec.size(), 0);
+
+        downsample_slice(input_vec, output_vec, samples, random_seed);
+
+        std::copy(output_vec.begin(), output_vec.end(), output_matrix.column(col).begin());
+    }
+}
+
diff --git a/src/DownsampleWorker.h b/src/DownsampleWorker.h
new file mode 100644
index 0000000..f480568
--- /dev/null
+++ b/src/DownsampleWorker.h
@@ -0,0 +1,22 @@
+#ifndef DOWNSAMPLEWORKER_H
+#define DOWNSAMPLEWORKER_H
+
+#include <Rcpp.h>
+#include <RcppParallel.h>
+#include <vector>
+
+class DownsampleWorker : public RcppParallel::Worker {
+private:
+    RcppParallel::RMatrix<int> input_matrix;
+    RcppParallel::RMatrix<int> output_matrix;
+    int samples;
+    unsigned int random_seed;
+
+public:    
+    DownsampleWorker(const Rcpp::IntegerMatrix& input, Rcpp::IntegerMatrix& output, int samples, unsigned int random_seed);
+
+    // Parallel operator
+    void operator()(std::size_t begin, std::size_t end) override;
+};
+
+#endif // DOWNSAMPLEWORKER_H
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index d189d69..4b964d6 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -51,11 +51,25 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// downsample_matrix_cpp
+Rcpp::IntegerMatrix downsample_matrix_cpp(Rcpp::IntegerMatrix input, int samples, unsigned int random_seed);
+RcppExport SEXP _tglkmeans_downsample_matrix_cpp(SEXP inputSEXP, SEXP samplesSEXP, SEXP random_seedSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type input(inputSEXP);
+    Rcpp::traits::input_parameter< int >::type samples(samplesSEXP);
+    Rcpp::traits::input_parameter< unsigned int >::type random_seed(random_seedSEXP);
+    rcpp_result_gen = Rcpp::wrap(downsample_matrix_cpp(input, samples, random_seed));
+    return rcpp_result_gen;
+END_RCPP
+}
 
 static const R_CallMethodDef CallEntries[] = {
     {"_tglkmeans_reduce_coclust", (DL_FUNC) &_tglkmeans_reduce_coclust, 3},
     {"_tglkmeans_reduce_num_trials", (DL_FUNC) &_tglkmeans_reduce_num_trials, 2},
     {"_tglkmeans_TGL_kmeans_cpp", (DL_FUNC) &_tglkmeans_TGL_kmeans_cpp, 8},
+    {"_tglkmeans_downsample_matrix_cpp", (DL_FUNC) &_tglkmeans_downsample_matrix_cpp, 3},
     {NULL, NULL, 0}
 };
 
diff --git a/src/downsample.cpp b/src/downsample.cpp
new file mode 100644
index 0000000..4a43833
--- /dev/null
+++ b/src/downsample.cpp
@@ -0,0 +1,19 @@
+#include <vector>
+#include <Rcpp.h>
+#include <RcppParallel.h>
+#include "DownsampleWorker.h"
+
+typedef float float32_t;
+typedef double float64_t;
+typedef unsigned char uint8_t;
+typedef unsigned int uint_t;
+
+// [[Rcpp::export]]
+Rcpp::IntegerMatrix downsample_matrix_cpp(Rcpp::IntegerMatrix input, int samples, unsigned int random_seed) {
+    Rcpp::IntegerMatrix output(input.nrow(), input.ncol());
+
+    DownsampleWorker worker(input, output, samples, random_seed);
+    RcppParallel::parallelFor(0, input.ncol(), worker);
+
+    return output;
+}
diff --git a/tests/testthat/test-downsample.R b/tests/testthat/test-downsample.R
new file mode 100644
index 0000000..e2b359c
--- /dev/null
+++ b/tests/testthat/test-downsample.R
@@ -0,0 +1,23 @@
+
+test_that("downsample_matrix returns the correct number of samples", {
+    mat <- matrix(1:12, nrow = 4)
+    target_n <- 2
+    ds_mat <- downsample_matrix(mat, target_n)
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+})
+
+test_that("downsample_matrix removes columns with small sums when remove_columns is TRUE", {
+    mat <- matrix(1:12, nrow = 4)
+    target_n <- 12
+    ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE)
+    expect_equal(ncol(ds_mat), 2)
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+})
+
+test_that("downsample_matrix does not remove columns with small sums when remove_columns is FALSE", {
+    mat <- matrix(1:12, nrow = 4)
+    target_n <- 12
+    expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = FALSE))
+    expect_equal(ncol(ds_mat), ncol(mat))
+    expect_true(all(colSums(ds_mat[, -1], na.rm = TRUE) == target_n))
+})
\ No newline at end of file

From 803eab43cd93651978c66ea2cf777b243ce52f7c Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Thu, 28 Dec 2023 23:41:25 +0200
Subject: [PATCH 02/10] deal with NA

---
 R/downsample.R                   |  9 ++++++++-
 src/DownsampleWorker.cpp         |  2 +-
 tests/testthat/test-downsample.R | 14 ++++++++++++--
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/R/downsample.R b/R/downsample.R
index c8ce4ec..29e50c3 100644
--- a/R/downsample.R
+++ b/R/downsample.R
@@ -24,17 +24,24 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE
         seed <- sample(1:10000, 1)
     }
 
+    # replace NAs with 0s for the cpp code
+    orig_mat <- mat
+    mat[is.na(mat)] <- 0
     ds_mat <- downsample_matrix_cpp(mat, target_n, seed)
 
     sums <- colSums(ds_mat, na.rm = TRUE)
     small_cols <- sums < target_n
     if (any(small_cols)) {
         if (remove_columns) {
-            ds_mat <- ds_mat[, !small_cols]
+            ds_mat <- ds_mat[, !small_cols, drop = FALSE]
+            orig_mat <- orig_mat[, !small_cols, drop = FALSE]
         } else {
             cli_warn("Some columns ({which(small_cols)}) have a sum<{.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.")
         }
     }
 
+    # put back the NAs
+    ds_mat[is.na(orig_mat)] <- NA
+
     return(ds_mat)
 }
diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp
index ce1070a..8ec03fe 100644
--- a/src/DownsampleWorker.cpp
+++ b/src/DownsampleWorker.cpp
@@ -82,7 +82,7 @@ static size_t random_sample(std::vector<size_t>& tree, ssize_t random) {
 
 template<typename D, typename O>
 static void downsample_slice(const std::vector<D>& input, std::vector<O>& output, const int32_t samples, const size_t random_seed) {
-    assert(output.size() == input.size()); // Replacing FastAssertCompare
+    assert(output.size() == input.size()); 
 
     if (samples < 0 || input.size() == 0) {
         return;
diff --git a/tests/testthat/test-downsample.R b/tests/testthat/test-downsample.R
index e2b359c..1d3d89f 100644
--- a/tests/testthat/test-downsample.R
+++ b/tests/testthat/test-downsample.R
@@ -1,4 +1,3 @@
-
 test_that("downsample_matrix returns the correct number of samples", {
     mat <- matrix(1:12, nrow = 4)
     target_n <- 2
@@ -20,4 +19,15 @@ test_that("downsample_matrix does not remove columns with small sums when remove
     expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = FALSE))
     expect_equal(ncol(ds_mat), ncol(mat))
     expect_true(all(colSums(ds_mat[, -1], na.rm = TRUE) == target_n))
-})
\ No newline at end of file
+})
+
+test_that("downsample_matrix returns the correct number of samples when there are NAs", {
+    mat <- matrix(1:12, nrow = 4)
+    mat[1, 1] <- NA
+    target_n <- 2
+    ds_mat <- downsample_matrix(mat, target_n)
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+
+    # make sure the NAs are still there
+    expect_true(all(is.na(ds_mat[1, 1])))
+})

From 5ea568eb663a16673893e7a06d9c7241f9187c67 Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Fri, 29 Dec 2023 00:34:09 +0200
Subject: [PATCH 03/10] Added support for dgCMatrix

---
 DESCRIPTION                      |  2 ++
 R/RcppExports.R                  |  4 ++++
 R/downsample.R                   | 39 +++++++++++++++++++++++++-------
 man/downsample_matrix.Rd         |  6 ++++-
 src/DownsampleWorker.cpp         | 23 ++++++++++++++++++-
 src/DownsampleWorker.h           | 20 ++++++++++++++++
 src/RcppExports.cpp              | 14 ++++++++++++
 src/downsample.cpp               | 28 +++++++++++++++++++++++
 tests/testthat/test-downsample.R |  2 +-
 9 files changed, 127 insertions(+), 11 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index f8fdbae..2d6cff1 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -30,6 +30,8 @@ Imports:
     future,
     ggplot2 (>= 2.2.0),
     magrittr,
+    Matrix,
+    methods,
     parallel (>= 3.3.2),
     plyr (>= 1.8.4),
     purrr (>= 0.2.0),
diff --git a/R/RcppExports.R b/R/RcppExports.R
index a09d6a4..ff2cd0c 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -17,3 +17,7 @@ downsample_matrix_cpp <- function(input, samples, random_seed) {
     .Call('_tglkmeans_downsample_matrix_cpp', PACKAGE = 'tglkmeans', input, samples, random_seed)
 }
 
+rcpp_downsample_sparse <- function(matrix, samples, random_seed) {
+    .Call('_tglkmeans_rcpp_downsample_sparse', PACKAGE = 'tglkmeans', matrix, samples, random_seed)
+}
+
diff --git a/R/downsample.R b/R/downsample.R
index 29e50c3..9e6d72d 100644
--- a/R/downsample.R
+++ b/R/downsample.R
@@ -4,7 +4,7 @@
 #' It uses a random seed for reproducibility and allows for removing columns with
 #' small sums.
 #'
-#' @param mat The input matrix to be downsampled
+#' @param mat The input matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). If the matrix contains NAs, the function will run significantly slower.
 #' @param target_n The target number of samples to downsample to
 #' @param seed The random seed for reproducibility (default is NULL)
 #' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE)
@@ -18,6 +18,10 @@
 #' # Remove columns with small sums
 #' downsample_matrix(mat, 12, remove_columns = TRUE)
 #'
+#' # sparse matrix
+#' mat_sparse <- Matrix::Matrix(mat, sparse = TRUE)
+#' downsample_matrix(mat_sparse, 2)
+#'
 #' @export
 downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE) {
     if (is.null(seed)) {
@@ -25,23 +29,42 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE
     }
 
     # replace NAs with 0s for the cpp code
-    orig_mat <- mat
-    mat[is.na(mat)] <- 0
-    ds_mat <- downsample_matrix_cpp(mat, target_n, seed)
+    has_nas <- FALSE
+    if (any(is.na(mat))) {
+        has_nas <- TRUE
+        cli_warn("Input matrix contains NAs. Processing would be significantly slower.")
+        orig_mat <- mat
+        mat[is.na(mat)] <- 0
+    }
+
+
+    if (methods::is(mat, "dgCMatrix")) {
+        ds_mat <- rcpp_downsample_sparse(mat, target_n, seed)
+        sums <- Matrix::colSums(ds_mat, na.rm = TRUE)
+    } else if (is.matrix(mat)) {
+        ds_mat <- downsample_matrix_cpp(mat, target_n, seed)
+        sums <- colSums(ds_mat, na.rm = TRUE)
+    } else {
+        cli_abort("Input must be a matrix or a sparse matrix (dgCMatrix). class of {.field mat} is {.val {class(mat)}}.")
+    }
 
-    sums <- colSums(ds_mat, na.rm = TRUE)
     small_cols <- sums < target_n
     if (any(small_cols)) {
         if (remove_columns) {
             ds_mat <- ds_mat[, !small_cols, drop = FALSE]
-            orig_mat <- orig_mat[, !small_cols, drop = FALSE]
+            if (has_nas) {
+                orig_mat <- orig_mat[, !small_cols, drop = FALSE]
+            }
         } else {
             cli_warn("Some columns ({which(small_cols)}) have a sum<{.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.")
         }
     }
 
-    # put back the NAs
-    ds_mat[is.na(orig_mat)] <- NA
+    if (has_nas) {
+        # put back the NAs
+        ds_mat[is.na(orig_mat)] <- NA
+    }
+
 
     return(ds_mat)
 }
diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd
index a697c77..3aa07cf 100644
--- a/man/downsample_matrix.Rd
+++ b/man/downsample_matrix.Rd
@@ -7,7 +7,7 @@
 downsample_matrix(mat, target_n, seed = NULL, remove_columns = FALSE)
 }
 \arguments{
-\item{mat}{The input matrix to be downsampled}
+\item{mat}{The input matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). If the matrix contains NAs, the function will run significantly slower.}
 
 \item{target_n}{The target number of samples to downsample to}
 
@@ -30,4 +30,8 @@ downsample_matrix(mat, 2)
 # Remove columns with small sums
 downsample_matrix(mat, 12, remove_columns = TRUE)
 
+# sparse matrix
+mat_sparse <- Matrix::Matrix(mat, sparse = TRUE)
+downsample_matrix(mat_sparse, 2)
+
 }
diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp
index 8ec03fe..fc1b93e 100644
--- a/src/DownsampleWorker.cpp
+++ b/src/DownsampleWorker.cpp
@@ -114,7 +114,6 @@ static void downsample_slice(const std::vector<D>& input, std::vector<O>& output
     }
 }
 
-
 DownsampleWorker::DownsampleWorker(const Rcpp::IntegerMatrix& input, Rcpp::IntegerMatrix& output, int samples, unsigned int random_seed)
     : input_matrix(input), output_matrix(output), samples(samples), random_seed(random_seed) {}
 
@@ -129,3 +128,25 @@ void DownsampleWorker::operator()(std::size_t begin, std::size_t end) {
     }
 }
 
+DownsampleWorkerSparse::DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, 
+                                               Rcpp::IntegerVector& out_x, int nrows, int ncols, int samples, unsigned int random_seed)
+    : input_i(i), input_p(p), input_x(x), output_x(out_x), nrows(nrows), ncols(ncols), samples(samples), random_seed(random_seed) {}
+
+void DownsampleWorkerSparse::operator()(std::size_t begin, std::size_t end) {
+    for (std::size_t col = begin; col < end; ++col) {
+        // Extract the current column from the sparse matrix
+        std::vector<int> input_vec;
+        for (int idx = input_p[col]; idx < input_p[col + 1]; ++idx) {
+            input_vec.push_back(input_x[idx]);
+        }
+
+        std::vector<int> output_vec(input_vec.size(), 0);
+
+        downsample_slice(input_vec, output_vec, samples, random_seed);
+
+        // Store results in the output sparse matrix
+        for (int idx = input_p[col], out_idx = 0; idx < input_p[col + 1]; ++idx, ++out_idx) {
+            output_x[idx] = output_vec[out_idx];
+        }
+    }
+}
diff --git a/src/DownsampleWorker.h b/src/DownsampleWorker.h
index f480568..5c6cf08 100644
--- a/src/DownsampleWorker.h
+++ b/src/DownsampleWorker.h
@@ -19,4 +19,24 @@ class DownsampleWorker : public RcppParallel::Worker {
     void operator()(std::size_t begin, std::size_t end) override;
 };
 
+class DownsampleWorkerSparse : public RcppParallel::Worker {
+private:
+    Rcpp::IntegerVector input_i;
+    Rcpp::IntegerVector input_p;
+    Rcpp::IntegerVector input_x;
+    Rcpp::IntegerVector output_x;
+    int nrows;
+    int ncols;
+    int samples;
+    unsigned int random_seed;
+
+public:
+    DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, 
+                           Rcpp::IntegerVector& out_x, int nrows, int ncols, int samples, unsigned int random_seed);
+
+    // Parallel operator
+    void operator()(std::size_t begin, std::size_t end) override;
+};
+
+
 #endif // DOWNSAMPLEWORKER_H
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 4b964d6..d5474ab 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -64,12 +64,26 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// rcpp_downsample_sparse
+Rcpp::S4 rcpp_downsample_sparse(Rcpp::S4 matrix, int samples, unsigned int random_seed);
+RcppExport SEXP _tglkmeans_rcpp_downsample_sparse(SEXP matrixSEXP, SEXP samplesSEXP, SEXP random_seedSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< Rcpp::S4 >::type matrix(matrixSEXP);
+    Rcpp::traits::input_parameter< int >::type samples(samplesSEXP);
+    Rcpp::traits::input_parameter< unsigned int >::type random_seed(random_seedSEXP);
+    rcpp_result_gen = Rcpp::wrap(rcpp_downsample_sparse(matrix, samples, random_seed));
+    return rcpp_result_gen;
+END_RCPP
+}
 
 static const R_CallMethodDef CallEntries[] = {
     {"_tglkmeans_reduce_coclust", (DL_FUNC) &_tglkmeans_reduce_coclust, 3},
     {"_tglkmeans_reduce_num_trials", (DL_FUNC) &_tglkmeans_reduce_num_trials, 2},
     {"_tglkmeans_TGL_kmeans_cpp", (DL_FUNC) &_tglkmeans_TGL_kmeans_cpp, 8},
     {"_tglkmeans_downsample_matrix_cpp", (DL_FUNC) &_tglkmeans_downsample_matrix_cpp, 3},
+    {"_tglkmeans_rcpp_downsample_sparse", (DL_FUNC) &_tglkmeans_rcpp_downsample_sparse, 3},
     {NULL, NULL, 0}
 };
 
diff --git a/src/downsample.cpp b/src/downsample.cpp
index 4a43833..6019819 100644
--- a/src/downsample.cpp
+++ b/src/downsample.cpp
@@ -17,3 +17,31 @@ Rcpp::IntegerMatrix downsample_matrix_cpp(Rcpp::IntegerMatrix input, int samples
 
     return output;
 }
+
+// [[Rcpp::export]]
+Rcpp::S4 rcpp_downsample_sparse(Rcpp::S4 matrix, int samples, unsigned int random_seed) {
+    // Extract components of the dgCMatrix
+    Rcpp::IntegerVector i = matrix.slot("i");
+    Rcpp::IntegerVector p = matrix.slot("p");
+    Rcpp::IntegerVector x = matrix.slot("x");
+
+    int nrows = Rcpp::as<Rcpp::IntegerVector>(matrix.slot("Dim"))[0];
+    int ncols = Rcpp::as<Rcpp::IntegerVector>(matrix.slot("Dim"))[1];
+
+    // Prepare output vector
+    Rcpp::IntegerVector out_x(x.size());
+
+    // Create and run the DownsampleWorkerSparse
+    DownsampleWorkerSparse worker(i, p, x, out_x, nrows, ncols, samples, random_seed);
+    RcppParallel::parallelFor(0, ncols, worker);
+
+    // Create a new dgCMatrix object for the output
+    Rcpp::S4 out_matrix("dgCMatrix");
+    out_matrix.slot("i") = i;
+    out_matrix.slot("p") = p;    
+    Rcpp::NumericVector out_x_double = Rcpp::as<Rcpp::NumericVector>(out_x);
+    out_matrix.slot("x") = out_x_double;
+    out_matrix.slot("Dim") = Rcpp::IntegerVector::create(nrows, ncols);
+
+    return out_matrix;
+}
\ No newline at end of file
diff --git a/tests/testthat/test-downsample.R b/tests/testthat/test-downsample.R
index 1d3d89f..1b2267c 100644
--- a/tests/testthat/test-downsample.R
+++ b/tests/testthat/test-downsample.R
@@ -25,7 +25,7 @@ test_that("downsample_matrix returns the correct number of samples when there ar
     mat <- matrix(1:12, nrow = 4)
     mat[1, 1] <- NA
     target_n <- 2
-    ds_mat <- downsample_matrix(mat, target_n)
+    expect_warning(ds_mat <- downsample_matrix(mat, target_n))
     expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
 
     # make sure the NAs are still there

From 1fabce245584a0273c41f2df6e7e85fe5af97aab Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Fri, 29 Dec 2023 10:15:01 +0200
Subject: [PATCH 04/10] added tests

---
 R/downsample.R                   |  16 +++-
 man/downsample_matrix.Rd         |   6 +-
 tests/testthat/test-downsample.R | 149 +++++++++++++++++++++++++++++++
 3 files changed, 167 insertions(+), 4 deletions(-)

diff --git a/R/downsample.R b/R/downsample.R
index 9e6d72d..ca93e78 100644
--- a/R/downsample.R
+++ b/R/downsample.R
@@ -4,8 +4,10 @@
 #' It uses a random seed for reproducibility and allows for removing columns with
 #' small sums.
 #'
-#' @param mat The input matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). If the matrix contains NAs, the function will run significantly slower.
-#' @param target_n The target number of samples to downsample to
+#' @param mat An integer matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix).
+#' If the matrix contains NAs, the function will run significantly slower. Values that are
+#' not integers will be coerced to integers using {.code floor()}.
+#' @param target_n The target number of samples to downsample to.
 #' @param seed The random seed for reproducibility (default is NULL)
 #' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE)
 #'
@@ -26,6 +28,8 @@
 downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE) {
     if (is.null(seed)) {
         seed <- sample(1:10000, 1)
+    } else if (!is.numeric(seed) || seed <= 0 || seed != as.integer(seed)) {
+        cli_abort("{.field seed} must be a positive integer.")
     }
 
     # replace NAs with 0s for the cpp code
@@ -37,6 +41,14 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE
         mat[is.na(mat)] <- 0
     }
 
+    if (!is.logical(remove_columns)) {
+        cli_abort("{.field remove_columns} must be a logical value.")
+    }
+
+    if (target_n <= 0 || target_n != as.integer(target_n)) {
+        cli_abort("{.field target_n} must be a positive integer.")
+    }
+
 
     if (methods::is(mat, "dgCMatrix")) {
         ds_mat <- rcpp_downsample_sparse(mat, target_n, seed)
diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd
index 3aa07cf..dfdb81d 100644
--- a/man/downsample_matrix.Rd
+++ b/man/downsample_matrix.Rd
@@ -7,9 +7,11 @@
 downsample_matrix(mat, target_n, seed = NULL, remove_columns = FALSE)
 }
 \arguments{
-\item{mat}{The input matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). If the matrix contains NAs, the function will run significantly slower.}
+\item{mat}{An integer matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix).
+If the matrix contains NAs, the function will run significantly slower. Values that are
+not integers will be coerced to integers using {.code floor()}.}
 
-\item{target_n}{The target number of samples to downsample to}
+\item{target_n}{The target number of samples to downsample to.}
 
 \item{seed}{The random seed for reproducibility (default is NULL)}
 
diff --git a/tests/testthat/test-downsample.R b/tests/testthat/test-downsample.R
index 1b2267c..631e990 100644
--- a/tests/testthat/test-downsample.R
+++ b/tests/testthat/test-downsample.R
@@ -3,6 +3,7 @@ test_that("downsample_matrix returns the correct number of samples", {
     target_n <- 2
     ds_mat <- downsample_matrix(mat, target_n)
     expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+    expect_true(all(mat >= ds_mat))
 })
 
 test_that("downsample_matrix removes columns with small sums when remove_columns is TRUE", {
@@ -31,3 +32,151 @@ test_that("downsample_matrix returns the correct number of samples when there ar
     # make sure the NAs are still there
     expect_true(all(is.na(ds_mat[1, 1])))
 })
+
+test_that("downsample_matrix returns the correct number of samples when there are NAs and remove_columns is TRUE", {
+    mat <- matrix(1:12, nrow = 4)
+    mat[1, 1] <- NA
+    target_n <- 2
+    expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE))
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+
+    # make sure the NAs are still there
+    expect_true(all(is.na(ds_mat[1, 1])))
+})
+
+test_that("downsample_matrix returns the correct number of samples when there are NAs and remove_columns is FALSE", {
+    mat <- matrix(1:12, nrow = 4)
+    mat[1, 1] <- NA
+    target_n <- 2
+    expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = FALSE))
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+    expect_true(all(mat[!is.na(mat)] >= ds_mat[!is.na(ds_mat)]))
+
+    # make sure the NAs are still there
+    expect_true(all(is.na(ds_mat[1, 1])))
+})
+
+test_that("downsample_matrix returns the correct number of samples when there are NAs and remove_columns is TRUE and the matrix is sparse", {
+    mat <- Matrix::Matrix(matrix(1:12, nrow = 4), sparse = TRUE)
+    mat[1, 1] <- NA
+    target_n <- 2
+    expect_warning(ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE))
+    expect_true(all(Matrix::colSums(ds_mat, na.rm = TRUE) == target_n))
+
+    # make sure the NAs are still there
+    expect_true(all(is.na(ds_mat[1, 1])))
+})
+
+test_that("downsample_matrix works with all zeros matrix", {
+    mat <- matrix(0, nrow = 4)
+    target_n <- 2
+    expect_warning(ds_mat <- downsample_matrix(mat, target_n))
+    expect_true(all(mat == ds_mat))
+
+    ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE)
+    expect_equal(ncol(ds_mat), 0)
+})
+
+# Test with Different Matrix Sizes
+test_that("downsample_matrix works with larger matrix", {
+    mat <- matrix(1:1e3, nrow = 10)
+    target_n <- 5
+    ds_mat <- downsample_matrix(mat, target_n)
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+})
+
+test_that("downsample_matrix works with single-column matrix", {
+    mat <- matrix(1:10, nrow = 10)
+    target_n <- 5
+    ds_mat <- downsample_matrix(mat, target_n)
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+})
+
+test_that("downsample_matrix works with single-row matrix", {
+    mat <- matrix(1:10, nrow = 1)
+    target_n <- 5
+    ds_mat <- downsample_matrix(mat, target_n, remove_columns = TRUE)
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+    expect_equal(ncol(ds_mat), 6)
+})
+
+# Test with Different target_n Values
+test_that("downsample_matrix with target_n equal to number of rows", {
+    mat <- matrix(1:12, nrow = 4)
+    target_n <- 4
+    ds_mat <- downsample_matrix(mat, target_n)
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+})
+
+test_that("downsample_matrix with target_n greater than number of rows", {
+    mat <- matrix(1:12, nrow = 4)
+    target_n <- 6
+    ds_mat <- downsample_matrix(mat, target_n)
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) <= target_n))
+})
+
+test_that("downsample_matrix with target_n equal to 1", {
+    mat <- matrix(1:12, nrow = 4)
+    target_n <- 1
+    ds_mat <- downsample_matrix(mat, target_n)
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+})
+
+test_that("downsample_matrix with target_n equal to 0", {
+    mat <- matrix(1:12, nrow = 4)
+    target_n <- 0
+    expect_error(downsample_matrix(mat, target_n))
+})
+
+# Test with Different Matrix Types
+test_that("downsample_matrix with sparse matrix", {
+    mat <- Matrix::Matrix(matrix(1:12, nrow = 4), sparse = TRUE)
+    target_n <- 2
+    ds_mat <- downsample_matrix(mat, target_n)
+    expect_true(all(Matrix::colSums(ds_mat, na.rm = TRUE) == target_n))
+    expect_true(all(mat >= ds_mat))
+})
+
+test_that("downsample_matrix with invalid matrix type", {
+    mat <- 1:12
+    target_n <- 2
+    expect_error(downsample_matrix(mat, target_n))
+
+    mat <- Matrix::Matrix(matrix(1:12, nrow = 4), sparse = FALSE)
+    target_n <- 2
+    expect_error(downsample_matrix(mat, target_n))
+})
+
+# Test with non-integer values
+test_that("downsample_matrix with non-integer values", {
+    mat <- matrix(1:12, nrow = 4)
+    mat[1, 1] <- 1.5
+    target_n <- 2
+    ds_mat <- downsample_matrix(mat, target_n)
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == target_n))
+    expect_true(all(mat >= ds_mat))
+})
+
+# Test with Various Seed Values
+test_that("downsample_matrix with fixed seed", {
+    mat <- matrix(1:12, nrow = 4)
+    target_n <- 2
+    seed <- 123
+    ds_mat1 <- downsample_matrix(mat, target_n, seed = seed)
+    ds_mat2 <- downsample_matrix(mat, target_n, seed = seed)
+    expect_equal(ds_mat1, ds_mat2)
+})
+
+test_that("downsample_matrix without specifying seed", {
+    mat <- matrix(1:12, nrow = 4)
+    target_n <- 2
+    ds_mat <- downsample_matrix(mat, target_n)
+    expect_true(is.matrix(ds_mat))
+})
+
+test_that("downsample_matrix with invalid seed", {
+    mat <- matrix(1:12, nrow = 4)
+    target_n <- 2
+    expect_error(downsample_matrix(mat, target_n, seed = -1))
+    expect_error(downsample_matrix(mat, target_n, seed = "not a number"))
+})

From 981e3ec0399953158b01d1f7117b7f74a04a9d2c Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Fri, 29 Dec 2023 10:35:32 +0200
Subject: [PATCH 05/10] show the user the random seed

---
 R/downsample.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/downsample.R b/R/downsample.R
index ca93e78..f0537db 100644
--- a/R/downsample.R
+++ b/R/downsample.R
@@ -28,6 +28,7 @@
 downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE) {
     if (is.null(seed)) {
         seed <- sample(1:10000, 1)
+        cli::cli_alert_warning("No seed provided. Using {.val {seed}}.")
     } else if (!is.numeric(seed) || seed <= 0 || seed != as.integer(seed)) {
         cli_abort("{.field seed} must be a positive integer.")
     }

From 63064f229299e943c502a6af12b74698543fcdc6 Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Fri, 29 Dec 2023 10:58:14 +0200
Subject: [PATCH 06/10] added target_q parameter

---
 R/downsample.R                   | 52 +++++++++++++++++++++++---------
 man/downsample_matrix.Rd         | 13 +++++++-
 tests/testthat/test-downsample.R | 22 ++++++++++++++
 3 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/R/downsample.R b/R/downsample.R
index f0537db..292f277 100644
--- a/R/downsample.R
+++ b/R/downsample.R
@@ -8,6 +8,7 @@
 #' If the matrix contains NAs, the function will run significantly slower. Values that are
 #' not integers will be coerced to integers using {.code floor()}.
 #' @param target_n The target number of samples to downsample to.
+#' @param target_q A target quantile of sums to downsample to. Only one of {.field target_n} or {.field target_q} can be provided.
 #' @param seed The random seed for reproducibility (default is NULL)
 #' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE)
 #'
@@ -24,8 +25,23 @@
 #' mat_sparse <- Matrix::Matrix(mat, sparse = TRUE)
 #' downsample_matrix(mat_sparse, 2)
 #'
+#' # with a quantile
+#' downsample_matrix(mat, target_q = 0.5)
+#'
 #' @export
-downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE) {
+downsample_matrix <- function(mat, target_n = NULL, target_q = NULL, seed = NULL, remove_columns = FALSE) {
+    if (is.null(target_n) && is.null(target_q)) {
+        cli_abort("Either {.field target_n} or {.field target_q} must be provided.")
+    } else if (!is.null(target_n) && !is.null(target_q)) {
+        cli_abort("Only one of {.field target_n} or {.field target_q} can be provided.")
+    }
+
+    sums <- colsums_matrix(mat)
+    if (!is.null(target_q)) {
+        target_n <- round(stats::quantile(sums, target_q))
+        cli::cli_alert_info("Using {.val {target_n}} as the target number.")
+    }
+
     if (is.null(seed)) {
         seed <- sample(1:10000, 1)
         cli::cli_alert_warning("No seed provided. Using {.val {seed}}.")
@@ -33,15 +49,6 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE
         cli_abort("{.field seed} must be a positive integer.")
     }
 
-    # replace NAs with 0s for the cpp code
-    has_nas <- FALSE
-    if (any(is.na(mat))) {
-        has_nas <- TRUE
-        cli_warn("Input matrix contains NAs. Processing would be significantly slower.")
-        orig_mat <- mat
-        mat[is.na(mat)] <- 0
-    }
-
     if (!is.logical(remove_columns)) {
         cli_abort("{.field remove_columns} must be a logical value.")
     }
@@ -50,15 +57,19 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE
         cli_abort("{.field target_n} must be a positive integer.")
     }
 
+    # replace NAs with 0s for the cpp code
+    has_nas <- FALSE
+    if (any(is.na(mat))) {
+        has_nas <- TRUE
+        cli_warn("Input matrix contains NAs. Processing would be significantly slower.")
+        orig_mat <- mat
+        mat[is.na(mat)] <- 0
+    }
 
     if (methods::is(mat, "dgCMatrix")) {
         ds_mat <- rcpp_downsample_sparse(mat, target_n, seed)
-        sums <- Matrix::colSums(ds_mat, na.rm = TRUE)
     } else if (is.matrix(mat)) {
         ds_mat <- downsample_matrix_cpp(mat, target_n, seed)
-        sums <- colSums(ds_mat, na.rm = TRUE)
-    } else {
-        cli_abort("Input must be a matrix or a sparse matrix (dgCMatrix). class of {.field mat} is {.val {class(mat)}}.")
     }
 
     small_cols <- sums < target_n
@@ -68,8 +79,9 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE
             if (has_nas) {
                 orig_mat <- orig_mat[, !small_cols, drop = FALSE]
             }
+            cli_alert_info("Removed {.val {sum(small_cols)}} columns with a sum smaller than {.val {target_n}}.")
         } else {
-            cli_warn("Some columns ({which(small_cols)}) have a sum<{.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.")
+            cli_warn("{.val {sum(small_cols)}} columns have a sum smaller than {.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.")
         }
     }
 
@@ -81,3 +93,13 @@ downsample_matrix <- function(mat, target_n, seed = NULL, remove_columns = FALSE
 
     return(ds_mat)
 }
+
+colsums_matrix <- function(mat) {
+    if (methods::is(mat, "dgCMatrix")) {
+        return(Matrix::colSums(mat, na.rm = TRUE))
+    } else if (is.matrix(mat)) {
+        return(colSums(mat, na.rm = TRUE))
+    } else {
+        cli_abort("Input must be a matrix or a sparse matrix (dgCMatrix). class of {.field mat} is {.val {class(mat)}}.")
+    }
+}
diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd
index dfdb81d..f590673 100644
--- a/man/downsample_matrix.Rd
+++ b/man/downsample_matrix.Rd
@@ -4,7 +4,13 @@
 \alias{downsample_matrix}
 \title{Downsample a matrix to a target number of in each column}
 \usage{
-downsample_matrix(mat, target_n, seed = NULL, remove_columns = FALSE)
+downsample_matrix(
+  mat,
+  target_n = NULL,
+  target_q = NULL,
+  seed = NULL,
+  remove_columns = FALSE
+)
 }
 \arguments{
 \item{mat}{An integer matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix).
@@ -13,6 +19,8 @@ not integers will be coerced to integers using {.code floor()}.}
 
 \item{target_n}{The target number of samples to downsample to.}
 
+\item{target_q}{A target quantile of sums to downsample to. Only one of {.field target_n} or {.field target_q} can be provided.}
+
 \item{seed}{The random seed for reproducibility (default is NULL)}
 
 \item{remove_columns}{Logical indicating whether to remove columns with small sums (default is FALSE)}
@@ -36,4 +44,7 @@ downsample_matrix(mat, 12, remove_columns = TRUE)
 mat_sparse <- Matrix::Matrix(mat, sparse = TRUE)
 downsample_matrix(mat_sparse, 2)
 
+# with a quantile
+downsample_matrix(mat, target_q = 0.5)
+
 }
diff --git a/tests/testthat/test-downsample.R b/tests/testthat/test-downsample.R
index 631e990..8a04d8b 100644
--- a/tests/testthat/test-downsample.R
+++ b/tests/testthat/test-downsample.R
@@ -180,3 +180,25 @@ test_that("downsample_matrix with invalid seed", {
     expect_error(downsample_matrix(mat, target_n, seed = -1))
     expect_error(downsample_matrix(mat, target_n, seed = "not a number"))
 })
+
+test_that("downsample_matrix correctly downsamples matrix using target_q", {
+    mat <- matrix(1:12, nrow = 4)
+    target_q <- 0.5
+    ds_mat <- downsample_matrix(mat, target_q = target_q, remove_columns = TRUE)
+    expect_true(all(colSums(ds_mat, na.rm = TRUE) == round(stats::quantile(colSums(mat), target_q))))
+
+    expect_warning(ds_mat <- downsample_matrix(mat, target_q = target_q, remove_columns = FALSE))
+    expect_true(all(mat >= ds_mat))
+})
+
+test_that("Cannot provide both target_n and target_q", {
+    mat <- matrix(1:12, nrow = 4)
+    target_n <- 2
+    target_q <- 0.5
+    expect_error(downsample_matrix(mat, target_n = target_n, target_q = target_q))
+})
+
+test_that("Fail when no target_n or target_q is provided", {
+    mat <- matrix(1:12, nrow = 4)
+    expect_error(downsample_matrix(mat))
+})

From ed2002e7bd4763eb31a0b6016edba34b5b9f447e Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Fri, 29 Dec 2023 11:06:49 +0200
Subject: [PATCH 07/10] bumped version to 0.5.0

---
 DESCRIPTION              | 2 +-
 NEWS.md                  | 4 ++++
 R/downsample.R           | 6 +++++-
 man/downsample_matrix.Rd | 6 +++++-
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2d6cff1..9e2498f 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tglkmeans
 Title: Efficient Implementation of K-Means++ Algorithm
-Version: 0.4.0
+Version: 0.5.0
 Authors@R: c(
     person("Aviezer", "Lifshitz", , "aviezer.lifshitz@weizmann.ac.il", role = c("aut", "cre")),
     person("Amos", "Tanay", role = "aut"),
diff --git a/NEWS.md b/NEWS.md
index 7844b04..938561e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,7 @@
+# tglkmeans 0.5.0
+
+* Added `dowsample_matrix` function to downsample the columns of a count matrix to a target number. 
+
 # tglkmeans 0.4.0
 
 * Default of `id_column` parameter was changed to `FALSE`. Note that this is a breaking change, and if you want to use an id column, you need to set it explicitly to `TRUE`.
diff --git a/R/downsample.R b/R/downsample.R
index 292f277..d6d3f4b 100644
--- a/R/downsample.R
+++ b/R/downsample.R
@@ -1,4 +1,4 @@
-#' Downsample a matrix to a target number of in each column
+#' Downsample the columns of a matrix to a target number
 #'
 #' @description This function takes a matrix and downsamples it to a target number of samples.
 #' It uses a random seed for reproducibility and allows for removing columns with
@@ -15,6 +15,10 @@
 #' @return The downsampled matrix
 #'
 #' @examples
+#' \dontshow{
+#' tglkmeans.set_parallel(1)
+#' }
+#'
 #' mat <- matrix(1:12, nrow = 4)
 #' downsample_matrix(mat, 2)
 #'
diff --git a/man/downsample_matrix.Rd b/man/downsample_matrix.Rd
index f590673..db41148 100644
--- a/man/downsample_matrix.Rd
+++ b/man/downsample_matrix.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/downsample.R
 \name{downsample_matrix}
 \alias{downsample_matrix}
-\title{Downsample a matrix to a target number of in each column}
+\title{Downsample the columns of a matrix to a target number}
 \usage{
 downsample_matrix(
   mat,
@@ -34,6 +34,10 @@ It uses a random seed for reproducibility and allows for removing columns with
 small sums.
 }
 \examples{
+\dontshow{
+tglkmeans.set_parallel(1)
+}
+
 mat <- matrix(1:12, nrow = 4)
 downsample_matrix(mat, 2)
 

From 38f5298b00c75f8f273c6503ec7d4315ad30cf86 Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Fri, 29 Dec 2023 11:11:31 +0200
Subject: [PATCH 08/10] removed unused private member

---
 src/DownsampleWorker.cpp | 4 ++--
 src/DownsampleWorker.h   | 3 +--
 src/downsample.cpp       | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp
index fc1b93e..4f047f7 100644
--- a/src/DownsampleWorker.cpp
+++ b/src/DownsampleWorker.cpp
@@ -129,8 +129,8 @@ void DownsampleWorker::operator()(std::size_t begin, std::size_t end) {
 }
 
 DownsampleWorkerSparse::DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, 
-                                               Rcpp::IntegerVector& out_x, int nrows, int ncols, int samples, unsigned int random_seed)
-    : input_i(i), input_p(p), input_x(x), output_x(out_x), nrows(nrows), ncols(ncols), samples(samples), random_seed(random_seed) {}
+                                               Rcpp::IntegerVector& out_x, int ncols, int samples, unsigned int random_seed)
+    : input_i(i), input_p(p), input_x(x), output_x(out_x), ncols(ncols), samples(samples), random_seed(random_seed) {}
 
 void DownsampleWorkerSparse::operator()(std::size_t begin, std::size_t end) {
     for (std::size_t col = begin; col < end; ++col) {
diff --git a/src/DownsampleWorker.h b/src/DownsampleWorker.h
index 5c6cf08..b7936b4 100644
--- a/src/DownsampleWorker.h
+++ b/src/DownsampleWorker.h
@@ -25,14 +25,13 @@ class DownsampleWorkerSparse : public RcppParallel::Worker {
     Rcpp::IntegerVector input_p;
     Rcpp::IntegerVector input_x;
     Rcpp::IntegerVector output_x;
-    int nrows;
     int ncols;
     int samples;
     unsigned int random_seed;
 
 public:
     DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, 
-                           Rcpp::IntegerVector& out_x, int nrows, int ncols, int samples, unsigned int random_seed);
+                           Rcpp::IntegerVector& out_x, int ncols, int samples, unsigned int random_seed);
 
     // Parallel operator
     void operator()(std::size_t begin, std::size_t end) override;
diff --git a/src/downsample.cpp b/src/downsample.cpp
index 6019819..f4c2484 100644
--- a/src/downsample.cpp
+++ b/src/downsample.cpp
@@ -32,7 +32,7 @@ Rcpp::S4 rcpp_downsample_sparse(Rcpp::S4 matrix, int samples, unsigned int rando
     Rcpp::IntegerVector out_x(x.size());
 
     // Create and run the DownsampleWorkerSparse
-    DownsampleWorkerSparse worker(i, p, x, out_x, nrows, ncols, samples, random_seed);
+    DownsampleWorkerSparse worker(i, p, x, out_x, ncols, samples, random_seed);
     RcppParallel::parallelFor(0, ncols, worker);
 
     // Create a new dgCMatrix object for the output

From f3f1e2c8a1a8159608dbc6fbb555164e53b50786 Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Fri, 29 Dec 2023 11:13:54 +0200
Subject: [PATCH 09/10] removed another unused variable

---
 src/DownsampleWorker.cpp | 4 ++--
 src/DownsampleWorker.h   | 3 +--
 src/downsample.cpp       | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/DownsampleWorker.cpp b/src/DownsampleWorker.cpp
index 4f047f7..b4e6507 100644
--- a/src/DownsampleWorker.cpp
+++ b/src/DownsampleWorker.cpp
@@ -129,8 +129,8 @@ void DownsampleWorker::operator()(std::size_t begin, std::size_t end) {
 }
 
 DownsampleWorkerSparse::DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, 
-                                               Rcpp::IntegerVector& out_x, int ncols, int samples, unsigned int random_seed)
-    : input_i(i), input_p(p), input_x(x), output_x(out_x), ncols(ncols), samples(samples), random_seed(random_seed) {}
+                                               Rcpp::IntegerVector& out_x, int samples, unsigned int random_seed)
+    : input_i(i), input_p(p), input_x(x), output_x(out_x), samples(samples), random_seed(random_seed) {}
 
 void DownsampleWorkerSparse::operator()(std::size_t begin, std::size_t end) {
     for (std::size_t col = begin; col < end; ++col) {
diff --git a/src/DownsampleWorker.h b/src/DownsampleWorker.h
index b7936b4..53c944c 100644
--- a/src/DownsampleWorker.h
+++ b/src/DownsampleWorker.h
@@ -25,13 +25,12 @@ class DownsampleWorkerSparse : public RcppParallel::Worker {
     Rcpp::IntegerVector input_p;
     Rcpp::IntegerVector input_x;
     Rcpp::IntegerVector output_x;
-    int ncols;
     int samples;
     unsigned int random_seed;
 
 public:
     DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, 
-                           Rcpp::IntegerVector& out_x, int ncols, int samples, unsigned int random_seed);
+                           Rcpp::IntegerVector& out_x, int samples, unsigned int random_seed);
 
     // Parallel operator
     void operator()(std::size_t begin, std::size_t end) override;
diff --git a/src/downsample.cpp b/src/downsample.cpp
index f4c2484..b2da02f 100644
--- a/src/downsample.cpp
+++ b/src/downsample.cpp
@@ -32,7 +32,7 @@ Rcpp::S4 rcpp_downsample_sparse(Rcpp::S4 matrix, int samples, unsigned int rando
     Rcpp::IntegerVector out_x(x.size());
 
     // Create and run the DownsampleWorkerSparse
-    DownsampleWorkerSparse worker(i, p, x, out_x, ncols, samples, random_seed);
+    DownsampleWorkerSparse worker(i, p, x, out_x, samples, random_seed);
     RcppParallel::parallelFor(0, ncols, worker);
 
     // Create a new dgCMatrix object for the output

From c27516d143953a2253f962d0a077e4d64b7fefa9 Mon Sep 17 00:00:00 2001
From: aviezerl <aviezer.lifshitz@weizmann.ac.il>
Date: Fri, 29 Dec 2023 11:15:16 +0200
Subject: [PATCH 10/10] updated _pkgdown

---
 _pkgdown.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/_pkgdown.yml b/_pkgdown.yml
index 30c405b..f55ec4c 100755
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -27,6 +27,10 @@ reference:
 - contents:
   - TGL_kmeans
   - TGL_kmeans_tidy
+- title: Matrix
+  desc: matrix utility functions
+- contents: 
+  - downsample_matrix
 - title: misc
   desc: utility functions
 - contents: