Skip to content

Commit

Permalink
Merge pull request #15 from tanaylab/feat@downsample-matrix
Browse files Browse the repository at this point in the history
Feat@downsample matrix
  • Loading branch information
aviezerl committed Dec 30, 2023
2 parents bad1652 + c27516d commit c65b008
Show file tree
Hide file tree
Showing 14 changed files with 656 additions and 3 deletions.
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tglkmeans
Title: Efficient Implementation of K-Means++ Algorithm
Version: 0.4.0
Version: 0.5.0
Authors@R: c(
person("Aviezer", "Lifshitz", , "[email protected]", role = c("aut", "cre")),
person("Amos", "Tanay", role = "aut"),
Expand Down Expand Up @@ -30,6 +30,8 @@ Imports:
future,
ggplot2 (>= 2.2.0),
magrittr,
Matrix,
methods,
parallel (>= 3.3.2),
plyr (>= 1.8.4),
purrr (>= 0.2.0),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
export("%>%")
export(TGL_kmeans)
export(TGL_kmeans_tidy)
export(downsample_matrix)
export(simulate_data)
export(tglkmeans.set_parallel)
import(dplyr)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# tglkmeans 0.5.0

* Added `dowsample_matrix` function to downsample the columns of a count matrix to a target number.

# tglkmeans 0.4.0

* Default of `id_column` parameter was changed to `FALSE`. Note that this is a breaking change, and if you want to use an id column, you need to set it explicitly to `TRUE`.
Expand Down
8 changes: 8 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,11 @@ TGL_kmeans_cpp <- function(ids, mat, k, metric, max_iter = 40, min_delta = 0.000
.Call('_tglkmeans_TGL_kmeans_cpp', PACKAGE = 'tglkmeans', ids, mat, k, metric, max_iter, min_delta, use_cpp_random, seed)
}

downsample_matrix_cpp <- function(input, samples, random_seed) {
.Call('_tglkmeans_downsample_matrix_cpp', PACKAGE = 'tglkmeans', input, samples, random_seed)
}

rcpp_downsample_sparse <- function(matrix, samples, random_seed) {
.Call('_tglkmeans_rcpp_downsample_sparse', PACKAGE = 'tglkmeans', matrix, samples, random_seed)
}

109 changes: 109 additions & 0 deletions R/downsample.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#' Downsample the columns of a matrix to a target number
#'
#' @description This function takes a matrix and downsamples it to a target number of samples.
#' It uses a random seed for reproducibility and allows for removing columns with
#' small sums.
#'
#' @param mat An integer matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix).
#' If the matrix contains NAs, the function will run significantly slower. Values that are
#' not integers will be coerced to integers using {.code floor()}.
#' @param target_n The target number of samples to downsample to.
#' @param target_q A target quantile of sums to downsample to. Only one of {.field target_n} or {.field target_q} can be provided.
#' @param seed The random seed for reproducibility (default is NULL)
#' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE)
#'
#' @return The downsampled matrix
#'
#' @examples
#' \dontshow{
#' tglkmeans.set_parallel(1)
#' }
#'
#' mat <- matrix(1:12, nrow = 4)
#' downsample_matrix(mat, 2)
#'
#' # Remove columns with small sums
#' downsample_matrix(mat, 12, remove_columns = TRUE)
#'
#' # sparse matrix
#' mat_sparse <- Matrix::Matrix(mat, sparse = TRUE)
#' downsample_matrix(mat_sparse, 2)
#'
#' # with a quantile
#' downsample_matrix(mat, target_q = 0.5)
#'
#' @export
downsample_matrix <- function(mat, target_n = NULL, target_q = NULL, seed = NULL, remove_columns = FALSE) {
if (is.null(target_n) && is.null(target_q)) {
cli_abort("Either {.field target_n} or {.field target_q} must be provided.")
} else if (!is.null(target_n) && !is.null(target_q)) {
cli_abort("Only one of {.field target_n} or {.field target_q} can be provided.")
}

sums <- colsums_matrix(mat)
if (!is.null(target_q)) {
target_n <- round(stats::quantile(sums, target_q))
cli::cli_alert_info("Using {.val {target_n}} as the target number.")
}

if (is.null(seed)) {
seed <- sample(1:10000, 1)
cli::cli_alert_warning("No seed provided. Using {.val {seed}}.")
} else if (!is.numeric(seed) || seed <= 0 || seed != as.integer(seed)) {
cli_abort("{.field seed} must be a positive integer.")
}

if (!is.logical(remove_columns)) {
cli_abort("{.field remove_columns} must be a logical value.")
}

if (target_n <= 0 || target_n != as.integer(target_n)) {
cli_abort("{.field target_n} must be a positive integer.")
}

# replace NAs with 0s for the cpp code
has_nas <- FALSE
if (any(is.na(mat))) {
has_nas <- TRUE
cli_warn("Input matrix contains NAs. Processing would be significantly slower.")
orig_mat <- mat
mat[is.na(mat)] <- 0
}

if (methods::is(mat, "dgCMatrix")) {
ds_mat <- rcpp_downsample_sparse(mat, target_n, seed)
} else if (is.matrix(mat)) {
ds_mat <- downsample_matrix_cpp(mat, target_n, seed)
}

small_cols <- sums < target_n
if (any(small_cols)) {
if (remove_columns) {
ds_mat <- ds_mat[, !small_cols, drop = FALSE]
if (has_nas) {
orig_mat <- orig_mat[, !small_cols, drop = FALSE]
}
cli_alert_info("Removed {.val {sum(small_cols)}} columns with a sum smaller than {.val {target_n}}.")
} else {
cli_warn("{.val {sum(small_cols)}} columns have a sum smaller than {.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.")
}
}

if (has_nas) {
# put back the NAs
ds_mat[is.na(orig_mat)] <- NA
}


return(ds_mat)
}

colsums_matrix <- function(mat) {
if (methods::is(mat, "dgCMatrix")) {
return(Matrix::colSums(mat, na.rm = TRUE))
} else if (is.matrix(mat)) {
return(colSums(mat, na.rm = TRUE))
} else {
cli_abort("Input must be a matrix or a sparse matrix (dgCMatrix). class of {.field mat} is {.val {class(mat)}}.")
}
}
4 changes: 4 additions & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ reference:
- contents:
- TGL_kmeans
- TGL_kmeans_tidy
- title: Matrix
desc: matrix utility functions
- contents:
- downsample_matrix
- title: misc
desc: utility functions
- contents:
Expand Down
2 changes: 1 addition & 1 deletion man/TGL_kmeans.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/TGL_kmeans_tidy.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

54 changes: 54 additions & 0 deletions man/downsample_matrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

152 changes: 152 additions & 0 deletions src/DownsampleWorker.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#include <vector>
#include <algorithm>
#include <cassert>
#include <random>
#include <Rcpp.h>
#include <RcppParallel.h>
#include "DownsampleWorker.h"

typedef float float32_t;
typedef double float64_t;
typedef unsigned char uint8_t;
typedef unsigned int uint_t;

static size_t
ceil_power_of_two(const size_t size) {
return size_t(1) << size_t(ceil(log2(float64_t(size))));
}

template<typename D>
static void initialize_tree(const std::vector<D>& input, std::vector<size_t>& tree) {
assert(input.size() >= 2);

size_t input_size = ceil_power_of_two(input.size());
tree.resize(2 * input_size - 1); // Resize the tree to the necessary size

// Copy input to the beginning of the tree and fill the rest with zeros
std::copy(input.begin(), input.end(), tree.begin());
std::fill(tree.begin() + input.size(), tree.begin() + input_size, 0);

// Iteratively build the tree
size_t tree_index = 0;
while (input_size > 1) {
size_t half_size = input_size / 2;
for (size_t index = 0; index < half_size; ++index) {
const auto left = tree[tree_index + index * 2];
const auto right = tree[tree_index + index * 2 + 1];
tree[tree_index + input_size + index] = left + right;

assert(left >= 0);
assert(right >= 0);
assert(tree[tree_index + input_size + index] == size_t(left) + size_t(right));
}
tree_index += input_size;
input_size = half_size;
}
assert(tree.size() == 2 * input.size() - 1);
}

static size_t random_sample(std::vector<size_t>& tree, ssize_t random) {
size_t size_of_level = 1;
ssize_t base_of_level = tree.size() - 1;
size_t index_in_level = 0;
size_t index_in_tree = base_of_level + index_in_level;

while (true) {
assert(index_in_tree == base_of_level + index_in_level);
assert(tree[index_in_tree] > random);

--tree[index_in_tree];
size_of_level *= 2;
base_of_level -= size_of_level;

if (base_of_level < 0) {
return index_in_level;
}

index_in_level *= 2;
index_in_tree = base_of_level + index_in_level;
ssize_t right_random = random - ssize_t(tree[index_in_tree]);

assert(tree[base_of_level + index_in_level] + tree[base_of_level + index_in_level + 1] ==
tree[base_of_level + size_of_level + index_in_level / 2] + 1);

if (right_random >= 0) {
++index_in_level;
++index_in_tree;
assert(index_in_level < size_of_level);
random = right_random;
}
}
}

template<typename D, typename O>
static void downsample_slice(const std::vector<D>& input, std::vector<O>& output, const int32_t samples, const size_t random_seed) {
assert(output.size() == input.size());

if (samples < 0 || input.size() == 0) {
return;
}

if (input.size() == 1) {
output[0] = O(static_cast<double>(samples) < static_cast<double>(input[0]) ? samples : input[0]);
return;
}

size_t input_size = ceil_power_of_two(input.size());
std::vector<size_t> tree(2 * input_size - 1);
initialize_tree(input, tree);
size_t& total = tree[tree.size() - 1];

if (total <= static_cast<size_t>(samples)) {
std::copy(input.begin(), input.end(), output.begin());
return;
}

std::fill(output.begin(), output.end(), O(0));

std::minstd_rand random(random_seed);
for (size_t index = 0; index < static_cast<size_t>(samples); ++index) {
size_t sampled_index = random_sample(tree, random() % total);
if (sampled_index < output.size()) {
++output[sampled_index];
}
}
}

DownsampleWorker::DownsampleWorker(const Rcpp::IntegerMatrix& input, Rcpp::IntegerMatrix& output, int samples, unsigned int random_seed)
: input_matrix(input), output_matrix(output), samples(samples), random_seed(random_seed) {}

void DownsampleWorker::operator()(std::size_t begin, std::size_t end) {
for (std::size_t col = begin; col < end; ++col) {
std::vector<int> input_vec(input_matrix.column(col).begin(), input_matrix.column(col).end());
std::vector<int> output_vec(input_vec.size(), 0);

downsample_slice(input_vec, output_vec, samples, random_seed);

std::copy(output_vec.begin(), output_vec.end(), output_matrix.column(col).begin());
}
}

DownsampleWorkerSparse::DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x,
Rcpp::IntegerVector& out_x, int samples, unsigned int random_seed)
: input_i(i), input_p(p), input_x(x), output_x(out_x), samples(samples), random_seed(random_seed) {}

void DownsampleWorkerSparse::operator()(std::size_t begin, std::size_t end) {
for (std::size_t col = begin; col < end; ++col) {
// Extract the current column from the sparse matrix
std::vector<int> input_vec;
for (int idx = input_p[col]; idx < input_p[col + 1]; ++idx) {
input_vec.push_back(input_x[idx]);
}

std::vector<int> output_vec(input_vec.size(), 0);

downsample_slice(input_vec, output_vec, samples, random_seed);

// Store results in the output sparse matrix
for (int idx = input_p[col], out_idx = 0; idx < input_p[col + 1]; ++idx, ++out_idx) {
output_x[idx] = output_vec[out_idx];
}
}
}
Loading

0 comments on commit c65b008

Please sign in to comment.