-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15 from tanaylab/feat@downsample-matrix
Feat@downsample matrix
- Loading branch information
Showing
14 changed files
with
656 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
Package: tglkmeans | ||
Title: Efficient Implementation of K-Means++ Algorithm | ||
Version: 0.4.0 | ||
Version: 0.5.0 | ||
Authors@R: c( | ||
person("Aviezer", "Lifshitz", , "[email protected]", role = c("aut", "cre")), | ||
person("Amos", "Tanay", role = "aut"), | ||
|
@@ -30,6 +30,8 @@ Imports: | |
future, | ||
ggplot2 (>= 2.2.0), | ||
magrittr, | ||
Matrix, | ||
methods, | ||
parallel (>= 3.3.2), | ||
plyr (>= 1.8.4), | ||
purrr (>= 0.2.0), | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
#' Downsample the columns of a matrix to a target number | ||
#' | ||
#' @description This function takes a matrix and downsamples it to a target number of samples. | ||
#' It uses a random seed for reproducibility and allows for removing columns with | ||
#' small sums. | ||
#' | ||
#' @param mat An integer matrix to be downsampled. Can be a matrix or sparse matrix (dgCMatrix). | ||
#' If the matrix contains NAs, the function will run significantly slower. Values that are | ||
#' not integers will be coerced to integers using {.code floor()}. | ||
#' @param target_n The target number of samples to downsample to. | ||
#' @param target_q A target quantile of sums to downsample to. Only one of {.field target_n} or {.field target_q} can be provided. | ||
#' @param seed The random seed for reproducibility (default is NULL) | ||
#' @param remove_columns Logical indicating whether to remove columns with small sums (default is FALSE) | ||
#' | ||
#' @return The downsampled matrix | ||
#' | ||
#' @examples | ||
#' \dontshow{ | ||
#' tglkmeans.set_parallel(1) | ||
#' } | ||
#' | ||
#' mat <- matrix(1:12, nrow = 4) | ||
#' downsample_matrix(mat, 2) | ||
#' | ||
#' # Remove columns with small sums | ||
#' downsample_matrix(mat, 12, remove_columns = TRUE) | ||
#' | ||
#' # sparse matrix | ||
#' mat_sparse <- Matrix::Matrix(mat, sparse = TRUE) | ||
#' downsample_matrix(mat_sparse, 2) | ||
#' | ||
#' # with a quantile | ||
#' downsample_matrix(mat, target_q = 0.5) | ||
#' | ||
#' @export | ||
downsample_matrix <- function(mat, target_n = NULL, target_q = NULL, seed = NULL, remove_columns = FALSE) { | ||
if (is.null(target_n) && is.null(target_q)) { | ||
cli_abort("Either {.field target_n} or {.field target_q} must be provided.") | ||
} else if (!is.null(target_n) && !is.null(target_q)) { | ||
cli_abort("Only one of {.field target_n} or {.field target_q} can be provided.") | ||
} | ||
|
||
sums <- colsums_matrix(mat) | ||
if (!is.null(target_q)) { | ||
target_n <- round(stats::quantile(sums, target_q)) | ||
cli::cli_alert_info("Using {.val {target_n}} as the target number.") | ||
} | ||
|
||
if (is.null(seed)) { | ||
seed <- sample(1:10000, 1) | ||
cli::cli_alert_warning("No seed provided. Using {.val {seed}}.") | ||
} else if (!is.numeric(seed) || seed <= 0 || seed != as.integer(seed)) { | ||
cli_abort("{.field seed} must be a positive integer.") | ||
} | ||
|
||
if (!is.logical(remove_columns)) { | ||
cli_abort("{.field remove_columns} must be a logical value.") | ||
} | ||
|
||
if (target_n <= 0 || target_n != as.integer(target_n)) { | ||
cli_abort("{.field target_n} must be a positive integer.") | ||
} | ||
|
||
# replace NAs with 0s for the cpp code | ||
has_nas <- FALSE | ||
if (any(is.na(mat))) { | ||
has_nas <- TRUE | ||
cli_warn("Input matrix contains NAs. Processing would be significantly slower.") | ||
orig_mat <- mat | ||
mat[is.na(mat)] <- 0 | ||
} | ||
|
||
if (methods::is(mat, "dgCMatrix")) { | ||
ds_mat <- rcpp_downsample_sparse(mat, target_n, seed) | ||
} else if (is.matrix(mat)) { | ||
ds_mat <- downsample_matrix_cpp(mat, target_n, seed) | ||
} | ||
|
||
small_cols <- sums < target_n | ||
if (any(small_cols)) { | ||
if (remove_columns) { | ||
ds_mat <- ds_mat[, !small_cols, drop = FALSE] | ||
if (has_nas) { | ||
orig_mat <- orig_mat[, !small_cols, drop = FALSE] | ||
} | ||
cli_alert_info("Removed {.val {sum(small_cols)}} columns with a sum smaller than {.val {target_n}}.") | ||
} else { | ||
cli_warn("{.val {sum(small_cols)}} columns have a sum smaller than {.val {target_n}}. These columns were not changed. To remove them, set {.field remove_columns=TRUE}.") | ||
} | ||
} | ||
|
||
if (has_nas) { | ||
# put back the NAs | ||
ds_mat[is.na(orig_mat)] <- NA | ||
} | ||
|
||
|
||
return(ds_mat) | ||
} | ||
|
||
colsums_matrix <- function(mat) { | ||
if (methods::is(mat, "dgCMatrix")) { | ||
return(Matrix::colSums(mat, na.rm = TRUE)) | ||
} else if (is.matrix(mat)) { | ||
return(colSums(mat, na.rm = TRUE)) | ||
} else { | ||
cli_abort("Input must be a matrix or a sparse matrix (dgCMatrix). class of {.field mat} is {.val {class(mat)}}.") | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
#include <vector> | ||
#include <algorithm> | ||
#include <cassert> | ||
#include <random> | ||
#include <Rcpp.h> | ||
#include <RcppParallel.h> | ||
#include "DownsampleWorker.h" | ||
|
||
typedef float float32_t; | ||
typedef double float64_t; | ||
typedef unsigned char uint8_t; | ||
typedef unsigned int uint_t; | ||
|
||
static size_t | ||
ceil_power_of_two(const size_t size) { | ||
return size_t(1) << size_t(ceil(log2(float64_t(size)))); | ||
} | ||
|
||
template<typename D> | ||
static void initialize_tree(const std::vector<D>& input, std::vector<size_t>& tree) { | ||
assert(input.size() >= 2); | ||
|
||
size_t input_size = ceil_power_of_two(input.size()); | ||
tree.resize(2 * input_size - 1); // Resize the tree to the necessary size | ||
|
||
// Copy input to the beginning of the tree and fill the rest with zeros | ||
std::copy(input.begin(), input.end(), tree.begin()); | ||
std::fill(tree.begin() + input.size(), tree.begin() + input_size, 0); | ||
|
||
// Iteratively build the tree | ||
size_t tree_index = 0; | ||
while (input_size > 1) { | ||
size_t half_size = input_size / 2; | ||
for (size_t index = 0; index < half_size; ++index) { | ||
const auto left = tree[tree_index + index * 2]; | ||
const auto right = tree[tree_index + index * 2 + 1]; | ||
tree[tree_index + input_size + index] = left + right; | ||
|
||
assert(left >= 0); | ||
assert(right >= 0); | ||
assert(tree[tree_index + input_size + index] == size_t(left) + size_t(right)); | ||
} | ||
tree_index += input_size; | ||
input_size = half_size; | ||
} | ||
assert(tree.size() == 2 * input.size() - 1); | ||
} | ||
|
||
static size_t random_sample(std::vector<size_t>& tree, ssize_t random) { | ||
size_t size_of_level = 1; | ||
ssize_t base_of_level = tree.size() - 1; | ||
size_t index_in_level = 0; | ||
size_t index_in_tree = base_of_level + index_in_level; | ||
|
||
while (true) { | ||
assert(index_in_tree == base_of_level + index_in_level); | ||
assert(tree[index_in_tree] > random); | ||
|
||
--tree[index_in_tree]; | ||
size_of_level *= 2; | ||
base_of_level -= size_of_level; | ||
|
||
if (base_of_level < 0) { | ||
return index_in_level; | ||
} | ||
|
||
index_in_level *= 2; | ||
index_in_tree = base_of_level + index_in_level; | ||
ssize_t right_random = random - ssize_t(tree[index_in_tree]); | ||
|
||
assert(tree[base_of_level + index_in_level] + tree[base_of_level + index_in_level + 1] == | ||
tree[base_of_level + size_of_level + index_in_level / 2] + 1); | ||
|
||
if (right_random >= 0) { | ||
++index_in_level; | ||
++index_in_tree; | ||
assert(index_in_level < size_of_level); | ||
random = right_random; | ||
} | ||
} | ||
} | ||
|
||
template<typename D, typename O> | ||
static void downsample_slice(const std::vector<D>& input, std::vector<O>& output, const int32_t samples, const size_t random_seed) { | ||
assert(output.size() == input.size()); | ||
|
||
if (samples < 0 || input.size() == 0) { | ||
return; | ||
} | ||
|
||
if (input.size() == 1) { | ||
output[0] = O(static_cast<double>(samples) < static_cast<double>(input[0]) ? samples : input[0]); | ||
return; | ||
} | ||
|
||
size_t input_size = ceil_power_of_two(input.size()); | ||
std::vector<size_t> tree(2 * input_size - 1); | ||
initialize_tree(input, tree); | ||
size_t& total = tree[tree.size() - 1]; | ||
|
||
if (total <= static_cast<size_t>(samples)) { | ||
std::copy(input.begin(), input.end(), output.begin()); | ||
return; | ||
} | ||
|
||
std::fill(output.begin(), output.end(), O(0)); | ||
|
||
std::minstd_rand random(random_seed); | ||
for (size_t index = 0; index < static_cast<size_t>(samples); ++index) { | ||
size_t sampled_index = random_sample(tree, random() % total); | ||
if (sampled_index < output.size()) { | ||
++output[sampled_index]; | ||
} | ||
} | ||
} | ||
|
||
DownsampleWorker::DownsampleWorker(const Rcpp::IntegerMatrix& input, Rcpp::IntegerMatrix& output, int samples, unsigned int random_seed) | ||
: input_matrix(input), output_matrix(output), samples(samples), random_seed(random_seed) {} | ||
|
||
void DownsampleWorker::operator()(std::size_t begin, std::size_t end) { | ||
for (std::size_t col = begin; col < end; ++col) { | ||
std::vector<int> input_vec(input_matrix.column(col).begin(), input_matrix.column(col).end()); | ||
std::vector<int> output_vec(input_vec.size(), 0); | ||
|
||
downsample_slice(input_vec, output_vec, samples, random_seed); | ||
|
||
std::copy(output_vec.begin(), output_vec.end(), output_matrix.column(col).begin()); | ||
} | ||
} | ||
|
||
DownsampleWorkerSparse::DownsampleWorkerSparse(const Rcpp::IntegerVector& i, const Rcpp::IntegerVector& p, const Rcpp::IntegerVector& x, | ||
Rcpp::IntegerVector& out_x, int samples, unsigned int random_seed) | ||
: input_i(i), input_p(p), input_x(x), output_x(out_x), samples(samples), random_seed(random_seed) {} | ||
|
||
void DownsampleWorkerSparse::operator()(std::size_t begin, std::size_t end) { | ||
for (std::size_t col = begin; col < end; ++col) { | ||
// Extract the current column from the sparse matrix | ||
std::vector<int> input_vec; | ||
for (int idx = input_p[col]; idx < input_p[col + 1]; ++idx) { | ||
input_vec.push_back(input_x[idx]); | ||
} | ||
|
||
std::vector<int> output_vec(input_vec.size(), 0); | ||
|
||
downsample_slice(input_vec, output_vec, samples, random_seed); | ||
|
||
// Store results in the output sparse matrix | ||
for (int idx = input_p[col], out_idx = 0; idx < input_p[col + 1]; ++idx, ++out_idx) { | ||
output_x[idx] = output_vec[out_idx]; | ||
} | ||
} | ||
} |
Oops, something went wrong.