Skip to content

Commit

Permalink
Exposed and documented the underlying counts() function.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Mar 27, 2024
1 parent 0c5b63a commit 22dbdb8
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 32 deletions.
77 changes: 46 additions & 31 deletions include/tatami_stats/counts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <vector>
#include <algorithm>
#include <cmath>
#include <type_traits>

/**
* @file counts.hpp
Expand All @@ -16,46 +17,60 @@
namespace tatami_stats {

/**
* @cond
* Count the number of values in each dimension element that satisfy the `condition`.
*
* @tparam Value_ Type of the matrix value, should be numeric.
* @tparam Index_ Type of the row/column indices.
* @tparam Output_ Type of the output value.
*
* @param row Whether to count in each row.
* @param p Pointer to a `tatami::Matrix`.
* @param[out] output Pointer to an array of length equal to the number of rows (if `row = true`) or columns (otherwise).
* On output, this will contain the row/column variances.
* @param threads Number of threads to use.
* @param condition Function that accepts a `Value_` and returns a boolean.
* If NaNs might be present in `p`, this should be handled by `condition`.
*/
namespace count_internal {

template<bool row_, typename Output_, typename Value_, typename Index_, class Function_>
void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Output_* output, Function_ fun) {
auto dim = (row_ ? p->nrow() : p->ncol());
auto otherdim = (row_ ? p->ncol() : p->nrow());
template<typename Value_, typename Index_, typename Output_, class Condition_>
void counts(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads, Condition_ condition) {
auto dim = (row ? p->nrow() : p->ncol());
auto otherdim = (row ? p->ncol() : p->nrow());
std::fill(output, output + dim, 0);

if (p->prefer_rows() == row_) {
if (p->prefer_rows() == row) {
if (p->sparse()) {
tatami::Options opt;
opt.sparse_ordered_index = false;
Output_ zerocount = fun(0);
bool count_zero = condition(0);

tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
std::vector<Value_> xbuffer(otherdim);
std::vector<Index_> ibuffer(otherdim);
auto ext = tatami::consecutive_extractor<true>(p, row_, start, len, opt);
auto ext = tatami::consecutive_extractor<true>(p, row, start, len, opt);

for (Index_ x = 0; x < len; ++x) {
auto range = ext->fetch(xbuffer.data(), ibuffer.data());
Output_ target = 0;
for (Index_ j = 0; j < range.number; ++j) {
target += fun(range.value[j]);
target += condition(range.value[j]);
}
if (count_zero) {
target += otherdim - range.number;
}
output[x + start] = target + zerocount * (otherdim - range.number);
output[x + start] = target;
}
}, dim, threads);

} else {
tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
std::vector<Value_> xbuffer(otherdim);
auto ext = tatami::consecutive_extractor<false>(p, row_, start, len);
auto ext = tatami::consecutive_extractor<false>(p, row, start, len);

for (Index_ x = 0; x < len; ++x) {
auto ptr = ext->fetch(xbuffer.data());
Output_ target = 0;
for (Index_ j = 0; j < otherdim; ++j) {
target += fun(ptr[j]);
target += condition(ptr[j]);
}
output[x + start] = target;
}
Expand All @@ -74,37 +89,42 @@ void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Outp
if (p->sparse()) {
tatami::Options opt;
opt.sparse_ordered_index = false;
Output_ zerocount = fun(0);
bool count_zero = condition(0);

tatami::parallelize([&](int t, Index_ start, Index_ len) -> void {
std::vector<Value_> xbuffer(dim);
std::vector<Index_> ibuffer(dim);
auto ext = tatami::consecutive_extractor<true>(p, !row_, start, len, opt);
auto ext = tatami::consecutive_extractor<true>(p, !row, start, len, opt);

auto curoutput = threaded_output_ptrs[t];
std::vector<Index_> nonzeros(dim);

for (Index_ x = 0; x < len; ++x) {
auto range = ext->fetch(xbuffer.data(), ibuffer.data());
for (Index_ j = 0; j < range.number; ++j) {
curoutput[range.index[j]] += fun(range.value[j]);
++(nonzeros[range.index[j]]);
auto idx = range.index[j];
curoutput[idx] += condition(range.value[j]);
++(nonzeros[idx]);
}
}

for (int d = 0; d < dim; ++d) {
curoutput[d] += zerocount * (len - nonzeros[d]);
if (count_zero) {
for (int d = 0; d < dim; ++d) {
curoutput[d] += len - nonzeros[d];
}
}
}, otherdim, threads);

} else {
tatami::parallelize([&](int t, Index_ start, Index_ len) -> void {
std::vector<Value_> xbuffer(dim);
auto ext = tatami::consecutive_extractor<false>(p, !row_, start, len);
auto ext = tatami::consecutive_extractor<false>(p, !row, start, len);
auto curoutput = threaded_output_ptrs[t];

for (Index_ x = 0; x < len; ++x) {
auto ptr = ext->fetch(xbuffer.data());
for (Index_ j = 0; j < dim; ++j) {
curoutput[j] += fun(ptr[j]);
curoutput[j] += condition(ptr[j]);
}
}
}, otherdim, threads);
Expand All @@ -119,11 +139,6 @@ void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Outp
}
}

}
/**
* @endcond
*/

/**
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
Expand All @@ -136,7 +151,7 @@ void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Outp
*/
template<typename Value_, typename Index_, typename Output_>
void row_nan_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
count_internal::dimension_counts<true>(p, threads, output, [](Value_ x) -> bool { return std::isnan(x); });
counts(true, p, output, threads, [](Value_ x) -> bool { return std::isnan(x); });
}

/**
Expand Down Expand Up @@ -168,7 +183,7 @@ std::vector<Output_> row_nan_counts(const tatami::Matrix<Value_, Index_>* p, int
*/
template<typename Value_, typename Index_, typename Output_>
void column_nan_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
count_internal::dimension_counts<false>(p, threads, output, [](Value_ x) -> bool { return std::isnan(x); });
counts(false, p, output, threads, [](Value_ x) -> bool { return std::isnan(x); });
}

/**
Expand Down Expand Up @@ -200,7 +215,7 @@ std::vector<Output_> column_nan_counts(const tatami::Matrix<Value_, Index_>* p,
*/
template<typename Value_, typename Index_, typename Output_>
void row_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
count_internal::dimension_counts<true>(p, threads, output, [](Value_ x) -> bool { return x == 0; });
counts(true, p, output, threads, [](Value_ x) -> bool { return x == 0; });
}

/**
Expand Down Expand Up @@ -232,7 +247,7 @@ std::vector<Output_> row_zero_counts(const tatami::Matrix<Value_, Index_>* p, in
*/
template<typename Value_, typename Index_, typename Output_>
void column_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
count_internal::dimension_counts<false>(p, threads, output, [](Value_ x) -> bool { return x == 0; });
counts(false, p, output, threads, [](Value_ x) -> bool { return x == 0; });
}

/**
Expand Down
52 changes: 51 additions & 1 deletion tests/src/counts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,32 @@ TEST(ComputingDimCounts, RowZeroCounts) {
EXPECT_EQ(ref, tatami_stats::row_zero_counts(unsorted_column.get()));
}

TEST(ComputingDimCounts, ColumZeroCount) {
TEST(ComputingDimVariances, RowZeroCountsWithNan) {
size_t NR = 52, NC = 83;
auto dump = tatami_test::simulate_sparse_vector<double>(NR * NC, 0.1);
for (size_t r = 0; r < NR; ++r) { // Injecting an NaN at the start.
dump[r * NC] = std::numeric_limits<double>::quiet_NaN();
}

auto dense_row = std::unique_ptr<tatami::NumericMatrix>(new tatami::DenseRowMatrix<double>(NR, NC, dump));
auto dense_column = tatami::convert_to_dense<false>(dense_row.get());
auto sparse_row = tatami::convert_to_compressed_sparse<true>(dense_row.get());
auto sparse_column = tatami::convert_to_compressed_sparse<false>(dense_row.get());

std::vector<int> ref(NR);
for (size_t r = 0; r < NR; ++r) {
for (size_t c = 1; c < NC; ++c) { // skipping the first element.
ref[r] += (dump[c + r * NC] == 0);
}
}

EXPECT_EQ(ref, tatami_stats::row_zero_counts(dense_row.get()));
EXPECT_EQ(ref, tatami_stats::row_zero_counts(dense_column.get()));
EXPECT_EQ(ref, tatami_stats::row_zero_counts(sparse_row.get()));
EXPECT_EQ(ref, tatami_stats::row_zero_counts(sparse_column.get()));
}

TEST(ComputingDimCounts, ColumnZeroCounts) {
size_t NR = 79, NC = 62;
auto dump = tatami_test::simulate_sparse_vector<double>(NR * NC, 0.1);

Expand Down Expand Up @@ -155,3 +180,28 @@ TEST(ComputingDimCounts, ColumZeroCount) {
std::shared_ptr<tatami::NumericMatrix> unsorted_column(new tatami_test::UnsortedWrapper<double, int>(sparse_column));
EXPECT_EQ(ref, tatami_stats::column_zero_counts(unsorted_column.get()));
}

TEST(ComputingDimVariances, ColumnZeroCountsWithNan) {
size_t NR = 82, NC = 33;
auto dump = tatami_test::simulate_sparse_vector<double>(NR * NC, 0.1);
for (size_t c = 0; c < NC; ++c) { // Injecting an NaN at the start.
dump[c] = std::numeric_limits<double>::quiet_NaN();
}

auto dense_row = std::unique_ptr<tatami::NumericMatrix>(new tatami::DenseRowMatrix<double>(NR, NC, dump));
auto dense_column = tatami::convert_to_dense<false>(dense_row.get());
auto sparse_row = tatami::convert_to_compressed_sparse<true>(dense_row.get());
auto sparse_column = tatami::convert_to_compressed_sparse<false>(dense_row.get());

std::vector<int> ref(NC);
for (size_t c = 0; c < NC; ++c) {
for (size_t r = 1; r < NR; ++r) { // skipping the first row.
ref[c] += dump[c + r * NC] == 0;
}
}

EXPECT_EQ(ref, tatami_stats::column_zero_counts(dense_row.get()));
EXPECT_EQ(ref, tatami_stats::column_zero_counts(dense_column.get()));
EXPECT_EQ(ref, tatami_stats::column_zero_counts(sparse_row.get()));
EXPECT_EQ(ref, tatami_stats::column_zero_counts(sparse_column.get()));
}

0 comments on commit 22dbdb8

Please sign in to comment.