Skip to content

Commit

Permalink
Added NaN-skipping capability for counts.
Browse files Browse the repository at this point in the history
Also exposed and documented the underlying counts() function.
  • Loading branch information
LTLA committed Mar 27, 2024
1 parent 0c5b63a commit 3737787
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 38 deletions.
123 changes: 86 additions & 37 deletions include/tatami_stats/counts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <vector>
#include <algorithm>
#include <cmath>
#include <type_traits>

/**
* @file counts.hpp
Expand All @@ -16,46 +17,72 @@
namespace tatami_stats {

/**
* @cond
* Count the number of values in each dimension element that satisfy the `condition`.
*
* @tparam skip_nan_ Whether to check for (and skip) NaNs.
* @tparam Value_ Type of the matrix value, should be numeric.
* @tparam Index_ Type of the row/column indices.
* @tparam Output_ Type of the output value.
*
* @param row Whether to count in each row.
* @param p Pointer to a `tatami::Matrix`.
* @param[out] output Pointer to an array of length equal to the number of rows (if `row = true`) or columns (otherwise).
* On output, this will contain the row/column variances.
* @param threads Number of threads to use.
* @param condition Function that accepts a `Value_` and returns a boolean.
*/
namespace count_internal {

template<bool row_, typename Output_, typename Value_, typename Index_, class Function_>
void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Output_* output, Function_ fun) {
auto dim = (row_ ? p->nrow() : p->ncol());
auto otherdim = (row_ ? p->ncol() : p->nrow());
template<bool skip_nan_ = false, typename Value_, typename Index_, typename Output_, class Condition_>
void counts(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads, Condition_ condition) {
auto dim = (row ? p->nrow() : p->ncol());
auto otherdim = (row ? p->ncol() : p->nrow());
std::fill(output, output + dim, 0);

if (p->prefer_rows() == row_) {
if (p->prefer_rows() == row) {
if (p->sparse()) {
tatami::Options opt;
opt.sparse_ordered_index = false;
Output_ zerocount = fun(0);
bool count_zero = condition(0);

tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
std::vector<Value_> xbuffer(otherdim);
std::vector<Index_> ibuffer(otherdim);
auto ext = tatami::consecutive_extractor<true>(p, row_, start, len, opt);
auto ext = tatami::consecutive_extractor<true>(p, row, start, len, opt);

for (Index_ x = 0; x < len; ++x) {
auto range = ext->fetch(xbuffer.data(), ibuffer.data());
Output_ target = 0;
for (Index_ j = 0; j < range.number; ++j) {
target += fun(range.value[j]);
auto val = range.value[j];
if constexpr(skip_nan_) {
if (std::isnan(val)) {
continue;
}
}
target += condition(val);
}
if (count_zero) {
target += otherdim - range.number;
}
output[x + start] = target + zerocount * (otherdim - range.number);
output[x + start] = target;
}
}, dim, threads);

} else {
tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
std::vector<Value_> xbuffer(otherdim);
auto ext = tatami::consecutive_extractor<false>(p, row_, start, len);
auto ext = tatami::consecutive_extractor<false>(p, row, start, len);

for (Index_ x = 0; x < len; ++x) {
auto ptr = ext->fetch(xbuffer.data());
Output_ target = 0;
for (Index_ j = 0; j < otherdim; ++j) {
target += fun(ptr[j]);
auto val = ptr[j];
if constexpr(skip_nan_) {
if (std::isnan(val)) {
continue;
}
}
target += condition(ptr[j]);
}
output[x + start] = target;
}
Expand All @@ -74,37 +101,60 @@ void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Outp
if (p->sparse()) {
tatami::Options opt;
opt.sparse_ordered_index = false;
Output_ zerocount = fun(0);
bool count_zero = condition(0);

tatami::parallelize([&](int t, Index_ start, Index_ len) -> void {
std::vector<Value_> xbuffer(dim);
std::vector<Index_> ibuffer(dim);
auto ext = tatami::consecutive_extractor<true>(p, !row_, start, len, opt);
auto ext = tatami::consecutive_extractor<true>(p, !row, start, len, opt);

auto curoutput = threaded_output_ptrs[t];
std::vector<Index_> nonzeros(dim);
typename std::conditional<skip_nan_, std::vector<Index_>, Index_>::type lost(count_zero ? dim : 0);

for (Index_ x = 0; x < len; ++x) {
auto range = ext->fetch(xbuffer.data(), ibuffer.data());
for (Index_ j = 0; j < range.number; ++j) {
curoutput[range.index[j]] += fun(range.value[j]);
++(nonzeros[range.index[j]]);
auto val = range.value[j];
auto idx = range.index[j];
if constexpr(skip_nan_) {
if (std::isnan(val)) {
++(lost[idx]);
continue;
}
}
curoutput[idx] += condition(val);
++(nonzeros[idx]);
}
}

for (int d = 0; d < dim; ++d) {
curoutput[d] += zerocount * (len - nonzeros[d]);
if (count_zero) {
for (int d = 0; d < dim; ++d) {
auto zeros = len - nonzeros[d];
if constexpr(skip_nan_) {
zeros -= lost[d];
}
curoutput[d] += zeros;
}
}
}, otherdim, threads);

} else {
tatami::parallelize([&](int t, Index_ start, Index_ len) -> void {
std::vector<Value_> xbuffer(dim);
auto ext = tatami::consecutive_extractor<false>(p, !row_, start, len);
auto ext = tatami::consecutive_extractor<false>(p, !row, start, len);
auto curoutput = threaded_output_ptrs[t];

for (Index_ x = 0; x < len; ++x) {
auto ptr = ext->fetch(xbuffer.data());
for (Index_ j = 0; j < dim; ++j) {
curoutput[j] += fun(ptr[j]);
auto val = ptr[j];
if constexpr(skip_nan_) {
if (std::isnan(val)) {
continue;
}
}
curoutput[j] += condition(val);
}
}
}, otherdim, threads);
Expand All @@ -119,11 +169,6 @@ void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Outp
}
}

}
/**
* @endcond
*/

/**
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
Expand All @@ -136,7 +181,7 @@ void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Outp
*/
template<typename Value_, typename Index_, typename Output_>
void row_nan_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
count_internal::dimension_counts<true>(p, threads, output, [](Value_ x) -> bool { return std::isnan(x); });
counts<false>(true, p, output, threads, [](Value_ x) -> bool { return std::isnan(x); });
}

/**
Expand Down Expand Up @@ -168,7 +213,7 @@ std::vector<Output_> row_nan_counts(const tatami::Matrix<Value_, Index_>* p, int
*/
template<typename Value_, typename Index_, typename Output_>
void column_nan_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
count_internal::dimension_counts<false>(p, threads, output, [](Value_ x) -> bool { return std::isnan(x); });
counts<false>(false, p, output, threads, [](Value_ x) -> bool { return std::isnan(x); });
}

/**
Expand All @@ -189,6 +234,7 @@ std::vector<Output_> column_nan_counts(const tatami::Matrix<Value_, Index_>* p,
}

/**
* @tparam skip_nan_ Whether to check for (and skip) NaNs.
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
* @tparam Output_ Type of the output value.
Expand All @@ -198,12 +244,13 @@ std::vector<Output_> column_nan_counts(const tatami::Matrix<Value_, Index_>* p,
* On output, this will store the number of zeros in each row.
* @param threads Number of threads to use.
*/
template<typename Value_, typename Index_, typename Output_>
template<bool skip_nan_ = false, typename Value_, typename Index_, typename Output_>
void row_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
count_internal::dimension_counts<true>(p, threads, output, [](Value_ x) -> bool { return x == 0; });
counts<skip_nan_>(true, p, output, threads, [](Value_ x) -> bool { return x == 0; });
}

/**
* @tparam skip_nan_ Whether to check for (and skip) NaNs.
* @tparam Output_ Type of the output value.
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
Expand All @@ -213,14 +260,15 @@ void row_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, i
*
* @return A vector of length equal to the number of rows, containing the number of zeros in each row.
*/
template<typename Output_ = int, typename Value_, typename Index_>
template<bool skip_nan_ = false, typename Output_ = int, typename Value_, typename Index_>
std::vector<Output_> row_zero_counts(const tatami::Matrix<Value_, Index_>* p, int threads = 1) {
std::vector<Output_> output(p->nrow());
row_zero_counts(p, output.data(), threads);
row_zero_counts<skip_nan_>(p, output.data(), threads);
return output;
}

/**
* @tparam skip_nan_ Whether to check for (and skip) NaNs.
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
* @tparam Output_ Type of the output value.
Expand All @@ -230,12 +278,13 @@ std::vector<Output_> row_zero_counts(const tatami::Matrix<Value_, Index_>* p, in
* On output, this will store the number of zeros in each column.
* @param threads Number of threads to use.
*/
template<typename Value_, typename Index_, typename Output_>
template<bool skip_nan_ = false, typename Value_, typename Index_, typename Output_>
void column_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
count_internal::dimension_counts<false>(p, threads, output, [](Value_ x) -> bool { return x == 0; });
counts<skip_nan_>(false, p, output, threads, [](Value_ x) -> bool { return x == 0; });
}

/**
* @tparam skip_nan_ Whether to check for (and skip) NaNs.
* @tparam Output_ Type of the output value.
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
Expand All @@ -245,10 +294,10 @@ void column_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output
*
* @return A vector of length equal to the number of columns, containing the number of zeros in each column.
*/
template<typename Output_ = int, typename Value_, typename Index_>
template<bool skip_nan_ = false, typename Output_ = int, typename Value_, typename Index_>
std::vector<Output_> column_zero_counts(const tatami::Matrix<Value_, Index_>* p, int threads = 1) {
std::vector<Output_> output(p->ncol());
column_zero_counts(p, output.data(), threads);
column_zero_counts<skip_nan_>(p, output.data(), threads);
return output;
}

Expand Down
52 changes: 51 additions & 1 deletion tests/src/counts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,32 @@ TEST(ComputingDimCounts, RowZeroCounts) {
EXPECT_EQ(ref, tatami_stats::row_zero_counts(unsorted_column.get()));
}

TEST(ComputingDimCounts, ColumZeroCount) {
TEST(ComputingDimVariances, RowZeroCountsWithNan) {
size_t NR = 52, NC = 83;
auto dump = tatami_test::simulate_sparse_vector<double>(NR * NC, 0.1);
for (size_t r = 0; r < NR; ++r) { // Injecting an NaN at the start.
dump[r * NC] = std::numeric_limits<double>::quiet_NaN();
}

auto dense_row = std::unique_ptr<tatami::NumericMatrix>(new tatami::DenseRowMatrix<double>(NR, NC, dump));
auto dense_column = tatami::convert_to_dense<false>(dense_row.get());
auto sparse_row = tatami::convert_to_compressed_sparse<true>(dense_row.get());
auto sparse_column = tatami::convert_to_compressed_sparse<false>(dense_row.get());

std::vector<int> ref(NR);
for (size_t r = 0; r < NR; ++r) {
for (size_t c = 1; c < NC; ++c) { // skipping the first element.
ref[r] += (dump[c + r * NC] == 0);
}
}

EXPECT_EQ(ref, tatami_stats::row_zero_counts<true>(dense_row.get()));
EXPECT_EQ(ref, tatami_stats::row_zero_counts<true>(dense_column.get()));
EXPECT_EQ(ref, tatami_stats::row_zero_counts<true>(sparse_row.get()));
EXPECT_EQ(ref, tatami_stats::row_zero_counts<true>(sparse_column.get()));
}

TEST(ComputingDimCounts, ColumnZeroCounts) {
size_t NR = 79, NC = 62;
auto dump = tatami_test::simulate_sparse_vector<double>(NR * NC, 0.1);

Expand Down Expand Up @@ -155,3 +180,28 @@ TEST(ComputingDimCounts, ColumZeroCount) {
std::shared_ptr<tatami::NumericMatrix> unsorted_column(new tatami_test::UnsortedWrapper<double, int>(sparse_column));
EXPECT_EQ(ref, tatami_stats::column_zero_counts(unsorted_column.get()));
}

TEST(ComputingDimVariances, ColumnZeroCountsWithNan) {
size_t NR = 82, NC = 33;
auto dump = tatami_test::simulate_sparse_vector<double>(NR * NC, 0.1);
for (size_t c = 0; c < NC; ++c) { // Injecting an NaN at the start.
dump[c] = std::numeric_limits<double>::quiet_NaN();
}

auto dense_row = std::unique_ptr<tatami::NumericMatrix>(new tatami::DenseRowMatrix<double>(NR, NC, dump));
auto dense_column = tatami::convert_to_dense<false>(dense_row.get());
auto sparse_row = tatami::convert_to_compressed_sparse<true>(dense_row.get());
auto sparse_column = tatami::convert_to_compressed_sparse<false>(dense_row.get());

std::vector<int> ref(NC);
for (size_t c = 0; c < NC; ++c) {
for (size_t r = 1; r < NR; ++r) { // skipping the first row.
ref[c] += dump[c + r * NC] == 0;
}
}

EXPECT_EQ(ref, tatami_stats::column_zero_counts<true>(dense_row.get()));
EXPECT_EQ(ref, tatami_stats::column_zero_counts<true>(dense_column.get()));
EXPECT_EQ(ref, tatami_stats::column_zero_counts<true>(sparse_row.get()));
EXPECT_EQ(ref, tatami_stats::column_zero_counts<true>(sparse_column.get()));
}

0 comments on commit 3737787

Please sign in to comment.