Added NaN-skipping capability for counts.

Also exposed and documented the underlying counts() function.
tatami-inc · Mar 27, 2024 · 3737787 · 3737787
1 parent 0c5b63a
commit 3737787
Show file tree

Hide file tree

Showing 2 changed files with 137 additions and 38 deletions.
diff --git a/include/tatami_stats/counts.hpp b/include/tatami_stats/counts.hpp
@@ -6,6 +6,7 @@
 #include <vector>
 #include <algorithm>
 #include <cmath>
+#include <type_traits>
 
 /**
  * @file counts.hpp
@@ -16,46 +17,72 @@
 namespace tatami_stats {
 
 /**
- * @cond
+ * Count the number of values in each dimension element that satisfy the `condition`.
+ *
+ * @tparam skip_nan_ Whether to check for (and skip) NaNs.
+ * @tparam Value_ Type of the matrix value, should be numeric.
+ * @tparam Index_ Type of the row/column indices.
+ * @tparam Output_ Type of the output value.
+ *
+ * @param row Whether to count in each row.
+ * @param p Pointer to a `tatami::Matrix`.
+ * @param[out] output Pointer to an array of length equal to the number of rows (if `row = true`) or columns (otherwise).
+ * On output, this will contain the row/column variances.
+ * @param threads Number of threads to use.
+ * @param condition Function that accepts a `Value_` and returns a boolean.
  */
-namespace count_internal {
-
-template<bool row_, typename Output_, typename Value_, typename Index_, class Function_>
-void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Output_* output, Function_ fun) {
-    auto dim = (row_ ? p->nrow() : p->ncol());
-    auto otherdim = (row_ ? p->ncol() : p->nrow());
+template<bool skip_nan_ = false, typename Value_, typename Index_, typename Output_, class Condition_>
+void counts(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads, Condition_ condition) {
+    auto dim = (row ? p->nrow() : p->ncol());
+    auto otherdim = (row ? p->ncol() : p->nrow());
     std::fill(output, output + dim, 0);
 
-    if (p->prefer_rows() == row_) {
+    if (p->prefer_rows() == row) {
         if (p->sparse()) {
             tatami::Options opt;
             opt.sparse_ordered_index = false;
-            Output_ zerocount = fun(0);
+            bool count_zero = condition(0);
 
             tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
                 std::vector<Value_> xbuffer(otherdim);
                 std::vector<Index_> ibuffer(otherdim);
-                auto ext = tatami::consecutive_extractor<true>(p, row_, start, len, opt);
+                auto ext = tatami::consecutive_extractor<true>(p, row, start, len, opt);
+
                 for (Index_ x = 0; x < len; ++x) {
                     auto range = ext->fetch(xbuffer.data(), ibuffer.data());
                     Output_ target = 0;
                     for (Index_ j = 0; j < range.number; ++j) {
-                        target += fun(range.value[j]);
+                        auto val = range.value[j];
+                        if constexpr(skip_nan_) {
+                            if (std::isnan(val)) {
+                                continue;
+                            }
+                        }
+                        target += condition(val);
+                    }
+                    if (count_zero) {
+                        target += otherdim - range.number;
                     }
-                    output[x + start] = target + zerocount * (otherdim - range.number);
+                    output[x + start] = target;
                 }
             }, dim, threads);
 
         } else {
             tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
                 std::vector<Value_> xbuffer(otherdim);
-                auto ext = tatami::consecutive_extractor<false>(p, row_, start, len);
+                auto ext = tatami::consecutive_extractor<false>(p, row, start, len);
 
                 for (Index_ x = 0; x < len; ++x) {
                     auto ptr = ext->fetch(xbuffer.data());
                     Output_ target = 0;
                     for (Index_ j = 0; j < otherdim; ++j) {
-                        target += fun(ptr[j]);
+                        auto val = ptr[j];
+                        if constexpr(skip_nan_) {
+                            if (std::isnan(val)) {
+                                continue;
+                            }
+                        }
+                        target += condition(ptr[j]);
                     }
                     output[x + start] = target;
                 }
@@ -74,37 +101,60 @@ void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Outp
         if (p->sparse()) {
             tatami::Options opt;
             opt.sparse_ordered_index = false;
-            Output_ zerocount = fun(0);
+            bool count_zero = condition(0);
 
             tatami::parallelize([&](int t, Index_ start, Index_ len) -> void {
                 std::vector<Value_> xbuffer(dim);
                 std::vector<Index_> ibuffer(dim);
-                auto ext = tatami::consecutive_extractor<true>(p, !row_, start, len, opt);
+                auto ext = tatami::consecutive_extractor<true>(p, !row, start, len, opt);
 
                 auto curoutput = threaded_output_ptrs[t];
                 std::vector<Index_> nonzeros(dim);
+                typename std::conditional<skip_nan_, std::vector<Index_>, Index_>::type lost(count_zero ? dim : 0);
+
                 for (Index_ x = 0; x < len; ++x) {
                     auto range = ext->fetch(xbuffer.data(), ibuffer.data());
                     for (Index_ j = 0; j < range.number; ++j) {
-                        curoutput[range.index[j]] += fun(range.value[j]);
-                        ++(nonzeros[range.index[j]]);
+                        auto val = range.value[j];
+                        auto idx = range.index[j];
+                        if constexpr(skip_nan_) {
+                            if (std::isnan(val)) {
+                                ++(lost[idx]);
+                                continue;
+                            }
+                        }
+                        curoutput[idx] += condition(val);
+                        ++(nonzeros[idx]);
                     }
                 }
 
-                for (int d = 0; d < dim; ++d) {
-                    curoutput[d] += zerocount * (len - nonzeros[d]);
+                if (count_zero) {
+                    for (int d = 0; d < dim; ++d) {
+                        auto zeros = len - nonzeros[d];
+                        if constexpr(skip_nan_) {
+                            zeros -= lost[d];
+                        }
+                        curoutput[d] += zeros;
+                    }
                 }
             }, otherdim, threads);
 
         } else {
             tatami::parallelize([&](int t, Index_ start, Index_ len) -> void {
                 std::vector<Value_> xbuffer(dim);
-                auto ext = tatami::consecutive_extractor<false>(p, !row_, start, len);
+                auto ext = tatami::consecutive_extractor<false>(p, !row, start, len);
                 auto curoutput = threaded_output_ptrs[t];
+
                 for (Index_ x = 0; x < len; ++x) {
                     auto ptr = ext->fetch(xbuffer.data());
                     for (Index_ j = 0; j < dim; ++j) {
-                        curoutput[j] += fun(ptr[j]);
+                        auto val = ptr[j];
+                        if constexpr(skip_nan_) {
+                            if (std::isnan(val)) {
+                                continue;
+                            }
+                        }
+                        curoutput[j] += condition(val);
                     }
                 }
             }, otherdim, threads);
@@ -119,11 +169,6 @@ void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Outp
     }
 }
 
-}
-/**
- * @endcond
- */
-
 /**
  * @tparam Value_ Type of the matrix value, should be summable.
  * @tparam Index_ Type of the row/column indices.
@@ -136,7 +181,7 @@ void dimension_counts(const tatami::Matrix<Value_, Index_>* p, int threads, Outp
  */
 template<typename Value_, typename Index_, typename Output_>
 void row_nan_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
-    count_internal::dimension_counts<true>(p, threads, output, [](Value_ x) -> bool { return std::isnan(x); });
+    counts<false>(true, p, output, threads, [](Value_ x) -> bool { return std::isnan(x); });
 }
 
 /**
@@ -168,7 +213,7 @@ std::vector<Output_> row_nan_counts(const tatami::Matrix<Value_, Index_>* p, int
  */
 template<typename Value_, typename Index_, typename Output_>
 void column_nan_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
-    count_internal::dimension_counts<false>(p, threads, output, [](Value_ x) -> bool { return std::isnan(x); });
+    counts<false>(false, p, output, threads, [](Value_ x) -> bool { return std::isnan(x); });
 }
 
 /**
@@ -189,6 +234,7 @@ std::vector<Output_> column_nan_counts(const tatami::Matrix<Value_, Index_>* p,
 }
 
 /**
+ * @tparam skip_nan_ Whether to check for (and skip) NaNs.
  * @tparam Value_ Type of the matrix value, should be summable.
  * @tparam Index_ Type of the row/column indices.
  * @tparam Output_ Type of the output value.
@@ -198,12 +244,13 @@ std::vector<Output_> column_nan_counts(const tatami::Matrix<Value_, Index_>* p,
  * On output, this will store the number of zeros in each row.
  * @param threads Number of threads to use.
  */
-template<typename Value_, typename Index_, typename Output_>
+template<bool skip_nan_ = false, typename Value_, typename Index_, typename Output_>
 void row_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
-    count_internal::dimension_counts<true>(p, threads, output, [](Value_ x) -> bool { return x == 0; });
+    counts<skip_nan_>(true, p, output, threads, [](Value_ x) -> bool { return x == 0; });
 }
 
 /**
+ * @tparam skip_nan_ Whether to check for (and skip) NaNs.
  * @tparam Output_ Type of the output value.
  * @tparam Value_ Type of the matrix value, should be summable.
  * @tparam Index_ Type of the row/column indices.
@@ -213,14 +260,15 @@ void row_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, i
  *
  * @return A vector of length equal to the number of rows, containing the number of zeros in each row.
  */
-template<typename Output_ = int, typename Value_, typename Index_>
+template<bool skip_nan_ = false, typename Output_ = int, typename Value_, typename Index_>
 std::vector<Output_> row_zero_counts(const tatami::Matrix<Value_, Index_>* p, int threads = 1) {
     std::vector<Output_> output(p->nrow());
-    row_zero_counts(p, output.data(), threads);
+    row_zero_counts<skip_nan_>(p, output.data(), threads);
     return output;
 }
 
 /**
+ * @tparam skip_nan_ Whether to check for (and skip) NaNs.
  * @tparam Value_ Type of the matrix value, should be summable.
  * @tparam Index_ Type of the row/column indices.
  * @tparam Output_ Type of the output value.
@@ -230,12 +278,13 @@ std::vector<Output_> row_zero_counts(const tatami::Matrix<Value_, Index_>* p, in
  * On output, this will store the number of zeros in each column.
  * @param threads Number of threads to use.
  */
-template<typename Value_, typename Index_, typename Output_>
+template<bool skip_nan_ = false, typename Value_, typename Index_, typename Output_>
 void column_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
-    count_internal::dimension_counts<false>(p, threads, output, [](Value_ x) -> bool { return x == 0; });
+    counts<skip_nan_>(false, p, output, threads, [](Value_ x) -> bool { return x == 0; });
 }
 
 /**
+ * @tparam skip_nan_ Whether to check for (and skip) NaNs.
  * @tparam Output_ Type of the output value.
  * @tparam Value_ Type of the matrix value, should be summable.
  * @tparam Index_ Type of the row/column indices.
@@ -245,10 +294,10 @@ void column_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output
  *
  * @return A vector of length equal to the number of columns, containing the number of zeros in each column.
  */
-template<typename Output_ = int, typename Value_, typename Index_>
+template<bool skip_nan_ = false, typename Output_ = int, typename Value_, typename Index_>
 std::vector<Output_> column_zero_counts(const tatami::Matrix<Value_, Index_>* p, int threads = 1) {
     std::vector<Output_> output(p->ncol());
-    column_zero_counts(p, output.data(), threads);
+    column_zero_counts<skip_nan_>(p, output.data(), threads);
     return output;
 }
 

diff --git a/tests/src/counts.cpp b/tests/src/counts.cpp
@@ -122,7 +122,32 @@ TEST(ComputingDimCounts, RowZeroCounts) {
     EXPECT_EQ(ref, tatami_stats::row_zero_counts(unsorted_column.get()));
 }
 
-TEST(ComputingDimCounts, ColumZeroCount) {
+TEST(ComputingDimVariances, RowZeroCountsWithNan) {
+    size_t NR = 52, NC = 83;
+    auto dump = tatami_test::simulate_sparse_vector<double>(NR * NC, 0.1);
+    for (size_t r = 0; r < NR; ++r) { // Injecting an NaN at the start.
+        dump[r * NC] = std::numeric_limits<double>::quiet_NaN();
+    }
+
+    auto dense_row = std::unique_ptr<tatami::NumericMatrix>(new tatami::DenseRowMatrix<double>(NR, NC, dump));
+    auto dense_column = tatami::convert_to_dense<false>(dense_row.get());
+    auto sparse_row = tatami::convert_to_compressed_sparse<true>(dense_row.get());
+    auto sparse_column = tatami::convert_to_compressed_sparse<false>(dense_row.get());
+
+    std::vector<int> ref(NR); 
+    for (size_t r = 0; r < NR; ++r) {
+        for (size_t c = 1; c < NC; ++c) { // skipping the first element.
+            ref[r] += (dump[c + r * NC] == 0);
+        }
+    }
+
+    EXPECT_EQ(ref, tatami_stats::row_zero_counts<true>(dense_row.get()));
+    EXPECT_EQ(ref, tatami_stats::row_zero_counts<true>(dense_column.get()));
+    EXPECT_EQ(ref, tatami_stats::row_zero_counts<true>(sparse_row.get()));
+    EXPECT_EQ(ref, tatami_stats::row_zero_counts<true>(sparse_column.get()));
+}
+
+TEST(ComputingDimCounts, ColumnZeroCounts) {
     size_t NR = 79, NC = 62;
     auto dump = tatami_test::simulate_sparse_vector<double>(NR * NC, 0.1);
 
@@ -155,3 +180,28 @@ TEST(ComputingDimCounts, ColumZeroCount) {
     std::shared_ptr<tatami::NumericMatrix> unsorted_column(new tatami_test::UnsortedWrapper<double, int>(sparse_column));
     EXPECT_EQ(ref, tatami_stats::column_zero_counts(unsorted_column.get()));
 }
+
+TEST(ComputingDimVariances, ColumnZeroCountsWithNan) {
+    size_t NR = 82, NC = 33;
+    auto dump = tatami_test::simulate_sparse_vector<double>(NR * NC, 0.1);
+    for (size_t c = 0; c < NC; ++c) { // Injecting an NaN at the start.
+        dump[c] = std::numeric_limits<double>::quiet_NaN();
+    }
+
+    auto dense_row = std::unique_ptr<tatami::NumericMatrix>(new tatami::DenseRowMatrix<double>(NR, NC, dump));
+    auto dense_column = tatami::convert_to_dense<false>(dense_row.get());
+    auto sparse_row = tatami::convert_to_compressed_sparse<true>(dense_row.get());
+    auto sparse_column = tatami::convert_to_compressed_sparse<false>(dense_row.get());
+
+    std::vector<int> ref(NC);
+    for (size_t c = 0; c < NC; ++c) {
+        for (size_t r = 1; r < NR; ++r) { // skipping the first row.
+            ref[c] += dump[c + r * NC] == 0;
+        }
+    }
+
+    EXPECT_EQ(ref, tatami_stats::column_zero_counts<true>(dense_row.get()));
+    EXPECT_EQ(ref, tatami_stats::column_zero_counts<true>(dense_column.get()));
+    EXPECT_EQ(ref, tatami_stats::column_zero_counts<true>(sparse_row.get()));
+    EXPECT_EQ(ref, tatami_stats::column_zero_counts<true>(sparse_column.get()));
+}