Skip to content

Commit

Permalink
Switch to Neumaier's summation for dimension sums.
Browse files Browse the repository at this point in the history
This is more numerically stable than the naive sum we were doing before.

We also add NaN-ignoring support for summation, and we expose a sums() function
that takes the dimension as a run-time argument.
  • Loading branch information
LTLA committed Mar 26, 2024
1 parent 58e66af commit f8dc69e
Show file tree
Hide file tree
Showing 3 changed files with 304 additions and 53 deletions.
251 changes: 219 additions & 32 deletions include/tatami_stats/sums.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,67 +16,259 @@
namespace tatami_stats {

/**
* @cond
* @brief Functions for computing dimension-wise sums.
* @namespace tatami_stats::sum
*/
namespace sum_internal {
namespace sum {

template<bool row_, typename Value_, typename Index_, typename Output_>
void dimension_sums(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads) {
auto dim = (row_ ? p->nrow() : p->ncol());
auto otherdim = (row_ ? p->ncol() : p->nrow());
const bool direct = p->prefer_rows() == row_;
/**
* Add the next value for Neumaier summation.
* This function should be called multiple times to obtain the sum and its error;
* the latter should then be added to the former to obtain the final compensated sum.
*
* @tparam skip_nan_ Whether to check for (and skip) NaNs.
* @tparam Output_ Type of the output data.
* @tparam Value_ Type of the input data.
*
* @param sum Current value of the sum.
* This should be set to zero before the first call to this function.
* @param error Current value of the error.
* This should be set to zero before the first call to this function.
* @param val Value to be added to `sum`.
*/
template<bool skip_nan_ = false, typename Output_, typename Value_>
void add(Output_& sum, Output_& error, Value_ val) {
if constexpr(skip_nan_) {
if (std::isnan(val)) {
return;
}
}

auto t = sum + val;
if (std::abs(sum) >= std::abs(val)) {
error += (sum - t) + val;
} else {
error += (val - t) + sum;
}
sum = t;
}

/**
* Perform Neumaier summation on an array of values.
*
* @tparam skip_nan_ Whether to check for (and skip) NaNs.
* @tparam Output_ Type of the output data.
* @tparam Value_ Type of the input data.
* @tparam Index_ Type of the row/column index.
*
* @param[in] ptr Pointer to an array of values of length `num`.
* @param num Size of the array.
* @return The compensated sum.
*/
template<bool skip_nan_ = false, typename Output_, typename Value_, typename Index_>
Output_ compute(const Value_* ptr, Index_ num) {
Output_ sum = 0, error = 0;
for (Index_ i = 0; i < num; ++i) {
add<skip_nan_>(sum, error, ptr[i]);
}
return sum + error;
}

/**
* @brief Running sums from dense data.
*
* Compute running sums from dense data using Neumaier summation.
* This considers a scenario with a set of equilength "target" vectors [V1, V2, V3, ..., Vn],
* but data are only available for "observed" vectors [P1, P2, P3, ..., Pm],
* where Pi[j] contains the i-th element of target vector Vj.
* The idea is to repeatedly call `add()` for `ptr` corresponding to observed vectors from 0 to m - 1,
* and then finally call `finish()` to obtain the sum for each target vector.
*
* @tparam skip_nan_ Whether to check for (and skip) NaNs.
* @tparam Output_ Type of the output data.
* @tparam Index_ Type of the row/column indices.
*/
template<bool skip_nan_ = false, typename Output_ = double, typename Index_ = int>
struct RunningDense {
/**
* @param num Number of target vectors, i.e., n.
* @param[out] sum Pointer to an output array of length `num`.
* This should be zeroed on input.
* Once `finish()` is called, this array will contain the sums for each target vector.
*/
RunningDense(Index_ num, Output_* sum) : num(num), sum(sum), error(num) {}

/**
* Add the next observed vector to the sum calculation.
* @tparam Value_ Type of the input data.
* @param[in] ptr Pointer to an array of values of length `num`, corresponding to an observed vector.
*/
template<typename Value_>
void add(const Value_* ptr) {
for (Index_ i = 0; i < num; ++i) {
sum::add<skip_nan_>(sum[i], error[i], ptr[i]);
}
}

/**
* Finish the compensated sum calculation once all observed vectors have been passed to `add()`.
* Sums are stored in the `sum` array passed to the `RunningDense` constructor.
*/
void finish() {
for (Index_ i = 0; i < num; ++i) {
sum[i] += error[i];
}
}

private:
Index_ num;
Output_* sum;
std::vector<Output_> error;
};

/**
* @brief Running sums from sparse data.
*
* Compute running sums from sparse data using Neumaier summation.
* This does the same as its dense overload for sparse observed vectors.
*
* @tparam skip_nan_ Whether to check for (and skip) NaNs.
* @tparam Output_ Type of the output data.
* @tparam Index_ Type of the row/column indices.
*/
template<bool skip_nan_ = false, typename Output_ = double, typename Index_ = int>
struct RunningSparse {
/**
* @param num Number of target vectors.
* @param[out] sum Pointer to an output array of length `num`, containing the sums for each target vector.
* This should be zeroed on input.
* Once `finish()` is called, this array will contain the sums for each target vector.
* @param subtract Offset to subtract from each element of `index` before using it to index into `mean` and friends.
* Only relevant if `mean` and friends hold statistics for a contiguous subset of target vectors,
* e.g., during task allocation for parallelization.
*/
RunningSparse(Index_ num, Output_* sum, Index_ subtract = 0) : num(num), sum(sum), error(num), subtract(subtract) {}

/**
* Add the next observed vector to the sum.
* @tparam Value_ Type of the input value.
* @param[in] value Value of structural non-zero elements.
* @param[in] index Index of structural non-zero elements.
* @param number Number of non-zero elements in `value` and `index`.
*/
template<typename Value_>
void add(const Value_* value, const Index_* index, Index_ number) {
for (Index_ i = 0; i < number; ++i) {
auto idx = index[i] - subtract;
sum::add<skip_nan_>(sum[idx], error[idx], value[i]);
}
}

/**
* Finish the compensated sum calculation once all observed vectors have been passed to `add()`.
* Sums are stored in the `sum` array passed to the `RunningSparse` constructor.
*/
void finish() {
for (Index_ i = 0; i < num; ++i) {
sum[i] += error[i];
}
}

private:
Index_ num;
Output_* sum;
std::vector<Output_> error;
Index_ subtract;
};

}

/**
* Compute sums for each element of a chosen dimension of a `tatami::Matrix`,
* using Neumaier's method for numerical stability.
*
* @internal
* We don't use pairwise summation as this is tricky to implement consistently for sparse data.
* Namely, should the recursion be done on the full dimension extent, or just the non-zero counts?
* We can't use the latter because the running sum doesn't know the number of non-zeros per target vector.
* But if we use the full dimension extent, the direct sum needs to check the index of each element,
* which involves an additional look-up and comparison.
*
* Neumaier is slower but is much simpler and, hey, we get stable sums.
* We can also better handle variable numbers of NaNs.
* @endinternal
*
* @tparam skip_nan_ Whether to check for (and skip) NaNs.
* @tparam Value_ Type of the matrix value, should be numeric.
* @tparam Index_ Type of the row/column indices.
* @tparam Output_ Type of the output value.
*
* @param row Whether to compute variances for the rows.
* @param p Pointer to a `tatami::Matrix`.
* @param[out] output Pointer to an array of length equal to the number of rows (if `row = true`) or columns (otherwise).
* On output, this will contain the row/column variances.
* @param threads Number of threads to use.
*/
template<bool skip_nan_, typename Value_, typename Index_, typename Output_>
void sums(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads) {
auto dim = (row ? p->nrow() : p->ncol());
auto otherdim = (row ? p->ncol() : p->nrow());
const bool direct = p->prefer_rows() == row;

if (p->sparse()) {
if (direct) {
tatami::Options opt;
opt.sparse_extract_index = false;

tatami::parallelize([&](size_t, Index_ s, Index_ l) {
auto ext = tatami::consecutive_extractor<true>(p, row_, s, l, opt);
auto ext = tatami::consecutive_extractor<true>(p, row, s, l, opt);
std::vector<Value_> vbuffer(otherdim);
for (Index_ x = 0; x < l; ++x) {
auto out = ext->fetch(vbuffer.data(), NULL);
output[x + s] = std::accumulate(out.value, out.value + out.number, static_cast<Output_>(0));
output[x + s] = sum::compute<skip_nan_, Output_>(out.value, out.number);
}
}, dim, threads);

} else {
tatami::Options opt;
opt.sparse_ordered_index = false;
std::fill(output, output + dim, static_cast<Output_>(0));

tatami::parallelize([&](size_t, Index_ s, Index_ l) {
auto ext = tatami::consecutive_extractor<true>(p, !row_, 0, otherdim, s, l);
auto ext = tatami::consecutive_extractor<true>(p, !row, 0, otherdim, s, l, opt);
std::vector<Value_> vbuffer(l);
std::vector<Index_> ibuffer(l);
sum::RunningSparse<skip_nan_, Output_, Index_> runner(l, output + s, s);

for (Index_ x = 0; x < otherdim; ++x) {
auto out = ext->fetch(vbuffer.data(), ibuffer.data());
for (Index_ j = 0; j < out.number; ++j) {
output[out.index[j]] += out.value[j];
}
runner.add(out.value, out.index, out.number);
}
}, dim, threads);
}

} else {
if (direct) {
tatami::parallelize([&](size_t, Index_ s, Index_ l) {
auto ext = tatami::consecutive_extractor<false>(p, row_, s, l);
auto ext = tatami::consecutive_extractor<false>(p, row, s, l);
std::vector<Value_> buffer(otherdim);
for (Index_ x = 0; x < l; ++x) {
auto out = ext->fetch(buffer.data());
output[x + s] = std::accumulate(out, out + otherdim, static_cast<Output_>(0));
output[x + s] = sum::compute<skip_nan_, Output_>(out, otherdim);
}
}, dim, threads);

} else {
std::fill(output, output + dim, static_cast<Output_>(0));

tatami::parallelize([&](size_t, Index_ s, Index_ l) {
auto ext = tatami::consecutive_extractor<false>(p, !row_, 0, otherdim, s, l);
auto ext = tatami::consecutive_extractor<false>(p, !row, 0, otherdim, s, l);
std::vector<Value_> buffer(l);
sum::RunningDense<skip_nan_, Output_, Index_> runner(l, output + s);

for (Index_ x = 0; x < otherdim; ++x) {
auto out = ext->fetch(buffer.data());
for (Index_ j = 0; j < l; ++j) {
output[s + j] += out[j];
}
runner.add(out);
}
}, dim, threads);
}
Expand All @@ -85,11 +277,6 @@ void dimension_sums(const tatami::Matrix<Value_, Index_>* p, Output_* output, in
return;
}

}
/**
* @endcond
*/

/**
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
Expand All @@ -100,9 +287,9 @@ void dimension_sums(const tatami::Matrix<Value_, Index_>* p, Output_* output, in
* On output, this will store the sum of values for each column.
* @param threads Number of threads to use.
*/
template<typename Value_, typename Index_, typename Output_>
template<bool skip_nan_ = false, typename Value_, typename Index_, typename Output_>
void column_sums(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
sum_internal::dimension_sums<false>(p, output, threads);
sums<skip_nan_>(false, p, output, threads);
return;
}

Expand All @@ -116,10 +303,10 @@ void column_sums(const tatami::Matrix<Value_, Index_>* p, Output_* output, int t
*
* @return A vector of length equal to the number of columns, containing the column sums.
*/
template<typename Output_ = double, typename Value_, typename Index_>
template<bool skip_nan_ = false, typename Output_ = double, typename Value_, typename Index_>
std::vector<Output_> column_sums(const tatami::Matrix<Value_, Index_>* p, int threads = 1) {
std::vector<Output_> output(p->ncol());
tatami_stats::column_sums(p, output.data(), threads);
tatami_stats::column_sums<skip_nan_>(p, output.data(), threads);
return output;
}

Expand All @@ -133,9 +320,9 @@ std::vector<Output_> column_sums(const tatami::Matrix<Value_, Index_>* p, int th
* On output, this will contain the row sums.
* @param threads Number of threads to use.
*/
template<typename Output_ = double, typename Value_, typename Index_>
template<bool skip_nan_ = false, typename Output_ = double, typename Value_, typename Index_>
void row_sums(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
sum_internal::dimension_sums<true>(p, output, threads);
sums<skip_nan_>(true, p, output, threads);
return;
}

Expand All @@ -149,10 +336,10 @@ void row_sums(const tatami::Matrix<Value_, Index_>* p, Output_* output, int thre
*
* @return A vector of length equal to the number of rows, containing the row sums.
*/
template<typename Output_ = double, typename Value_, typename Index_>
template<bool skip_nan_ = false, typename Output_ = double, typename Value_, typename Index_>
std::vector<Output_> row_sums(const tatami::Matrix<Value_, Index_>* p, int threads = 1) {
std::vector<Output_> output(p->nrow());
tatami_stats::row_sums(p, output.data(), threads);
tatami_stats::row_sums<skip_nan_>(p, output.data(), threads);
return output;
}

Expand Down
3 changes: 2 additions & 1 deletion tests/src/grouped_sums.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "tatami_stats/grouped_sums.hpp"
#include "tatami_stats/sums.hpp"
#include "tatami_test/tatami_test.hpp"
#include "utils.h"

TEST(GroupedSums, ByRow) {
size_t NR = 99, NC = 155;
Expand Down Expand Up @@ -37,7 +38,7 @@ TEST(GroupedSums, ByRow) {
expected[g + r * ngroup] = per_group[r];
}
}
EXPECT_EQ(expected, rref);
compare_double_vectors(expected, rref);

EXPECT_EQ(rref, tatami_stats::row_sums_by_group(dense_column.get(), cgroups.data()));
EXPECT_EQ(rref, tatami_stats::row_sums_by_group(sparse_row.get(), cgroups.data()));
Expand Down
Loading

0 comments on commit f8dc69e

Please sign in to comment.