Skip to content

Commit

Permalink
Overhauled organization of functions within each category.
Browse files Browse the repository at this point in the history
Each statistic now gets its own namespace with a common apply(), direct() and
Running* classes (if applicable). The row/column functions are now moved inside
this namespace and are called by_row and by_column. NaN skipping and the number
of threads are now passed via a separate statistic-specific Options class,
which avoids the need to decide on NaN handling at compile time.
  • Loading branch information
LTLA committed Mar 27, 2024
1 parent 111e912 commit f7c491c
Show file tree
Hide file tree
Showing 13 changed files with 977 additions and 1,042 deletions.
182 changes: 133 additions & 49 deletions include/tatami_stats/counts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@

namespace tatami_stats {

/**
* @brief Functions for computing dimension-wise counts.
* @namespace tatami_stats::counts
*/
namespace counts {

/**
* Count the number of values in each dimension element that satisfy the `condition`.
*
Expand All @@ -28,12 +34,12 @@ namespace tatami_stats {
* @param p Pointer to a `tatami::Matrix`.
* @param[out] output Pointer to an array of length equal to the number of rows (if `row = true`) or columns (otherwise).
* On output, this will contain the row/column variances.
* @param threads Number of threads to use.
* @param num_threads Number of threads to use.
* @param condition Function that accepts a `Value_` and returns a boolean.
* If NaNs might be present in `p`, this should be handled by `condition`.
* This function is also responsible for handling any NaNs that might be present in `p`.
*/
template<typename Value_, typename Index_, typename Output_, class Condition_>
void counts(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads, Condition_ condition) {
void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, int num_threads, Condition_ condition) {
auto dim = (row ? p->nrow() : p->ncol());
auto otherdim = (row ? p->ncol() : p->nrow());
std::fill(output, output + dim, 0);
Expand All @@ -60,7 +66,7 @@ void counts(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output,
}
output[x + start] = target;
}
}, dim, threads);
}, dim, num_threads);

} else {
tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
Expand All @@ -75,13 +81,13 @@ void counts(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output,
}
output[x + start] = target;
}
}, dim, threads);
}, dim, num_threads);
}

} else {
std::vector<Output_*> threaded_output_ptrs(threads, output);
std::vector<std::vector<Output_> > threaded_output(threads - 1);
for (int t = 1; t < threads; ++t) {
std::vector<Output_*> threaded_output_ptrs(num_threads, output);
std::vector<std::vector<Output_> > threaded_output(num_threads - 1);
for (int t = 1; t < num_threads; ++t) {
auto& curout = threaded_output[t - 1];
curout.resize(dim);
threaded_output_ptrs[t] = curout.data();
Expand Down Expand Up @@ -114,7 +120,7 @@ void counts(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output,
curoutput[d] += len - nonzeros[d];
}
}
}, otherdim, threads);
}, otherdim, num_threads);

} else {
tatami::parallelize([&](int t, Index_ start, Index_ len) -> void {
Expand All @@ -128,10 +134,10 @@ void counts(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output,
curoutput[j] += condition(ptr[j]);
}
}
}, otherdim, threads);
}, otherdim, num_threads);
}

for (int t = 1; t < threads; ++t) {
for (int t = 1; t < num_threads; ++t) {
auto curoutput = threaded_output_ptrs[t];
for (Index_ d = 0; d < dim; ++d) {
output[d] += curoutput[d];
Expand All @@ -140,141 +146,219 @@ void counts(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output,
}
}

/**
* @brief Functions for counting NaNs on each dimension.
* @namespace tatami_stats::counts::nan
*/
namespace nan {

/**
* @brief NaN-counting options.
*/
struct Options {
/**
* Number of threads to use when obtaining counts across a `tatami::Matrix`.
*/
int num_threads = 1;
};

/**
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
* @tparam Output_ Type of the output value.
* This should be at least large enough to hold the dimensions of `p`.
*
* @param row Whether to obtain a count for each row.
* @param p Pointer to a `tatami::Matrix`.
* @param[out] output Pointer to an array of length equal to the number of rows.
* On output, this will store the number of NaNs in each row.
* @param threads Number of threads to use.
* @param nopt Counting options.
*/
template<typename Value_, typename Index_, typename Output_>
void row_nan_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
counts(true, p, output, threads, [](Value_ x) -> bool { return std::isnan(x); });
void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, const Options& nopt) {
counts::apply(row, p, output, nopt.num_threads, [](Value_ x) -> bool { return std::isnan(x); });
}

/**
* Wrapper around `apply()` for row NaN counts.
*
* @tparam Output_ Type of the output value.
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
*
* @param p Pointer to a `tatami::Matrix`.
* @param threads Number of threads to use.
* @param nopt Counting options.
*
* @return A vector of length equal to the number of rows, containing the number of NaNs in each row.
*/
template<typename Output_ = int, typename Value_, typename Index_>
std::vector<Output_> row_nan_counts(const tatami::Matrix<Value_, Index_>* p, int threads = 1) {
std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p, const Options& nopt) {
std::vector<Output_> output(p->nrow());
row_nan_counts(p, output.data(), threads);
apply(true, p, output.data(), nopt);
return output;
}

/**
* Overload with default options.
*
* @tparam Output_ Type of the output value.
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
* @tparam Output_ Type of the output value.
* This should be at least large enough to hold the dimensions of `p`.
*
* @param p Pointer to a `tatami::Matrix`.
* @param[out] output Pointer to an array of length equal to the number of columns.
* On output, this will store the number of NaNs in each column.
* @param threads Number of threads to use.
* @return A vector of length equal to the number of rows, containing the number of NaNs in each row.
*/
template<typename Value_, typename Index_, typename Output_>
void column_nan_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
counts(false, p, output, threads, [](Value_ x) -> bool { return std::isnan(x); });
template<typename Output_ = int, typename Value_, typename Index_>
std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p) {
return by_row(p, Options());
}

/**
* Wrapper around `apply()` for column NaN counts.
*
* @tparam Output_ Type of the output value.
* This should be at least large enough to hold the dimensions of `p`.
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
*
* @param p Pointer to a `tatami::Matrix`.
* @param threads Number of threads to use.
* @param nopt Counting options.
*
* @return A vector of length equal to the number of columns, containing the number of NaNs in each column.
*/
template<typename Output_ = int, typename Value_, typename Index_>
std::vector<Output_> column_nan_counts(const tatami::Matrix<Value_, Index_>* p, int threads = 1) {
std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p, const Options& nopt) {
std::vector<Output_> output(p->ncol());
column_nan_counts(p, output.data(), threads);
apply(false, p, output.data(), nopt);
return output;
}

/**
* Overload with default options.
*
* @tparam Output_ Type of the output value.
* This should be at least large enough to hold the dimensions of `p`.
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
*
* @param p Pointer to a `tatami::Matrix`.
*
* @return A vector of length equal to the number of columns, containing the number of NaNs in each column.
*/
template<typename Output_ = int, typename Value_, typename Index_>
std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p) {
return by_column(p, Options());
}

}

/**
* @brief Functions for counting zeros on each dimension.
* @namespace tatami_stats::counts::zero
*/
namespace zero {

/**
* @brief Zero-counting options.
*/
struct Options {
/**
* Number of threads to use when obtaining counts across a `tatami::Matrix`.
*/
int num_threads = 1;
};

/**
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
* @tparam Output_ Type of the output value.
* This should be at least large enough to hold the dimensions of `p`.
*
* @param row Whether to obtain a count for each row.
* @param p Pointer to a `tatami::Matrix`.
* @param[out] output Pointer to an array of length equal to the number of rows.
* On output, this will store the number of zeros in each row.
* @param threads Number of threads to use.
* @param zopt Counting options.
*/
template<typename Value_, typename Index_, typename Output_>
void row_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
counts(true, p, output, threads, [](Value_ x) -> bool { return x == 0; });
void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, const Options& zopt) {
counts::apply(row, p, output, zopt.num_threads, [](Value_ x) -> bool { return x == 0; });
}

/**
* @tparam Output_ Type of the output value.
* This should be at least large enough to hold the dimensions of `p`.
* Wrapper around `apply()` for row-wise zero counts.
*
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
* @tparam Output_ Type of the output value.
* This should be at least large enough to hold the dimensions of `p`.
*
* @param p Pointer to a `tatami::Matrix`.
* @param threads Number of threads to use.
*
* @return A vector of length equal to the number of rows, containing the number of zeros in each row.
* @param zopt Counting options.
*/
template<typename Output_ = int, typename Value_, typename Index_>
std::vector<Output_> row_zero_counts(const tatami::Matrix<Value_, Index_>* p, int threads = 1) {
std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p, const Options& zopt) {
std::vector<Output_> output(p->nrow());
row_zero_counts(p, output.data(), threads);
apply(true, p, output.data(), zopt);
return output;
}

/**
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
* Overload with default options.
*
* @tparam Output_ Type of the output value.
* This should be at least large enough to hold the dimensions of `p`.
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
*
* @param p Pointer to a `tatami::Matrix`.
* @param[out] output Pointer to an array of length equal to the number of columns.
* On output, this will store the number of zeros in each column.
* @param threads Number of threads to use.
*
* @return A vector of length equal to the number of rows, containing the number of zeros in each row.
*/
template<typename Value_, typename Index_, typename Output_>
void column_zero_counts(const tatami::Matrix<Value_, Index_>* p, Output_* output, int threads = 1) {
counts(false, p, output, threads, [](Value_ x) -> bool { return x == 0; });
template<typename Output_ = int, typename Value_, typename Index_>
std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p) {
return by_row(p, Options());
}

/**
* Wrapper around `apply()` for column-wise zero counts.
*
* @tparam Output_ Type of the output value.
* This should be at least large enough to hold the dimensions of `p`.
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
*
* @param p Pointer to a `tatami::Matrix`.
* @param threads Number of threads to use.
* @param zopt Counting options.
*
* @return A vector of length equal to the number of columns, containing the number of zeros in each column.
*/
template<typename Output_ = int, typename Value_, typename Index_>
std::vector<Output_> column_zero_counts(const tatami::Matrix<Value_, Index_>* p, int threads = 1) {
std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p, const Options& zopt) {
std::vector<Output_> output(p->ncol());
column_zero_counts(p, output.data(), threads);
apply(false, p, output.data(), zopt);
return output;
}

/**
* @tparam Output_ Type of the output value.
* This should be at least large enough to hold the dimensions of `p`.
* @tparam Value_ Type of the matrix value, should be summable.
* @tparam Index_ Type of the row/column indices.
*
* @param p Pointer to a `tatami::Matrix`.
*
* @return A vector of length equal to the number of columns, containing the number of zeros in each column.
*/
template<typename Output_ = int, typename Value_, typename Index_>
std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p) {
return by_column(p, Options());
}

}

}

}

#endif
4 changes: 2 additions & 2 deletions include/tatami_stats/grouped_medians.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void grouped_medians(const tatami::Matrix<Value_, Index_>* p, const Group_* grou

for (size_t g = 0; g < ngroups; ++g, ++curoutput) {
auto& w = workspace[g];
*curoutput = median::compute(w.data(), w.size(), static_cast<size_t>(group_sizes[g]));
*curoutput = medians::direct(w.data(), w.size(), static_cast<size_t>(group_sizes[g]), false);
w.clear();
}
}
Expand All @@ -65,7 +65,7 @@ void grouped_medians(const tatami::Matrix<Value_, Index_>* p, const Group_* grou
}

for (auto& w : workspace) {
*curoutput = median::compute(w.data(), w.size());
*curoutput = medians::direct(w.data(), w.size(), false);
++curoutput;
w.clear();
}
Expand Down
Loading

0 comments on commit f7c491c

Please sign in to comment.