Skip to content

Commit

Permalink
Merge pull request #157 from eseiler/misc/estimate
Browse files Browse the repository at this point in the history
[MISC] More robust estimation
  • Loading branch information
smehringer authored Oct 9, 2024
2 parents 3772205 + f78ba23 commit 9e1394b
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 23 deletions.
29 changes: 14 additions & 15 deletions src/estimate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ void check_ibf(min_arguments const & args,
IBFType const & ibf,
std::vector<uint16_t> & estimations_i,
seqan3::dna4_vector const & seq,
std::vector<uint32_t> & prev_counts,
std::vector<float> & prev_counts,
exp_t const & expressions,
uint16_t const level,
std::vector<double> const fprs,
Expand All @@ -40,7 +40,7 @@ void check_ibf(min_arguments const & args,
static constexpr bool multiple_expressions = std::same_as<exp_t, std::vector<std::vector<uint16_t>>>;

// Count minimisers in ibf of current level
std::vector<uint32_t> counter(ibf.bin_count());
std::vector<float> counter(ibf.bin_count());
uint64_t minimiser_count{};
auto agent = ibf.membership_agent();
for (auto minHash : seqan3::views::minimiser_hash(seq, args.shape, args.w_size, args.s))
Expand All @@ -50,7 +50,7 @@ void check_ibf(min_arguments const & args,
}

// Defines where the median should be
double const minimiser_pos = minimiser_count / 2.0;
float const minimiser_pos = minimiser_count / 2.0;

// Check every experiment by going over the number of bins in the ibf.
for (size_t bin = 0; bin < counter.size(); ++bin)
Expand All @@ -59,14 +59,13 @@ void check_ibf(min_arguments const & args,
continue;

// Correction by substracting the expected number of false positives
double const expected_false_positives = minimiser_count * fprs[bin];
double const corrected_count = counter[bin] - expected_false_positives;
double const normalized_count = corrected_count / (1.0 - fprs[bin]);

// TODO: Rounding?
// `normalized_count` could be ceiled
// `estimate` further down could be ceiled
counter[bin] = normalized_count < 0.0 ? 0u : normalized_count;
counter[bin] = [&]()
{
float const expected_false_positives = minimiser_count * fprs[bin];
float const corrected_count = counter[bin] - expected_false_positives;
float const normalized_count = std::max(0.0, corrected_count / (1.0 - fprs[bin]));
return normalized_count;
}();

// Check if considering previously seen minimisers and minimisers found at current level equal to or are greater
// than the minimiser_pow, which gives the median position.
Expand All @@ -86,8 +85,8 @@ void check_ibf(min_arguments const & args,
assert(minimiser_pos > prev_counts[bin]);
// Should never fail. This would mean that prev_counts[bin] was enough by itself and we should already
// have estimated on the previous level.
assert(counter[bin] > 0u);
double const normalized_minimiser_pos = (minimiser_pos - prev_counts[bin]) / counter[bin];
assert(counter[bin] > 0.0);
float const normalized_minimiser_pos = (minimiser_pos - prev_counts[bin]) / counter[bin];

// Actually calculate estimation, in the else case level stands for the prev_expression
if constexpr (multiple_expressions)
Expand Down Expand Up @@ -139,9 +138,9 @@ void estimate(estimate_ibf_arguments & args,
{
std::vector<std::string> ids;
std::vector<seqan3::dna4_vector> seqs;
std::vector<uint32_t> counter;
std::vector<float> counter;
std::vector<uint16_t> counter_est;
std::vector<std::vector<uint32_t>> prev_counts;
std::vector<std::vector<float>> prev_counts;
uint64_t prev_expression;
std::vector<std::vector<uint16_t>> estimations;
std::vector<std::vector<uint16_t>> expressions;
Expand Down
19 changes: 13 additions & 6 deletions src/ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,13 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
// ^^ why divide? --> estimate the number of minimisers
}

auto divide_and_ceil = [](uint64_t const dividend, uint64_t const divisor) -> uint64_t
{
assert(dividend >= 1u || divisor >= 1u);
assert(divisor != 0u);
return (dividend + divisor - 1u) / divisor;
};

// If set_expression_thresholds_samplewise is not set the expressions as determined by the first file are used for
// all files.
if constexpr (samplewise)
Expand All @@ -821,9 +828,9 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
for (int c = 0; c < ibf_args.number_expression_thresholds - 1; c++)
{
diff = diff * 2;
sizes[i].push_back(filesize / diff);
sizes[i].push_back(divide_and_ceil(filesize, diff));
}
sizes[i].push_back(filesize / diff);
sizes[i].push_back(divide_and_ceil(filesize, diff));
}
else if constexpr (minimiser_files_given)
{
Expand All @@ -839,10 +846,10 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
float diff{1};
for (int c = 0; c < ibf_args.number_expression_thresholds - 1; c++)
{
diff = ibf_args.expression_thresholds[c + 1] / ibf_args.expression_thresholds[c];
sizes[i].push_back(filesize / diff);
diff = ibf_args.expression_thresholds[c + 1] / static_cast<float>(ibf_args.expression_thresholds[c]);
sizes[i].push_back(std::ceil(filesize / diff));
}
sizes[i].push_back(filesize / diff);
sizes[i].push_back(std::ceil(filesize / diff));
}
}

Expand All @@ -857,7 +864,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
size += sizes[i][j];
}

if (size == 0u)
if (size == num_files)
{
throw std::invalid_argument{
std::string("[Error]. The chosen expression threshold is not well picked. If you use the automatic ")
Expand Down
4 changes: 2 additions & 2 deletions test/api/estimate_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ TEST_F(estimate_test, example)

std::ifstream output_file("Single_expression.out");
std::string line;
std::string expected{"GeneA\t9\t32\t"};
std::string expected{"GeneA\t10\t32\t"};
if (output_file.is_open())
{
while (std::getline(output_file, line))
Expand Down Expand Up @@ -287,7 +287,7 @@ TEST_F(estimate_test, example_multiple_threads)

std::ifstream output_file("Multiple_expression.out");
std::string line;
std::string expected{"GeneA\t9\t32\t"};
std::string expected{"GeneA\t10\t32\t"};
if (output_file.is_open())
{
while (std::getline(output_file, line))
Expand Down

0 comments on commit 9e1394b

Please sign in to comment.