Skip to content

Commit

Permalink
Merge remote-tracking branch 'xian/lr-giraffe' into lr-giraffe
Browse files Browse the repository at this point in the history
  • Loading branch information
adamnovak committed May 23, 2024
2 parents d3326dd + 9f40a4e commit 74c0b4d
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 24 deletions.
4 changes: 4 additions & 0 deletions src/minimizer_mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3506,6 +3506,10 @@ std::vector<MinimizerMapper::Seed> MinimizerMapper::find_seeds(const std::vector
? 0
: aln.sequence().size() / this->minimizer_downsampling_window_count;

//Cap the window length at the cap
minimizer_downsampling_window_size = std::min(minimizer_downsampling_window_size,
this->minimizer_downsampling_max_window_length);

if (minimizer_downsampling_window_size != 0) {
for (auto& kv : minimizers_in_read_order_by_length) {
auto& length = kv.first;
Expand Down
5 changes: 4 additions & 1 deletion src/minimizer_mapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,13 @@ class MinimizerMapper : public AlignerClient {
static constexpr double default_minimizer_score_fraction = 0.9;
double minimizer_score_fraction = default_minimizer_score_fraction;

/// Window size for minimizer downsampling
/// Window count for minimizer downsampling
static constexpr size_t default_minimizer_downsampling_window_count = 0;
size_t minimizer_downsampling_window_count = default_minimizer_downsampling_window_count;

static constexpr size_t default_minimizer_downsampling_max_window_length = std::numeric_limits<size_t>::max();
size_t minimizer_downsampling_max_window_length = default_minimizer_downsampling_max_window_length;

/// Maximum number of distinct minimizers to take
static constexpr size_t default_max_unique_min = 500;
size_t max_unique_min = default_max_unique_min;
Expand Down
67 changes: 44 additions & 23 deletions src/subcommand/giraffe_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,16 @@ static std::unique_ptr<GroupedOptionGroup> get_options() {
"use maximum of number minimizers calculated by READ_LENGTH / INT and --max-min"
);
comp_opts.add_range(
"downsample-min",
"downsample-window-count",
&MinimizerMapper::minimizer_downsampling_max_window_length,
MinimizerMapper::default_minimizer_downsampling_max_window_length,
"downsample minimizers with windows of length read_length/INT, 0 for no downsampling"
);
comp_opts.add_range(
"downsample-window-length",
&MinimizerMapper::minimizer_downsampling_window_count,
MinimizerMapper::default_minimizer_downsampling_window_count,
"downsample minimizers with windows of length read_length/INT, 0 for no downsampling"
"maximum window length for downsampling"
);
comp_opts.add_range(
"distance-limit", 'D',
Expand Down Expand Up @@ -839,7 +845,8 @@ int main_giraffe(int argc, char** argv) {
// Use downsampling instead of max unique minimizer count
.add_entry<size_t>("max-min", 0)
.add_entry<size_t>("num-bp-per-min", 1000)
.add_entry<size_t>("downsample-min", 125)
.add_entry<size_t>("downsample-window-count", 125)
.add_entry<size_t>("downsample-window-length", 120)
// Don't use the hit-cap||score-fraction filter because it doesn't do anything after downsampling
.add_entry<size_t>("hit-cap", 0)
.add_entry<double>("score-fraction", 1.0)
Expand Down Expand Up @@ -890,7 +897,8 @@ int main_giraffe(int argc, char** argv) {
// Use downsampling instead of max unique minimizer count
.add_entry<size_t>("max-min", 100)
.add_entry<size_t>("num-bp-per-min", 500)
.add_entry<size_t>("downsample-min", 500)
.add_entry<size_t>("downsample-window-count", 500)
.add_entry<size_t>("downsample-window-length", 20)
// Don't use the hit-cap||score-fraction filter because it doesn't do anything after downsampling
.add_entry<size_t>("hit-cap", 0)
.add_entry<double>("score-fraction", 1.0)
Expand Down Expand Up @@ -945,48 +953,61 @@ int main_giraffe(int argc, char** argv) {
.add_entry<bool>("explored-cap", true)
// Cap minimizers at a number we won't reach.
.add_entry<size_t>("max-min", 500)
.add_entry<size_t>("num-bp-per-min", 500)
// Don't downsample
.add_entry<size_t>("downsample-min", 0)
.add_entry<size_t>("downsample-window-count", 0)
.add_entry<size_t>("downsample-window-length", std::numeric_limits<size_t>::max())
// Use the hit-cap||score-fraction filter
.add_entry<size_t>("hit-cap", 10)
.add_entry<size_t>("hit-cap", 15)
.add_entry<double>("score-fraction", 0.9)
.add_entry<size_t>("hard-hit-cap", 500) // Default: 500
.add_entry<size_t>("hard-hit-cap", 1000) // Default: 500
// Grab the best trees
.add_entry<size_t>("min-to-fragment", 2)
.add_entry<size_t>("max-to-fragment", 800)
.add_entry<double>("zipcode-tree-score-threshold", 50)
.add_entry<double>("pad-zipcode-tree-score-threshold", 20)
.add_entry<double>("zipcode-tree-coverage-threshold", 0.3)
.add_entry<size_t>("min-to-fragment", 4)
.add_entry<size_t>("max-to-fragment", 1000)
.add_entry<double>("zipcode-tree-scale", 0.75)
.add_entry<double>("zipcode-tree-score-threshold", 20)
.add_entry<double>("pad-zipcode-tree-score-threshold", 50)
.add_entry<double>("zipcode-tree-coverage-threshold", 0.5)
// And extend them
.add_entry<size_t>("gapless-extension-limit", std::numeric_limits<size_t>::max())
// Allowing a lot of mismatches because we chop later
.add_entry<size_t>("max-extension-mismatches", 10)
.add_entry<size_t>("max-extension-mismatches", 15)
// And fragment them
.add_entry<double>("fragment-gap-scale", 4.0)
.add_entry<double>("gap-scale", 4.0)
.add_entry<double>("fragment-gap-scale", 5.0)
.add_entry<double>("gap-scale", 5.0)
.add_entry<size_t>("fragment-max-lookback-bases", 275)
.add_entry<double>("fragment-max-lookback-bases-per-base", 0)
.add_entry<size_t>("fragment-max-indel-bases", 2500)
.add_entry<double>("fragment-max-indel-bases-per-base", 0)
// And take those to chains
.add_entry<double>("fragment-score-fraction", 0.7)
.add_entry<double>("fragment-min-score", 0)
.add_entry<double>("fragment-score-fraction", 0.5)
.add_entry<double>("fragment-min-score", 20)
.add_entry<double>("fragment-set-score-threshold", std::numeric_limits<double>::max())
.add_entry<int>("min-chaining-problems", 5)
.add_entry<int>("min-chaining-problems", 10)
.add_entry<int>("max-chaining-problems", std::numeric_limits<int>::max())
.add_entry<size_t>("max-lookback-bases", 3000)
.add_entry<double>("max-lookback-bases-per-base", 0)
.add_entry<size_t>("max-indel-bases", 2000)
.add_entry<double>("max-indel-bases-per-base", 0)
.add_entry<double>("chain-score-threshold", 100.0)
.add_entry<double>("min-chain-score-per-base", 0.01)
.add_entry<int>("max-min-chain-score", 200.0)
.add_entry<int>("item-bonus", 0)
.add_entry<int>("item-scale", 1.0)
.add_entry<int>("min-chains", 3)
.add_entry<size_t>("max-chains-per-tree", 5)
.add_entry<size_t>("max-alignments", 5)
.add_entry<size_t>("max-alignments", 4)
// Don't use the WFAExtender to connect anchors because it can take tenths of seconds sometimes.
.add_entry<size_t>("max-chain-connection", 0)
.add_entry<size_t>("max-tail-gap", 100)
.add_entry<double>("mapq-score-scale", 1.0);
.add_entry<size_t>("max-chain-connection", 85)
.add_entry<size_t>("max-tail-gap", 115)
.add_entry<double>("mapq-score-scale", 1.5);
presets["srold"]
.add_entry<bool>("align-from-chains", true)
.add_entry<bool>("explored-cap", false)
// Use downsampling instead of max unique minimizer count
.add_entry<size_t>("max-min", 0)
.add_entry<size_t>("downsample-min", 100)
.add_entry<size_t>("downsample-window-count", 100)
.add_entry<size_t>("downsample-window-length", std::numeric_limits<size_t>::max())
// Don't use the hit-cap||score-fraction filter because it doesn't do anything after downsampling
.add_entry<size_t>("hit-cap", 0)
.add_entry<double>("score-fraction", 1.0)
Expand Down

1 comment on commit 74c0b4d

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch lr-giraffe. View the full report here.

0 tests passed, 16 tests failed and 0 tests skipped in 526 seconds

Failed tests:

  • test_sim_chr21_snp1kg (32 seconds)
  • test_full_brca2_cactus (34 seconds)
  • test_full_brca2_primary (29 seconds)
  • test_full_brca2_snp1kg (30 seconds)
  • test_map_brca1_cactus (30 seconds)
  • test_map_brca1_primary (29 seconds)
  • test_map_brca1_snp1kg (30 seconds)
  • test_map_brca1_snp1kg_mpmap (30 seconds)
  • test_map_mhc_primary (30 seconds)
  • test_map_mhc_snp1kg (31 seconds)
  • test_sim_mhc_cactus (25 seconds)
  • test_sim_mhc_snp1kg (28 seconds)
  • test_sim_mhc_snp1kg_mpmap (25 seconds)
  • test_call_chr21_snp1kg (29 seconds)
  • test_sim_chr21_snp1kg_trained (41 seconds)
  • test_sim_yeast_cactus (52 seconds)

Please sign in to comment.