From 042fb16fac0569251ddbd929c6d6a6557cf83711 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Thu, 12 Sep 2024 09:01:33 -0700 Subject: [PATCH] remove threads that visit same exon multiple times --- src/transcriptome.cpp | 96 +++++++++++++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 30 deletions(-) diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index 58658a4d15c..7fb73623602 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -1628,14 +1628,18 @@ list Transcriptome::project_transcript_gbwt(const Transcri assert(haplotype_index.bidirectional()); + // the return value list edited_transcript_paths; + // boundary nodes for the exons (we save them for later while walking out the haplotypes) vector > exon_node_ids; exon_node_ids.reserve(cur_transcript.exons.size()); + // exon paths for each exon and the threads that follow them, which we will use in the phase of construcint the edited transcript pathsw vector, thread_ids_t> > haplotypes; + // for each haplotype, the index of its exons in the haplotypes vector and the next expected exon (only used for constructing haplotypes) multimap > haplotype_id_index; - + for (size_t exon_idx = 0; exon_idx < cur_transcript.exons.size(); ++exon_idx) { const Exon & cur_exon = cur_transcript.exons.at(exon_idx); @@ -1650,38 +1654,33 @@ list Transcriptome::project_transcript_gbwt(const Transcri // first position in downstream intron). auto exon_haplotypes = get_exon_haplotypes(exon_node_ids.back().first, exon_node_ids.back().second, haplotype_index, reference_samples, expected_length); - + if (haplotypes.empty()) { - for (auto & exon_haplotype: exon_haplotypes) { haplotypes.emplace_back(vector(1, exon_haplotype.first), exon_haplotype.second); haplotypes.back().first.reserve(cur_transcript.exons.size()); for (auto & haplotype_id: exon_haplotype.second) { - haplotype_id_index.emplace(haplotype_id, make_pair(haplotypes.size() - 1, exon_idx + 1)); } } - - } else { + } else { for (auto & exon_haplotype: exon_haplotypes) { assert(!exon_haplotype.first.empty()); spp::sparse_hash_map extended_haplotypes; for (auto & haplotype_id: exon_haplotype.second) { - + auto haplotype_id_index_it_range = haplotype_id_index.equal_range(haplotype_id); auto haplotype_id_index_it = haplotype_id_index_it_range.first; while (haplotype_id_index_it != haplotype_id_index_it_range.second) { - if (exon_idx != haplotype_id_index_it->second.second) { assert(haplotype_id_index_it->second.second < exon_idx); - haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it); continue; } @@ -1690,36 +1689,36 @@ list Transcriptome::project_transcript_gbwt(const Transcri pair, thread_ids_t> * cur_haplotype = &haplotypes.at(haplotype_id_index_it->second.first); if (extended_haplotypes.find(haplotype_id_index_it->second.first) != extended_haplotypes.end()) { - + // we have already started to extend the haplotypes from this haplotype's previous exon haplotype set assert(cur_haplotype->first.size() == exon_idx + 1); haplotypes.at(extended_haplotypes.at(haplotype_id_index_it->second.first)).second.emplace_back(haplotype_id); - haplotype_id_index_it->second.first = extended_haplotypes.at(haplotype_id_index_it->second.first); - + haplotype_id_index_it->second.first = extended_haplotypes.at(haplotype_id_index_it->second.first); } else if (cur_haplotype->first.size() == exon_idx) { - + // the next expected exon is this one, so we extend it by this exon and mark the extension as conly containing this thread so far cur_haplotype->first.emplace_back(exon_haplotype.first); cur_haplotype->second = {haplotype_id}; assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotype_id_index_it->second.first).second); - - } else if (cur_haplotype->first.size() == exon_idx + 1) { + } else if (cur_haplotype->first.size() == exon_idx + 1) { + // the next expected exon is the following one, so we must have already added this one, so we must need divide this + // haplotype group into two haplotypes.emplace_back(vector(cur_haplotype->first.begin(), cur_haplotype->first.end() - 1), thread_ids_t(1, haplotype_id)); haplotypes.back().first.emplace_back(exon_haplotype.first); assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotypes.size() - 1).second); - haplotype_id_index_it->second.first = haplotypes.size() - 1; - - } else { + haplotype_id_index_it->second.first = haplotypes.size() - 1; + } else { + // we must have missed an exon, so we have to erase this haplotype from the results haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it); continue; - } + } ++haplotype_id_index_it; } } } - } + } } for (auto & haplotype: haplotypes) { @@ -1853,7 +1852,7 @@ list Transcriptome::project_transcript_gbwt(const Transcri vector > Transcriptome::get_exon_haplotypes(const vg::id_t start_node, const vg::id_t end_node, const gbwt::GBWT & haplotype_index, const unordered_set& reference_samples, const int32_t expected_length) const { assert(expected_length > 0); - + // Calculate the expected upperbound of the length between the two // nodes (number of nodes). const int32_t expected_length_upperbound = 1.1 * expected_length; @@ -1888,7 +1887,7 @@ vector > Transcriptome::get_exon_haplotypes(con // Stop current extension if end node is reached. if (gbwt::Node::id(cur_exon_haplotype.first.back()) == end_node) { - + exon_haplotypes.emplace_back(cur_exon_haplotype.first, haplotype_index.locate(cur_exon_haplotype.second)); assert(exon_haplotypes.back().second.size() <= cur_exon_haplotype.second.size()); @@ -1928,7 +1927,6 @@ vector > Transcriptome::get_exon_haplotypes(con } if (!has_relevant_haplotype) { - exon_haplotype_queue.pop(); continue; } @@ -1938,13 +1936,12 @@ vector > Transcriptome::get_exon_haplotypes(con // End current extension if no outgoing edges exist. if (out_edges.empty()) { - exon_haplotype_queue.pop(); continue; } auto out_edges_it = out_edges.begin(); - ++out_edges_it; + ++out_edges_it; // skip the first edge (we will follow it in-place on the queue after the following loop) while (out_edges_it != out_edges.end()) { @@ -1952,9 +1949,8 @@ vector > Transcriptome::get_exon_haplotypes(con if (out_edges_it->first != gbwt::ENDMARKER) { auto extended_search = haplotype_index.extend(cur_exon_haplotype.second, out_edges_it->first); - // Add new extension to queue if not empty (haplotypes found). - if (!extended_search.empty()) { + if (!extended_search.empty()) { exon_haplotype_queue.push(make_pair(cur_exon_haplotype.first, extended_search)); exon_haplotype_queue.back().first.emplace_back(out_edges_it->first); @@ -1966,18 +1962,58 @@ vector > Transcriptome::get_exon_haplotypes(con // Do not extend haplotypes that end within the exon. if (out_edges.begin()->first != gbwt::ENDMARKER) { - cur_exon_haplotype.first.emplace_back(out_edges.begin()->first); cur_exon_haplotype.second = haplotype_index.extend(cur_exon_haplotype.second, out_edges.begin()->first); // End current extension if empty (no haplotypes found). - if (cur_exon_haplotype.second.empty()) { exon_haplotype_queue.pop(); } + if (cur_exon_haplotype.second.empty()) { + exon_haplotype_queue.pop(); } } else { - exon_haplotype_queue.pop(); } } + + // remove haplotypes that visit this exon more than once + // FIXME: we should instead use the R-index to get reachability information in addition to thread IDs + { + unordered_map> seen_position; + bool found_duplicate = false; + for (size_t i = 0; i < exon_haplotypes.size(); ++i) { + + auto& exon_haplotype_set = exon_haplotypes[i]; + + for (size_t j = 0; j < exon_haplotype_set.second.size(); ++j) { + auto it = seen_position.find(exon_haplotype_set.second[j]); + if (it != seen_position.end()) { + // mark this and the previous instance of this haplotype for removal + exon_haplotypes[it->second.first].second[it->second.second] = -1; + exon_haplotype_set.second[j] = -1; + found_duplicate = true; + } + else { + // there is no previous instance of this haplotype + seen_position[exon_haplotype_set.second[j]] = make_pair(i, j); + } + } + } + + if (found_duplicate) { + // remove any duplicate thread IDs + for (auto& exon_haplotype_set : exon_haplotypes) { + auto new_end = remove_if(exon_haplotype_set.second.begin(), exon_haplotype_set.second.end(), + [](gbwt::size_type thread_id) { + return thread_id == (gbwt::size_type) -1; + }); + exon_haplotype_set.second.resize(new_end - exon_haplotype_set.second.begin()); + } + // remove any haplotype sets that are now empty + auto new_end = remove_if(exon_haplotypes.begin(), exon_haplotypes.end(), [](const pair& a) { + return a.second.empty(); + }); + exon_haplotypes.resize(new_end - exon_haplotypes.begin()); + } + } return exon_haplotypes; }