From 042fb16fac0569251ddbd929c6d6a6557cf83711 Mon Sep 17 00:00:00 2001
From: Jordan Eizenga <joeizeng@gmail.com>
Date: Thu, 12 Sep 2024 09:01:33 -0700
Subject: [PATCH] remove threads that visit same exon multiple times

---
 src/transcriptome.cpp | 96 +++++++++++++++++++++++++++++--------------
 1 file changed, 66 insertions(+), 30 deletions(-)

diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp
index 58658a4d15c..7fb73623602 100644
--- a/src/transcriptome.cpp
+++ b/src/transcriptome.cpp
@@ -1628,14 +1628,18 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
     
     assert(haplotype_index.bidirectional());
 
+    // the return value
     list<EditedTranscriptPath> edited_transcript_paths;
 
+    // boundary nodes for the exons (we save them for later while walking out the haplotypes)
     vector<pair<vg::id_t, vg::id_t> > exon_node_ids;
     exon_node_ids.reserve(cur_transcript.exons.size());
 
+    // exon paths for each exon and the threads that follow them, which we will use in the phase of construcint the edited transcript pathsw
     vector<pair<vector<exon_nodes_t>, thread_ids_t> > haplotypes;
+    // for each haplotype, the index of its exons in the haplotypes vector and the next expected exon (only used for constructing haplotypes)
     multimap<int32_t, pair<int32_t, int32_t> > haplotype_id_index;
-
+    
     for (size_t exon_idx = 0; exon_idx < cur_transcript.exons.size(); ++exon_idx) {
 
         const Exon & cur_exon = cur_transcript.exons.at(exon_idx);
@@ -1650,38 +1654,33 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
         // first position in downstream intron).
         auto exon_haplotypes = get_exon_haplotypes(exon_node_ids.back().first, exon_node_ids.back().second, haplotype_index,
                                                    reference_samples, expected_length);
-
+        
         if (haplotypes.empty()) {
-
             for (auto & exon_haplotype: exon_haplotypes) {
 
                 haplotypes.emplace_back(vector<exon_nodes_t>(1, exon_haplotype.first), exon_haplotype.second);
                 haplotypes.back().first.reserve(cur_transcript.exons.size());
 
                 for (auto & haplotype_id: exon_haplotype.second) {
-
                     haplotype_id_index.emplace(haplotype_id, make_pair(haplotypes.size() - 1, exon_idx + 1));
                 }
             }
-            
-        } else {
 
+        } else {
             for (auto & exon_haplotype: exon_haplotypes) {
 
                 assert(!exon_haplotype.first.empty());
                 spp::sparse_hash_map<int32_t, uint32_t> extended_haplotypes;
 
                 for (auto & haplotype_id: exon_haplotype.second) {
-
+                    
                     auto haplotype_id_index_it_range = haplotype_id_index.equal_range(haplotype_id);
                     auto haplotype_id_index_it = haplotype_id_index_it_range.first;
 
                     while (haplotype_id_index_it != haplotype_id_index_it_range.second) {
-
                         if (exon_idx != haplotype_id_index_it->second.second) {
 
                             assert(haplotype_id_index_it->second.second < exon_idx);
-
                             haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it);
                             continue;
                         }
@@ -1690,36 +1689,36 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
                         pair<vector<exon_nodes_t>, thread_ids_t> * cur_haplotype = &haplotypes.at(haplotype_id_index_it->second.first);
 
                         if (extended_haplotypes.find(haplotype_id_index_it->second.first) != extended_haplotypes.end()) {
-
+                            // we have already started to extend the haplotypes from this haplotype's previous exon haplotype set
                             assert(cur_haplotype->first.size() == exon_idx + 1);
                             haplotypes.at(extended_haplotypes.at(haplotype_id_index_it->second.first)).second.emplace_back(haplotype_id);
-                            haplotype_id_index_it->second.first = extended_haplotypes.at(haplotype_id_index_it->second.first);                        
-                        
+                            haplotype_id_index_it->second.first = extended_haplotypes.at(haplotype_id_index_it->second.first);
                         } else if (cur_haplotype->first.size() == exon_idx) {
-
+                            // the next expected exon is this one, so we extend it by this exon and mark the extension as conly containing this thread so far
                             cur_haplotype->first.emplace_back(exon_haplotype.first);
                             cur_haplotype->second = {haplotype_id};
                             assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotype_id_index_it->second.first).second);
-                        
-                        } else if (cur_haplotype->first.size() == exon_idx + 1) {
 
+                        } else if (cur_haplotype->first.size() == exon_idx + 1) {
+                            // the next expected exon is the following one, so we must have already added this one, so we must need divide this
+                            // haplotype group into two
                             haplotypes.emplace_back(vector<exon_nodes_t>(cur_haplotype->first.begin(), cur_haplotype->first.end() - 1), thread_ids_t(1, haplotype_id));
                             haplotypes.back().first.emplace_back(exon_haplotype.first);
 
                             assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotypes.size() - 1).second);
-                            haplotype_id_index_it->second.first = haplotypes.size() - 1;                
-                        
-                        } else {
+                            haplotype_id_index_it->second.first = haplotypes.size() - 1;
 
+                        } else {
+                            // we must have missed an exon, so we have to erase this haplotype from the results
                             haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it);
                             continue;
-                        } 
+                        }
 
                         ++haplotype_id_index_it;
                     }
                 }
             }
-        }  
+        }
     }
 
     for (auto & haplotype: haplotypes) {
@@ -1853,7 +1852,7 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
 vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(const vg::id_t start_node, const vg::id_t end_node, const gbwt::GBWT & haplotype_index, const unordered_set<string>& reference_samples, const int32_t expected_length) const {
 
     assert(expected_length > 0);
-
+    
     // Calculate the expected upperbound of the length between the two 
     // nodes (number of nodes). 
     const int32_t expected_length_upperbound = 1.1 * expected_length;
@@ -1888,7 +1887,7 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con
 
         // Stop current extension if end node is reached.
         if (gbwt::Node::id(cur_exon_haplotype.first.back()) == end_node) {
-
+            
             exon_haplotypes.emplace_back(cur_exon_haplotype.first, haplotype_index.locate(cur_exon_haplotype.second));
             assert(exon_haplotypes.back().second.size() <= cur_exon_haplotype.second.size());
             
@@ -1928,7 +1927,6 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con
             }
 
             if (!has_relevant_haplotype) {
-
                 exon_haplotype_queue.pop();
                 continue;                
             }
@@ -1938,13 +1936,12 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con
 
         // End current extension if no outgoing edges exist.
         if (out_edges.empty()) {
-
             exon_haplotype_queue.pop();
             continue;
         }
 
         auto out_edges_it = out_edges.begin(); 
-        ++out_edges_it;
+        ++out_edges_it; // skip the first edge (we will follow it in-place on the queue after the following loop)
 
         while (out_edges_it != out_edges.end()) {
 
@@ -1952,9 +1949,8 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con
             if (out_edges_it->first != gbwt::ENDMARKER) {
 
                 auto extended_search = haplotype_index.extend(cur_exon_haplotype.second, out_edges_it->first);
-
                 // Add new extension to queue if not empty (haplotypes found).
-                if (!extended_search.empty()) { 
+                if (!extended_search.empty()) {
 
                     exon_haplotype_queue.push(make_pair(cur_exon_haplotype.first, extended_search));
                     exon_haplotype_queue.back().first.emplace_back(out_edges_it->first);
@@ -1966,18 +1962,58 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con
 
         // Do not extend haplotypes that end within the exon.
         if (out_edges.begin()->first != gbwt::ENDMARKER) {
-
             cur_exon_haplotype.first.emplace_back(out_edges.begin()->first);
             cur_exon_haplotype.second = haplotype_index.extend(cur_exon_haplotype.second, out_edges.begin()->first);        
 
             // End current extension if empty (no haplotypes found). 
-            if (cur_exon_haplotype.second.empty()) { exon_haplotype_queue.pop(); }
+            if (cur_exon_haplotype.second.empty()) {
+                exon_haplotype_queue.pop(); }
     
         } else {
-
             exon_haplotype_queue.pop();
         }
     }
+    
+    // remove haplotypes that visit this exon more than once
+    // FIXME: we should instead use the R-index to get reachability information in addition to thread IDs
+    {
+        unordered_map<gbwt::size_type, pair<size_t, size_t>> seen_position;
+        bool found_duplicate = false;
+        for (size_t i = 0; i < exon_haplotypes.size(); ++i) {
+            
+            auto& exon_haplotype_set = exon_haplotypes[i];
+            
+            for (size_t j = 0; j < exon_haplotype_set.second.size(); ++j) {
+                auto it = seen_position.find(exon_haplotype_set.second[j]);
+                if (it != seen_position.end()) {
+                    // mark this and the previous instance of this haplotype for removal
+                    exon_haplotypes[it->second.first].second[it->second.second] = -1;
+                    exon_haplotype_set.second[j] = -1;
+                    found_duplicate = true;
+                }
+                else {
+                    // there is no previous instance of this haplotype
+                    seen_position[exon_haplotype_set.second[j]] = make_pair(i, j);
+                }
+            }
+        }
+        
+        if (found_duplicate) {
+            // remove any duplicate thread IDs
+            for (auto& exon_haplotype_set : exon_haplotypes) {
+                auto new_end = remove_if(exon_haplotype_set.second.begin(), exon_haplotype_set.second.end(),
+                                         [](gbwt::size_type thread_id) {
+                    return thread_id == (gbwt::size_type) -1;
+                });
+                exon_haplotype_set.second.resize(new_end - exon_haplotype_set.second.begin());
+            }
+            // remove any haplotype sets that are now empty
+            auto new_end = remove_if(exon_haplotypes.begin(), exon_haplotypes.end(), [](const pair<exon_nodes_t, thread_ids_t>& a) {
+                return a.second.empty();
+            });
+            exon_haplotypes.resize(new_end - exon_haplotypes.begin());
+        }
+    }
 
     return exon_haplotypes;
 }