Skip to content

Commit

Permalink
Merge pull request #4397 from vgteam/rna-tandem-dup-exon
Browse files Browse the repository at this point in the history
Hotfix for transcript projection onto haplotypes with tandem duplications in vg rna
  • Loading branch information
jeizenga authored Sep 13, 2024
2 parents 2220b91 + 042fb16 commit 4db75c2
Showing 1 changed file with 66 additions and 30 deletions.
96 changes: 66 additions & 30 deletions src/transcriptome.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1628,14 +1628,18 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri

assert(haplotype_index.bidirectional());

// the return value
list<EditedTranscriptPath> edited_transcript_paths;

// boundary nodes for the exons (we save them for later while walking out the haplotypes)
vector<pair<vg::id_t, vg::id_t> > exon_node_ids;
exon_node_ids.reserve(cur_transcript.exons.size());

// exon paths for each exon and the threads that follow them, which we will use in the phase of construcint the edited transcript pathsw
vector<pair<vector<exon_nodes_t>, thread_ids_t> > haplotypes;
// for each haplotype, the index of its exons in the haplotypes vector and the next expected exon (only used for constructing haplotypes)
multimap<int32_t, pair<int32_t, int32_t> > haplotype_id_index;

for (size_t exon_idx = 0; exon_idx < cur_transcript.exons.size(); ++exon_idx) {

const Exon & cur_exon = cur_transcript.exons.at(exon_idx);
Expand All @@ -1650,38 +1654,33 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
// first position in downstream intron).
auto exon_haplotypes = get_exon_haplotypes(exon_node_ids.back().first, exon_node_ids.back().second, haplotype_index,
reference_samples, expected_length);

if (haplotypes.empty()) {

for (auto & exon_haplotype: exon_haplotypes) {

haplotypes.emplace_back(vector<exon_nodes_t>(1, exon_haplotype.first), exon_haplotype.second);
haplotypes.back().first.reserve(cur_transcript.exons.size());

for (auto & haplotype_id: exon_haplotype.second) {

haplotype_id_index.emplace(haplotype_id, make_pair(haplotypes.size() - 1, exon_idx + 1));
}
}

} else {

} else {
for (auto & exon_haplotype: exon_haplotypes) {

assert(!exon_haplotype.first.empty());
spp::sparse_hash_map<int32_t, uint32_t> extended_haplotypes;

for (auto & haplotype_id: exon_haplotype.second) {

auto haplotype_id_index_it_range = haplotype_id_index.equal_range(haplotype_id);
auto haplotype_id_index_it = haplotype_id_index_it_range.first;

while (haplotype_id_index_it != haplotype_id_index_it_range.second) {

if (exon_idx != haplotype_id_index_it->second.second) {

assert(haplotype_id_index_it->second.second < exon_idx);

haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it);
continue;
}
Expand All @@ -1690,36 +1689,36 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
pair<vector<exon_nodes_t>, thread_ids_t> * cur_haplotype = &haplotypes.at(haplotype_id_index_it->second.first);

if (extended_haplotypes.find(haplotype_id_index_it->second.first) != extended_haplotypes.end()) {

// we have already started to extend the haplotypes from this haplotype's previous exon haplotype set
assert(cur_haplotype->first.size() == exon_idx + 1);
haplotypes.at(extended_haplotypes.at(haplotype_id_index_it->second.first)).second.emplace_back(haplotype_id);
haplotype_id_index_it->second.first = extended_haplotypes.at(haplotype_id_index_it->second.first);

haplotype_id_index_it->second.first = extended_haplotypes.at(haplotype_id_index_it->second.first);
} else if (cur_haplotype->first.size() == exon_idx) {

// the next expected exon is this one, so we extend it by this exon and mark the extension as conly containing this thread so far
cur_haplotype->first.emplace_back(exon_haplotype.first);
cur_haplotype->second = {haplotype_id};
assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotype_id_index_it->second.first).second);

} else if (cur_haplotype->first.size() == exon_idx + 1) {

} else if (cur_haplotype->first.size() == exon_idx + 1) {
// the next expected exon is the following one, so we must have already added this one, so we must need divide this
// haplotype group into two
haplotypes.emplace_back(vector<exon_nodes_t>(cur_haplotype->first.begin(), cur_haplotype->first.end() - 1), thread_ids_t(1, haplotype_id));
haplotypes.back().first.emplace_back(exon_haplotype.first);

assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotypes.size() - 1).second);
haplotype_id_index_it->second.first = haplotypes.size() - 1;

} else {
haplotype_id_index_it->second.first = haplotypes.size() - 1;

} else {
// we must have missed an exon, so we have to erase this haplotype from the results
haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it);
continue;
}
}

++haplotype_id_index_it;
}
}
}
}
}
}

for (auto & haplotype: haplotypes) {
Expand Down Expand Up @@ -1853,7 +1852,7 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(const vg::id_t start_node, const vg::id_t end_node, const gbwt::GBWT & haplotype_index, const unordered_set<string>& reference_samples, const int32_t expected_length) const {

assert(expected_length > 0);

// Calculate the expected upperbound of the length between the two
// nodes (number of nodes).
const int32_t expected_length_upperbound = 1.1 * expected_length;
Expand Down Expand Up @@ -1888,7 +1887,7 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con

// Stop current extension if end node is reached.
if (gbwt::Node::id(cur_exon_haplotype.first.back()) == end_node) {

exon_haplotypes.emplace_back(cur_exon_haplotype.first, haplotype_index.locate(cur_exon_haplotype.second));
assert(exon_haplotypes.back().second.size() <= cur_exon_haplotype.second.size());

Expand Down Expand Up @@ -1928,7 +1927,6 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con
}

if (!has_relevant_haplotype) {

exon_haplotype_queue.pop();
continue;
}
Expand All @@ -1938,23 +1936,21 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con

// End current extension if no outgoing edges exist.
if (out_edges.empty()) {

exon_haplotype_queue.pop();
continue;
}

auto out_edges_it = out_edges.begin();
++out_edges_it;
++out_edges_it; // skip the first edge (we will follow it in-place on the queue after the following loop)

while (out_edges_it != out_edges.end()) {

// Do not extend haplotypes that end within the exon.
if (out_edges_it->first != gbwt::ENDMARKER) {

auto extended_search = haplotype_index.extend(cur_exon_haplotype.second, out_edges_it->first);

// Add new extension to queue if not empty (haplotypes found).
if (!extended_search.empty()) {
if (!extended_search.empty()) {

exon_haplotype_queue.push(make_pair(cur_exon_haplotype.first, extended_search));
exon_haplotype_queue.back().first.emplace_back(out_edges_it->first);
Expand All @@ -1966,18 +1962,58 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con

// Do not extend haplotypes that end within the exon.
if (out_edges.begin()->first != gbwt::ENDMARKER) {

cur_exon_haplotype.first.emplace_back(out_edges.begin()->first);
cur_exon_haplotype.second = haplotype_index.extend(cur_exon_haplotype.second, out_edges.begin()->first);

// End current extension if empty (no haplotypes found).
if (cur_exon_haplotype.second.empty()) { exon_haplotype_queue.pop(); }
if (cur_exon_haplotype.second.empty()) {
exon_haplotype_queue.pop(); }

} else {

exon_haplotype_queue.pop();
}
}

// remove haplotypes that visit this exon more than once
// FIXME: we should instead use the R-index to get reachability information in addition to thread IDs
{
unordered_map<gbwt::size_type, pair<size_t, size_t>> seen_position;
bool found_duplicate = false;
for (size_t i = 0; i < exon_haplotypes.size(); ++i) {

auto& exon_haplotype_set = exon_haplotypes[i];

for (size_t j = 0; j < exon_haplotype_set.second.size(); ++j) {
auto it = seen_position.find(exon_haplotype_set.second[j]);
if (it != seen_position.end()) {
// mark this and the previous instance of this haplotype for removal
exon_haplotypes[it->second.first].second[it->second.second] = -1;
exon_haplotype_set.second[j] = -1;
found_duplicate = true;
}
else {
// there is no previous instance of this haplotype
seen_position[exon_haplotype_set.second[j]] = make_pair(i, j);
}
}
}

if (found_duplicate) {
// remove any duplicate thread IDs
for (auto& exon_haplotype_set : exon_haplotypes) {
auto new_end = remove_if(exon_haplotype_set.second.begin(), exon_haplotype_set.second.end(),
[](gbwt::size_type thread_id) {
return thread_id == (gbwt::size_type) -1;
});
exon_haplotype_set.second.resize(new_end - exon_haplotype_set.second.begin());
}
// remove any haplotype sets that are now empty
auto new_end = remove_if(exon_haplotypes.begin(), exon_haplotypes.end(), [](const pair<exon_nodes_t, thread_ids_t>& a) {
return a.second.empty();
});
exon_haplotypes.resize(new_end - exon_haplotypes.begin());
}
}

return exon_haplotypes;
}
Expand Down

1 comment on commit 4db75c2

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17323 seconds

Please sign in to comment.