Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hotfix for transcript projection onto haplotypes with tandem duplications in vg rna #4397

Merged
merged 1 commit into from
Sep 13, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 66 additions & 30 deletions src/transcriptome.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1628,14 +1628,18 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri

assert(haplotype_index.bidirectional());

// the return value
list<EditedTranscriptPath> edited_transcript_paths;

// boundary nodes for the exons (we save them for later while walking out the haplotypes)
vector<pair<vg::id_t, vg::id_t> > exon_node_ids;
exon_node_ids.reserve(cur_transcript.exons.size());

// exon paths for each exon and the threads that follow them, which we will use in the phase of construcint the edited transcript pathsw
vector<pair<vector<exon_nodes_t>, thread_ids_t> > haplotypes;
// for each haplotype, the index of its exons in the haplotypes vector and the next expected exon (only used for constructing haplotypes)
multimap<int32_t, pair<int32_t, int32_t> > haplotype_id_index;

for (size_t exon_idx = 0; exon_idx < cur_transcript.exons.size(); ++exon_idx) {

const Exon & cur_exon = cur_transcript.exons.at(exon_idx);
Expand All @@ -1650,38 +1654,33 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
// first position in downstream intron).
auto exon_haplotypes = get_exon_haplotypes(exon_node_ids.back().first, exon_node_ids.back().second, haplotype_index,
reference_samples, expected_length);

if (haplotypes.empty()) {

for (auto & exon_haplotype: exon_haplotypes) {

haplotypes.emplace_back(vector<exon_nodes_t>(1, exon_haplotype.first), exon_haplotype.second);
haplotypes.back().first.reserve(cur_transcript.exons.size());

for (auto & haplotype_id: exon_haplotype.second) {

haplotype_id_index.emplace(haplotype_id, make_pair(haplotypes.size() - 1, exon_idx + 1));
}
}

} else {

} else {
for (auto & exon_haplotype: exon_haplotypes) {

assert(!exon_haplotype.first.empty());
spp::sparse_hash_map<int32_t, uint32_t> extended_haplotypes;

for (auto & haplotype_id: exon_haplotype.second) {

auto haplotype_id_index_it_range = haplotype_id_index.equal_range(haplotype_id);
auto haplotype_id_index_it = haplotype_id_index_it_range.first;

while (haplotype_id_index_it != haplotype_id_index_it_range.second) {

if (exon_idx != haplotype_id_index_it->second.second) {

assert(haplotype_id_index_it->second.second < exon_idx);

haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it);
continue;
}
Expand All @@ -1690,36 +1689,36 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
pair<vector<exon_nodes_t>, thread_ids_t> * cur_haplotype = &haplotypes.at(haplotype_id_index_it->second.first);

if (extended_haplotypes.find(haplotype_id_index_it->second.first) != extended_haplotypes.end()) {

// we have already started to extend the haplotypes from this haplotype's previous exon haplotype set
assert(cur_haplotype->first.size() == exon_idx + 1);
haplotypes.at(extended_haplotypes.at(haplotype_id_index_it->second.first)).second.emplace_back(haplotype_id);
haplotype_id_index_it->second.first = extended_haplotypes.at(haplotype_id_index_it->second.first);

haplotype_id_index_it->second.first = extended_haplotypes.at(haplotype_id_index_it->second.first);
} else if (cur_haplotype->first.size() == exon_idx) {

// the next expected exon is this one, so we extend it by this exon and mark the extension as conly containing this thread so far
cur_haplotype->first.emplace_back(exon_haplotype.first);
cur_haplotype->second = {haplotype_id};
assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotype_id_index_it->second.first).second);

} else if (cur_haplotype->first.size() == exon_idx + 1) {

} else if (cur_haplotype->first.size() == exon_idx + 1) {
// the next expected exon is the following one, so we must have already added this one, so we must need divide this
// haplotype group into two
haplotypes.emplace_back(vector<exon_nodes_t>(cur_haplotype->first.begin(), cur_haplotype->first.end() - 1), thread_ids_t(1, haplotype_id));
haplotypes.back().first.emplace_back(exon_haplotype.first);

assert(extended_haplotypes.emplace(haplotype_id_index_it->second.first, haplotypes.size() - 1).second);
haplotype_id_index_it->second.first = haplotypes.size() - 1;

} else {
haplotype_id_index_it->second.first = haplotypes.size() - 1;

} else {
// we must have missed an exon, so we have to erase this haplotype from the results
haplotype_id_index_it = haplotype_id_index.erase(haplotype_id_index_it);
continue;
}
}

++haplotype_id_index_it;
}
}
}
}
}
}

for (auto & haplotype: haplotypes) {
Expand Down Expand Up @@ -1853,7 +1852,7 @@ list<EditedTranscriptPath> Transcriptome::project_transcript_gbwt(const Transcri
vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(const vg::id_t start_node, const vg::id_t end_node, const gbwt::GBWT & haplotype_index, const unordered_set<string>& reference_samples, const int32_t expected_length) const {

assert(expected_length > 0);

// Calculate the expected upperbound of the length between the two
// nodes (number of nodes).
const int32_t expected_length_upperbound = 1.1 * expected_length;
Expand Down Expand Up @@ -1888,7 +1887,7 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con

// Stop current extension if end node is reached.
if (gbwt::Node::id(cur_exon_haplotype.first.back()) == end_node) {

exon_haplotypes.emplace_back(cur_exon_haplotype.first, haplotype_index.locate(cur_exon_haplotype.second));
assert(exon_haplotypes.back().second.size() <= cur_exon_haplotype.second.size());

Expand Down Expand Up @@ -1928,7 +1927,6 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con
}

if (!has_relevant_haplotype) {

exon_haplotype_queue.pop();
continue;
}
Expand All @@ -1938,23 +1936,21 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con

// End current extension if no outgoing edges exist.
if (out_edges.empty()) {

exon_haplotype_queue.pop();
continue;
}

auto out_edges_it = out_edges.begin();
++out_edges_it;
++out_edges_it; // skip the first edge (we will follow it in-place on the queue after the following loop)

while (out_edges_it != out_edges.end()) {

// Do not extend haplotypes that end within the exon.
if (out_edges_it->first != gbwt::ENDMARKER) {

auto extended_search = haplotype_index.extend(cur_exon_haplotype.second, out_edges_it->first);

// Add new extension to queue if not empty (haplotypes found).
if (!extended_search.empty()) {
if (!extended_search.empty()) {

exon_haplotype_queue.push(make_pair(cur_exon_haplotype.first, extended_search));
exon_haplotype_queue.back().first.emplace_back(out_edges_it->first);
Expand All @@ -1966,18 +1962,58 @@ vector<pair<exon_nodes_t, thread_ids_t> > Transcriptome::get_exon_haplotypes(con

// Do not extend haplotypes that end within the exon.
if (out_edges.begin()->first != gbwt::ENDMARKER) {

cur_exon_haplotype.first.emplace_back(out_edges.begin()->first);
cur_exon_haplotype.second = haplotype_index.extend(cur_exon_haplotype.second, out_edges.begin()->first);

// End current extension if empty (no haplotypes found).
if (cur_exon_haplotype.second.empty()) { exon_haplotype_queue.pop(); }
if (cur_exon_haplotype.second.empty()) {
exon_haplotype_queue.pop(); }

} else {

exon_haplotype_queue.pop();
}
}

// remove haplotypes that visit this exon more than once
// FIXME: we should instead use the R-index to get reachability information in addition to thread IDs
{
unordered_map<gbwt::size_type, pair<size_t, size_t>> seen_position;
bool found_duplicate = false;
for (size_t i = 0; i < exon_haplotypes.size(); ++i) {

auto& exon_haplotype_set = exon_haplotypes[i];

for (size_t j = 0; j < exon_haplotype_set.second.size(); ++j) {
auto it = seen_position.find(exon_haplotype_set.second[j]);
if (it != seen_position.end()) {
// mark this and the previous instance of this haplotype for removal
exon_haplotypes[it->second.first].second[it->second.second] = -1;
exon_haplotype_set.second[j] = -1;
found_duplicate = true;
}
else {
// there is no previous instance of this haplotype
seen_position[exon_haplotype_set.second[j]] = make_pair(i, j);
}
}
}

if (found_duplicate) {
// remove any duplicate thread IDs
for (auto& exon_haplotype_set : exon_haplotypes) {
auto new_end = remove_if(exon_haplotype_set.second.begin(), exon_haplotype_set.second.end(),
[](gbwt::size_type thread_id) {
return thread_id == (gbwt::size_type) -1;
});
exon_haplotype_set.second.resize(new_end - exon_haplotype_set.second.begin());
}
// remove any haplotype sets that are now empty
auto new_end = remove_if(exon_haplotypes.begin(), exon_haplotypes.end(), [](const pair<exon_nodes_t, thread_ids_t>& a) {
return a.second.empty();
});
exon_haplotypes.resize(new_end - exon_haplotypes.begin());
}
}

return exon_haplotypes;
}
Expand Down
Loading