Skip to content

Commit

Permalink
Merge pull request #3538 from broadinstitute/hail-backend-selected-tr…
Browse files Browse the repository at this point in the history
…anscript

Hail backend selected transcript
  • Loading branch information
hanars authored Aug 9, 2023
2 parents fd385a2 + 63cd81b commit 0c77c97
Show file tree
Hide file tree
Showing 18 changed files with 74 additions and 10 deletions.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/07/26 13:13:09
Created at 2023/08/04 12:47:37
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
42 changes: 39 additions & 3 deletions hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,11 @@ def _filter_annotated_table(self, gene_ids=None, rs_ids=None, frequencies=None,

def _filter_by_gene_ids(self, gene_ids):
gene_ids = hl.set(gene_ids)
self._ht = self._ht.filter(self._ht.sorted_transcript_consequences.any(lambda t: gene_ids.contains(t.gene_id)))
gene_id_filter = self._get_gene_id_filter(gene_ids)
self._ht = self._ht.filter(gene_id_filter)

def _get_gene_id_filter(self, gene_ids):
raise NotImplementedError

def _filter_rs_ids(self, rs_ids):
rs_id_set = hl.set(rs_ids)
Expand Down Expand Up @@ -602,6 +606,9 @@ class VariantHailTableQuery(BaseHailTableQuery):
'alt': lambda r: r.alleles[1],
'genotypeFilters': lambda r: hl.str(' ,').join(r.filters),
'mainTranscriptId': lambda r: r.sorted_transcript_consequences.first().transcript_id,
'selectedMainTranscriptId': lambda r: hl.or_missing(
r.selected_transcript != r.sorted_transcript_consequences.first(), r.selected_transcript.transcript_id,
),
}
BASE_ANNOTATION_FIELDS.update(BaseHailTableQuery.BASE_ANNOTATION_FIELDS)
ENUM_ANNOTATION_FIELDS = {
Expand All @@ -618,18 +625,43 @@ class VariantHailTableQuery(BaseHailTableQuery):
},
}

@staticmethod
def _selected_main_transcript_expr(ht):
has_allowed_transcripts = 'allowed_transcripts' in ht.row
main_transcript = ht.sorted_transcript_consequences.first()
if 'gene_transcripts' in ht.row:
matched_transcript = ht.gene_transcripts.first()
if has_allowed_transcripts:
allowed_transcript_ids = hl.set(ht.allowed_transcripts.map(lambda t: t.transcript_id))
matched_transcript = hl.or_else(
ht.gene_transcripts.find(lambda t: allowed_transcript_ids.contains(t.transcript_id)),
matched_transcript,
)
elif has_allowed_transcripts:
matched_transcript = ht.allowed_transcripts.first()
else:
matched_transcript = main_transcript

return hl.or_else(matched_transcript, main_transcript)

def import_filtered_table(self, *args, **kwargs):
super(VariantHailTableQuery, self).import_filtered_table(*args, **kwargs)
super().import_filtered_table(*args, **kwargs)
self._ht = self._ht.key_by(**{VARIANT_KEY_FIELD: self._ht.variant_id})

def _format_transcript_args(self):
args = super(VariantHailTableQuery, self)._format_transcript_args()
args = super()._format_transcript_args()
args.update({
'annotate_value': lambda transcript, *args: {'major_consequence': transcript.consequence_terms.first()},
'drop_fields': ['consequence_terms'],
})
return args

def _get_gene_id_filter(self, gene_ids):
self._ht = self._ht.annotate(
gene_transcripts=self._ht.sorted_transcript_consequences.filter(lambda t: gene_ids.contains(t.gene_id))
)
return hl.is_defined(self._ht.gene_transcripts.first())

def _annotate_allowed_consequences(self, annotations, annotation_filters):
allowed_consequences = {
ann for field, anns in annotations.items()
Expand Down Expand Up @@ -686,6 +718,10 @@ def _has_terms_range_expr(self, terms, field, subfield, range_configs):
value = self._ht[field][f'{subfield}_id']
return hl.any(lambda r: (value >= r[0]) & (value <= r[1]), ranges)

def _format_results(self, ht):
ht = ht.annotate(selected_transcript=self._selected_main_transcript_expr(ht))
return super()._format_results(ht)


QUERY_CLASS_MAP = {
VARIANT_DATASET: VariantHailTableQuery,
Expand Down
25 changes: 21 additions & 4 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
},
'transcripts': {},
'mainTranscriptId': None,
'selectedMainTranscriptId': None,
'_sort': [1000010146],
}

Expand All @@ -70,6 +71,8 @@
MULTI_FAMILY_VARIANT['familyGuids'] += FAMILY_3_VARIANT['familyGuids']
MULTI_FAMILY_VARIANT['genotypes'].update(FAMILY_3_VARIANT['genotypes'])

SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT = {**MULTI_FAMILY_VARIANT, 'selectedMainTranscriptId': 'ENST00000497611'}

PROJECT_2_VARIANT1 = deepcopy(VARIANT1)
PROJECT_2_VARIANT1['familyGuids'] = ['F000011_11']
PROJECT_2_VARIANT1['genotypes'] = {
Expand Down Expand Up @@ -193,7 +196,7 @@ async def test_location_search(self):
)

await self._assert_expected_search(
[MULTI_FAMILY_VARIANT], omit_sample_type='SV_WES',
[SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], omit_sample_type='SV_WES',
intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][:1]
)

Expand Down Expand Up @@ -249,16 +252,17 @@ async def test_annotations_filter(self):
await self._assert_expected_search([VARIANT2], pathogenicity={'hgmd': ['hgmd_other']}, omit_sample_type='SV_WES')

pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting', 'benign']}
await self._assert_expected_search([VARIANT2], pathogenicity=pathogenicity, omit_sample_type='SV_WES')
await self._assert_expected_search([VARIANT1, VARIANT2], pathogenicity=pathogenicity, omit_sample_type='SV_WES')

pathogenicity['clinvar'] = pathogenicity['clinvar'][:1]
await self._assert_expected_search(
[VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations={'SCREEN': ['CTCF-only', 'DNase-only']},
[VARIANT1, VARIANT4], pathogenicity=pathogenicity, annotations={'SCREEN': ['CTCF-only', 'DNase-only']},
omit_sample_type='SV_WES',
)

annotations = {'missense': ['missense_variant'], 'in_frame': ['inframe_insertion', 'inframe_deletion'], 'frameshift': None}
await self._assert_expected_search(
[VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES',
[VARIANT1, VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES',
)

await self._assert_expected_search([VARIANT2, VARIANT4], annotations=annotations, omit_sample_type='SV_WES')
Expand All @@ -268,6 +272,19 @@ async def test_annotations_filter(self):
[VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], annotations=annotations, omit_sample_type='SV_WES',
)

selected_transcript_variant_2 = {**VARIANT2, 'selectedMainTranscriptId': 'ENST00000641759'}
selected_transcript_variant_3 = {**MULTI_FAMILY_VARIANT, 'selectedMainTranscriptId': 'ENST00000426137'}
annotations = {'other': ['non_coding_transcript_exon_variant']}
await self._assert_expected_search(
[VARIANT1, selected_transcript_variant_2, selected_transcript_variant_3],
pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[selected_transcript_variant_2, SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT],
gene_ids=LOCATION_SEARCH['gene_ids'][:1], annotations=annotations, omit_sample_type='SV_WES',
)

async def test_in_silico_filter(self):
in_silico = {'eigen': '5.5', 'mut_taster': 'P'}
await self._assert_expected_search(
Expand Down
15 changes: 13 additions & 2 deletions hail_search/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,14 @@
},
},
'genotypeFilters': '',
'clinvar': None,
'clinvar': {
'alleleId': 19473,
'conflictingPathogenicities': None,
'goldStars': None,
'pathogenicity': 'Likely_pathogenic',
'assertions': None,
'version': '2023-07-10',
},
'hgmd': None,
'screenRegionType': None,
'populations': {
Expand Down Expand Up @@ -110,6 +117,7 @@
},
'transcripts': {},
'mainTranscriptId': None,
'selectedMainTranscriptId': None,
'_sort': [1000010439],
}
VARIANT2 = {
Expand Down Expand Up @@ -194,6 +202,7 @@
],
},
'mainTranscriptId': 'ENST00000376585',
'selectedMainTranscriptId': None,
'_sort': [1011794419],
}
VARIANT3 = {
Expand Down Expand Up @@ -254,13 +263,14 @@
'ENSG00000097046': [
{'aminoAcids': None, 'canonical': 1, 'codons': None, 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000428239.5:c.115+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000428239', 'isLofNagnag': None, 'transcriptRank': 0, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'intron_variant'},
{'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000234626.10:c.115+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000234626', 'isLofNagnag': None, 'transcriptRank': 1, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'intron_variant'},
{'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000426137.1:c.115+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000426137', 'isLofNagnag': None, 'transcriptRank': 2, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'intron_variant'},
{'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000426137.1:c.115+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000426137', 'isLofNagnag': None, 'transcriptRank': 2, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'non_coding_transcript_exon_variant'},
],
'ENSG00000177000': [
{'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000497611.1:n.501+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000497611', 'isLofNagnag': None, 'transcriptRank': 3, 'biotype': 'processed_transcript', 'lofFilters': None, 'majorConsequence': 'intron_variant'},
],
},
'mainTranscriptId': 'ENST00000428239',
'selectedMainTranscriptId': None,
'_sort': [1091502721],
}
VARIANT4 = {
Expand Down Expand Up @@ -325,6 +335,7 @@
],
},
'mainTranscriptId': 'ENST00000428239',
'selectedMainTranscriptId': None,
'_sort': [1091511686],
}

Expand Down

0 comments on commit 0c77c97

Please sign in to comment.