diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.README.txt.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.README.txt.crc index 37305974fb..0410f9ffd4 100644 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.README.txt.crc and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.README.txt.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.metadata.json.gz.crc index f6bc82dd1e..b9f66cdff8 100644 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.metadata.json.gz.crc and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/README.txt b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/README.txt index f0eed4785c..b006f802b1 100644 --- a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/README.txt +++ b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.109-b71b065e4bb6 - Created at 2023/07/26 13:13:09 \ No newline at end of file + Created at 2023/08/04 12:47:37 \ No newline at end of file diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/.index.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/.index.crc deleted file mode 100644 index 2842a6945b..0000000000 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/.index.crc and /dev/null differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/index b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/index deleted file mode 100644 index 34465e4594..0000000000 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/index and /dev/null differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/.index.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/.index.crc new file mode 100644 index 0000000000..60183f50e1 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/.index.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/.metadata.json.gz.crc similarity index 100% rename from hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/.metadata.json.gz.crc rename to hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/.metadata.json.gz.crc diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/index b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/index new file mode 100644 index 0000000000..befb8d34e8 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/index differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/metadata.json.gz similarity index 100% rename from hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/metadata.json.gz rename to hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/metadata.json.gz diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/metadata.json.gz index 5ce75d799d..7014e2577e 100644 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/metadata.json.gz and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/.metadata.json.gz.crc index 5a38c8be75..4bfe301fc1 100644 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/.metadata.json.gz.crc and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/metadata.json.gz index b2dab47c9a..68f277de11 100644 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/metadata.json.gz and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.crc deleted file mode 100644 index ac1eed0fad..0000000000 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.crc and /dev/null differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-bd787778-d7c2-4828-835f-ca93e860ce89.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-bd787778-d7c2-4828-835f-ca93e860ce89.crc new file mode 100644 index 0000000000..7f51fce1fc Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-bd787778-d7c2-4828-835f-ca93e860ce89.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-bd787778-d7c2-4828-835f-ca93e860ce89 similarity index 52% rename from hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af rename to hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-bd787778-d7c2-4828-835f-ca93e860ce89 index f050aad17a..7384b9a0e8 100644 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-bd787778-d7c2-4828-835f-ca93e860ce89 differ diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 5aee829c0c..f96388931e 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -420,7 +420,11 @@ def _filter_annotated_table(self, gene_ids=None, rs_ids=None, frequencies=None, def _filter_by_gene_ids(self, gene_ids): gene_ids = hl.set(gene_ids) - self._ht = self._ht.filter(self._ht.sorted_transcript_consequences.any(lambda t: gene_ids.contains(t.gene_id))) + gene_id_filter = self._get_gene_id_filter(gene_ids) + self._ht = self._ht.filter(gene_id_filter) + + def _get_gene_id_filter(self, gene_ids): + raise NotImplementedError def _filter_rs_ids(self, rs_ids): rs_id_set = hl.set(rs_ids) @@ -602,6 +606,9 @@ class VariantHailTableQuery(BaseHailTableQuery): 'alt': lambda r: r.alleles[1], 'genotypeFilters': lambda r: hl.str(' ,').join(r.filters), 'mainTranscriptId': lambda r: r.sorted_transcript_consequences.first().transcript_id, + 'selectedMainTranscriptId': lambda r: hl.or_missing( + r.selected_transcript != r.sorted_transcript_consequences.first(), r.selected_transcript.transcript_id, + ), } BASE_ANNOTATION_FIELDS.update(BaseHailTableQuery.BASE_ANNOTATION_FIELDS) ENUM_ANNOTATION_FIELDS = { @@ -618,18 +625,43 @@ class VariantHailTableQuery(BaseHailTableQuery): }, } + @staticmethod + def _selected_main_transcript_expr(ht): + has_allowed_transcripts = 'allowed_transcripts' in ht.row + main_transcript = ht.sorted_transcript_consequences.first() + if 'gene_transcripts' in ht.row: + matched_transcript = ht.gene_transcripts.first() + if has_allowed_transcripts: + allowed_transcript_ids = hl.set(ht.allowed_transcripts.map(lambda t: t.transcript_id)) + matched_transcript = hl.or_else( + ht.gene_transcripts.find(lambda t: allowed_transcript_ids.contains(t.transcript_id)), + matched_transcript, + ) + elif has_allowed_transcripts: + matched_transcript = ht.allowed_transcripts.first() + else: + matched_transcript = main_transcript + + return hl.or_else(matched_transcript, main_transcript) + def import_filtered_table(self, *args, **kwargs): - super(VariantHailTableQuery, self).import_filtered_table(*args, **kwargs) + super().import_filtered_table(*args, **kwargs) self._ht = self._ht.key_by(**{VARIANT_KEY_FIELD: self._ht.variant_id}) def _format_transcript_args(self): - args = super(VariantHailTableQuery, self)._format_transcript_args() + args = super()._format_transcript_args() args.update({ 'annotate_value': lambda transcript, *args: {'major_consequence': transcript.consequence_terms.first()}, 'drop_fields': ['consequence_terms'], }) return args + def _get_gene_id_filter(self, gene_ids): + self._ht = self._ht.annotate( + gene_transcripts=self._ht.sorted_transcript_consequences.filter(lambda t: gene_ids.contains(t.gene_id)) + ) + return hl.is_defined(self._ht.gene_transcripts.first()) + def _annotate_allowed_consequences(self, annotations, annotation_filters): allowed_consequences = { ann for field, anns in annotations.items() @@ -686,6 +718,10 @@ def _has_terms_range_expr(self, terms, field, subfield, range_configs): value = self._ht[field][f'{subfield}_id'] return hl.any(lambda r: (value >= r[0]) & (value <= r[1]), ranges) + def _format_results(self, ht): + ht = ht.annotate(selected_transcript=self._selected_main_transcript_expr(ht)) + return super()._format_results(ht) + QUERY_CLASS_MAP = { VARIANT_DATASET: VariantHailTableQuery, diff --git a/hail_search/test_search.py b/hail_search/test_search.py index a68dfc9730..a7c54b1a8b 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -54,6 +54,7 @@ }, 'transcripts': {}, 'mainTranscriptId': None, + 'selectedMainTranscriptId': None, '_sort': [1000010146], } @@ -70,6 +71,8 @@ MULTI_FAMILY_VARIANT['familyGuids'] += FAMILY_3_VARIANT['familyGuids'] MULTI_FAMILY_VARIANT['genotypes'].update(FAMILY_3_VARIANT['genotypes']) +SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT = {**MULTI_FAMILY_VARIANT, 'selectedMainTranscriptId': 'ENST00000497611'} + PROJECT_2_VARIANT1 = deepcopy(VARIANT1) PROJECT_2_VARIANT1['familyGuids'] = ['F000011_11'] PROJECT_2_VARIANT1['genotypes'] = { @@ -193,7 +196,7 @@ async def test_location_search(self): ) await self._assert_expected_search( - [MULTI_FAMILY_VARIANT], omit_sample_type='SV_WES', + [SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], omit_sample_type='SV_WES', intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][:1] ) @@ -249,16 +252,17 @@ async def test_annotations_filter(self): await self._assert_expected_search([VARIANT2], pathogenicity={'hgmd': ['hgmd_other']}, omit_sample_type='SV_WES') pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting', 'benign']} - await self._assert_expected_search([VARIANT2], pathogenicity=pathogenicity, omit_sample_type='SV_WES') + await self._assert_expected_search([VARIANT1, VARIANT2], pathogenicity=pathogenicity, omit_sample_type='SV_WES') + pathogenicity['clinvar'] = pathogenicity['clinvar'][:1] await self._assert_expected_search( - [VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations={'SCREEN': ['CTCF-only', 'DNase-only']}, + [VARIANT1, VARIANT4], pathogenicity=pathogenicity, annotations={'SCREEN': ['CTCF-only', 'DNase-only']}, omit_sample_type='SV_WES', ) annotations = {'missense': ['missense_variant'], 'in_frame': ['inframe_insertion', 'inframe_deletion'], 'frameshift': None} await self._assert_expected_search( - [VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', + [VARIANT1, VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', ) await self._assert_expected_search([VARIANT2, VARIANT4], annotations=annotations, omit_sample_type='SV_WES') @@ -268,6 +272,19 @@ async def test_annotations_filter(self): [VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], annotations=annotations, omit_sample_type='SV_WES', ) + selected_transcript_variant_2 = {**VARIANT2, 'selectedMainTranscriptId': 'ENST00000641759'} + selected_transcript_variant_3 = {**MULTI_FAMILY_VARIANT, 'selectedMainTranscriptId': 'ENST00000426137'} + annotations = {'other': ['non_coding_transcript_exon_variant']} + await self._assert_expected_search( + [VARIANT1, selected_transcript_variant_2, selected_transcript_variant_3], + pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [selected_transcript_variant_2, SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], + gene_ids=LOCATION_SEARCH['gene_ids'][:1], annotations=annotations, omit_sample_type='SV_WES', + ) + async def test_in_silico_filter(self): in_silico = {'eigen': '5.5', 'mut_taster': 'P'} await self._assert_expected_search( diff --git a/hail_search/test_utils.py b/hail_search/test_utils.py index e45183cb46..83e1365c1f 100644 --- a/hail_search/test_utils.py +++ b/hail_search/test_utils.py @@ -82,7 +82,14 @@ }, }, 'genotypeFilters': '', - 'clinvar': None, + 'clinvar': { + 'alleleId': 19473, + 'conflictingPathogenicities': None, + 'goldStars': None, + 'pathogenicity': 'Likely_pathogenic', + 'assertions': None, + 'version': '2023-07-10', + }, 'hgmd': None, 'screenRegionType': None, 'populations': { @@ -110,6 +117,7 @@ }, 'transcripts': {}, 'mainTranscriptId': None, + 'selectedMainTranscriptId': None, '_sort': [1000010439], } VARIANT2 = { @@ -194,6 +202,7 @@ ], }, 'mainTranscriptId': 'ENST00000376585', + 'selectedMainTranscriptId': None, '_sort': [1011794419], } VARIANT3 = { @@ -254,13 +263,14 @@ 'ENSG00000097046': [ {'aminoAcids': None, 'canonical': 1, 'codons': None, 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000428239.5:c.115+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000428239', 'isLofNagnag': None, 'transcriptRank': 0, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'intron_variant'}, {'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000234626.10:c.115+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000234626', 'isLofNagnag': None, 'transcriptRank': 1, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'intron_variant'}, - {'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000426137.1:c.115+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000426137', 'isLofNagnag': None, 'transcriptRank': 2, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'intron_variant'}, + {'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000426137.1:c.115+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000426137', 'isLofNagnag': None, 'transcriptRank': 2, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'non_coding_transcript_exon_variant'}, ], 'ENSG00000177000': [ {'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000497611.1:n.501+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000497611', 'isLofNagnag': None, 'transcriptRank': 3, 'biotype': 'processed_transcript', 'lofFilters': None, 'majorConsequence': 'intron_variant'}, ], }, 'mainTranscriptId': 'ENST00000428239', + 'selectedMainTranscriptId': None, '_sort': [1091502721], } VARIANT4 = { @@ -325,6 +335,7 @@ ], }, 'mainTranscriptId': 'ENST00000428239', + 'selectedMainTranscriptId': None, '_sort': [1091511686], }