Skip to content

Commit

Permalink
Merge pull request #3590 from broadinstitute/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
hanars authored Sep 7, 2023
2 parents ee4afa0 + 48143e0 commit cd73180
Show file tree
Hide file tree
Showing 75 changed files with 797 additions and 281 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
3 changes: 3 additions & 0 deletions hail_search/fixtures/GRCh38/SV_WGS/annotations.ht/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/23 14:11:40
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/23 14:16:34
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/23 14:16:26
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
407 changes: 269 additions & 138 deletions hail_search/hail_search_query.py

Large diffs are not rendered by default.

137 changes: 126 additions & 11 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

from hail_search.test_utils import get_hail_search_body, FAMILY_2_VARIANT_SAMPLE_DATA, FAMILY_2_MISSING_SAMPLE_DATA, \
VARIANT1, VARIANT2, VARIANT3, VARIANT4, MULTI_PROJECT_SAMPLE_DATA, MULTI_PROJECT_MISSING_SAMPLE_DATA, \
LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, GENE_COUNTS
LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, GENE_COUNTS, SV_WGS_SAMPLE_DATA, \
SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4
from hail_search.web_app import init_web_app

PROJECT_2_VARIANT = {
Expand Down Expand Up @@ -95,7 +96,12 @@
}

# Ensures no variants are filtered out by annotation/path filters for compound hets
COMP_HET_ALL_PASS_FILTERS = {'annotations': {'splice_ai': '0.0'}, 'pathogenicity': {'clinvar': ['likely_pathogenic']}}
COMP_HET_ALL_PASS_FILTERS = {
'annotations': {'splice_ai': '0.0'}, 'pathogenicity': {'clinvar': ['likely_pathogenic']},
'structural': ['DEL', 'CPX', 'INS'],
}

NEW_SV_FILTER = {'new_structural_variants': ['NEW']}


def _sorted(variant, sorts):
Expand Down Expand Up @@ -137,6 +143,10 @@ async def test_single_family_search(self):
}
)

await self._assert_expected_search(
[SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA,
)

async def test_single_project_search(self):
await self._assert_expected_search(
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', gene_counts={
Expand All @@ -152,51 +162,96 @@ async def test_multi_project_search(self):
)

async def test_inheritance_filter(self):
inheritance_mode = 'any_affected'
await self._assert_expected_search(
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], inheritance_mode='any_affected', omit_sample_type='SV_WES',
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT1, FAMILY_3_VARIANT, VARIANT4], inheritance_mode='de_novo', omit_sample_type='SV_WES',
[SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA,
)

await self._assert_expected_search([], inheritance_mode='x_linked_recessive', omit_sample_type='SV_WES')
await self._assert_expected_search(
[SV_VARIANT2], inheritance_mode=inheritance_mode, annotations=NEW_SV_FILTER, sample_data=SV_WGS_SAMPLE_DATA,
)

inheritance_mode = 'de_novo'
await self._assert_expected_search(
[VARIANT1, FAMILY_3_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[SV_VARIANT1], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA,
)

inheritance_mode = 'x_linked_recessive'
await self._assert_expected_search([], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES')
await self._assert_expected_search([], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA)

inheritance_mode = 'homozygous_recessive'
await self._assert_expected_search(
[VARIANT2], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT2], inheritance_mode='homozygous_recessive', omit_sample_type='SV_WES',
[PROJECT_2_VARIANT1, VARIANT2], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA,
)

await self._assert_expected_search(
[PROJECT_2_VARIANT1, VARIANT2], inheritance_mode='homozygous_recessive', sample_data=MULTI_PROJECT_SAMPLE_DATA,
[SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA,
)

gt_inheritance_filter = {'genotype': {'I000006_hg00733': 'has_alt', 'I000005_hg00732': 'ref_ref'}}
await self._assert_expected_search(
[VARIANT2, VARIANT3], inheritance_filter=gt_inheritance_filter, sample_data=FAMILY_2_VARIANT_SAMPLE_DATA)

inheritance_mode = 'compound_het'
await self._assert_expected_search(
[[VARIANT3, VARIANT4]], inheritance_mode='compound_het', sample_data=MULTI_PROJECT_SAMPLE_DATA, gene_counts={
[[VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, gene_counts={
'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}},
'ENSG00000177000': {'total': 1, 'families': {'F000002_2': 1}},
}, **COMP_HET_ALL_PASS_FILTERS,
)

await self._assert_expected_search(
[PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', gene_counts={
[[SV_VARIANT1, SV_VARIANT2]], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA,
**COMP_HET_ALL_PASS_FILTERS,
)

inheritance_mode = 'recessive'
await self._assert_expected_search(
[PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, gene_counts={
'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}},
'ENSG00000177000': {'total': 2, 'families': {'F000002_2': 2}},
}, sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS,
)

await self._assert_expected_search(
[[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA,
**COMP_HET_ALL_PASS_FILTERS,
)

async def test_quality_filter(self):
quality_filter = {'vcf_filter': 'pass'}
await self._assert_expected_search(
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'vcf_filter': 'pass'}, omit_sample_type='SV_WES',
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES',
)

await self._assert_expected_search([SV_VARIANT4], quality_filter=quality_filter, sample_data=SV_WGS_SAMPLE_DATA)

await self._assert_expected_search(
[VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40}, omit_sample_type='SV_WES',
)

sv_quality_filter = {'min_gq_sv': 40}
await self._assert_expected_search(
[SV_VARIANT3, SV_VARIANT4], quality_filter=sv_quality_filter, sample_data=SV_WGS_SAMPLE_DATA,
)

await self._assert_expected_search(
[], annotations=NEW_SV_FILTER, quality_filter=sv_quality_filter, sample_data=SV_WGS_SAMPLE_DATA,
)

await self._assert_expected_search(
[VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40, 'vcf_filter': 'pass'}, omit_sample_type='SV_WES',
)
Expand All @@ -206,6 +261,10 @@ async def test_quality_filter(self):
omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[SV_VARIANT3, SV_VARIANT4], quality_filter={'min_gq_sv': 60, 'affected_only': True}, sample_data=SV_WGS_SAMPLE_DATA,
)

await self._assert_expected_search(
[VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter={'min_ab': 50}, omit_sample_type='SV_WES',
)
Expand Down Expand Up @@ -236,10 +295,19 @@ async def test_location_search(self):
[VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', **LOCATION_SEARCH,
)

sv_intervals = ['1:9310023-9380264']
await self._assert_expected_search(
[SV_VARIANT1, SV_VARIANT2], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, gene_ids=['ENSG00000171621'],
)

await self._assert_expected_search(
[VARIANT1], omit_sample_type='SV_WES', **EXCLUDE_LOCATION_SEARCH,
)

await self._assert_expected_search(
[SV_VARIANT3, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, exclude_intervals=True,
)

await self._assert_expected_search(
[SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], omit_sample_type='SV_WES',
intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][:1]
Expand All @@ -258,6 +326,10 @@ async def test_variant_id_search(self):
[], omit_sample_type='SV_WES', variant_ids=VARIANT_ID_SEARCH['variant_ids'][1:],
)

await self._assert_expected_search([SV_VARIANT2, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, variant_keys=[
'cohort_2911.chr1.final_cleanup_INS_chr1_160', 'phase2_DEL_chr14_4640',
])

async def test_frequency_filter(self):
await self._assert_expected_search(
[VARIANT1, VARIANT4], frequencies={'seqr': {'af': 0.2}}, omit_sample_type='SV_WES',
Expand All @@ -275,6 +347,10 @@ async def test_frequency_filter(self):
[VARIANT4], frequencies={'seqr': {'ac': 4, 'hh': 0}}, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[SV_VARIANT1], frequencies={'sv_callset': {'af': 0.05}}, sample_data=SV_WGS_SAMPLE_DATA,
)

await self._assert_expected_search(
[VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.05}}, omit_sample_type='SV_WES',
)
Expand All @@ -287,6 +363,10 @@ async def test_frequency_filter(self):
[VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.005}}, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[SV_VARIANT1, SV_VARIANT3, SV_VARIANT4], frequencies={'gnomad_svs': {'af': 0.001}}, sample_data=SV_WGS_SAMPLE_DATA,
)

await self._assert_expected_search(
[VARIANT4], frequencies={'seqr': {'af': 0.2}, 'gnomad_genomes': {'ac': 50}},
omit_sample_type='SV_WES',
Expand Down Expand Up @@ -320,18 +400,26 @@ async def test_annotations_filter(self):
omit_sample_type='SV_WES',
)

annotations = {'missense': ['missense_variant'], 'in_frame': ['inframe_insertion', 'inframe_deletion'], 'frameshift': None}
annotations = {
'missense': ['missense_variant'], 'in_frame': ['inframe_insertion', 'inframe_deletion'], 'frameshift': None,
'structural_consequence': ['INTRONIC'],
}
await self._assert_expected_search(
[VARIANT1, VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES',
)

await self._assert_expected_search([VARIANT2, VARIANT4], annotations=annotations, omit_sample_type='SV_WES')

await self._assert_expected_search([SV_VARIANT1], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA)

annotations['splice_ai'] = '0.005'
await self._assert_expected_search(
[VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], annotations=annotations, omit_sample_type='SV_WES',
)

annotations['structural'] = ['DEL']
await self._assert_expected_search([SV_VARIANT1, SV_VARIANT4], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA)

annotations = {'other': ['non_coding_transcript_exon_variant']}
await self._assert_expected_search(
[VARIANT1, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT],
Expand Down Expand Up @@ -362,6 +450,19 @@ async def test_secondary_annotations_filter(self):
annotations=annotations_2, annotations_secondary=annotations_1,
)

sv_annotations_1 = {'structural': ['INS']}
sv_annotations_2 = {'structural': ['DEL'], 'structural_consequence': ['INTRONIC']}

await self._assert_expected_search(
[[SV_VARIANT1, SV_VARIANT2]], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='compound_het',
annotations=sv_annotations_1, annotations_secondary=sv_annotations_2,
)

await self._assert_expected_search(
[[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='recessive',
annotations=sv_annotations_2, annotations_secondary=sv_annotations_1,
)

pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting']}
await self._assert_expected_search(
[VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES',
Expand Down Expand Up @@ -403,6 +504,10 @@ async def test_in_silico_filter(self):
[VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, in_silico={'strvctvre': 0.1, 'requireScore': True},
)

async def test_search_errors(self):
search_body = get_hail_search_body(sample_data=FAMILY_2_MISSING_SAMPLE_DATA)
async with self.client.request('POST', '/search', json=search_body) as resp:
Expand Down Expand Up @@ -430,6 +535,11 @@ async def test_sort(self):
_sorted(VARIANT1, [None, None])], omit_sample_type='SV_WES', sort='protein_consequence',
)

await self._assert_expected_search(
[_sorted(SV_VARIANT1, [11]), _sorted(SV_VARIANT2, [12]), _sorted(SV_VARIANT3, [12]), _sorted(SV_VARIANT4, [12])],
sample_data=SV_WGS_SAMPLE_DATA, sort='protein_consequence',
)

await self._assert_expected_search(
[_sorted(VARIANT4, [11, 11]), _sorted(SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, [11, 22]),
_sorted(SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT, [22, 22])],
Expand Down Expand Up @@ -507,6 +617,11 @@ async def test_sort(self):
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], sort='size', omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[_sorted(SV_VARIANT4, [-46343]), _sorted(SV_VARIANT1, [-104]), _sorted(SV_VARIANT2, [-50]),
_sorted(SV_VARIANT3, [-50])], sample_data=SV_WGS_SAMPLE_DATA, sort='size',
)

# sort applies to compound hets
await self._assert_expected_search(
[_sorted(VARIANT2, [11, 11]), [_sorted(VARIANT4, [11, 11]), _sorted(VARIANT3, [22, 24])]],
Expand Down
Loading

0 comments on commit cd73180

Please sign in to comment.