Skip to content

Commit

Permalink
Merge branch 'hail-backend-frequency-filter' of https://github.com/br…
Browse files Browse the repository at this point in the history
…oadinstitute/seqr into hail-backend-in-silico-filter
  • Loading branch information
hanars committed Aug 2, 2023
2 parents 693d04c + 7c7177f commit 5c3eea4
Show file tree
Hide file tree
Showing 9 changed files with 176 additions and 40 deletions.
30 changes: 30 additions & 0 deletions hail_search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,40 @@
UNAFFECTED = 'N'
AFFECTED_ID = 0
UNAFFECTED_ID = 1
MALE = 'M'

VARIANT_DATASET = 'VARIANTS'

VARIANT_KEY_FIELD = 'variantId'
GNOMAD_GENOMES_FIELD = 'gnomad_genomes'

XPOS = 'xpos'

ALT_ALT = 'alt_alt'
REF_REF = 'ref_ref'
REF_ALT = 'ref_alt'
HAS_ALT = 'has_alt'
HAS_REF = 'has_ref'
COMP_HET_ALT = 'COMP_HET_ALT'

RECESSIVE = 'recessive'
X_LINKED_RECESSIVE = 'x_linked_recessive'
COMPOUND_HET = 'compound_het'
ANY_AFFECTED = 'any_affected'
RECESSIVE_FILTER = {
AFFECTED: ALT_ALT,
UNAFFECTED: HAS_REF,
}
INHERITANCE_FILTERS = {
RECESSIVE: RECESSIVE_FILTER,
X_LINKED_RECESSIVE: RECESSIVE_FILTER,
'homozygous_recessive': RECESSIVE_FILTER,
COMPOUND_HET: {
AFFECTED: COMP_HET_ALT,
UNAFFECTED: HAS_REF,
},
'de_novo': {
AFFECTED: HAS_ALT,
UNAFFECTED: REF_REF,
},
}
91 changes: 81 additions & 10 deletions hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
import logging
import os

from hail_search.constants import AFFECTED, UNAFFECTED, AFFECTED_ID, UNAFFECTED_ID, VARIANT_DATASET, VARIANT_KEY_FIELD,\
GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38_DISPLAY
from hail_search.constants import AFFECTED, UNAFFECTED, AFFECTED_ID, UNAFFECTED_ID, MALE, VARIANT_DATASET, \
VARIANT_KEY_FIELD, GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38_DISPLAY, INHERITANCE_FILTERS, \
ANY_AFFECTED, X_LINKED_RECESSIVE, REF_REF, REF_ALT, COMP_HET_ALT, ALT_ALT, HAS_ALT, HAS_REF

DATASETS_DIR = os.environ.get('DATASETS_DIR', '/hail_datasets')

Expand All @@ -22,6 +23,15 @@ def _to_camel_case(snake_case_str):

class BaseHailTableQuery(object):

GENOTYPE_QUERY_MAP = {
REF_REF: lambda gt: gt.is_hom_ref(),
REF_ALT: lambda gt: gt.is_het(),
COMP_HET_ALT: lambda gt: gt.is_het(),
ALT_ALT: lambda gt: gt.is_hom_var(),
HAS_ALT: lambda gt: gt.is_non_ref(),
HAS_REF: lambda gt: gt.is_hom_ref() | gt.is_het_ref(),
}

GENOTYPE_FIELDS = {}
POPULATIONS = {}
POPULATION_FIELDS = {}
Expand Down Expand Up @@ -160,14 +170,14 @@ def import_filtered_table(self, data_type, sample_data, **kwargs):
if len(family_samples) == 1:
family_guid, family_sample_data = list(family_samples.items())[0]
family_ht = hl.read_table(f'{tables_path}/families/{family_guid}.ht')
families_ht = self._filter_entries_table(family_ht, family_sample_data)
families_ht = self._filter_entries_table(family_ht, family_sample_data, **kwargs)
else:
filtered_project_hts = []
exception_messages = set()
for project_guid, project_sample_data in project_samples.items():
project_ht = hl.read_table(f'{tables_path}/projects/{project_guid}.ht')
try:
filtered_project_hts.append(self._filter_entries_table(project_ht, project_sample_data))
filtered_project_hts.append(self._filter_entries_table(project_ht, project_sample_data, **kwargs))
except HTTPBadRequest as e:
exception_messages.add(e.text)

Expand Down Expand Up @@ -205,11 +215,11 @@ def import_filtered_table(self, data_type, sample_data, **kwargs):
)
self._filter_annotated_table(**kwargs)

def _filter_entries_table(self, ht, sample_data):
ht = self._add_entry_sample_families(ht, sample_data)
def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, **kwargs):
ht, sample_id_family_index_map = self._add_entry_sample_families(ht, sample_data)

ht = ht.annotate(family_entries=ht.family_entries.map(
lambda entries: hl.or_missing(entries.any(lambda x: x.GT.is_non_ref()), entries))
ht = self._filter_inheritance(
ht, inheritance_mode, inheritance_filter, sample_data, sample_id_family_index_map,
)

return ht.select_globals()
Expand Down Expand Up @@ -237,11 +247,13 @@ def _add_entry_sample_families(cls, ht, sample_data):
sample_index_family_map = hl.dict({sample_id_index_map[k]: v for k, v in sample_id_family_map.items()})
family_index_map = {f: i for i, f in enumerate(sorted(set(sample_id_family_map.values())))}
family_sample_indices = [None] * len(family_index_map)
sample_id_family_index_map = {}
for sample_id, family_guid in sample_id_family_map.items():
sample_index = sample_id_index_map[sample_id]
family_index = family_index_map[family_guid]
if not family_sample_indices[family_index]:
family_sample_indices[family_index] = []
sample_id_family_index_map[sample_id] = (family_index, len(family_sample_indices[family_index]))
family_sample_indices[family_index].append(sample_index)
family_sample_indices = hl.array(family_sample_indices)

Expand All @@ -256,13 +268,72 @@ def _add_entry_sample_families(cls, ht, sample_data):
))
)

return ht
return ht, sample_id_family_index_map

@staticmethod
def _missing_entry(entry):
entry_type = dict(**entry.dtype)
return hl.struct(**{k: hl.missing(v) for k, v in entry_type.items()})

def _filter_inheritance(self, ht, inheritance_mode, inheritance_filter, sample_data, sample_id_family_index_map):
any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT)

is_any_affected = inheritance_mode == ANY_AFFECTED
if is_any_affected:
prev_any_valid_entry = any_valid_entry
any_valid_entry = lambda x: prev_any_valid_entry(x) & (x.affected_id == AFFECTED_ID)

ht = ht.annotate(family_entries=ht.family_entries.map(
lambda entries: hl.or_missing(entries.any(any_valid_entry), entries))
)

if not (inheritance_filter or inheritance_mode):
return ht

if inheritance_mode == X_LINKED_RECESSIVE:
x_chrom_interval = hl.parse_locus_interval(
hl.get_reference(self._genome_version).x_contigs[0], reference_genome=self._genome_version)
ht = ht.filter(self.get_x_chrom_filter(ht, x_chrom_interval))

if not is_any_affected:
ht = self._filter_families_inheritance(
ht, inheritance_mode, inheritance_filter, sample_id_family_index_map, sample_data,
)

return ht.filter(ht.family_entries.any(hl.is_defined))

@classmethod
def _filter_families_inheritance(cls, ht, inheritance_mode, inheritance_filter, sample_id_family_index_map, sample_data):
individual_genotype_filter = (inheritance_filter or {}).get('genotype')

entry_indices_by_gt = defaultdict(lambda: defaultdict(list))
for s in sample_data:
genotype = individual_genotype_filter.get(s['individual_guid']) \
if individual_genotype_filter else INHERITANCE_FILTERS[inheritance_mode].get(s['affected'])
if inheritance_mode == X_LINKED_RECESSIVE and s['affected'] == UNAFFECTED and s['sex'] == MALE:
genotype = REF_REF
if genotype:
family_index, entry_index = sample_id_family_index_map[s['sample_id']]
entry_indices_by_gt[genotype][family_index].append(entry_index)

for genotype, entry_indices in entry_indices_by_gt.items():
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(family_entries=hl.enumerate(ht.family_entries).map(
lambda x: cls._valid_genotype_family_entries(x[1], entry_indices.get(x[0]), genotype)))

return ht

@classmethod
def _valid_genotype_family_entries(cls, entries, gentoype_entry_indices, genotype):
is_valid = hl.is_missing(gentoype_entry_indices) | gentoype_entry_indices.all(
lambda i: cls.GENOTYPE_QUERY_MAP[genotype](entries[i].GT)
)
return hl.or_missing(is_valid, entries)

@staticmethod
def get_x_chrom_filter(ht, x_interval):
return x_interval.contains(ht.locus)

def _filter_annotated_table(self, frequencies=None, in_silico=None, **kwargs):
self._filter_by_frequency(frequencies)

Expand Down Expand Up @@ -293,7 +364,7 @@ def _filter_by_frequency(self, frequencies):
if hemi_field:
pop_filters.append(pop_expr[hemi_field] <= freqs['hh'])

if pop_filters is not None:
if pop_filters:
pop_filter = pop_filters[0]
for pf in pop_filters[1:]:
pop_filter &= pf
Expand Down
10 changes: 3 additions & 7 deletions hail_search/requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile hail_search/requirements-test.in
#
aiohttp==3.8.4
aiohttp==3.8.5
# via pytest-aiohttp
aiosignal==1.3.1
# via aiohttp
Expand All @@ -15,9 +15,7 @@ attrs==23.1.0
charset-normalizer==3.2.0
# via aiohttp
coverage==5.1
# via -r hail_search/requirements-test.in
exceptiongroup==1.1.2
# via pytest
# via -r requirements-test.in
frozenlist==1.3.3
# via
# aiohttp
Expand All @@ -39,10 +37,8 @@ pytest==7.4.0
# pytest-aiohttp
# pytest-asyncio
pytest-aiohttp==1.0.4
# via -r hail_search/requirements-test.in
# via -r requirements-test.in
pytest-asyncio==0.21.0
# via pytest-aiohttp
tomli==2.0.1
# via pytest
yarl==1.9.2
# via aiohttp
59 changes: 49 additions & 10 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,30 @@
'_sort': [1000010146],
}

MULTI_FAMILY_VARIANT = deepcopy(VARIANT3)
MULTI_FAMILY_VARIANT['familyGuids'].append('F000003_3')
MULTI_FAMILY_VARIANT['genotypes']['I000007_na20870'] = {
'sampleId': 'NA20870', 'individualGuid': 'I000007_na20870', 'familyGuid': 'F000003_3',
'numAlt': 1, 'dp': 28, 'gq': 99, 'ab': 0.6785714285714286,
FAMILY_3_VARIANT = deepcopy(VARIANT3)
FAMILY_3_VARIANT['familyGuids'] = ['F000003_3']
FAMILY_3_VARIANT['genotypes'] = {
'I000007_na20870': {
'sampleId': 'NA20870', 'individualGuid': 'I000007_na20870', 'familyGuid': 'F000003_3',
'numAlt': 1, 'dp': 28, 'gq': 99, 'ab': 0.6785714285714286,
},
}

MULTI_PROJECT_VARIANT1 = deepcopy(VARIANT1)
MULTI_PROJECT_VARIANT1['familyGuids'].append('F000011_11')
MULTI_PROJECT_VARIANT1['genotypes']['I000015_na20885'] = {
'sampleId': 'NA20885', 'individualGuid': 'I000015_na20885', 'familyGuid': 'F000011_11',
'numAlt': 2, 'dp': 6, 'gq': 16, 'ab': 1.0,
MULTI_FAMILY_VARIANT = deepcopy(VARIANT3)
MULTI_FAMILY_VARIANT['familyGuids'] += FAMILY_3_VARIANT['familyGuids']
MULTI_FAMILY_VARIANT['genotypes'].update(FAMILY_3_VARIANT['genotypes'])

PROJECT_2_VARIANT1 = deepcopy(VARIANT1)
PROJECT_2_VARIANT1['familyGuids'] = ['F000011_11']
PROJECT_2_VARIANT1['genotypes'] = {
'I000015_na20885': {
'sampleId': 'NA20885', 'individualGuid': 'I000015_na20885', 'familyGuid': 'F000011_11',
'numAlt': 2, 'dp': 6, 'gq': 16, 'ab': 1.0,
},
}
MULTI_PROJECT_VARIANT1 = deepcopy(VARIANT1)
MULTI_PROJECT_VARIANT1['familyGuids'] += PROJECT_2_VARIANT1['familyGuids']
MULTI_PROJECT_VARIANT1['genotypes'].update(PROJECT_2_VARIANT1['genotypes'])
MULTI_PROJECT_VARIANT2 = deepcopy(VARIANT2)
MULTI_PROJECT_VARIANT2['familyGuids'].append('F000011_11')
MULTI_PROJECT_VARIANT2['genotypes']['I000015_na20885'] = {
Expand Down Expand Up @@ -117,6 +128,29 @@ async def test_multi_project_search(self):
sample_data=MULTI_PROJECT_SAMPLE_DATA,
)

async def test_inheritance_filter(self):
await self._assert_expected_search(
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], inheritance_mode='any_affected', omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT1, FAMILY_3_VARIANT, VARIANT4], inheritance_mode='de_novo', omit_sample_type='SV_WES',
)

await self._assert_expected_search([], inheritance_mode='x_linked_recessive', omit_sample_type='SV_WES')

await self._assert_expected_search(
[VARIANT2], inheritance_mode='homozygous_recessive', omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[PROJECT_2_VARIANT1, VARIANT2], inheritance_mode='homozygous_recessive', sample_data=MULTI_PROJECT_SAMPLE_DATA,
)

gt_inheritance_filter = {'genotype': {'I000006_hg00733': 'has_alt', 'I000005_hg00732': 'ref_ref'}}
await self._assert_expected_search(
[VARIANT2, VARIANT3], inheritance_filter=gt_inheritance_filter, sample_data=FAMILY_2_VARIANT_SAMPLE_DATA)

async def test_frequency_filter(self):
await self._assert_expected_search(
[VARIANT1, VARIANT4], frequencies={'seqr': {'af': 0.2}}, omit_sample_type='SV_WES',
Expand Down Expand Up @@ -147,6 +181,11 @@ async def test_frequency_filter(self):
omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {}, 'gnomad_genomes': {'af': None}},
omit_sample_type='SV_WES',
)

async def test_in_silico_filter(self):
in_silico = {'eigen': '5.5', 'mut_taster': 'P'}
await self._assert_expected_search(
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ asgiref==3.6.0
# django
build==0.10.0
# via pip-tools
certifi==2022.12.7
certifi==2023.7.22
# via
# -c requirements.txt
# requests
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ async-timeout==4.0.2
# via redis
cachetools==5.3.0
# via google-auth
certifi==2022.12.7
certifi==2023.7.22
# via
# elasticsearch
# requests
cffi==1.15.1
# via cryptography
charset-normalizer==3.0.1
# via requests
cryptography==41.0.2
cryptography==41.0.3
# via social-auth-core
defusedxml==0.7.1
# via
Expand Down
6 changes: 3 additions & 3 deletions seqr/views/apis/variant_search_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,9 @@ def _get_variant_main_transcript_field_val(parsed_variant):
{'header': 'pos'},
{'header': 'ref'},
{'header': 'alt'},
{'header': 'gene', 'value_path': '{transcripts: transcripts.*[].{value: geneSymbol, transcriptId: transcriptId}, mainTranscriptId: mainTranscriptId}', 'process': _get_variant_main_transcript_field_val},
{'header': 'gene', 'value_path': '{transcripts: transcripts.*[].{value: geneSymbol || geneId, transcriptId: transcriptId}, mainTranscriptId: mainTranscriptId}', 'process': _get_variant_main_transcript_field_val},
{'header': 'worst_consequence', 'value_path': '{transcripts: transcripts.*[].{value: majorConsequence, transcriptId: transcriptId}, mainTranscriptId: mainTranscriptId}', 'process': _get_variant_main_transcript_field_val},
{'header': 'callset_freq', 'value_path': 'populations.callset.af'},
{'header': 'callset_freq', 'value_path': 'populations.callset.af || populations.seqr.af'},
{'header': 'exac_freq', 'value_path': 'populations.exac.af'},
{'header': 'gnomad_genomes_freq', 'value_path': 'populations.gnomad_genomes.af'},
{'header': 'gnomad_exomes_freq', 'value_path': 'populations.gnomad_exomes.af'},
Expand All @@ -210,7 +210,7 @@ def _get_variant_main_transcript_field_val(parsed_variant):
{'header': 'rsid', 'value_path': 'rsid'},
{'header': 'hgvsc', 'value_path': '{transcripts: transcripts.*[].{value: hgvsc, transcriptId: transcriptId}, mainTranscriptId: mainTranscriptId}', 'process': _get_variant_main_transcript_field_val},
{'header': 'hgvsp', 'value_path': '{transcripts: transcripts.*[].{value: hgvsp, transcriptId: transcriptId}, mainTranscriptId: mainTranscriptId}', 'process': _get_variant_main_transcript_field_val},
{'header': 'clinvar_clinical_significance', 'value_path': 'clinvar.clinicalSignificance'},
{'header': 'clinvar_clinical_significance', 'value_path': 'clinvar.clinicalSignificance || clinvar.pathogenicity'},
{'header': 'clinvar_gold_stars', 'value_path': 'clinvar.goldStars'},
{'header': 'filter', 'value_path': 'genotypeFilters'},
]
Expand Down
12 changes: 6 additions & 6 deletions seqr/views/apis/variant_search_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,12 +384,12 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro
['12', '48367227', 'TC', 'T', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '2', 'Known gene for phenotype (None)|Excluded (None)', 'a later note (None)|test n\xf8te (None)', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', ''],
['1', '11794419', 'T', 'G', '', 'missense_variant', '', '0.29499998688697815', '0.2633855640888214',
['1', '11794419', 'T', 'G', 'ENSG00000177000', 'missense_variant', '0.31111112236976624', '0.29499998688697815', '0.2633855640888214',
'0.28899794816970825', '0.24615199863910675', '20.899999618530273', '0.19699999690055847',
'2.000999927520752', '0.0', '', 'tolerated', '', 'damaging', 'rs1801131', 'ENST00000376585.6:c.1409A>C',
'ENSP00000365770.1:p.Glu470Ala', '', '1', '', '2', '', '', '', '', '', 'HG00731', '2', '99', '1.0',
'ENSP00000365770.1:p.Glu470Ala', 'Conflicting_interpretations_of_pathogenicity', '1', '', '2', '', '', '', '', '', 'HG00731', '2', '99', '1.0',
'HG00732', '0', '40', '0.0', 'HG00733', '1', '99', '0.625'],
['1', '91502721', 'G', 'A', '', 'intron_variant', '', '0.0', '0.38041073083877563', '0.0',
['1', '91502721', 'G', 'A', 'ENSG00000097046', 'intron_variant', '0.6666666865348816', '0.0', '0.38041073083877563', '0.0',
'0.36268100142478943', '2.753999948501587', '', '1.378000020980835', '0.009999999776482582', '', '', '',
'', 'rs13447464', 'ENST00000428239.5:c.115+890G>A', '', '', '', '', '2', '', '', '', '', '', 'HG00731',
'1', '99', '1.0', 'HG00732', '0', '99', '0.4594594594594595', 'HG00733', '1', '99', '0.4074074074074074'],
Expand Down Expand Up @@ -418,12 +418,12 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro
['12', '48367227', 'TC', 'T', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '2', 'Known gene for phenotype (None)|Excluded (None)', 'a later note (None)|test n\xf8te (None)',
'', '', '', '', '', '', '', '', '', '', '', '',],
['1', '11794419', 'T', 'G', '', 'missense_variant', '', '0.29499998688697815', '0.2633855640888214',
['1', '11794419', 'T', 'G', 'ENSG00000177000', 'missense_variant', '0.31111112236976624', '0.29499998688697815', '0.2633855640888214',
'0.28899794816970825', '0.24615199863910675', '20.899999618530273', '0.19699999690055847',
'2.000999927520752', '0.0', '', 'tolerated', '', 'damaging', 'rs1801131', 'ENST00000376585.6:c.1409A>C',
'ENSP00000365770.1:p.Glu470Ala', '', '1', '', '2', '', '', 'HG00731', '2', '99', '1.0',
'ENSP00000365770.1:p.Glu470Ala', 'Conflicting_interpretations_of_pathogenicity', '1', '', '2', '', '', 'HG00731', '2', '99', '1.0',
'HG00732', '0', '40', '0.0', 'HG00733', '1', '99', '0.625'],
['1', '91502721', 'G', 'A', '', 'intron_variant', '', '0.0', '0.38041073083877563', '0.0',
['1', '91502721', 'G', 'A', 'ENSG00000097046', 'intron_variant', '0.6666666865348816', '0.0', '0.38041073083877563', '0.0',
'0.36268100142478943', '2.753999948501587', '', '1.378000020980835', '0.009999999776482582', '', '',
'', '', 'rs13447464', 'ENST00000428239.5:c.115+890G>A', '', '', '', '', '2', '', '', 'HG00731',
'1', '99', '1.0', 'HG00732', '0', '99', '0.4594594594594595', 'HG00733', '1', '99',
Expand Down
Loading

0 comments on commit 5c3eea4

Please sign in to comment.