diff --git a/hail_search/constants.py b/hail_search/constants.py index 3e46d40789..08ea627961 100644 --- a/hail_search/constants.py +++ b/hail_search/constants.py @@ -4,6 +4,7 @@ UNAFFECTED = 'N' AFFECTED_ID = 0 UNAFFECTED_ID = 1 +MALE = 'M' VARIANT_DATASET = 'VARIANTS' @@ -11,3 +12,32 @@ GNOMAD_GENOMES_FIELD = 'gnomad_genomes' XPOS = 'xpos' + +ALT_ALT = 'alt_alt' +REF_REF = 'ref_ref' +REF_ALT = 'ref_alt' +HAS_ALT = 'has_alt' +HAS_REF = 'has_ref' +COMP_HET_ALT = 'COMP_HET_ALT' + +RECESSIVE = 'recessive' +X_LINKED_RECESSIVE = 'x_linked_recessive' +COMPOUND_HET = 'compound_het' +ANY_AFFECTED = 'any_affected' +RECESSIVE_FILTER = { + AFFECTED: ALT_ALT, + UNAFFECTED: HAS_REF, +} +INHERITANCE_FILTERS = { + RECESSIVE: RECESSIVE_FILTER, + X_LINKED_RECESSIVE: RECESSIVE_FILTER, + 'homozygous_recessive': RECESSIVE_FILTER, + COMPOUND_HET: { + AFFECTED: COMP_HET_ALT, + UNAFFECTED: HAS_REF, + }, + 'de_novo': { + AFFECTED: HAS_ALT, + UNAFFECTED: REF_REF, + }, +} diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index dcf4756e11..fcec5e74cd 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -4,8 +4,9 @@ import logging import os -from hail_search.constants import AFFECTED, UNAFFECTED, AFFECTED_ID, UNAFFECTED_ID, VARIANT_DATASET, VARIANT_KEY_FIELD,\ - GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38_DISPLAY +from hail_search.constants import AFFECTED, UNAFFECTED, AFFECTED_ID, UNAFFECTED_ID, MALE, VARIANT_DATASET, \ + VARIANT_KEY_FIELD, GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38_DISPLAY, INHERITANCE_FILTERS, \ + ANY_AFFECTED, X_LINKED_RECESSIVE, REF_REF, REF_ALT, COMP_HET_ALT, ALT_ALT, HAS_ALT, HAS_REF DATASETS_DIR = os.environ.get('DATASETS_DIR', '/hail_datasets') @@ -22,6 +23,15 @@ def _to_camel_case(snake_case_str): class BaseHailTableQuery(object): + GENOTYPE_QUERY_MAP = { + REF_REF: lambda gt: gt.is_hom_ref(), + REF_ALT: lambda gt: gt.is_het(), + COMP_HET_ALT: lambda gt: gt.is_het(), + ALT_ALT: lambda gt: gt.is_hom_var(), + HAS_ALT: lambda gt: gt.is_non_ref(), + HAS_REF: lambda gt: gt.is_hom_ref() | gt.is_het_ref(), + } + GENOTYPE_FIELDS = {} POPULATIONS = {} POPULATION_FIELDS = {} @@ -160,14 +170,14 @@ def import_filtered_table(self, data_type, sample_data, **kwargs): if len(family_samples) == 1: family_guid, family_sample_data = list(family_samples.items())[0] family_ht = hl.read_table(f'{tables_path}/families/{family_guid}.ht') - families_ht = self._filter_entries_table(family_ht, family_sample_data) + families_ht = self._filter_entries_table(family_ht, family_sample_data, **kwargs) else: filtered_project_hts = [] exception_messages = set() for project_guid, project_sample_data in project_samples.items(): project_ht = hl.read_table(f'{tables_path}/projects/{project_guid}.ht') try: - filtered_project_hts.append(self._filter_entries_table(project_ht, project_sample_data)) + filtered_project_hts.append(self._filter_entries_table(project_ht, project_sample_data, **kwargs)) except HTTPBadRequest as e: exception_messages.add(e.text) @@ -205,11 +215,11 @@ def import_filtered_table(self, data_type, sample_data, **kwargs): ) self._filter_annotated_table(**kwargs) - def _filter_entries_table(self, ht, sample_data): - ht = self._add_entry_sample_families(ht, sample_data) + def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, **kwargs): + ht, sample_id_family_index_map = self._add_entry_sample_families(ht, sample_data) - ht = ht.annotate(family_entries=ht.family_entries.map( - lambda entries: hl.or_missing(entries.any(lambda x: x.GT.is_non_ref()), entries)) + ht = self._filter_inheritance( + ht, inheritance_mode, inheritance_filter, sample_data, sample_id_family_index_map, ) return ht.select_globals() @@ -237,11 +247,13 @@ def _add_entry_sample_families(cls, ht, sample_data): sample_index_family_map = hl.dict({sample_id_index_map[k]: v for k, v in sample_id_family_map.items()}) family_index_map = {f: i for i, f in enumerate(sorted(set(sample_id_family_map.values())))} family_sample_indices = [None] * len(family_index_map) + sample_id_family_index_map = {} for sample_id, family_guid in sample_id_family_map.items(): sample_index = sample_id_index_map[sample_id] family_index = family_index_map[family_guid] if not family_sample_indices[family_index]: family_sample_indices[family_index] = [] + sample_id_family_index_map[sample_id] = (family_index, len(family_sample_indices[family_index])) family_sample_indices[family_index].append(sample_index) family_sample_indices = hl.array(family_sample_indices) @@ -256,13 +268,72 @@ def _add_entry_sample_families(cls, ht, sample_data): )) ) - return ht + return ht, sample_id_family_index_map @staticmethod def _missing_entry(entry): entry_type = dict(**entry.dtype) return hl.struct(**{k: hl.missing(v) for k, v in entry_type.items()}) + def _filter_inheritance(self, ht, inheritance_mode, inheritance_filter, sample_data, sample_id_family_index_map): + any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT) + + is_any_affected = inheritance_mode == ANY_AFFECTED + if is_any_affected: + prev_any_valid_entry = any_valid_entry + any_valid_entry = lambda x: prev_any_valid_entry(x) & (x.affected_id == AFFECTED_ID) + + ht = ht.annotate(family_entries=ht.family_entries.map( + lambda entries: hl.or_missing(entries.any(any_valid_entry), entries)) + ) + + if not (inheritance_filter or inheritance_mode): + return ht + + if inheritance_mode == X_LINKED_RECESSIVE: + x_chrom_interval = hl.parse_locus_interval( + hl.get_reference(self._genome_version).x_contigs[0], reference_genome=self._genome_version) + ht = ht.filter(self.get_x_chrom_filter(ht, x_chrom_interval)) + + if not is_any_affected: + ht = self._filter_families_inheritance( + ht, inheritance_mode, inheritance_filter, sample_id_family_index_map, sample_data, + ) + + return ht.filter(ht.family_entries.any(hl.is_defined)) + + @classmethod + def _filter_families_inheritance(cls, ht, inheritance_mode, inheritance_filter, sample_id_family_index_map, sample_data): + individual_genotype_filter = (inheritance_filter or {}).get('genotype') + + entry_indices_by_gt = defaultdict(lambda: defaultdict(list)) + for s in sample_data: + genotype = individual_genotype_filter.get(s['individual_guid']) \ + if individual_genotype_filter else INHERITANCE_FILTERS[inheritance_mode].get(s['affected']) + if inheritance_mode == X_LINKED_RECESSIVE and s['affected'] == UNAFFECTED and s['sex'] == MALE: + genotype = REF_REF + if genotype: + family_index, entry_index = sample_id_family_index_map[s['sample_id']] + entry_indices_by_gt[genotype][family_index].append(entry_index) + + for genotype, entry_indices in entry_indices_by_gt.items(): + entry_indices = hl.dict(entry_indices) + ht = ht.annotate(family_entries=hl.enumerate(ht.family_entries).map( + lambda x: cls._valid_genotype_family_entries(x[1], entry_indices.get(x[0]), genotype))) + + return ht + + @classmethod + def _valid_genotype_family_entries(cls, entries, gentoype_entry_indices, genotype): + is_valid = hl.is_missing(gentoype_entry_indices) | gentoype_entry_indices.all( + lambda i: cls.GENOTYPE_QUERY_MAP[genotype](entries[i].GT) + ) + return hl.or_missing(is_valid, entries) + + @staticmethod + def get_x_chrom_filter(ht, x_interval): + return x_interval.contains(ht.locus) + def _filter_annotated_table(self, frequencies=None, in_silico=None, **kwargs): self._filter_by_frequency(frequencies) @@ -293,7 +364,7 @@ def _filter_by_frequency(self, frequencies): if hemi_field: pop_filters.append(pop_expr[hemi_field] <= freqs['hh']) - if pop_filters is not None: + if pop_filters: pop_filter = pop_filters[0] for pf in pop_filters[1:]: pop_filter &= pf diff --git a/hail_search/requirements-test.txt b/hail_search/requirements-test.txt index e15b9abad2..02396189c8 100644 --- a/hail_search/requirements-test.txt +++ b/hail_search/requirements-test.txt @@ -4,7 +4,7 @@ # # pip-compile hail_search/requirements-test.in # -aiohttp==3.8.4 +aiohttp==3.8.5 # via pytest-aiohttp aiosignal==1.3.1 # via aiohttp @@ -15,9 +15,7 @@ attrs==23.1.0 charset-normalizer==3.2.0 # via aiohttp coverage==5.1 - # via -r hail_search/requirements-test.in -exceptiongroup==1.1.2 - # via pytest + # via -r requirements-test.in frozenlist==1.3.3 # via # aiohttp @@ -39,10 +37,8 @@ pytest==7.4.0 # pytest-aiohttp # pytest-asyncio pytest-aiohttp==1.0.4 - # via -r hail_search/requirements-test.in + # via -r requirements-test.in pytest-asyncio==0.21.0 # via pytest-aiohttp -tomli==2.0.1 - # via pytest yarl==1.9.2 # via aiohttp diff --git a/hail_search/test_search.py b/hail_search/test_search.py index f34a7844f4..6ed943b83e 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -56,19 +56,30 @@ '_sort': [1000010146], } -MULTI_FAMILY_VARIANT = deepcopy(VARIANT3) -MULTI_FAMILY_VARIANT['familyGuids'].append('F000003_3') -MULTI_FAMILY_VARIANT['genotypes']['I000007_na20870'] = { - 'sampleId': 'NA20870', 'individualGuid': 'I000007_na20870', 'familyGuid': 'F000003_3', - 'numAlt': 1, 'dp': 28, 'gq': 99, 'ab': 0.6785714285714286, +FAMILY_3_VARIANT = deepcopy(VARIANT3) +FAMILY_3_VARIANT['familyGuids'] = ['F000003_3'] +FAMILY_3_VARIANT['genotypes'] = { + 'I000007_na20870': { + 'sampleId': 'NA20870', 'individualGuid': 'I000007_na20870', 'familyGuid': 'F000003_3', + 'numAlt': 1, 'dp': 28, 'gq': 99, 'ab': 0.6785714285714286, + }, } -MULTI_PROJECT_VARIANT1 = deepcopy(VARIANT1) -MULTI_PROJECT_VARIANT1['familyGuids'].append('F000011_11') -MULTI_PROJECT_VARIANT1['genotypes']['I000015_na20885'] = { - 'sampleId': 'NA20885', 'individualGuid': 'I000015_na20885', 'familyGuid': 'F000011_11', - 'numAlt': 2, 'dp': 6, 'gq': 16, 'ab': 1.0, +MULTI_FAMILY_VARIANT = deepcopy(VARIANT3) +MULTI_FAMILY_VARIANT['familyGuids'] += FAMILY_3_VARIANT['familyGuids'] +MULTI_FAMILY_VARIANT['genotypes'].update(FAMILY_3_VARIANT['genotypes']) + +PROJECT_2_VARIANT1 = deepcopy(VARIANT1) +PROJECT_2_VARIANT1['familyGuids'] = ['F000011_11'] +PROJECT_2_VARIANT1['genotypes'] = { + 'I000015_na20885': { + 'sampleId': 'NA20885', 'individualGuid': 'I000015_na20885', 'familyGuid': 'F000011_11', + 'numAlt': 2, 'dp': 6, 'gq': 16, 'ab': 1.0, + }, } +MULTI_PROJECT_VARIANT1 = deepcopy(VARIANT1) +MULTI_PROJECT_VARIANT1['familyGuids'] += PROJECT_2_VARIANT1['familyGuids'] +MULTI_PROJECT_VARIANT1['genotypes'].update(PROJECT_2_VARIANT1['genotypes']) MULTI_PROJECT_VARIANT2 = deepcopy(VARIANT2) MULTI_PROJECT_VARIANT2['familyGuids'].append('F000011_11') MULTI_PROJECT_VARIANT2['genotypes']['I000015_na20885'] = { @@ -117,6 +128,29 @@ async def test_multi_project_search(self): sample_data=MULTI_PROJECT_SAMPLE_DATA, ) + async def test_inheritance_filter(self): + await self._assert_expected_search( + [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], inheritance_mode='any_affected', omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [VARIANT1, FAMILY_3_VARIANT, VARIANT4], inheritance_mode='de_novo', omit_sample_type='SV_WES', + ) + + await self._assert_expected_search([], inheritance_mode='x_linked_recessive', omit_sample_type='SV_WES') + + await self._assert_expected_search( + [VARIANT2], inheritance_mode='homozygous_recessive', omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [PROJECT_2_VARIANT1, VARIANT2], inheritance_mode='homozygous_recessive', sample_data=MULTI_PROJECT_SAMPLE_DATA, + ) + + gt_inheritance_filter = {'genotype': {'I000006_hg00733': 'has_alt', 'I000005_hg00732': 'ref_ref'}} + await self._assert_expected_search( + [VARIANT2, VARIANT3], inheritance_filter=gt_inheritance_filter, sample_data=FAMILY_2_VARIANT_SAMPLE_DATA) + async def test_frequency_filter(self): await self._assert_expected_search( [VARIANT1, VARIANT4], frequencies={'seqr': {'af': 0.2}}, omit_sample_type='SV_WES', @@ -147,6 +181,11 @@ async def test_frequency_filter(self): omit_sample_type='SV_WES', ) + await self._assert_expected_search( + [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {}, 'gnomad_genomes': {'af': None}}, + omit_sample_type='SV_WES', + ) + async def test_in_silico_filter(self): in_silico = {'eigen': '5.5', 'mut_taster': 'P'} await self._assert_expected_search( diff --git a/requirements-dev.txt b/requirements-dev.txt index dd569aae2a..65fe1d8557 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,7 +10,7 @@ asgiref==3.6.0 # django build==0.10.0 # via pip-tools -certifi==2022.12.7 +certifi==2023.7.22 # via # -c requirements.txt # requests diff --git a/requirements.txt b/requirements.txt index 70d4fed5d4..da497692d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ async-timeout==4.0.2 # via redis cachetools==5.3.0 # via google-auth -certifi==2022.12.7 +certifi==2023.7.22 # via # elasticsearch # requests @@ -18,7 +18,7 @@ cffi==1.15.1 # via cryptography charset-normalizer==3.0.1 # via requests -cryptography==41.0.2 +cryptography==41.0.3 # via social-auth-core defusedxml==0.7.1 # via diff --git a/seqr/views/apis/variant_search_api.py b/seqr/views/apis/variant_search_api.py index da0f58c945..0ddf7a061b 100644 --- a/seqr/views/apis/variant_search_api.py +++ b/seqr/views/apis/variant_search_api.py @@ -192,9 +192,9 @@ def _get_variant_main_transcript_field_val(parsed_variant): {'header': 'pos'}, {'header': 'ref'}, {'header': 'alt'}, - {'header': 'gene', 'value_path': '{transcripts: transcripts.*[].{value: geneSymbol, transcriptId: transcriptId}, mainTranscriptId: mainTranscriptId}', 'process': _get_variant_main_transcript_field_val}, + {'header': 'gene', 'value_path': '{transcripts: transcripts.*[].{value: geneSymbol || geneId, transcriptId: transcriptId}, mainTranscriptId: mainTranscriptId}', 'process': _get_variant_main_transcript_field_val}, {'header': 'worst_consequence', 'value_path': '{transcripts: transcripts.*[].{value: majorConsequence, transcriptId: transcriptId}, mainTranscriptId: mainTranscriptId}', 'process': _get_variant_main_transcript_field_val}, - {'header': 'callset_freq', 'value_path': 'populations.callset.af'}, + {'header': 'callset_freq', 'value_path': 'populations.callset.af || populations.seqr.af'}, {'header': 'exac_freq', 'value_path': 'populations.exac.af'}, {'header': 'gnomad_genomes_freq', 'value_path': 'populations.gnomad_genomes.af'}, {'header': 'gnomad_exomes_freq', 'value_path': 'populations.gnomad_exomes.af'}, @@ -210,7 +210,7 @@ def _get_variant_main_transcript_field_val(parsed_variant): {'header': 'rsid', 'value_path': 'rsid'}, {'header': 'hgvsc', 'value_path': '{transcripts: transcripts.*[].{value: hgvsc, transcriptId: transcriptId}, mainTranscriptId: mainTranscriptId}', 'process': _get_variant_main_transcript_field_val}, {'header': 'hgvsp', 'value_path': '{transcripts: transcripts.*[].{value: hgvsp, transcriptId: transcriptId}, mainTranscriptId: mainTranscriptId}', 'process': _get_variant_main_transcript_field_val}, - {'header': 'clinvar_clinical_significance', 'value_path': 'clinvar.clinicalSignificance'}, + {'header': 'clinvar_clinical_significance', 'value_path': 'clinvar.clinicalSignificance || clinvar.pathogenicity'}, {'header': 'clinvar_gold_stars', 'value_path': 'clinvar.goldStars'}, {'header': 'filter', 'value_path': 'genotypeFilters'}, ] diff --git a/seqr/views/apis/variant_search_api_tests.py b/seqr/views/apis/variant_search_api_tests.py index feed023d44..32ec58ce66 100644 --- a/seqr/views/apis/variant_search_api_tests.py +++ b/seqr/views/apis/variant_search_api_tests.py @@ -384,12 +384,12 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro ['12', '48367227', 'TC', 'T', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '2', 'Known gene for phenotype (None)|Excluded (None)', 'a later note (None)|test n\xf8te (None)', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], - ['1', '11794419', 'T', 'G', '', 'missense_variant', '', '0.29499998688697815', '0.2633855640888214', + ['1', '11794419', 'T', 'G', 'ENSG00000177000', 'missense_variant', '0.31111112236976624', '0.29499998688697815', '0.2633855640888214', '0.28899794816970825', '0.24615199863910675', '20.899999618530273', '0.19699999690055847', '2.000999927520752', '0.0', '', 'tolerated', '', 'damaging', 'rs1801131', 'ENST00000376585.6:c.1409A>C', - 'ENSP00000365770.1:p.Glu470Ala', '', '1', '', '2', '', '', '', '', '', 'HG00731', '2', '99', '1.0', + 'ENSP00000365770.1:p.Glu470Ala', 'Conflicting_interpretations_of_pathogenicity', '1', '', '2', '', '', '', '', '', 'HG00731', '2', '99', '1.0', 'HG00732', '0', '40', '0.0', 'HG00733', '1', '99', '0.625'], - ['1', '91502721', 'G', 'A', '', 'intron_variant', '', '0.0', '0.38041073083877563', '0.0', + ['1', '91502721', 'G', 'A', 'ENSG00000097046', 'intron_variant', '0.6666666865348816', '0.0', '0.38041073083877563', '0.0', '0.36268100142478943', '2.753999948501587', '', '1.378000020980835', '0.009999999776482582', '', '', '', '', 'rs13447464', 'ENST00000428239.5:c.115+890G>A', '', '', '', '', '2', '', '', '', '', '', 'HG00731', '1', '99', '1.0', 'HG00732', '0', '99', '0.4594594594594595', 'HG00733', '1', '99', '0.4074074074074074'], @@ -418,12 +418,12 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro ['12', '48367227', 'TC', 'T', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '2', 'Known gene for phenotype (None)|Excluded (None)', 'a later note (None)|test n\xf8te (None)', '', '', '', '', '', '', '', '', '', '', '', '',], - ['1', '11794419', 'T', 'G', '', 'missense_variant', '', '0.29499998688697815', '0.2633855640888214', + ['1', '11794419', 'T', 'G', 'ENSG00000177000', 'missense_variant', '0.31111112236976624', '0.29499998688697815', '0.2633855640888214', '0.28899794816970825', '0.24615199863910675', '20.899999618530273', '0.19699999690055847', '2.000999927520752', '0.0', '', 'tolerated', '', 'damaging', 'rs1801131', 'ENST00000376585.6:c.1409A>C', - 'ENSP00000365770.1:p.Glu470Ala', '', '1', '', '2', '', '', 'HG00731', '2', '99', '1.0', + 'ENSP00000365770.1:p.Glu470Ala', 'Conflicting_interpretations_of_pathogenicity', '1', '', '2', '', '', 'HG00731', '2', '99', '1.0', 'HG00732', '0', '40', '0.0', 'HG00733', '1', '99', '0.625'], - ['1', '91502721', 'G', 'A', '', 'intron_variant', '', '0.0', '0.38041073083877563', '0.0', + ['1', '91502721', 'G', 'A', 'ENSG00000097046', 'intron_variant', '0.6666666865348816', '0.0', '0.38041073083877563', '0.0', '0.36268100142478943', '2.753999948501587', '', '1.378000020980835', '0.009999999776482582', '', '', '', '', 'rs13447464', 'ENST00000428239.5:c.115+890G>A', '', '', '', '', '2', '', '', 'HG00731', '1', '99', '1.0', 'HG00732', '0', '99', '0.4594594594594595', 'HG00733', '1', '99', diff --git a/ui/shared/components/panel/LoadWorkspaceDataForm.jsx b/ui/shared/components/panel/LoadWorkspaceDataForm.jsx index ae93ca5a7b..b134d57132 100644 --- a/ui/shared/components/panel/LoadWorkspaceDataForm.jsx +++ b/ui/shared/components/panel/LoadWorkspaceDataForm.jsx @@ -148,7 +148,7 @@ const onProjectCreateSuccess = (responseJson) => { const formatAddDataUrl = ({ projectGuid }) => (`/api/project/${projectGuid}/add_workspace_data`) -const onAddDataFromWorkspace = dispatch => (responseJson) => { +const onAddDataFromWorkspace = responseJson => (dispatch) => { dispatch({ type: RECEIVE_DATA, updatesById: responseJson }) }