From 26a3c91ee6c55aee1786814afb0853eb0fc8eb1d Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Jul 2023 15:18:32 -0400 Subject: [PATCH 1/6] add in silico filtering --- hail_search/hail_search_query.py | 60 +++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 78d6467821..219a00c8a3 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -51,10 +51,17 @@ def _format_population_config(cls, pop_config): base_pop_config.update(pop_config) return base_pop_config + @property + def ht_globals(self): + return {k: hl.eval(self._ht[k]) for k in self.GLOBALS if k != 'enums'} + + @property + def enums(self): + return hl.eval(self._ht.enums) + @property def annotation_fields(self): - ht_globals = {k: hl.eval(self._ht[k]) for k in self.GLOBALS} - enums = ht_globals.pop('enums') + enums = self.enums annotation_fields = { 'populations': lambda r: hl.struct(**{ @@ -73,7 +80,7 @@ def annotation_fields(self): } annotation_fields.update(self.BASE_ANNOTATION_FIELDS) - format_enum = lambda k, enum_config: lambda r: self._enum_field(r[k], enums[k], ht_globals=ht_globals, **enum_config) + format_enum = lambda k, enum_config: lambda r: self._enum_field(r[k], enums[k], ht_globals=self.ht_globals, **enum_config) annotation_fields.update({ enum_config.get('response_key', k): format_enum(k, enum_config) for k, enum_config in self.ENUM_ANNOTATION_FIELDS.items() @@ -96,6 +103,12 @@ def _format_transcript_args(self): 'format_value': lambda value: value.rename({k: _to_camel_case(k) for k in value.keys()}), } + def _get_enum_lookup(self, field, subfield): + enum_field = self.enums.get(field, {}).get(subfield) + if enum_field is None: + return None + return {v: i for i, v in enumerate(enum_field)} + @staticmethod def _enum_field(value, enum, ht_globals=None, annotate_value=None, format_value=None, drop_fields=None, **kwargs): annotations = {} @@ -250,9 +263,11 @@ def _missing_entry(entry): entry_type = dict(**entry.dtype) return hl.struct(**{k: hl.missing(v) for k, v in entry_type.items()}) - def _filter_annotated_table(self, frequencies=None, **kwargs): + def _filter_annotated_table(self, frequencies=None, in_silico=None, **kwargs): self._filter_by_frequency(frequencies) + self._filter_by_in_silico(in_silico) + def _filter_by_frequency(self, frequencies): frequencies = {k: v for k, v in (frequencies or {}).items() if k in self.POPULATIONS} if not frequencies: @@ -284,6 +299,43 @@ def _filter_by_frequency(self, frequencies): pop_filter &= pf self._ht = self._ht.filter(hl.is_missing(pop_expr) | pop_filter) + def _filter_by_in_silico(self, in_silico_filters): + require_score = in_silico_filters.get('requireScore', False) + in_silico_filters = { + k: v for k, v in (in_silico_filters or {}).items() + if k in self.PREDICTION_FIELDS_CONFIG and v is not None and len(v) != 0 + } + if not in_silico_filters: + return + + in_silico_qs = [] + missing_qs = [] + for in_silico, value in in_silico_filters.items(): + score_path = self.PREDICTION_FIELDS_CONFIG[in_silico] + enum_lookup = self._get_enum_lookup(*score_path) + if enum_lookup is not None: + ht_value = self._ht[score_path.source][f'{score_path.field}_id'] + score_filter = ht_value == enum_lookup[value] + else: + ht_value = self._ht[score_path.source][score_path.field] + score_filter = ht_value >= float(value) + + in_silico_qs.append(score_filter) + if not require_score: + missing_qs.append(hl.is_missing(ht_value)) + + if missing_qs: + missing_q = missing_qs[0] + for q in missing_qs[1:]: + missing_q &= q + in_silico_qs.append(missing_q) + + in_silico_q = in_silico_qs[0] + for q in in_silico_qs[1:]: + in_silico_q |= q + + self._ht = self._ht.filter(in_silico_q) + def _format_results(self, ht): annotations = {k: v(ht) for k, v in self.annotation_fields.items()} annotations.update({ From 693d04c4cef39de261893ef15d44e2f07a7464e4 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Jul 2023 17:33:12 -0400 Subject: [PATCH 2/6] unit tests --- hail_search/hail_search_query.py | 3 ++- hail_search/test_search.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 219a00c8a3..dcf4756e11 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -300,9 +300,10 @@ def _filter_by_frequency(self, frequencies): self._ht = self._ht.filter(hl.is_missing(pop_expr) | pop_filter) def _filter_by_in_silico(self, in_silico_filters): + in_silico_filters = in_silico_filters or {} require_score = in_silico_filters.get('requireScore', False) in_silico_filters = { - k: v for k, v in (in_silico_filters or {}).items() + k: v for k, v in in_silico_filters.items() if k in self.PREDICTION_FIELDS_CONFIG and v is not None and len(v) != 0 } if not in_silico_filters: diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 019f6e58aa..f34a7844f4 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -147,6 +147,17 @@ async def test_frequency_filter(self): omit_sample_type='SV_WES', ) + async def test_in_silico_filter(self): + in_silico = {'eigen': '5.5', 'mut_taster': 'P'} + await self._assert_expected_search( + [VARIANT1, VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES', + ) + + in_silico['requireScore'] = True + await self._assert_expected_search( + [VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES', + ) + async def test_search_missing_data(self): search_body = get_hail_search_body(sample_data=FAMILY_2_MISSING_SAMPLE_DATA) async with self.client.request('POST', '/search', json=search_body) as resp: From 0b94b5868d0217608f98643094396224257d003f Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 2 Aug 2023 18:08:10 -0400 Subject: [PATCH 3/6] pr feedback --- hail_search/hail_search_query.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index c551cb9076..ff73fadcac 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -422,10 +422,7 @@ def _filter_by_frequency(self, frequencies): def _filter_by_in_silico(self, in_silico_filters): in_silico_filters = in_silico_filters or {} require_score = in_silico_filters.get('requireScore', False) - in_silico_filters = { - k: v for k, v in in_silico_filters.items() - if k in self.PREDICTION_FIELDS_CONFIG and v is not None and len(v) != 0 - } + in_silico_filters = {k: v for k, v in in_silico_filters.items() if k in self.PREDICTION_FIELDS_CONFIG and v} if not in_silico_filters: return From 794fa4a39bcc020da350f422d3e2359e9f614ad6 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 3 Aug 2023 16:13:05 -0400 Subject: [PATCH 4/6] fix merge --- hail_search/hail_search_query.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index e6220d449d..413161eeab 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -417,6 +417,8 @@ def _filter_annotated_table(self, gene_ids=None, rs_ids=None, frequencies=None, if rs_ids: self._filter_rs_ids(rs_ids) + self._filter_by_frequency(frequencies) + self._filter_by_in_silico(in_silico) def _filter_by_gene_ids(self, gene_ids): From df30e8e3d28b17fca686603d79c6c57d7e6b40f1 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 4 Aug 2023 16:02:49 -0400 Subject: [PATCH 5/6] use hl.any --- hail_search/hail_search_query.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 413161eeab..15b055a1da 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -521,11 +521,7 @@ def _filter_by_in_silico(self, in_silico_filters): missing_q &= q in_silico_qs.append(missing_q) - in_silico_q = in_silico_qs[0] - for q in in_silico_qs[1:]: - in_silico_q |= q - - self._ht = self._ht.filter(in_silico_q) + self._ht = self._ht.filter(hl.any(in_silico_qs)) def _format_results(self, ht): annotations = {k: v(ht) for k, v in self.annotation_fields().items()} From f0aed20e80409a0461c27aa05b04336677cae310 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 4 Aug 2023 16:05:04 -0400 Subject: [PATCH 6/6] use hl.all --- hail_search/hail_search_query.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 15b055a1da..def9bbf10e 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -516,10 +516,7 @@ def _filter_by_in_silico(self, in_silico_filters): missing_qs.append(hl.is_missing(ht_value)) if missing_qs: - missing_q = missing_qs[0] - for q in missing_qs[1:]: - missing_q &= q - in_silico_qs.append(missing_q) + in_silico_qs.append(hl.all(missing_qs)) self._ht = self._ht.filter(hl.any(in_silico_qs))