Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hail backend - In-silico filter #3530

Merged
merged 12 commits into from
Aug 4, 2023
38 changes: 37 additions & 1 deletion hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ def _format_transcript_args(self):
'format_value': lambda value: value.rename({k: _to_camel_case(k) for k in value.keys()}),
}

def _get_enum_lookup(self, field, subfield):
enum_field = self._enums.get(field, {}).get(subfield)
if enum_field is None:
return None
return {v: i for i, v in enumerate(enum_field)}

@staticmethod
def _enum_field(value, enum, ht_globals=None, annotate_value=None, format_value=None, drop_fields=None, **kwargs):
annotations = {}
Expand Down Expand Up @@ -404,7 +410,7 @@ def _filter_variant_ids(self, ht, variant_ids):
variant_id_q |= q
return ht.filter(variant_id_q)

def _filter_annotated_table(self, gene_ids=None, rs_ids=None, frequencies=None, **kwargs):
def _filter_annotated_table(self, gene_ids=None, rs_ids=None, frequencies=None, in_silico=None, **kwargs):
if gene_ids:
self._filter_by_gene_ids(gene_ids)

Expand All @@ -413,6 +419,8 @@ def _filter_annotated_table(self, gene_ids=None, rs_ids=None, frequencies=None,

self._filter_by_frequency(frequencies)

self._filter_by_in_silico(in_silico)

def _filter_by_gene_ids(self, gene_ids):
gene_ids = hl.set(gene_ids)
self._ht = self._ht.filter(self._ht.sorted_transcript_consequences.any(lambda t: gene_ids.contains(t.gene_id)))
Expand Down Expand Up @@ -484,6 +492,34 @@ def _filter_by_frequency(self, frequencies):
pop_filter &= pf
self._ht = self._ht.filter(hl.is_missing(pop_expr) | pop_filter)

def _filter_by_in_silico(self, in_silico_filters):
in_silico_filters = in_silico_filters or {}
require_score = in_silico_filters.get('requireScore', False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using includes_missing_score for the requireScore is more understandable.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but this is passed from the ui so its not something we can change. I think its clearer to name the variable what its called in the UI code and what its passed in the json as rather than to rename it to something else that means the same thing

in_silico_filters = {k: v for k, v in in_silico_filters.items() if k in self.PREDICTION_FIELDS_CONFIG and v}
if not in_silico_filters:
return

in_silico_qs = []
missing_qs = []
for in_silico, value in in_silico_filters.items():
score_path = self.PREDICTION_FIELDS_CONFIG[in_silico]
enum_lookup = self._get_enum_lookup(*score_path)
if enum_lookup is not None:
ht_value = self._ht[score_path.source][f'{score_path.field}_id']
score_filter = ht_value == enum_lookup[value]
else:
ht_value = self._ht[score_path.source][score_path.field]
score_filter = ht_value >= float(value)

in_silico_qs.append(score_filter)
if not require_score:
missing_qs.append(hl.is_missing(ht_value))

if missing_qs:
in_silico_qs.append(hl.all(missing_qs))

self._ht = self._ht.filter(hl.any(in_silico_qs))

def _format_results(self, ht):
annotations = {k: v(ht) for k, v in self.annotation_fields().items()}
annotations.update({
Expand Down
11 changes: 11 additions & 0 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,17 @@ async def test_frequency_filter(self):
omit_sample_type='SV_WES',
)

async def test_in_silico_filter(self):
in_silico = {'eigen': '5.5', 'mut_taster': 'P'}
await self._assert_expected_search(
[VARIANT1, VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES',
)

in_silico['requireScore'] = True
await self._assert_expected_search(
[VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES',
)

async def test_search_errors(self):
search_body = get_hail_search_body(sample_data=FAMILY_2_MISSING_SAMPLE_DATA)
async with self.client.request('POST', '/search', json=search_body) as resp:
Expand Down
Loading