Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hail backend - In-silico filter #3530

Merged
merged 12 commits into from
Aug 4, 2023
48 changes: 47 additions & 1 deletion hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ def _format_transcript_args(self):
'format_value': lambda value: value.rename({k: _to_camel_case(k) for k in value.keys()}),
}

def _get_enum_lookup(self, field, subfield):
enum_field = self._enums.get(field, {}).get(subfield)
if enum_field is None:
return None
return {v: i for i, v in enumerate(enum_field)}

@staticmethod
def _enum_field(value, enum, globals=None, annotate_value=None, format_value=None, drop_fields=None, **kwargs):
annotations = {}
Expand Down Expand Up @@ -377,9 +383,11 @@ def _filter_vcf_filters(ht):
def get_x_chrom_filter(ht, x_interval):
return x_interval.contains(ht.locus)

def _filter_annotated_table(self, frequencies=None, **kwargs):
def _filter_annotated_table(self, frequencies=None, in_silico=None, **kwargs):
self._filter_by_frequency(frequencies)

self._filter_by_in_silico(in_silico)

def _filter_by_frequency(self, frequencies):
frequencies = {k: v for k, v in (frequencies or {}).items() if k in self.POPULATIONS}
if not frequencies:
Expand Down Expand Up @@ -411,6 +419,44 @@ def _filter_by_frequency(self, frequencies):
pop_filter &= pf
self._ht = self._ht.filter(hl.is_missing(pop_expr) | pop_filter)

def _filter_by_in_silico(self, in_silico_filters):
in_silico_filters = in_silico_filters or {}
require_score = in_silico_filters.get('requireScore', False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using includes_missing_score for the requireScore is more understandable.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but this is passed from the ui so its not something we can change. I think its clearer to name the variable what its called in the UI code and what its passed in the json as rather than to rename it to something else that means the same thing

in_silico_filters = {
k: v for k, v in in_silico_filters.items()
if k in self.PREDICTION_FIELDS_CONFIG and v is not None and len(v) != 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the v is not None and len(v) != 0 equivalent to v?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, this was copy-pasted from elasticsearch code, but it does not seem to be neccessary

}
if not in_silico_filters:
return

in_silico_qs = []
missing_qs = []
for in_silico, value in in_silico_filters.items():
score_path = self.PREDICTION_FIELDS_CONFIG[in_silico]
enum_lookup = self._get_enum_lookup(*score_path)
if enum_lookup is not None:
ht_value = self._ht[score_path.source][f'{score_path.field}_id']
score_filter = ht_value == enum_lookup[value]
else:
ht_value = self._ht[score_path.source][score_path.field]
score_filter = ht_value >= float(value)

in_silico_qs.append(score_filter)
if not require_score:
missing_qs.append(hl.is_missing(ht_value))

if missing_qs:
missing_q = missing_qs[0]
for q in missing_qs[1:]:
missing_q &= q
in_silico_qs.append(missing_q)

in_silico_q = in_silico_qs[0]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this block be replaced by hl.any(in_silico_qs)?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Much cleaner 🎉!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like I was frustrated that there was no easy way to do this in hail, and now I'm like "oh duh, of course there is"

for q in in_silico_qs[1:]:
in_silico_q |= q

self._ht = self._ht.filter(in_silico_q)

def _format_results(self, ht):
annotations = {k: v(ht) for k, v in self.annotation_fields.items()}
annotations.update({
Expand Down
11 changes: 11 additions & 0 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,17 @@ async def test_frequency_filter(self):
omit_sample_type='SV_WES',
)

async def test_in_silico_filter(self):
in_silico = {'eigen': '5.5', 'mut_taster': 'P'}
await self._assert_expected_search(
[VARIANT1, VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES',
)

in_silico['requireScore'] = True
await self._assert_expected_search(
[VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES',
)

async def test_search_missing_data(self):
search_body = get_hail_search_body(sample_data=FAMILY_2_MISSING_SAMPLE_DATA)
async with self.client.request('POST', '/search', json=search_body) as resp:
Expand Down
Loading