Skip to content

Commit

Permalink
Merge pull request #3520 from broadinstitute/hail-backend-filter-quality
Browse files Browse the repository at this point in the history
Hail backend filter quality
  • Loading branch information
hanars authored Aug 2, 2023
2 parents bc7a21c + 480525d commit d34d6fc
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 1 deletion.
59 changes: 58 additions & 1 deletion hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@


PredictionPath = namedtuple('PredictionPath', ['source', 'field'])
QualityFilterFormat = namedtuple('QualityFilterFormat', ['scale', 'override'], defaults=[None, None])


def _to_camel_case(snake_case_str):
Expand All @@ -33,6 +34,7 @@ class BaseHailTableQuery(object):
}

GENOTYPE_FIELDS = {}
QUALITY_FILTER_FORMAT = {}
POPULATIONS = {}
POPULATION_FIELDS = {}
POPULATION_KEYS = ['AF', 'AC', 'AN', 'Hom', 'Hemi', 'Het']
Expand Down Expand Up @@ -202,13 +204,25 @@ def import_filtered_table(self, data_type, sample_data, **kwargs):

return ht

def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, **kwargs):
def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, quality_filter=None,
**kwargs):
ht, sample_id_family_index_map = self._add_entry_sample_families(ht, sample_data)

ht = self._filter_inheritance(
ht, inheritance_mode, inheritance_filter, sample_data, sample_id_family_index_map,
)

quality_filter = quality_filter or {}
if quality_filter.get('vcf_filter'):
ht = self._filter_vcf_filters(ht)

passes_quality_filter = self._get_genotype_passes_quality_filter(quality_filter)
if passes_quality_filter is not None:
ht = ht.annotate(family_entries=ht.family_entries.map(
lambda entries: hl.or_missing(entries.all(passes_quality_filter), entries)
))
ht = ht.filter(ht.family_entries.any(hl.is_defined))

return ht.select_globals()

@classmethod
Expand Down Expand Up @@ -317,6 +331,46 @@ def _valid_genotype_family_entries(cls, entries, gentoype_entry_indices, genotyp
)
return hl.or_missing(is_valid, entries)

@classmethod
def _get_genotype_passes_quality_filter(cls, quality_filter):
affected_only = quality_filter.get('affected_only')
passes_quality_filters = []
for filter_k, value in quality_filter.items():
field = cls.GENOTYPE_FIELDS.get(filter_k.replace('min_', ''))
if field and value:
passes_quality_filters.append(cls._get_genotype_passes_quality_field(field, value, affected_only))

if not passes_quality_filters:
return None

def passes_quality(gt):
pq = passes_quality_filters[0](gt)
for q in passes_quality_filters[1:]:
pq &= q(gt)
return pq

return passes_quality

@classmethod
def _get_genotype_passes_quality_field(cls, field, value, affected_only):
field_config = cls.QUALITY_FILTER_FORMAT.get(field) or QualityFilterFormat()
if field_config.scale:
value = value / field_config.scale

def passes_quality_field(gt):
is_valid = (gt[field] >= value) | hl.is_missing(gt[field])
if field_config.override:
is_valid |= field_config.override(gt)
if affected_only:
is_valid |= gt.affected_id == UNAFFECTED_ID
return is_valid

return passes_quality_field

@staticmethod
def _filter_vcf_filters(ht):
return ht.filter(hl.is_missing(ht.filters) | (ht.filters.length() < 1))

@staticmethod
def get_x_chrom_filter(ht, x_interval):
return x_interval.contains(ht.locus)
Expand Down Expand Up @@ -351,6 +405,9 @@ def _get_sort_expressions(self, ht, sort):
class VariantHailTableQuery(BaseHailTableQuery):

GENOTYPE_FIELDS = {f.lower(): f for f in ['DP', 'GQ', 'AB']}
QUALITY_FILTER_FORMAT = {
'AB': QualityFilterFormat(override=lambda gt: ~gt.GT.is_het(), scale=100),
}
POPULATIONS = {
'seqr': {'hom': 'hom', 'hemi': None, 'het': None},
'topmed': {'hemi': None},
Expand Down
31 changes: 31 additions & 0 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,37 @@ async def test_inheritance_filter(self):
await self._assert_expected_search(
[VARIANT2, VARIANT3], inheritance_filter=gt_inheritance_filter, sample_data=FAMILY_2_VARIANT_SAMPLE_DATA)

async def test_quality_filter(self):
await self._assert_expected_search(
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'vcf_filter': 'pass'}, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40}, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40, 'vcf_filter': 'pass'}, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 60, 'affected_only': True},
omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter={'min_ab': 50}, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT2, VARIANT3], quality_filter={'min_ab': 70, 'affected_only': True},
omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT2, FAMILY_3_VARIANT], quality_filter={'min_gq': 40, 'min_ab': 50}, omit_sample_type='SV_WES',
)

async def test_search_missing_data(self):
search_body = get_hail_search_body(sample_data=FAMILY_2_MISSING_SAMPLE_DATA)
async with self.client.request('POST', '/search', json=search_body) as resp:
Expand Down

0 comments on commit d34d6fc

Please sign in to comment.