Skip to content

Commit

Permalink
Merge pull request #3552 from broadinstitute/hail-backend-af-prefilter
Browse files Browse the repository at this point in the history
Hail backend - AF prefilter
  • Loading branch information
hanars authored Aug 16, 2023
2 parents a57e166 + ac69ae8 commit 4ca9ccc
Show file tree
Hide file tree
Showing 21 changed files with 57 additions and 17 deletions.
1 change: 1 addition & 0 deletions hail_search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
},
}

PREFILTER_FREQ_CUTOFF = 0.01
PATH_FREQ_OVERRIDE_CUTOFF = 0.05
CLINVAR_PATH_FILTER = 'pathogenic'
CLINVAR_LIKELY_PATH_FILTER = 'likely_pathogenic'
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/11 14:19:35
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
58 changes: 45 additions & 13 deletions hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
VARIANT_KEY_FIELD, GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38_DISPLAY, INHERITANCE_FILTERS, \
ANY_AFFECTED, X_LINKED_RECESSIVE, REF_REF, REF_ALT, COMP_HET_ALT, ALT_ALT, HAS_ALT, HAS_REF, \
ANNOTATION_OVERRIDE_FIELDS, SCREEN_KEY, SPLICE_AI_FIELD, CLINVAR_KEY, HGMD_KEY, CLINVAR_PATH_SIGNIFICANCES, \
CLINVAR_PATH_FILTER, CLINVAR_LIKELY_PATH_FILTER, CLINVAR_PATH_RANGES, HGMD_PATH_RANGES, PATH_FREQ_OVERRIDE_CUTOFF
CLINVAR_PATH_FILTER, CLINVAR_LIKELY_PATH_FILTER, CLINVAR_PATH_RANGES, HGMD_PATH_RANGES, PATH_FREQ_OVERRIDE_CUTOFF, \
PREFILTER_FREQ_CUTOFF

DATASETS_DIR = os.environ.get('DATASETS_DIR', '/hail_datasets')

Expand Down Expand Up @@ -229,9 +230,10 @@ def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inherita
excluded_intervals=None, variant_ids=None, **kwargs):
if excluded_intervals:
ht = hl.filter_intervals(ht, excluded_intervals, keep=False)

if variant_ids:
elif variant_ids:
ht = self._filter_variant_ids(ht, variant_ids)
elif not self._load_table_kwargs['_intervals']:
ht = self._prefilter_entries_table(ht, **kwargs)

ht, sample_id_family_index_map = self._add_entry_sample_families(ht, sample_data)

Expand Down Expand Up @@ -409,6 +411,9 @@ def _filter_variant_ids(self, ht, variant_ids):
variant_id_q |= q
return ht.filter(variant_id_q)

def _prefilter_entries_table(self, ht, **kwargs):
return ht

def _filter_annotated_table(self, gene_ids=None, rs_ids=None, frequencies=None, in_silico=None, pathogenicity=None, annotations=None, **kwargs):
if gene_ids:
self._filter_by_gene_ids(gene_ids)
Expand Down Expand Up @@ -677,29 +682,56 @@ def _format_transcript_args(self):

def _get_family_passes_quality_filter(self, quality_filter, ht=None, pathogenicity=None, **kwargs):
passes_quality = super(VariantHailTableQuery, self)._get_family_passes_quality_filter(quality_filter)
clinvar_path_ht = False if passes_quality is None else self._get_clinvar_filter_ht(pathogenicity)
clinvar_path_ht = False if passes_quality is None else self._get_loaded_filter_ht(
CLINVAR_KEY, 'clinvar_path_variants.ht', self._get_clinvar_prefilter, pathogenicity=pathogenicity)
if not clinvar_path_ht:
return passes_quality

return lambda entries: hl.is_defined(clinvar_path_ht[ht.key]) | passes_quality(entries)

def _get_clinvar_filter_ht(self, pathogenicity):
if self._filter_hts.get(CLINVAR_KEY) is not None:
return self._filter_hts[CLINVAR_KEY]
def _get_loaded_filter_ht(self, key, table_path, get_filters, **kwargs):
if self._filter_hts.get(key) is None:
ht_filter = get_filters(**kwargs)
if ht_filter is False:
self._filter_hts[key] = False
else:
ht = self._read_table(table_path)
if ht_filter is not True:
ht = ht.filter(ht[ht_filter])
self._filter_hts[key] = ht

return self._filter_hts[key]

def _get_clinvar_prefilter(self, pathogenicity=None):
clinvar_path_filters = self._get_clinvar_path_filters(pathogenicity)
if not clinvar_path_filters:
self._filter_hts[CLINVAR_KEY] = False
return False

clinvar_path_ht = self._read_table('clinvar_path_variants.ht')
if CLINVAR_LIKELY_PATH_FILTER not in clinvar_path_filters:
clinvar_path_ht = clinvar_path_ht.filter(clinvar_path_ht.is_pathogenic)
return 'is_pathogenic'
elif CLINVAR_PATH_FILTER not in clinvar_path_filters:
clinvar_path_ht = clinvar_path_ht.filter(clinvar_path_ht.is_likely_pathogenic)
self._filter_hts[CLINVAR_KEY] = clinvar_path_ht
return 'is_likely_pathogenic'
return True

def _prefilter_entries_table(self, ht, **kwargs):
af_ht = self._get_loaded_filter_ht(
GNOMAD_GENOMES_FIELD, 'high_af_variants.ht', self._get_gnomad_af_prefilter, **kwargs)
if af_ht:
ht = ht.filter(hl.is_missing(af_ht[ht.key]))
return ht

def _get_gnomad_af_prefilter(self, frequencies=None, pathogenicity=None, **kwargs):
gnomad_genomes_filter = (frequencies or {}).get(GNOMAD_GENOMES_FIELD, {})
af_cutoff = gnomad_genomes_filter.get('af')
if af_cutoff is None and gnomad_genomes_filter.get('ac') is not None:
af_cutoff = PREFILTER_FREQ_CUTOFF
if af_cutoff is None:
return False

if self._get_clinvar_path_filters(pathogenicity):
af_cutoff = max(af_cutoff, PATH_FREQ_OVERRIDE_CUTOFF)

return clinvar_path_ht
return 'is_gt_10_percent' if af_cutoff > PREFILTER_FREQ_CUTOFF else True

def _get_gene_id_filter(self, gene_ids):
self._ht = self._ht.annotate(
Expand Down
12 changes: 8 additions & 4 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,15 +243,19 @@ async def test_frequency_filter(self):
)

await self._assert_expected_search(
[VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.41}}, omit_sample_type='SV_WES',
[VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.05}}, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.41, 'hh': 1}}, omit_sample_type='SV_WES',
[VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.05, 'hh': 1}}, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT1, VARIANT4], frequencies={'seqr': {'af': 0.2}, 'gnomad_genomes': {'af': 0.41}},
[VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.005}}, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
[VARIANT4], frequencies={'seqr': {'af': 0.2}, 'gnomad_genomes': {'ac': 50}},
omit_sample_type='SV_WES',
)

Expand All @@ -263,7 +267,7 @@ async def test_frequency_filter(self):
annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters
await self._assert_expected_search(
[VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES',
annotations=annotations, pathogenicity={'clinvar': ['likely_pathogenic', 'vus_or_conflicting']},
annotations=annotations, pathogenicity={'clinvar': ['pathogenic', 'likely_pathogenic', 'vus_or_conflicting']},
)

await self._assert_expected_search(
Expand Down

0 comments on commit 4ca9ccc

Please sign in to comment.