Skip to content

Commit

Permalink
Merge pull request #3540 from broadinstitute/hail-backend-clinvar-ove…
Browse files Browse the repository at this point in the history
…rrides

Hail backend - clinvar overrides
  • Loading branch information
hanars authored Aug 9, 2023
2 parents ee43165 + 3156430 commit a81c997
Show file tree
Hide file tree
Showing 39 changed files with 126 additions and 40 deletions.
8 changes: 6 additions & 2 deletions hail_search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,13 @@
},
}

PATH_FREQ_OVERRIDE_CUTOFF = 0.05
CLINVAR_PATH_FILTER = 'pathogenic'
CLINVAR_LIKELY_PATH_FILTER = 'likely_pathogenic'
CLINVAR_PATH_SIGNIFICANCES = {CLINVAR_PATH_FILTER, CLINVAR_LIKELY_PATH_FILTER}
CLINVAR_PATH_RANGES = [
('pathogenic', 'Pathogenic', 'Pathogenic/Likely_risk_allele'),
('likely_pathogenic', 'Pathogenic/Likely_pathogenic', 'Likely_risk_allele'),
(CLINVAR_PATH_FILTER, 'Pathogenic', 'Pathogenic/Likely_risk_allele'),
(CLINVAR_LIKELY_PATH_FILTER, 'Pathogenic/Likely_pathogenic', 'Likely_risk_allele'),
('vus_or_conflicting', 'Conflicting_interpretations_of_pathogenicity', 'No_pathogenic_assertion'),
('likely_benign', 'Likely_benign', 'Benign/Likely_benign'),
('benign', 'Benign/Likely_benign', 'Benign'),
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/04 12:47:37
Created at 2023/08/07 11:46:24
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/04 13:56:14
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
118 changes: 87 additions & 31 deletions hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from hail_search.constants import AFFECTED, UNAFFECTED, AFFECTED_ID, UNAFFECTED_ID, MALE, VARIANT_DATASET, \
VARIANT_KEY_FIELD, GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38_DISPLAY, INHERITANCE_FILTERS, \
ANY_AFFECTED, X_LINKED_RECESSIVE, REF_REF, REF_ALT, COMP_HET_ALT, ALT_ALT, HAS_ALT, HAS_REF, \
ANNOTATION_OVERRIDE_FIELDS, SCREEN_KEY, SPLICE_AI_FIELD, CLINVAR_KEY, HGMD_KEY, CLINVAR_PATH_RANGES, HGMD_PATH_RANGES
ANNOTATION_OVERRIDE_FIELDS, SCREEN_KEY, SPLICE_AI_FIELD, CLINVAR_KEY, HGMD_KEY, CLINVAR_PATH_SIGNIFICANCES, \
CLINVAR_PATH_FILTER, CLINVAR_LIKELY_PATH_FILTER, CLINVAR_PATH_RANGES, HGMD_PATH_RANGES, PATH_FREQ_OVERRIDE_CUTOFF

DATASETS_DIR = os.environ.get('DATASETS_DIR', '/hail_datasets')

Expand Down Expand Up @@ -142,42 +143,46 @@ def __init__(self, data_type, sample_data, genome_version, sort=XPOS, num_result
self._genome_version = genome_version
self._sort = sort
self._num_results = num_results
self._data_type = data_type
self._ht = None
self._enums = None
self._globals = None

self._load_filtered_table(data_type, sample_data, **kwargs)
self._load_filtered_table(sample_data, **kwargs)

def _load_filtered_table(self, data_type, sample_data, intervals=None, exclude_intervals=False, variant_ids=None, **kwargs):
def _load_filtered_table(self, sample_data, intervals=None, exclude_intervals=False, variant_ids=None, **kwargs):
parsed_intervals, variant_ids = self._parse_intervals(intervals, variant_ids)
excluded_intervals = None
if exclude_intervals:
excluded_intervals = parsed_intervals
parsed_intervals = None
self._load_table_kwargs = {'_intervals': parsed_intervals, '_filter_intervals': bool(parsed_intervals)}
self.import_filtered_table(
data_type, sample_data, intervals=parsed_intervals, excluded_intervals=excluded_intervals,
variant_ids=variant_ids, **kwargs)
sample_data, excluded_intervals=excluded_intervals, variant_ids=variant_ids, **kwargs)

def import_filtered_table(self, data_type, sample_data, intervals=None, **kwargs):
tables_path = f'{DATASETS_DIR}/{self._genome_version}/{data_type}'
load_table_kwargs = {'_intervals': intervals, '_filter_intervals': bool(intervals)}
def _get_table_path(self, path):
return f'{DATASETS_DIR}/{self._genome_version}/{self._data_type}/{path}'

def _read_table(self, path):
return hl.read_table(self._get_table_path(path), **self._load_table_kwargs)

def import_filtered_table(self, sample_data, **kwargs):
family_samples = defaultdict(list)
project_samples = defaultdict(list)
for s in sample_data:
family_samples[s['family_guid']].append(s)
project_samples[s['project_guid']].append(s)

logger.info(f'Loading {data_type} data for {len(family_samples)} families in {len(project_samples)} projects')
logger.info(f'Loading {self._data_type} data for {len(family_samples)} families in {len(project_samples)} projects')
if len(family_samples) == 1:
family_guid, family_sample_data = list(family_samples.items())[0]
family_ht = hl.read_table(f'{tables_path}/families/{family_guid}.ht', **load_table_kwargs)
family_ht = self._read_table(f'families/{family_guid}.ht')
families_ht = self._filter_entries_table(family_ht, family_sample_data, **kwargs)
else:
filtered_project_hts = []
exception_messages = set()
for project_guid, project_sample_data in project_samples.items():
project_ht = hl.read_table(f'{tables_path}/projects/{project_guid}.ht', **load_table_kwargs)
project_ht = self._read_table(f'projects/{project_guid}.ht')
try:
filtered_project_hts.append(self._filter_entries_table(project_ht, project_sample_data, **kwargs))
except HTTPBadRequest as e:
Expand All @@ -199,7 +204,7 @@ def import_filtered_table(self, data_type, sample_data, intervals=None, **kwargs
),
)

annotations_ht_path = f'{tables_path}/annotations.ht'
annotations_ht_path = self._get_table_path('annotations.ht')
annotation_ht_query_result = hl.query_table(
annotations_ht_path, families_ht.key).first().drop(*families_ht.key)
ht = families_ht.annotate(**annotation_ht_query_result)
Expand All @@ -220,7 +225,7 @@ def import_filtered_table(self, data_type, sample_data, intervals=None, **kwargs
)
self._filter_annotated_table(**kwargs)

def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, quality_filter=None,
def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, quality_filter=None,
excluded_intervals=None, variant_ids=None, **kwargs):
if excluded_intervals:
ht = hl.filter_intervals(ht, excluded_intervals, keep=False)
Expand All @@ -238,10 +243,10 @@ def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inherit
if quality_filter.get('vcf_filter'):
ht = self._filter_vcf_filters(ht)

passes_quality_filter = self._get_genotype_passes_quality_filter(quality_filter)
passes_quality_filter = self._get_family_passes_quality_filter(quality_filter, ht=ht, **kwargs)
if passes_quality_filter is not None:
ht = ht.annotate(family_entries=ht.family_entries.map(
lambda entries: hl.or_missing(entries.all(passes_quality_filter), entries)
lambda entries: hl.or_missing(passes_quality_filter(entries), entries)
))
ht = ht.filter(ht.family_entries.any(hl.is_defined))

Expand Down Expand Up @@ -353,19 +358,18 @@ def _valid_genotype_family_entries(cls, entries, gentoype_entry_indices, genotyp
)
return hl.or_missing(is_valid, entries)

@classmethod
def _get_genotype_passes_quality_filter(cls, quality_filter):
def _get_family_passes_quality_filter(self, quality_filter, **kwargs):
affected_only = quality_filter.get('affected_only')
passes_quality_filters = []
for filter_k, value in quality_filter.items():
field = cls.GENOTYPE_FIELDS.get(filter_k.replace('min_', ''))
field = self.GENOTYPE_FIELDS.get(filter_k.replace('min_', ''))
if field and value:
passes_quality_filters.append(cls._get_genotype_passes_quality_field(field, value, affected_only))
passes_quality_filters.append(self._get_genotype_passes_quality_field(field, value, affected_only))

if not passes_quality_filters:
return None

return lambda gt: hl.all([f(gt) for f in passes_quality_filters])
return lambda entries: entries.all(lambda gt: hl.all([f(gt) for f in passes_quality_filters]))

@classmethod
def _get_genotype_passes_quality_field(cls, field, value, affected_only):
Expand Down Expand Up @@ -412,7 +416,7 @@ def _filter_annotated_table(self, gene_ids=None, rs_ids=None, frequencies=None,
if rs_ids:
self._filter_rs_ids(rs_ids)

self._filter_by_frequency(frequencies)
self._filter_by_frequency(frequencies, pathogenicity)

self._filter_by_in_silico(in_silico)

Expand Down Expand Up @@ -458,18 +462,23 @@ def _parse_intervals(self, intervals, variant_ids):

return parsed_intervals, variant_ids

def _filter_by_frequency(self, frequencies):
def _filter_by_frequency(self, frequencies, pathogenicity):
frequencies = {k: v for k, v in (frequencies or {}).items() if k in self.POPULATIONS}
if not frequencies:
return

path_override_filter = self._frequency_override_filter(pathogenicity)
filters = []
for pop, freqs in sorted(frequencies.items()):
pop_filters = []
pop_expr = self._ht[self.POPULATION_FIELDS.get(pop, pop)]
pop_config = self._format_population_config(self.POPULATIONS[pop])
if freqs.get('af') is not None:
af_field = pop_config.get('filter_af') or pop_config['af']
pop_filters.append(pop_expr[af_field] <= freqs['af'])
pop_filter = pop_expr[af_field] <= freqs['af']
if path_override_filter is not None and freqs['af'] < PATH_FREQ_OVERRIDE_CUTOFF:
pop_filter |= path_override_filter & (pop_expr[af_field] <= PATH_FREQ_OVERRIDE_CUTOFF)
pop_filters.append(pop_filter)
elif freqs.get('ac') is not None:
ac_field = pop_config['ac']
if ac_field:
Expand All @@ -484,7 +493,13 @@ def _filter_by_frequency(self, frequencies):
pop_filters.append(pop_expr[hemi_field] <= freqs['hh'])

if pop_filters:
self._ht = self._ht.filter(hl.is_missing(pop_expr) | hl.all(pop_filters))
filters.append(hl.is_missing(pop_expr) | hl.all(pop_filters))

if filters:
self._ht = self._ht.filter(hl.all(filters))

def _frequency_override_filter(self, pathogenicity):
return None

def _filter_by_in_silico(self, in_silico_filters):
in_silico_filters = in_silico_filters or {}
Expand Down Expand Up @@ -592,10 +607,10 @@ class VariantHailTableQuery(BaseHailTableQuery):
'revel': PredictionPath('dbnsfp', 'REVEL_score'),
'sift': PredictionPath('dbnsfp', 'SIFT_pred'),
}
PATHOGENICITY_FILTERS = [
(CLINVAR_KEY, 'pathogenicity', CLINVAR_PATH_RANGES),
(HGMD_KEY, 'class', HGMD_PATH_RANGES),
]
PATHOGENICITY_FILTERS = {
CLINVAR_KEY: ('pathogenicity', CLINVAR_PATH_RANGES),
HGMD_KEY: ('class', HGMD_PATH_RANGES),
}

GLOBALS = BaseHailTableQuery.GLOBALS + ['versions']
CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS + ['rsid']
Expand Down Expand Up @@ -644,6 +659,10 @@ def _selected_main_transcript_expr(ht):

return hl.or_else(matched_transcript, main_transcript)

def __init__(self, *args, **kwargs):
self._filter_hts = {}
super(VariantHailTableQuery, self).__init__(*args, **kwargs)

def import_filtered_table(self, *args, **kwargs):
super().import_filtered_table(*args, **kwargs)
self._ht = self._ht.key_by(**{VARIANT_KEY_FIELD: self._ht.variant_id})
Expand All @@ -656,6 +675,32 @@ def _format_transcript_args(self):
})
return args

def _get_family_passes_quality_filter(self, quality_filter, ht=None, pathogenicity=None, **kwargs):
passes_quality = super(VariantHailTableQuery, self)._get_family_passes_quality_filter(quality_filter)
clinvar_path_ht = False if passes_quality is None else self._get_clinvar_filter_ht(pathogenicity)
if not clinvar_path_ht:
return passes_quality

return lambda entries: hl.is_defined(clinvar_path_ht[ht.key]) | passes_quality(entries)

def _get_clinvar_filter_ht(self, pathogenicity):
if self._filter_hts.get(CLINVAR_KEY) is not None:
return self._filter_hts[CLINVAR_KEY]

clinvar_path_filters = self._get_clinvar_path_filters(pathogenicity)
if not clinvar_path_filters:
self._filter_hts[CLINVAR_KEY] = False
return False

clinvar_path_ht = self._read_table('clinvar_path_variants.ht')
if CLINVAR_LIKELY_PATH_FILTER not in clinvar_path_filters:
clinvar_path_ht = clinvar_path_ht.filter(clinvar_path_ht.is_pathogenic)
elif CLINVAR_PATH_FILTER not in clinvar_path_filters:
clinvar_path_ht = clinvar_path_ht.filter(clinvar_path_ht.is_likely_pathogenic)
self._filter_hts[CLINVAR_KEY] = clinvar_path_ht

return clinvar_path_ht

def _get_gene_id_filter(self, gene_ids):
self._ht = self._ht.annotate(
gene_transcripts=self._ht.sorted_transcript_consequences.filter(lambda t: gene_ids.contains(t.gene_id))
Expand Down Expand Up @@ -688,10 +733,10 @@ def _annotate_allowed_consequences(self, annotations, annotation_filters):
def _get_annotation_override_filters(self, pathogenicity, annotations):
annotation_filters = []

for key, *args in self.PATHOGENICITY_FILTERS:
for key in self.PATHOGENICITY_FILTERS.keys():
path_terms = (pathogenicity or {}).get(key)
if path_terms:
annotation_filters.append(self._has_terms_range_expr(path_terms, key, *args))
annotation_filters.append(self._has_path_expr(path_terms, key))
if annotations.get(SCREEN_KEY):
screen_enum = self._get_enum_lookup(SCREEN_KEY.lower(), 'region_type')
allowed_consequences = hl.set({screen_enum[c] for c in annotations[SCREEN_KEY]})
Expand All @@ -702,7 +747,18 @@ def _get_annotation_override_filters(self, pathogenicity, annotations):

return annotation_filters

def _has_terms_range_expr(self, terms, field, subfield, range_configs):
def _frequency_override_filter(self, pathogenicity):
path_terms = self._get_clinvar_path_filters(pathogenicity)
return self._has_path_expr(path_terms, CLINVAR_KEY) if path_terms else None

@staticmethod
def _get_clinvar_path_filters(pathogenicity):
return {
f for f in (pathogenicity or {}).get(CLINVAR_KEY) or [] if f in CLINVAR_PATH_SIGNIFICANCES
}

def _has_path_expr(self, terms, field):
subfield, range_configs = self.PATHOGENICITY_FILTERS[field]
enum_lookup = self._get_enum_lookup(field, subfield)

ranges = [[None, None]]
Expand Down
27 changes: 25 additions & 2 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,20 @@ async def test_quality_filter(self):
omit_sample_type='SV_WES',
)

quality_filter = {'min_gq': 40, 'min_ab': 50}
await self._assert_expected_search(
[VARIANT2, FAMILY_3_VARIANT], quality_filter={'min_gq': 40, 'min_ab': 50}, omit_sample_type='SV_WES',
[VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES',
)

annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters
await self._assert_expected_search(
[VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES',
annotations=annotations, pathogenicity={'clinvar': ['likely_pathogenic', 'vus_or_conflicting']},
)

await self._assert_expected_search(
[VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES',
annotations=annotations, pathogenicity={'clinvar': ['pathogenic']},
)

async def test_location_search(self):
Expand Down Expand Up @@ -235,7 +247,7 @@ async def test_frequency_filter(self):
)

await self._assert_expected_search(
[VARIANT4], frequencies={'gnomad_genomes': {'af': 0.41, 'hh': 1}}, omit_sample_type='SV_WES',
[VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.41, 'hh': 1}}, omit_sample_type='SV_WES',
)

await self._assert_expected_search(
Expand All @@ -248,6 +260,17 @@ async def test_frequency_filter(self):
omit_sample_type='SV_WES',
)

annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters
await self._assert_expected_search(
[VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES',
annotations=annotations, pathogenicity={'clinvar': ['likely_pathogenic', 'vus_or_conflicting']},
)

await self._assert_expected_search(
[VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES',
annotations=annotations, pathogenicity={'clinvar': ['pathogenic', 'vus_or_conflicting']},
)

async def test_annotations_filter(self):
await self._assert_expected_search([VARIANT2], pathogenicity={'hgmd': ['hgmd_other']}, omit_sample_type='SV_WES')

Expand Down
4 changes: 2 additions & 2 deletions hail_search/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
'topmed': {'af': 0.0784199982881546, 'ac': 20757, 'an': 264690, 'hom': 0, 'het': 20757},
'exac': {'af': 0.0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'het': 0, 'filter_af': 0.0},
'gnomad_exomes': {'af': 0.0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'filter_af': 0.0},
'gnomad_genomes': {'af': 0.34449315071105957, 'ac': 9271, 'an': 26912, 'hom': 480, 'hemi': 0, 'filter_af': 0.40276646614074707},
'gnomad_genomes': {'af': 0.034449315071105957, 'ac': 927, 'an': 26912, 'hom': 48, 'hemi': 0, 'filter_af': 0.040276646614074707},
},
'predictions': {
'cadd': 4.668000221252441,
Expand Down Expand Up @@ -168,7 +168,7 @@
'topmed': {'af': 0.24615199863910675, 'ac': 65154, 'an': 264690, 'hom': 8775, 'het': 47604},
'exac': {'af': 0.29499998688697815, 'ac': 35805, 'an': 121372, 'hom': 5872, 'hemi': 0, 'het': 24061, 'filter_af': 0.4153035283088684},
'gnomad_exomes': {'af': 0.28899794816970825, 'ac': 72672, 'an': 251462, 'hom': 11567, 'hemi': 0, 'filter_af': 0.4116474986076355},
'gnomad_genomes': {'af': 0.2633855640888214, 'ac': 40003, 'an': 151880, 'hom': 5754, 'hemi': 0, 'filter_af': 0.4067690968513489},
'gnomad_genomes': {'af': 0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'filter_af': 0},
},
'predictions': {
'cadd': 20.899999618530273,
Expand Down
Loading

0 comments on commit a81c997

Please sign in to comment.