diff --git a/hail_search/constants.py b/hail_search/constants.py index bd24b8ed48..4da596dc3b 100644 --- a/hail_search/constants.py +++ b/hail_search/constants.py @@ -52,9 +52,13 @@ }, } +PATH_FREQ_OVERRIDE_CUTOFF = 0.05 +CLINVAR_PATH_FILTER = 'pathogenic' +CLINVAR_LIKELY_PATH_FILTER = 'likely_pathogenic' +CLINVAR_PATH_SIGNIFICANCES = {CLINVAR_PATH_FILTER, CLINVAR_LIKELY_PATH_FILTER} CLINVAR_PATH_RANGES = [ - ('pathogenic', 'Pathogenic', 'Pathogenic/Likely_risk_allele'), - ('likely_pathogenic', 'Pathogenic/Likely_pathogenic', 'Likely_risk_allele'), + (CLINVAR_PATH_FILTER, 'Pathogenic', 'Pathogenic/Likely_risk_allele'), + (CLINVAR_LIKELY_PATH_FILTER, 'Pathogenic/Likely_pathogenic', 'Likely_risk_allele'), ('vus_or_conflicting', 'Conflicting_interpretations_of_pathogenicity', 'No_pathogenic_assertion'), ('likely_benign', 'Likely_benign', 'Benign/Likely_benign'), ('benign', 'Benign/Likely_benign', 'Benign'), diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.README.txt.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.README.txt.crc index 0410f9ffd4..7509d5e155 100644 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.README.txt.crc and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.README.txt.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.metadata.json.gz.crc index b9f66cdff8..77826b30f8 100644 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.metadata.json.gz.crc and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/README.txt b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/README.txt index b006f802b1..3cd30e172a 100644 --- a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/README.txt +++ b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.109-b71b065e4bb6 - Created at 2023/08/04 12:47:37 \ No newline at end of file + Created at 2023/08/07 11:46:24 \ No newline at end of file diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.idx/.index.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.idx/.index.crc new file mode 100644 index 0000000000..34bb158fa8 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.idx/.index.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.idx/.metadata.json.gz.crc similarity index 100% rename from hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/.metadata.json.gz.crc rename to hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.idx/.metadata.json.gz.crc diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.idx/index b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.idx/index new file mode 100644 index 0000000000..22a0d98596 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.idx/index differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.idx/metadata.json.gz similarity index 100% rename from hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/metadata.json.gz rename to hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.idx/metadata.json.gz diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/.index.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/.index.crc deleted file mode 100644 index 60183f50e1..0000000000 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/.index.crc and /dev/null differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/index b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/index deleted file mode 100644 index befb8d34e8..0000000000 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-bd787778-d7c2-4828-835f-ca93e860ce89.idx/index and /dev/null differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/metadata.json.gz index 7014e2577e..6816809fc8 100644 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/metadata.json.gz and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/.metadata.json.gz.crc index 4bfe301fc1..ad8bc43a54 100644 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/.metadata.json.gz.crc and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/metadata.json.gz index 68f277de11..1361095c3c 100644 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/metadata.json.gz and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.crc new file mode 100644 index 0000000000..07e11f0275 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-bd787778-d7c2-4828-835f-ca93e860ce89.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-bd787778-d7c2-4828-835f-ca93e860ce89.crc deleted file mode 100644 index 7f51fce1fc..0000000000 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-bd787778-d7c2-4828-835f-ca93e860ce89.crc and /dev/null differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97 b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97 new file mode 100644 index 0000000000..0462c2e17d Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-0f6272b7-11d0-44b2-a095-6d84fd6f4a97 differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-bd787778-d7c2-4828-835f-ca93e860ce89 b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-bd787778-d7c2-4828-835f-ca93e860ce89 deleted file mode 100644 index 7384b9a0e8..0000000000 Binary files a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-bd787778-d7c2-4828-835f-ca93e860ce89 and /dev/null differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/.README.txt.crc b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/.README.txt.crc new file mode 100644 index 0000000000..9e5f9ce902 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/.README.txt.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/._SUCCESS.crc b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/._SUCCESS.crc new file mode 100644 index 0000000000..3b7b044936 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/._SUCCESS.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/.metadata.json.gz.crc new file mode 100644 index 0000000000..72936da220 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/README.txt b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/README.txt new file mode 100644 index 0000000000..183eac112f --- /dev/null +++ b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.109-b71b065e4bb6 + Created at 2023/08/04 13:56:14 \ No newline at end of file diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/_SUCCESS b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/.metadata.json.gz.crc new file mode 100644 index 0000000000..365dde7902 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/metadata.json.gz new file mode 100644 index 0000000000..6552f3d13d Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/parts/.part-0.crc b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/parts/.part-0.crc new file mode 100644 index 0000000000..55eea218ee Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/parts/.part-0.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/parts/part-0 b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/parts/part-0 new file mode 100644 index 0000000000..9e0f935fd8 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/globals/parts/part-0 differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/.index.crc b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/.index.crc new file mode 100644 index 0000000000..b6b7333780 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/.index.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/.metadata.json.gz.crc new file mode 100644 index 0000000000..134eb40272 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/index b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/index new file mode 100644 index 0000000000..b0a36c8f7d Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/index differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/metadata.json.gz new file mode 100644 index 0000000000..d2a0a95f91 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/index/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.idx/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/metadata.json.gz new file mode 100644 index 0000000000..30206aa796 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/.metadata.json.gz.crc new file mode 100644 index 0000000000..5e7c307ede Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/metadata.json.gz new file mode 100644 index 0000000000..74645cb7cd Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/parts/.part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.crc b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/parts/.part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.crc new file mode 100644 index 0000000000..16180a7a8d Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/parts/.part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/parts/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5 b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/parts/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5 new file mode 100644 index 0000000000..bd62c00d06 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/clinvar_path_variants.ht/rows/parts/part-0-23b3a09d-4799-4bcb-9e13-c2812c6915c5 differ diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index f96388931e..f2df77100b 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -7,7 +7,8 @@ from hail_search.constants import AFFECTED, UNAFFECTED, AFFECTED_ID, UNAFFECTED_ID, MALE, VARIANT_DATASET, \ VARIANT_KEY_FIELD, GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38_DISPLAY, INHERITANCE_FILTERS, \ ANY_AFFECTED, X_LINKED_RECESSIVE, REF_REF, REF_ALT, COMP_HET_ALT, ALT_ALT, HAS_ALT, HAS_REF, \ - ANNOTATION_OVERRIDE_FIELDS, SCREEN_KEY, SPLICE_AI_FIELD, CLINVAR_KEY, HGMD_KEY, CLINVAR_PATH_RANGES, HGMD_PATH_RANGES + ANNOTATION_OVERRIDE_FIELDS, SCREEN_KEY, SPLICE_AI_FIELD, CLINVAR_KEY, HGMD_KEY, CLINVAR_PATH_SIGNIFICANCES, \ + CLINVAR_PATH_FILTER, CLINVAR_LIKELY_PATH_FILTER, CLINVAR_PATH_RANGES, HGMD_PATH_RANGES, PATH_FREQ_OVERRIDE_CUTOFF DATASETS_DIR = os.environ.get('DATASETS_DIR', '/hail_datasets') @@ -142,42 +143,46 @@ def __init__(self, data_type, sample_data, genome_version, sort=XPOS, num_result self._genome_version = genome_version self._sort = sort self._num_results = num_results + self._data_type = data_type self._ht = None self._enums = None self._globals = None - self._load_filtered_table(data_type, sample_data, **kwargs) + self._load_filtered_table(sample_data, **kwargs) - def _load_filtered_table(self, data_type, sample_data, intervals=None, exclude_intervals=False, variant_ids=None, **kwargs): + def _load_filtered_table(self, sample_data, intervals=None, exclude_intervals=False, variant_ids=None, **kwargs): parsed_intervals, variant_ids = self._parse_intervals(intervals, variant_ids) excluded_intervals = None if exclude_intervals: excluded_intervals = parsed_intervals parsed_intervals = None + self._load_table_kwargs = {'_intervals': parsed_intervals, '_filter_intervals': bool(parsed_intervals)} self.import_filtered_table( - data_type, sample_data, intervals=parsed_intervals, excluded_intervals=excluded_intervals, - variant_ids=variant_ids, **kwargs) + sample_data, excluded_intervals=excluded_intervals, variant_ids=variant_ids, **kwargs) - def import_filtered_table(self, data_type, sample_data, intervals=None, **kwargs): - tables_path = f'{DATASETS_DIR}/{self._genome_version}/{data_type}' - load_table_kwargs = {'_intervals': intervals, '_filter_intervals': bool(intervals)} + def _get_table_path(self, path): + return f'{DATASETS_DIR}/{self._genome_version}/{self._data_type}/{path}' + def _read_table(self, path): + return hl.read_table(self._get_table_path(path), **self._load_table_kwargs) + + def import_filtered_table(self, sample_data, **kwargs): family_samples = defaultdict(list) project_samples = defaultdict(list) for s in sample_data: family_samples[s['family_guid']].append(s) project_samples[s['project_guid']].append(s) - logger.info(f'Loading {data_type} data for {len(family_samples)} families in {len(project_samples)} projects') + logger.info(f'Loading {self._data_type} data for {len(family_samples)} families in {len(project_samples)} projects') if len(family_samples) == 1: family_guid, family_sample_data = list(family_samples.items())[0] - family_ht = hl.read_table(f'{tables_path}/families/{family_guid}.ht', **load_table_kwargs) + family_ht = self._read_table(f'families/{family_guid}.ht') families_ht = self._filter_entries_table(family_ht, family_sample_data, **kwargs) else: filtered_project_hts = [] exception_messages = set() for project_guid, project_sample_data in project_samples.items(): - project_ht = hl.read_table(f'{tables_path}/projects/{project_guid}.ht', **load_table_kwargs) + project_ht = self._read_table(f'projects/{project_guid}.ht') try: filtered_project_hts.append(self._filter_entries_table(project_ht, project_sample_data, **kwargs)) except HTTPBadRequest as e: @@ -199,7 +204,7 @@ def import_filtered_table(self, data_type, sample_data, intervals=None, **kwargs ), ) - annotations_ht_path = f'{tables_path}/annotations.ht' + annotations_ht_path = self._get_table_path('annotations.ht') annotation_ht_query_result = hl.query_table( annotations_ht_path, families_ht.key).first().drop(*families_ht.key) ht = families_ht.annotate(**annotation_ht_query_result) @@ -220,7 +225,7 @@ def import_filtered_table(self, data_type, sample_data, intervals=None, **kwargs ) self._filter_annotated_table(**kwargs) - def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, quality_filter=None, + def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, quality_filter=None, excluded_intervals=None, variant_ids=None, **kwargs): if excluded_intervals: ht = hl.filter_intervals(ht, excluded_intervals, keep=False) @@ -238,10 +243,10 @@ def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inherit if quality_filter.get('vcf_filter'): ht = self._filter_vcf_filters(ht) - passes_quality_filter = self._get_genotype_passes_quality_filter(quality_filter) + passes_quality_filter = self._get_family_passes_quality_filter(quality_filter, ht=ht, **kwargs) if passes_quality_filter is not None: ht = ht.annotate(family_entries=ht.family_entries.map( - lambda entries: hl.or_missing(entries.all(passes_quality_filter), entries) + lambda entries: hl.or_missing(passes_quality_filter(entries), entries) )) ht = ht.filter(ht.family_entries.any(hl.is_defined)) @@ -353,19 +358,18 @@ def _valid_genotype_family_entries(cls, entries, gentoype_entry_indices, genotyp ) return hl.or_missing(is_valid, entries) - @classmethod - def _get_genotype_passes_quality_filter(cls, quality_filter): + def _get_family_passes_quality_filter(self, quality_filter, **kwargs): affected_only = quality_filter.get('affected_only') passes_quality_filters = [] for filter_k, value in quality_filter.items(): - field = cls.GENOTYPE_FIELDS.get(filter_k.replace('min_', '')) + field = self.GENOTYPE_FIELDS.get(filter_k.replace('min_', '')) if field and value: - passes_quality_filters.append(cls._get_genotype_passes_quality_field(field, value, affected_only)) + passes_quality_filters.append(self._get_genotype_passes_quality_field(field, value, affected_only)) if not passes_quality_filters: return None - return lambda gt: hl.all([f(gt) for f in passes_quality_filters]) + return lambda entries: entries.all(lambda gt: hl.all([f(gt) for f in passes_quality_filters])) @classmethod def _get_genotype_passes_quality_field(cls, field, value, affected_only): @@ -412,7 +416,7 @@ def _filter_annotated_table(self, gene_ids=None, rs_ids=None, frequencies=None, if rs_ids: self._filter_rs_ids(rs_ids) - self._filter_by_frequency(frequencies) + self._filter_by_frequency(frequencies, pathogenicity) self._filter_by_in_silico(in_silico) @@ -458,18 +462,23 @@ def _parse_intervals(self, intervals, variant_ids): return parsed_intervals, variant_ids - def _filter_by_frequency(self, frequencies): + def _filter_by_frequency(self, frequencies, pathogenicity): frequencies = {k: v for k, v in (frequencies or {}).items() if k in self.POPULATIONS} if not frequencies: return + path_override_filter = self._frequency_override_filter(pathogenicity) + filters = [] for pop, freqs in sorted(frequencies.items()): pop_filters = [] pop_expr = self._ht[self.POPULATION_FIELDS.get(pop, pop)] pop_config = self._format_population_config(self.POPULATIONS[pop]) if freqs.get('af') is not None: af_field = pop_config.get('filter_af') or pop_config['af'] - pop_filters.append(pop_expr[af_field] <= freqs['af']) + pop_filter = pop_expr[af_field] <= freqs['af'] + if path_override_filter is not None and freqs['af'] < PATH_FREQ_OVERRIDE_CUTOFF: + pop_filter |= path_override_filter & (pop_expr[af_field] <= PATH_FREQ_OVERRIDE_CUTOFF) + pop_filters.append(pop_filter) elif freqs.get('ac') is not None: ac_field = pop_config['ac'] if ac_field: @@ -484,7 +493,13 @@ def _filter_by_frequency(self, frequencies): pop_filters.append(pop_expr[hemi_field] <= freqs['hh']) if pop_filters: - self._ht = self._ht.filter(hl.is_missing(pop_expr) | hl.all(pop_filters)) + filters.append(hl.is_missing(pop_expr) | hl.all(pop_filters)) + + if filters: + self._ht = self._ht.filter(hl.all(filters)) + + def _frequency_override_filter(self, pathogenicity): + return None def _filter_by_in_silico(self, in_silico_filters): in_silico_filters = in_silico_filters or {} @@ -592,10 +607,10 @@ class VariantHailTableQuery(BaseHailTableQuery): 'revel': PredictionPath('dbnsfp', 'REVEL_score'), 'sift': PredictionPath('dbnsfp', 'SIFT_pred'), } - PATHOGENICITY_FILTERS = [ - (CLINVAR_KEY, 'pathogenicity', CLINVAR_PATH_RANGES), - (HGMD_KEY, 'class', HGMD_PATH_RANGES), - ] + PATHOGENICITY_FILTERS = { + CLINVAR_KEY: ('pathogenicity', CLINVAR_PATH_RANGES), + HGMD_KEY: ('class', HGMD_PATH_RANGES), + } GLOBALS = BaseHailTableQuery.GLOBALS + ['versions'] CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS + ['rsid'] @@ -644,6 +659,10 @@ def _selected_main_transcript_expr(ht): return hl.or_else(matched_transcript, main_transcript) + def __init__(self, *args, **kwargs): + self._filter_hts = {} + super(VariantHailTableQuery, self).__init__(*args, **kwargs) + def import_filtered_table(self, *args, **kwargs): super().import_filtered_table(*args, **kwargs) self._ht = self._ht.key_by(**{VARIANT_KEY_FIELD: self._ht.variant_id}) @@ -656,6 +675,32 @@ def _format_transcript_args(self): }) return args + def _get_family_passes_quality_filter(self, quality_filter, ht=None, pathogenicity=None, **kwargs): + passes_quality = super(VariantHailTableQuery, self)._get_family_passes_quality_filter(quality_filter) + clinvar_path_ht = False if passes_quality is None else self._get_clinvar_filter_ht(pathogenicity) + if not clinvar_path_ht: + return passes_quality + + return lambda entries: hl.is_defined(clinvar_path_ht[ht.key]) | passes_quality(entries) + + def _get_clinvar_filter_ht(self, pathogenicity): + if self._filter_hts.get(CLINVAR_KEY) is not None: + return self._filter_hts[CLINVAR_KEY] + + clinvar_path_filters = self._get_clinvar_path_filters(pathogenicity) + if not clinvar_path_filters: + self._filter_hts[CLINVAR_KEY] = False + return False + + clinvar_path_ht = self._read_table('clinvar_path_variants.ht') + if CLINVAR_LIKELY_PATH_FILTER not in clinvar_path_filters: + clinvar_path_ht = clinvar_path_ht.filter(clinvar_path_ht.is_pathogenic) + elif CLINVAR_PATH_FILTER not in clinvar_path_filters: + clinvar_path_ht = clinvar_path_ht.filter(clinvar_path_ht.is_likely_pathogenic) + self._filter_hts[CLINVAR_KEY] = clinvar_path_ht + + return clinvar_path_ht + def _get_gene_id_filter(self, gene_ids): self._ht = self._ht.annotate( gene_transcripts=self._ht.sorted_transcript_consequences.filter(lambda t: gene_ids.contains(t.gene_id)) @@ -688,10 +733,10 @@ def _annotate_allowed_consequences(self, annotations, annotation_filters): def _get_annotation_override_filters(self, pathogenicity, annotations): annotation_filters = [] - for key, *args in self.PATHOGENICITY_FILTERS: + for key in self.PATHOGENICITY_FILTERS.keys(): path_terms = (pathogenicity or {}).get(key) if path_terms: - annotation_filters.append(self._has_terms_range_expr(path_terms, key, *args)) + annotation_filters.append(self._has_path_expr(path_terms, key)) if annotations.get(SCREEN_KEY): screen_enum = self._get_enum_lookup(SCREEN_KEY.lower(), 'region_type') allowed_consequences = hl.set({screen_enum[c] for c in annotations[SCREEN_KEY]}) @@ -702,7 +747,18 @@ def _get_annotation_override_filters(self, pathogenicity, annotations): return annotation_filters - def _has_terms_range_expr(self, terms, field, subfield, range_configs): + def _frequency_override_filter(self, pathogenicity): + path_terms = self._get_clinvar_path_filters(pathogenicity) + return self._has_path_expr(path_terms, CLINVAR_KEY) if path_terms else None + + @staticmethod + def _get_clinvar_path_filters(pathogenicity): + return { + f for f in (pathogenicity or {}).get(CLINVAR_KEY) or [] if f in CLINVAR_PATH_SIGNIFICANCES + } + + def _has_path_expr(self, terms, field): + subfield, range_configs = self.PATHOGENICITY_FILTERS[field] enum_lookup = self._get_enum_lookup(field, subfield) ranges = [[None, None]] diff --git a/hail_search/test_search.py b/hail_search/test_search.py index a7c54b1a8b..6a66ad455d 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -182,8 +182,20 @@ async def test_quality_filter(self): omit_sample_type='SV_WES', ) + quality_filter = {'min_gq': 40, 'min_ab': 50} await self._assert_expected_search( - [VARIANT2, FAMILY_3_VARIANT], quality_filter={'min_gq': 40, 'min_ab': 50}, omit_sample_type='SV_WES', + [VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', + ) + + annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters + await self._assert_expected_search( + [VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', + annotations=annotations, pathogenicity={'clinvar': ['likely_pathogenic', 'vus_or_conflicting']}, + ) + + await self._assert_expected_search( + [VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', + annotations=annotations, pathogenicity={'clinvar': ['pathogenic']}, ) async def test_location_search(self): @@ -235,7 +247,7 @@ async def test_frequency_filter(self): ) await self._assert_expected_search( - [VARIANT4], frequencies={'gnomad_genomes': {'af': 0.41, 'hh': 1}}, omit_sample_type='SV_WES', + [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.41, 'hh': 1}}, omit_sample_type='SV_WES', ) await self._assert_expected_search( @@ -248,6 +260,17 @@ async def test_frequency_filter(self): omit_sample_type='SV_WES', ) + annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters + await self._assert_expected_search( + [VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES', + annotations=annotations, pathogenicity={'clinvar': ['likely_pathogenic', 'vus_or_conflicting']}, + ) + + await self._assert_expected_search( + [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES', + annotations=annotations, pathogenicity={'clinvar': ['pathogenic', 'vus_or_conflicting']}, + ) + async def test_annotations_filter(self): await self._assert_expected_search([VARIANT2], pathogenicity={'hgmd': ['hgmd_other']}, omit_sample_type='SV_WES') diff --git a/hail_search/test_utils.py b/hail_search/test_utils.py index 83e1365c1f..27ef225dc8 100644 --- a/hail_search/test_utils.py +++ b/hail_search/test_utils.py @@ -97,7 +97,7 @@ 'topmed': {'af': 0.0784199982881546, 'ac': 20757, 'an': 264690, 'hom': 0, 'het': 20757}, 'exac': {'af': 0.0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'het': 0, 'filter_af': 0.0}, 'gnomad_exomes': {'af': 0.0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'filter_af': 0.0}, - 'gnomad_genomes': {'af': 0.34449315071105957, 'ac': 9271, 'an': 26912, 'hom': 480, 'hemi': 0, 'filter_af': 0.40276646614074707}, + 'gnomad_genomes': {'af': 0.034449315071105957, 'ac': 927, 'an': 26912, 'hom': 48, 'hemi': 0, 'filter_af': 0.040276646614074707}, }, 'predictions': { 'cadd': 4.668000221252441, @@ -168,7 +168,7 @@ 'topmed': {'af': 0.24615199863910675, 'ac': 65154, 'an': 264690, 'hom': 8775, 'het': 47604}, 'exac': {'af': 0.29499998688697815, 'ac': 35805, 'an': 121372, 'hom': 5872, 'hemi': 0, 'het': 24061, 'filter_af': 0.4153035283088684}, 'gnomad_exomes': {'af': 0.28899794816970825, 'ac': 72672, 'an': 251462, 'hom': 11567, 'hemi': 0, 'filter_af': 0.4116474986076355}, - 'gnomad_genomes': {'af': 0.2633855640888214, 'ac': 40003, 'an': 151880, 'hom': 5754, 'hemi': 0, 'filter_af': 0.4067690968513489}, + 'gnomad_genomes': {'af': 0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'filter_af': 0}, }, 'predictions': { 'cadd': 20.899999618530273, diff --git a/seqr/views/apis/variant_search_api_tests.py b/seqr/views/apis/variant_search_api_tests.py index 78dfc8317d..37db1aeb93 100644 --- a/seqr/views/apis/variant_search_api_tests.py +++ b/seqr/views/apis/variant_search_api_tests.py @@ -387,7 +387,7 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro ['12', '48367227', 'TC', 'T', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '2', 'Known gene for phenotype (None)|Excluded (None)', 'a later note (None)|test n\xf8te (None)', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], - ['1', '11794419', 'T', 'G', 'ENSG00000177000', 'missense_variant', '0.31111112236976624', '0.29499998688697815', '0.2633855640888214', + ['1', '11794419', 'T', 'G', 'ENSG00000177000', 'missense_variant', '0.31111112236976624', '0.29499998688697815', '0', '0.28899794816970825', '0.24615199863910675', '20.899999618530273', '0.19699999690055847', '2.000999927520752', '0.0', '', 'tolerated', '', 'damaging', 'rs1801131', 'ENST00000376585.6:c.1409A>C', 'ENSP00000365770.1:p.Glu470Ala', 'Conflicting_interpretations_of_pathogenicity', '1', '', '2', '', '', '', '', '', 'HG00731', '2', '99', '1.0', @@ -421,7 +421,7 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro ['12', '48367227', 'TC', 'T', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '2', 'Known gene for phenotype (None)|Excluded (None)', 'a later note (None)|test n\xf8te (None)', '', '', '', '', '', '', '', '', '', '', '', '',], - ['1', '11794419', 'T', 'G', 'ENSG00000177000', 'missense_variant', '0.31111112236976624', '0.29499998688697815', '0.2633855640888214', + ['1', '11794419', 'T', 'G', 'ENSG00000177000', 'missense_variant', '0.31111112236976624', '0.29499998688697815', '0', '0.28899794816970825', '0.24615199863910675', '20.899999618530273', '0.19699999690055847', '2.000999927520752', '0.0', '', 'tolerated', '', 'damaging', 'rs1801131', 'ENST00000376585.6:c.1409A>C', 'ENSP00000365770.1:p.Glu470Ala', 'Conflicting_interpretations_of_pathogenicity', '1', '', '2', '', '', 'HG00731', '2', '99', '1.0',