diff --git a/hail_search/constants.py b/hail_search/constants.py index 9f364016e1..8c6bea220f 100644 --- a/hail_search/constants.py +++ b/hail_search/constants.py @@ -1,4 +1,4 @@ -GENOME_VERSION_GRCh38_DISPLAY = 'GRCh38' +GENOME_VERSION_GRCh38 = 'GRCh38' AFFECTED = 'A' UNAFFECTED = 'N' diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 8d39674e9f..667d9cbae2 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -5,7 +5,7 @@ import os from hail_search.constants import AFFECTED, UNAFFECTED, AFFECTED_ID, UNAFFECTED_ID, MALE, VARIANT_DATASET, \ - VARIANT_KEY_FIELD, GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38_DISPLAY, INHERITANCE_FILTERS, \ + VARIANT_KEY_FIELD, GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38, INHERITANCE_FILTERS, \ ANY_AFFECTED, X_LINKED_RECESSIVE, REF_REF, REF_ALT, COMP_HET_ALT, ALT_ALT, HAS_ALT, HAS_REF, \ ANNOTATION_OVERRIDE_FIELDS, SCREEN_KEY, SPLICE_AI_FIELD, CLINVAR_KEY, HGMD_KEY, CLINVAR_PATH_SIGNIFICANCES, \ CLINVAR_PATH_FILTER, CLINVAR_LIKELY_PATH_FILTER, CLINVAR_PATH_RANGES, HGMD_PATH_RANGES, PATH_FREQ_OVERRIDE_CUTOFF, \ @@ -28,6 +28,9 @@ def _to_camel_case(snake_case_str): class BaseHailTableQuery(object): + DATA_TYPE = None + LOADED_GLOBALS = None + GENOTYPE_QUERY_MAP = { REF_REF: lambda gt: gt.is_hom_ref(), REF_ALT: lambda gt: gt.is_het(), @@ -44,6 +47,7 @@ class BaseHailTableQuery(object): POPULATION_KEYS = ['AF', 'AC', 'AN', 'Hom', 'Hemi', 'Het'] PREDICTION_FIELDS_CONFIG = {} + GENOME_VERSIONS = [GENOME_VERSION_GRCh38] GLOBALS = ['enums'] CORE_FIELDS = [XPOS] BASE_ANNOTATION_FIELDS = { @@ -60,6 +64,14 @@ class BaseHailTableQuery(object): XPOS: lambda r: [r.xpos], } + @classmethod + def load_globals(cls): + cls.LOADED_GLOBALS = {} + for genome_version in cls.GENOME_VERSIONS: + ht_path = cls._get_generic_table_path(genome_version, 'annotations.ht') + ht_globals = hl.eval(hl.read_table(ht_path).globals.select(*cls.GLOBALS)) + cls.LOADED_GLOBALS[genome_version] = {k: ht_globals[k] for k in cls.GLOBALS} + @classmethod def _format_population_config(cls, pop_config): base_pop_config = {field.lower(): field for field in cls.POPULATION_KEYS} @@ -98,7 +110,7 @@ def annotation_fields(self): for k, enum_config in self.ENUM_ANNOTATION_FIELDS.items() }) - if self._genome_version == GENOME_VERSION_GRCh38_DISPLAY: + if self._genome_version == GENOME_VERSION_GRCh38: annotation_fields.update(self.LIFTOVER_ANNOTATION_FIELDS) return annotation_fields @@ -148,16 +160,13 @@ def _enum_field(value, enum, ht_globals=None, annotate_value=None, format_value= return value - def __init__(self, data_type, sample_data, genome_version, sort=XPOS, sort_metadata=None, num_results=100, inheritance_mode=None, **kwargs): + def __init__(self, sample_data, genome_version, sort=XPOS, sort_metadata=None, num_results=100, inheritance_mode=None, **kwargs): self._genome_version = genome_version self._sort = sort self._sort_metadata = sort_metadata self._num_results = num_results - self._data_type = data_type self._ht = None self._comp_het_ht = None - self._enums = None - self._globals = None self._inheritance_mode = inheritance_mode self._load_filtered_table(sample_data, inheritance_mode=inheritance_mode, **kwargs) @@ -170,6 +179,14 @@ def _is_recessive_search(self): def _has_comp_het_search(self): return self._inheritance_mode in {RECESSIVE, COMPOUND_HET} + @property + def _globals(self): + return self.LOADED_GLOBALS[self._genome_version] + + @property + def _enums(self): + return self._globals['enums'] + def _load_filtered_table(self, sample_data, intervals=None, exclude_intervals=False, variant_ids=None, **kwargs): parsed_intervals, variant_ids = self._parse_intervals(intervals, variant_ids) excluded_intervals = None @@ -189,8 +206,12 @@ def _load_filtered_table(self, sample_data, intervals=None, exclude_intervals=Fa else: self._ht = None + @classmethod + def _get_generic_table_path(cls, genome_version, path): + return f'{DATASETS_DIR}/{genome_version}/{cls.DATA_TYPE}/{path}' + def _get_table_path(self, path): - return f'{DATASETS_DIR}/{self._genome_version}/{self._data_type}/{path}' + return self._get_generic_table_path(self._genome_version, path) def _read_table(self, path): return hl.read_table(self._get_table_path(path), **self._load_table_kwargs) @@ -202,7 +223,7 @@ def import_filtered_table(self, sample_data, intervals=None, **kwargs): family_samples[s['family_guid']].append(s) project_samples[s['project_guid']].append(s) - logger.info(f'Loading {self._data_type} data for {len(family_samples)} families in {len(project_samples)} projects') + logger.info(f'Loading {self.DATA_TYPE} data for {len(family_samples)} families in {len(project_samples)} projects') if len(family_samples) == 1: family_guid, family_sample_data = list(family_samples.items())[0] family_ht = self._read_table(f'families/{family_guid}.ht') @@ -247,11 +268,6 @@ def import_filtered_table(self, sample_data, intervals=None, **kwargs): annotations_ht_path, families_ht.key).first().drop(*families_ht.key) self._ht = families_ht.annotate(**annotation_ht_query_result) - # Get globals - annotation_globals_ht = hl.read_table(annotations_ht_path).head(0).select() - self._globals = {k: hl.eval(annotation_globals_ht[k]) for k in self.GLOBALS} - self._enums = self._globals.pop('enums') - self._filter_annotated_table(**kwargs) def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, quality_filter=None, @@ -733,6 +749,8 @@ def _gene_rank_sort(cls, r, gene_ranks): class VariantHailTableQuery(BaseHailTableQuery): + DATA_TYPE = VARIANT_DATASET + GENOTYPE_FIELDS = {f.lower(): f for f in ['DP', 'GQ', 'AB']} QUALITY_FILTER_FORMAT = { 'AB': QualityFilterFormat(override=lambda gt: ~gt.GT.is_het(), scale=100), @@ -1001,6 +1019,4 @@ def _gene_rank_sort(cls, r, gene_ranks): ] -QUERY_CLASS_MAP = { - VARIANT_DATASET: VariantHailTableQuery, -} +QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery]} diff --git a/hail_search/search.py b/hail_search/search.py index 716aae5e7b..eec334a80f 100644 --- a/hail_search/search.py +++ b/hail_search/search.py @@ -3,13 +3,18 @@ def search_hail_backend(request): sample_data = request.pop('sample_data', {}) + genome_version = request.pop('genome_version') data_types = list(sample_data.keys()) single_data_type = data_types[0] if len(data_types) == 1 else None sample_data = sample_data[single_data_type] - data_type = single_data_type query_cls = QUERY_CLASS_MAP[single_data_type] - query = query_cls(data_type, sample_data=sample_data, **request) + query = query_cls(sample_data, genome_version, **request) return query.search() + + +def load_globals(): + for cls in QUERY_CLASS_MAP.values(): + cls.load_globals() diff --git a/hail_search/web_app.py b/hail_search/web_app.py index cf538cf751..0dc5775a6b 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -2,7 +2,7 @@ import json import hail as hl -from hail_search.search import search_hail_backend +from hail_search.search import search_hail_backend, load_globals def _hl_json_default(o): @@ -29,4 +29,5 @@ def init_web_app(): web.get('/status', status), web.post('/search', search), ]) + load_globals() return app diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json index 1c522b8bf8..7137b39277 100644 --- a/seqr/fixtures/1kg_project.json +++ b/seqr/fixtures/1kg_project.json @@ -695,7 +695,7 @@ "created_by": null, "last_modified_date": "2017-03-13T09:07:50.158Z", "family": 8, - "individual_id": "NA20877", + "individual_id": "NA20888", "mother_id": null, "father_id": null, "sex": "M", diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index d5ae198e92..fa105833f3 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -205,9 +205,9 @@ def sample_metadata_export(request, project_guid): ) family_rows_by_id = {row['family_id']: row for row in family_rows} - rows_by_subject_id = {row['subject_id']: row for row in subject_rows} + rows_by_subject_family_id = {(row['subject_id'], row['family_guid']): row for row in subject_rows} for row in sample_rows: - rows_by_subject_id[row['subject_id']].update(row) + rows_by_subject_family_id[(row['subject_id'], row['family_guid'])].update(row) for rows in discovery_rows: for i, row in enumerate(rows): @@ -216,9 +216,9 @@ def sample_metadata_export(request, project_guid): parsed_row.update({ '{}-{}'.format(k, i + 1): row[k] for k in DISCOVERY_TABLE_METADATA_VARIANT_COLUMNS if row.get(k) }) - rows_by_subject_id[row['subject_id']].update(parsed_row) + rows_by_subject_family_id[(row['subject_id'], row['family_guid'])].update(parsed_row) - rows = list(rows_by_subject_id.values()) + rows = list(rows_by_subject_family_id.values()) all_features = set() for row in rows: row.update(family_rows_by_id[row['family_id']]) @@ -349,9 +349,10 @@ def _parse_anvil_metadata(individual_samples, user, include_collaborator=False, subject_rows.append(subject_row) sample_row = _get_sample_row(sample, has_dbgap_submission, airtable_metadata) + sample_row['family_guid'] = family_subject_row['family_guid'] sample_rows.append(sample_row) - discovery_row = _get_discovery_rows(sample, parsed_variants, male_individual_guids) + discovery_row = _get_discovery_rows(sample, parsed_variants, male_individual_guids, family_subject_row['family_guid']) discovery_rows.append(discovery_row) return subject_rows, sample_rows, family_rows, discovery_rows @@ -561,12 +562,13 @@ def _get_sample_row(sample, has_dbgap_submission, airtable_metadata): sample_row['dbgap_sample_id'] = airtable_metadata.get('dbgap_sample_id', '') return sample_row -def _get_discovery_rows(sample, parsed_variants, male_individual_guids): +def _get_discovery_rows(sample, parsed_variants, male_individual_guids, family_guid): individual = sample.individual discovery_row = { 'entity:discovery_id': individual.individual_id, 'subject_id': individual.individual_id, 'sample_id': sample.sample_id, + 'family_guid': family_guid, } discovery_rows = [] for genotypes, parsed_variant in parsed_variants: diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 11fb5818e0..a99b5ce0c8 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -613,6 +613,7 @@ def test_sample_metadata_export(self, mock_google_authenticated): self.assertEqual(response.status_code, 200) response_json = response.json() self.assertListEqual(list(response_json.keys()), ['rows']) + self.assertEqual(len(response_json['rows']), 16 + len(self.ADDITIONAL_SAMPLES)) expected_samples.update({ 'NA19679', 'NA20870', 'HG00732', 'NA20876', 'NA20874', 'NA20875', 'NA19678', 'NA19675', 'HG00731', 'NA20872', 'NA20881', 'HG00733', @@ -621,6 +622,7 @@ def test_sample_metadata_export(self, mock_google_authenticated): self.assertSetEqual({r['sample_id'] for r in response_json['rows']}, expected_samples) test_row = next(r for r in response_json['rows'] if r['sample_id'] == 'NA20889') self.assertDictEqual(EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW, test_row) + self.assertEqual(len([r['subject_id'] for r in response_json['rows'] if r['subject_id'] == 'NA20888']), 2) self.check_no_analyst_no_access(url) @@ -687,13 +689,13 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat 'The following tables are required in the data model but absent from the reports: subject', 'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id', 'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata', - 'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', - 'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', - 'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', - 'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', + 'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', + 'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', + 'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', + 'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', ] + skipped_file_validation_warnings[1:6] + skipped_file_validation_warnings[7:]) self.assertListEqual(response.json()['errors'], [ - 'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', + 'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', 'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)', 'The following entries are missing required "aligned_dna_short_read_set_id" (from Airtable) in the "aligned_dna_short_read_set" table: NA19675_1', ]) @@ -813,15 +815,15 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat sample_filter = "OR({CollaboratorSampleID}='HG00731',{CollaboratorSampleID}='HG00732',{CollaboratorSampleID}='HG00733'," \ "{CollaboratorSampleID}='NA19675_1',{CollaboratorSampleID}='NA19678',{CollaboratorSampleID}='NA19679'," \ "{CollaboratorSampleID}='NA20870',{CollaboratorSampleID}='NA20872',{CollaboratorSampleID}='NA20874'," \ - "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20877'," \ - "{CollaboratorSampleID}='NA20881')" + "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20881'," \ + "{CollaboratorSampleID}='NA20888')" sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorSampleID', 'Recontactable'] self._assert_expected_airtable_call(0, sample_filter, sample_fields) secondary_sample_filter = "OR({SeqrCollaboratorSampleID}='HG00731',{SeqrCollaboratorSampleID}='HG00732'," \ "{SeqrCollaboratorSampleID}='HG00733',{SeqrCollaboratorSampleID}='NA19678'," \ "{SeqrCollaboratorSampleID}='NA19679',{SeqrCollaboratorSampleID}='NA20870',{SeqrCollaboratorSampleID}='NA20872'," \ "{SeqrCollaboratorSampleID}='NA20874',{SeqrCollaboratorSampleID}='NA20875',{SeqrCollaboratorSampleID}='NA20876'," \ - "{SeqrCollaboratorSampleID}='NA20877',{SeqrCollaboratorSampleID}='NA20881')" + "{SeqrCollaboratorSampleID}='NA20881',{SeqrCollaboratorSampleID}='NA20888')" sample_fields[0] = 'SeqrCollaboratorSampleID' self._assert_expected_airtable_call(1, secondary_sample_filter, sample_fields) metadata_fields = [