Skip to content

Commit

Permalink
Merge branch 'dev' of https://github.com/broadinstitute/seqr into hai…
Browse files Browse the repository at this point in the history
…l-backend-sort
  • Loading branch information
hanars committed Aug 17, 2023
2 parents a0f6372 + 47f8e12 commit 1bed8af
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 35 deletions.
2 changes: 1 addition & 1 deletion hail_search/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
GENOME_VERSION_GRCh38_DISPLAY = 'GRCh38'
GENOME_VERSION_GRCh38 = 'GRCh38'

AFFECTED = 'A'
UNAFFECTED = 'N'
Expand Down
48 changes: 32 additions & 16 deletions hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os

from hail_search.constants import AFFECTED, UNAFFECTED, AFFECTED_ID, UNAFFECTED_ID, MALE, VARIANT_DATASET, \
VARIANT_KEY_FIELD, GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38_DISPLAY, INHERITANCE_FILTERS, \
VARIANT_KEY_FIELD, GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38, INHERITANCE_FILTERS, \
ANY_AFFECTED, X_LINKED_RECESSIVE, REF_REF, REF_ALT, COMP_HET_ALT, ALT_ALT, HAS_ALT, HAS_REF, \
ANNOTATION_OVERRIDE_FIELDS, SCREEN_KEY, SPLICE_AI_FIELD, CLINVAR_KEY, HGMD_KEY, CLINVAR_PATH_SIGNIFICANCES, \
CLINVAR_PATH_FILTER, CLINVAR_LIKELY_PATH_FILTER, CLINVAR_PATH_RANGES, HGMD_PATH_RANGES, PATH_FREQ_OVERRIDE_CUTOFF, \
Expand All @@ -28,6 +28,9 @@ def _to_camel_case(snake_case_str):

class BaseHailTableQuery(object):

DATA_TYPE = None
LOADED_GLOBALS = None

GENOTYPE_QUERY_MAP = {
REF_REF: lambda gt: gt.is_hom_ref(),
REF_ALT: lambda gt: gt.is_het(),
Expand All @@ -44,6 +47,7 @@ class BaseHailTableQuery(object):
POPULATION_KEYS = ['AF', 'AC', 'AN', 'Hom', 'Hemi', 'Het']
PREDICTION_FIELDS_CONFIG = {}

GENOME_VERSIONS = [GENOME_VERSION_GRCh38]
GLOBALS = ['enums']
CORE_FIELDS = [XPOS]
BASE_ANNOTATION_FIELDS = {
Expand All @@ -60,6 +64,14 @@ class BaseHailTableQuery(object):
XPOS: lambda r: [r.xpos],
}

@classmethod
def load_globals(cls):
cls.LOADED_GLOBALS = {}
for genome_version in cls.GENOME_VERSIONS:
ht_path = cls._get_generic_table_path(genome_version, 'annotations.ht')
ht_globals = hl.eval(hl.read_table(ht_path).globals.select(*cls.GLOBALS))
cls.LOADED_GLOBALS[genome_version] = {k: ht_globals[k] for k in cls.GLOBALS}

@classmethod
def _format_population_config(cls, pop_config):
base_pop_config = {field.lower(): field for field in cls.POPULATION_KEYS}
Expand Down Expand Up @@ -98,7 +110,7 @@ def annotation_fields(self):
for k, enum_config in self.ENUM_ANNOTATION_FIELDS.items()
})

if self._genome_version == GENOME_VERSION_GRCh38_DISPLAY:
if self._genome_version == GENOME_VERSION_GRCh38:
annotation_fields.update(self.LIFTOVER_ANNOTATION_FIELDS)
return annotation_fields

Expand Down Expand Up @@ -148,16 +160,13 @@ def _enum_field(value, enum, ht_globals=None, annotate_value=None, format_value=

return value

def __init__(self, data_type, sample_data, genome_version, sort=XPOS, sort_metadata=None, num_results=100, inheritance_mode=None, **kwargs):
def __init__(self, sample_data, genome_version, sort=XPOS, sort_metadata=None, num_results=100, inheritance_mode=None, **kwargs):
self._genome_version = genome_version
self._sort = sort
self._sort_metadata = sort_metadata
self._num_results = num_results
self._data_type = data_type
self._ht = None
self._comp_het_ht = None
self._enums = None
self._globals = None
self._inheritance_mode = inheritance_mode

self._load_filtered_table(sample_data, inheritance_mode=inheritance_mode, **kwargs)
Expand All @@ -170,6 +179,14 @@ def _is_recessive_search(self):
def _has_comp_het_search(self):
return self._inheritance_mode in {RECESSIVE, COMPOUND_HET}

@property
def _globals(self):
return self.LOADED_GLOBALS[self._genome_version]

@property
def _enums(self):
return self._globals['enums']

def _load_filtered_table(self, sample_data, intervals=None, exclude_intervals=False, variant_ids=None, **kwargs):
parsed_intervals, variant_ids = self._parse_intervals(intervals, variant_ids)
excluded_intervals = None
Expand All @@ -189,8 +206,12 @@ def _load_filtered_table(self, sample_data, intervals=None, exclude_intervals=Fa
else:
self._ht = None

@classmethod
def _get_generic_table_path(cls, genome_version, path):
return f'{DATASETS_DIR}/{genome_version}/{cls.DATA_TYPE}/{path}'

def _get_table_path(self, path):
return f'{DATASETS_DIR}/{self._genome_version}/{self._data_type}/{path}'
return self._get_generic_table_path(self._genome_version, path)

def _read_table(self, path):
return hl.read_table(self._get_table_path(path), **self._load_table_kwargs)
Expand All @@ -202,7 +223,7 @@ def import_filtered_table(self, sample_data, intervals=None, **kwargs):
family_samples[s['family_guid']].append(s)
project_samples[s['project_guid']].append(s)

logger.info(f'Loading {self._data_type} data for {len(family_samples)} families in {len(project_samples)} projects')
logger.info(f'Loading {self.DATA_TYPE} data for {len(family_samples)} families in {len(project_samples)} projects')
if len(family_samples) == 1:
family_guid, family_sample_data = list(family_samples.items())[0]
family_ht = self._read_table(f'families/{family_guid}.ht')
Expand Down Expand Up @@ -247,11 +268,6 @@ def import_filtered_table(self, sample_data, intervals=None, **kwargs):
annotations_ht_path, families_ht.key).first().drop(*families_ht.key)
self._ht = families_ht.annotate(**annotation_ht_query_result)

# Get globals
annotation_globals_ht = hl.read_table(annotations_ht_path).head(0).select()
self._globals = {k: hl.eval(annotation_globals_ht[k]) for k in self.GLOBALS}
self._enums = self._globals.pop('enums')

self._filter_annotated_table(**kwargs)

def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, quality_filter=None,
Expand Down Expand Up @@ -733,6 +749,8 @@ def _gene_rank_sort(cls, r, gene_ranks):

class VariantHailTableQuery(BaseHailTableQuery):

DATA_TYPE = VARIANT_DATASET

GENOTYPE_FIELDS = {f.lower(): f for f in ['DP', 'GQ', 'AB']}
QUALITY_FILTER_FORMAT = {
'AB': QualityFilterFormat(override=lambda gt: ~gt.GT.is_het(), scale=100),
Expand Down Expand Up @@ -1001,6 +1019,4 @@ def _gene_rank_sort(cls, r, gene_ranks):
]


QUERY_CLASS_MAP = {
VARIANT_DATASET: VariantHailTableQuery,
}
QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery]}
9 changes: 7 additions & 2 deletions hail_search/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,18 @@

def search_hail_backend(request):
sample_data = request.pop('sample_data', {})
genome_version = request.pop('genome_version')

data_types = list(sample_data.keys())
single_data_type = data_types[0] if len(data_types) == 1 else None

sample_data = sample_data[single_data_type]
data_type = single_data_type
query_cls = QUERY_CLASS_MAP[single_data_type]

query = query_cls(data_type, sample_data=sample_data, **request)
query = query_cls(sample_data, genome_version, **request)
return query.search()


def load_globals():
for cls in QUERY_CLASS_MAP.values():
cls.load_globals()
3 changes: 2 additions & 1 deletion hail_search/web_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import json
import hail as hl

from hail_search.search import search_hail_backend
from hail_search.search import search_hail_backend, load_globals


def _hl_json_default(o):
Expand All @@ -29,4 +29,5 @@ def init_web_app():
web.get('/status', status),
web.post('/search', search),
])
load_globals()
return app
2 changes: 1 addition & 1 deletion seqr/fixtures/1kg_project.json
Original file line number Diff line number Diff line change
Expand Up @@ -695,7 +695,7 @@
"created_by": null,
"last_modified_date": "2017-03-13T09:07:50.158Z",
"family": 8,
"individual_id": "NA20877",
"individual_id": "NA20888",
"mother_id": null,
"father_id": null,
"sex": "M",
Expand Down
14 changes: 8 additions & 6 deletions seqr/views/apis/report_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,9 @@ def sample_metadata_export(request, project_guid):
)
family_rows_by_id = {row['family_id']: row for row in family_rows}

rows_by_subject_id = {row['subject_id']: row for row in subject_rows}
rows_by_subject_family_id = {(row['subject_id'], row['family_guid']): row for row in subject_rows}
for row in sample_rows:
rows_by_subject_id[row['subject_id']].update(row)
rows_by_subject_family_id[(row['subject_id'], row['family_guid'])].update(row)

for rows in discovery_rows:
for i, row in enumerate(rows):
Expand All @@ -216,9 +216,9 @@ def sample_metadata_export(request, project_guid):
parsed_row.update({
'{}-{}'.format(k, i + 1): row[k] for k in DISCOVERY_TABLE_METADATA_VARIANT_COLUMNS if row.get(k)
})
rows_by_subject_id[row['subject_id']].update(parsed_row)
rows_by_subject_family_id[(row['subject_id'], row['family_guid'])].update(parsed_row)

rows = list(rows_by_subject_id.values())
rows = list(rows_by_subject_family_id.values())
all_features = set()
for row in rows:
row.update(family_rows_by_id[row['family_id']])
Expand Down Expand Up @@ -349,9 +349,10 @@ def _parse_anvil_metadata(individual_samples, user, include_collaborator=False,
subject_rows.append(subject_row)

sample_row = _get_sample_row(sample, has_dbgap_submission, airtable_metadata)
sample_row['family_guid'] = family_subject_row['family_guid']
sample_rows.append(sample_row)

discovery_row = _get_discovery_rows(sample, parsed_variants, male_individual_guids)
discovery_row = _get_discovery_rows(sample, parsed_variants, male_individual_guids, family_subject_row['family_guid'])
discovery_rows.append(discovery_row)

return subject_rows, sample_rows, family_rows, discovery_rows
Expand Down Expand Up @@ -561,12 +562,13 @@ def _get_sample_row(sample, has_dbgap_submission, airtable_metadata):
sample_row['dbgap_sample_id'] = airtable_metadata.get('dbgap_sample_id', '')
return sample_row

def _get_discovery_rows(sample, parsed_variants, male_individual_guids):
def _get_discovery_rows(sample, parsed_variants, male_individual_guids, family_guid):
individual = sample.individual
discovery_row = {
'entity:discovery_id': individual.individual_id,
'subject_id': individual.individual_id,
'sample_id': sample.sample_id,
'family_guid': family_guid,
}
discovery_rows = []
for genotypes, parsed_variant in parsed_variants:
Expand Down
18 changes: 10 additions & 8 deletions seqr/views/apis/report_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,7 @@ def test_sample_metadata_export(self, mock_google_authenticated):
self.assertEqual(response.status_code, 200)
response_json = response.json()
self.assertListEqual(list(response_json.keys()), ['rows'])
self.assertEqual(len(response_json['rows']), 16 + len(self.ADDITIONAL_SAMPLES))
expected_samples.update({
'NA19679', 'NA20870', 'HG00732', 'NA20876', 'NA20874', 'NA20875', 'NA19678', 'NA19675', 'HG00731',
'NA20872', 'NA20881', 'HG00733',
Expand All @@ -621,6 +622,7 @@ def test_sample_metadata_export(self, mock_google_authenticated):
self.assertSetEqual({r['sample_id'] for r in response_json['rows']}, expected_samples)
test_row = next(r for r in response_json['rows'] if r['sample_id'] == 'NA20889')
self.assertDictEqual(EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW, test_row)
self.assertEqual(len([r['subject_id'] for r in response_json['rows'] if r['subject_id'] == 'NA20888']), 2)

self.check_no_analyst_no_access(url)

Expand Down Expand Up @@ -687,13 +689,13 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
'The following tables are required in the data model but absent from the reports: subject',
'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id',
'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata',
'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
] + skipped_file_validation_warnings[1:6] + skipped_file_validation_warnings[7:])
self.assertListEqual(response.json()['errors'], [
'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)',
'The following entries are missing required "aligned_dna_short_read_set_id" (from Airtable) in the "aligned_dna_short_read_set" table: NA19675_1',
])
Expand Down Expand Up @@ -813,15 +815,15 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
sample_filter = "OR({CollaboratorSampleID}='HG00731',{CollaboratorSampleID}='HG00732',{CollaboratorSampleID}='HG00733'," \
"{CollaboratorSampleID}='NA19675_1',{CollaboratorSampleID}='NA19678',{CollaboratorSampleID}='NA19679'," \
"{CollaboratorSampleID}='NA20870',{CollaboratorSampleID}='NA20872',{CollaboratorSampleID}='NA20874'," \
"{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20877'," \
"{CollaboratorSampleID}='NA20881')"
"{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20881'," \
"{CollaboratorSampleID}='NA20888')"
sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorSampleID', 'Recontactable']
self._assert_expected_airtable_call(0, sample_filter, sample_fields)
secondary_sample_filter = "OR({SeqrCollaboratorSampleID}='HG00731',{SeqrCollaboratorSampleID}='HG00732'," \
"{SeqrCollaboratorSampleID}='HG00733',{SeqrCollaboratorSampleID}='NA19678'," \
"{SeqrCollaboratorSampleID}='NA19679',{SeqrCollaboratorSampleID}='NA20870',{SeqrCollaboratorSampleID}='NA20872'," \
"{SeqrCollaboratorSampleID}='NA20874',{SeqrCollaboratorSampleID}='NA20875',{SeqrCollaboratorSampleID}='NA20876'," \
"{SeqrCollaboratorSampleID}='NA20877',{SeqrCollaboratorSampleID}='NA20881')"
"{SeqrCollaboratorSampleID}='NA20881',{SeqrCollaboratorSampleID}='NA20888')"
sample_fields[0] = 'SeqrCollaboratorSampleID'
self._assert_expected_airtable_call(1, secondary_sample_filter, sample_fields)
metadata_fields = [
Expand Down

0 comments on commit 1bed8af

Please sign in to comment.