diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 3cb64f3f27..e4f24038f4 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -75,6 +75,7 @@ jobs: pip install -r hail_search/requirements-test.txt - name: Run coverage tests run: | + export DATASETS_DIR=./hail_search/fixtures coverage run --source="./hail_search" --omit="./hail_search/__main__.py","./hail_search/test_utils.py" -m pytest hail_search/ coverage report --fail-under=99 diff --git a/hail_search/__main__.py b/hail_search/__main__.py index 6b08495110..b860206e3e 100644 --- a/hail_search/__main__.py +++ b/hail_search/__main__.py @@ -1,9 +1,13 @@ from aiohttp import web +import hail as hl +import logging from hail_search.web_app import init_web_app def run(): + logging.basicConfig(level=logging.INFO) + hl.init() app = init_web_app() web.run_app( app, diff --git a/hail_search/constants.py b/hail_search/constants.py new file mode 100644 index 0000000000..3e46d40789 --- /dev/null +++ b/hail_search/constants.py @@ -0,0 +1,13 @@ +GENOME_VERSION_GRCh38_DISPLAY = 'GRCh38' + +AFFECTED = 'A' +UNAFFECTED = 'N' +AFFECTED_ID = 0 +UNAFFECTED_ID = 1 + +VARIANT_DATASET = 'VARIANTS' + +VARIANT_KEY_FIELD = 'variantId' +GNOMAD_GENOMES_FIELD = 'gnomad_genomes' + +XPOS = 'xpos' diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.README.txt.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.README.txt.crc new file mode 100644 index 0000000000..37305974fb Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.README.txt.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/._SUCCESS.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/._SUCCESS.crc new file mode 100644 index 0000000000..3b7b044936 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/._SUCCESS.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.metadata.json.gz.crc new file mode 100644 index 0000000000..f6bc82dd1e Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/README.txt b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/README.txt new file mode 100644 index 0000000000..f0eed4785c --- /dev/null +++ b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.109-b71b065e4bb6 + Created at 2023/07/26 13:13:09 \ No newline at end of file diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/_SUCCESS b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/.metadata.json.gz.crc new file mode 100644 index 0000000000..0813711bd3 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/metadata.json.gz new file mode 100644 index 0000000000..95c3a4b4e1 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/parts/.part-0.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/parts/.part-0.crc new file mode 100644 index 0000000000..739b0e6fcf Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/parts/.part-0.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/parts/part-0 b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/parts/part-0 new file mode 100644 index 0000000000..a38cc3b309 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/globals/parts/part-0 differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/.index.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/.index.crc new file mode 100644 index 0000000000..2842a6945b Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/.index.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/.metadata.json.gz.crc new file mode 100644 index 0000000000..c74eb2d84a Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/index b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/index new file mode 100644 index 0000000000..34465e4594 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/index differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/metadata.json.gz new file mode 100644 index 0000000000..aaf21fbc34 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/index/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.idx/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/metadata.json.gz new file mode 100644 index 0000000000..5ce75d799d Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/.metadata.json.gz.crc new file mode 100644 index 0000000000..5a38c8be75 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/metadata.json.gz new file mode 100644 index 0000000000..b2dab47c9a Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.crc b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.crc new file mode 100644 index 0000000000..ac1eed0fad Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/.part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af new file mode 100644 index 0000000000..f050aad17a Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/annotations.ht/rows/parts/part-0-67-0-0-264cdfc4-7193-8607-b752-e11ee66166af differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/.README.txt.crc b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/.README.txt.crc new file mode 100644 index 0000000000..365d81da95 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/.README.txt.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/._SUCCESS.crc b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/._SUCCESS.crc new file mode 100644 index 0000000000..3b7b044936 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/._SUCCESS.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/.metadata.json.gz.crc new file mode 100644 index 0000000000..821761ad25 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/README.txt b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/README.txt new file mode 100644 index 0000000000..8400b300c2 --- /dev/null +++ b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.109-b71b065e4bb6 + Created at 2023/07/26 12:54:13 \ No newline at end of file diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/_SUCCESS b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/.metadata.json.gz.crc new file mode 100644 index 0000000000..7089d13963 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/metadata.json.gz new file mode 100644 index 0000000000..fda5b36409 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/parts/.part-0.crc b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/parts/.part-0.crc new file mode 100644 index 0000000000..122d211e11 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/parts/.part-0.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/parts/part-0 b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/parts/part-0 new file mode 100644 index 0000000000..31c6c7edb0 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/globals/parts/part-0 differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/.index.crc b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/.index.crc new file mode 100644 index 0000000000..49ce5d737d Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/.index.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/.metadata.json.gz.crc new file mode 100644 index 0000000000..be00f8beab Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/index b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/index new file mode 100644 index 0000000000..58a170c632 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/index differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/metadata.json.gz new file mode 100644 index 0000000000..c51e268533 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/index/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.idx/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/metadata.json.gz new file mode 100644 index 0000000000..e3e9216946 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/.metadata.json.gz.crc new file mode 100644 index 0000000000..9a1c4c4c1d Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/.metadata.json.gz.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/metadata.json.gz new file mode 100644 index 0000000000..1d71448943 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/metadata.json.gz differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/parts/.part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.crc b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/parts/.part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.crc new file mode 100644 index 0000000000..ccbcc14bd5 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/parts/.part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d.crc differ diff --git a/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/parts/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/parts/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d new file mode 100644 index 0000000000..5c79ad05c9 Binary files /dev/null and b/hail_search/fixtures/GRCh38/VARIANTS/families/F000002_2.ht/rows/parts/part-0-cd2c14fe-2732-4c18-b1c4-bdcf5e6c460d differ diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py new file mode 100644 index 0000000000..4ba6b2ccbe --- /dev/null +++ b/hail_search/hail_search_query.py @@ -0,0 +1,318 @@ +from aiohttp.web import HTTPBadRequest +from collections import defaultdict, namedtuple +import hail as hl +import logging +import os + +from hail_search.constants import AFFECTED, UNAFFECTED, AFFECTED_ID, UNAFFECTED_ID, VARIANT_DATASET, VARIANT_KEY_FIELD,\ + GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38_DISPLAY + +DATASETS_DIR = os.environ.get('DATASETS_DIR', '/hail_datasets') + +logger = logging.getLogger(__name__) + + +PredictionPath = namedtuple('PredictionPath', ['source', 'field']) + + +def _to_camel_case(snake_case_str): + converted = snake_case_str.replace('_', ' ').title().replace(' ', '') + return converted[0].lower() + converted[1:] + + +class BaseHailTableQuery(object): + + GENOTYPE_FIELDS = {} + POPULATIONS = {} + POPULATION_FIELDS = {} + POPULATION_KEYS = ['AF', 'AC', 'AN', 'Hom', 'Hemi', 'Het'] + PREDICTION_FIELDS_CONFIG = {} + + GLOBALS = ['enums'] + CORE_FIELDS = [XPOS] + BASE_ANNOTATION_FIELDS = { + 'familyGuids': lambda r: r.genotypes.group_by(lambda x: x.familyGuid).keys(), + 'genotypes': lambda r: r.genotypes.group_by(lambda x: x.individualGuid).map_values(lambda x: x[0]), + } + ENUM_ANNOTATION_FIELDS = {} + LIFTOVER_ANNOTATION_FIELDS = { + 'liftedOverGenomeVersion': lambda r: hl.or_missing(hl.is_defined(r.rg37_locus), '37'), + 'liftedOverChrom': lambda r: hl.or_missing(hl.is_defined(r.rg37_locus), r.rg37_locus.contig), + 'liftedOverPos': lambda r: hl.or_missing(hl.is_defined(r.rg37_locus), r.rg37_locus.position), + } + + SORTS = { + XPOS: lambda r: [r.xpos], + } + + @classmethod + def _format_population_config(cls, pop_config): + base_pop_config = {field.lower(): field for field in cls.POPULATION_KEYS} + base_pop_config.update(pop_config) + return base_pop_config + + @property + def annotation_fields(self): + ht_globals = {k: hl.eval(self._ht[k]) for k in self.GLOBALS} + enums = ht_globals.pop('enums') + + annotation_fields = { + 'populations': lambda r: hl.struct(**{ + population: self.population_expression(r, population) for population in self.POPULATIONS.keys() + }), + 'predictions': lambda r: hl.struct(**{ + prediction: hl.array(enums[path.source][path.field])[r[path.source][f'{path.field}_id']] + if enums.get(path.source, {}).get(path.field) else r[path.source][path.field] + for prediction, path in self.PREDICTION_FIELDS_CONFIG.items() + }), + 'transcripts': lambda r: hl.or_else( + r.sorted_transcript_consequences, hl.empty_array(r.sorted_transcript_consequences.dtype.element_type) + ).map( + lambda t: self._enum_field(t, enums['sorted_transcript_consequences'], **self._format_transcript_args()) + ).group_by(lambda t: t.geneId), + } + annotation_fields.update(self.BASE_ANNOTATION_FIELDS) + + format_enum = lambda k, enum_config: lambda r: self._enum_field(r[k], enums[k], ht_globals=ht_globals, **enum_config) + annotation_fields.update({ + enum_config.get('response_key', k): format_enum(k, enum_config) + for k, enum_config in self.ENUM_ANNOTATION_FIELDS.items() + }) + + if self._genome_version == GENOME_VERSION_GRCh38_DISPLAY: + annotation_fields.update(self.LIFTOVER_ANNOTATION_FIELDS) + return annotation_fields + + def population_expression(self, r, population): + pop_config = self._format_population_config(self.POPULATIONS[population]) + pop_field = self.POPULATION_FIELDS.get(population, population) + return hl.struct(**{ + response_key: hl.or_else(r[pop_field][field], '' if response_key == 'id' else 0) + for response_key, field in pop_config.items() if field is not None + }) + + def _format_transcript_args(self): + return { + 'format_value': lambda value: value.rename({k: _to_camel_case(k) for k in value.keys()}), + } + + @staticmethod + def _enum_field(value, enum, ht_globals=None, annotate_value=None, format_value=None, drop_fields=None, **kwargs): + annotations = {} + drop = [] + (drop_fields or []) + value_keys = value.keys() + for field, field_enum in enum.items(): + is_array = f'{field}_ids' in value_keys + value_field = f"{field}_id{'s' if is_array else ''}" + drop.append(value_field) + + enum_array = hl.array(field_enum) + if is_array: + annotations[f'{field}s'] = value[value_field].map(lambda v: enum_array[v]) + else: + annotations[field] = enum_array[value[value_field]] + + value = value.annotate(**annotations) + if annotate_value: + annotations = annotate_value(value, enum, ht_globals) + value = value.annotate(**annotations) + value = value.drop(*drop) + + if format_value: + value = format_value(value) + + return value + + def __init__(self, data_type, sample_data, genome_version, sort=XPOS, num_results=100, **kwargs): + self._genome_version = genome_version + self._sort = sort + self._num_results = num_results + + self._load_filtered_table(data_type, sample_data, **kwargs) + + def _load_filtered_table(self, data_type, sample_data, **kwargs): + self._ht = self.import_filtered_table(data_type, sample_data, **kwargs) + + def import_filtered_table(self, data_type, sample_data, **kwargs): + tables_path = f'{DATASETS_DIR}/{self._genome_version}/{data_type}' + + family_samples = defaultdict(list) + project_samples = defaultdict(list) + for s in sample_data: + family_samples[s['family_guid']].append(s) + project_samples[s['project_guid']].append(s) + + logger.info(f'Loading {data_type} data for {len(family_samples)} families in {len(project_samples)} projects') + if len(family_samples) == 1: + family_guid, family_sample_data = list(family_samples.items())[0] + family_ht = hl.read_table(f'{tables_path}/families/{family_guid}.ht') + families_ht = self._add_entry_sample_families(family_ht, family_sample_data) + families_ht = families_ht.select_globals() + + annotations_ht_path = f'{tables_path}/annotations.ht' + annotation_ht_query_result = hl.query_table( + annotations_ht_path, families_ht.key).first().drop(*families_ht.key) + ht = families_ht.annotate(**annotation_ht_query_result) + # Add globals + ht = ht.join(hl.read_table(annotations_ht_path).head(0).select().select_globals(*self.GLOBALS), how='left') + + ht = ht.transmute( + genotypes=ht.family_entries.flatmap(lambda x: x).filter( + lambda gt: hl.is_defined(gt.individualGuid) + ).map(lambda gt: gt.select( + 'sampleId', 'individualGuid', 'familyGuid', + numAlt=hl.if_else(hl.is_defined(gt.GT), gt.GT.n_alt_alleles(), -1), + **{k: gt[field] for k, field in self.GENOTYPE_FIELDS.items()} + )) + ) + + return ht + + @classmethod + def _add_entry_sample_families(cls, ht, sample_data): + sample_index_id_map = dict(enumerate(hl.eval(ht.sample_ids))) + sample_id_index_map = {v: k for k, v in sample_index_id_map.items()} + sample_index_id_map = hl.dict(sample_index_id_map) + sample_individual_map = {s['sample_id']: s['individual_guid'] for s in sample_data} + missing_samples = set(sample_individual_map.keys()) - set(sample_id_index_map.keys()) + if missing_samples: + raise HTTPBadRequest( + text=f'The following samples are available in seqr but missing the loaded data: {", ".join(sorted(missing_samples))}' + ) + + affected_id_map = {AFFECTED: AFFECTED_ID, UNAFFECTED: UNAFFECTED_ID} + sample_index_affected_status = hl.dict({ + sample_id_index_map[s['sample_id']]: affected_id_map.get(s['affected']) for s in sample_data + }) + sample_index_individual_map = hl.dict({ + sample_id_index_map[sample_id]: i_guid for sample_id, i_guid in sample_individual_map.items() + }) + sample_id_family_map = {s['sample_id']: s['family_guid'] for s in sample_data} + sample_index_family_map = hl.dict({sample_id_index_map[k]: v for k, v in sample_id_family_map.items()}) + family_index_map = {f: i for i, f in enumerate(sorted(set(sample_id_family_map.values())))} + family_sample_indices = [None] * len(family_index_map) + for sample_id, family_guid in sample_id_family_map.items(): + sample_index = sample_id_index_map[sample_id] + family_index = family_index_map[family_guid] + if not family_sample_indices[family_index]: + family_sample_indices[family_index] = [] + family_sample_indices[family_index].append(sample_index) + family_sample_indices = hl.array(family_sample_indices) + + ht = ht.transmute( + family_entries=family_sample_indices.map(lambda sample_indices: sample_indices.map( + lambda i: hl.or_else(ht.entries[i], cls._missing_entry(ht.entries[i])).annotate( + sampleId=sample_index_id_map.get(i), + individualGuid=sample_index_individual_map.get(i), + familyGuid=sample_index_family_map.get(i), + affected_id=sample_index_affected_status.get(i), + ) + )) + ) + + return ht + + @staticmethod + def _missing_entry(entry): + entry_type = dict(**entry.dtype) + return hl.struct(**{k: hl.missing(v) for k, v in entry_type.items()}) + + def _format_results(self, ht): + annotations = {k: v(ht) for k, v in self.annotation_fields.items()} + annotations.update({ + '_sort': self._sort_order(ht), + 'genomeVersion': self._genome_version.replace('GRCh', ''), + }) + results = ht.annotate(**annotations) + return results.select(*self.CORE_FIELDS, *list(annotations.keys())) + + def search(self): + ht = self._format_results(self._ht) + + (total_results, collected) = ht.aggregate((hl.agg.count(), hl.agg.take(ht.row, self._num_results, ordering=ht._sort))) + logger.info(f'Total hits: {total_results}. Fetched: {self._num_results}') + + return collected, total_results + + def _sort_order(self, ht): + sort_expressions = self._get_sort_expressions(ht, XPOS) + if self._sort != XPOS: + sort_expressions = self._get_sort_expressions(ht, self._sort) + sort_expressions + return sort_expressions + + def _get_sort_expressions(self, ht, sort): + return self.SORTS[sort](ht) + + +class VariantHailTableQuery(BaseHailTableQuery): + + GENOTYPE_FIELDS = {f.lower(): f for f in ['DP', 'GQ', 'AB']} + POPULATIONS = { + 'seqr': {'hom': 'hom', 'hemi': None, 'het': None}, + 'topmed': {'hemi': None}, + 'exac': { + 'filter_af': 'AF_POPMAX', 'ac': 'AC_Adj', 'an': 'AN_Adj', 'hom': 'AC_Hom', 'hemi': 'AC_Hemi', + 'het': 'AC_Het', + }, + 'gnomad_exomes': {'filter_af': 'AF_POPMAX_OR_GLOBAL', 'het': None}, + GNOMAD_GENOMES_FIELD: {'filter_af': 'AF_POPMAX_OR_GLOBAL', 'het': None}, + } + POPULATION_FIELDS = {'seqr': 'gt_stats'} + PREDICTION_FIELDS_CONFIG = { + 'cadd': PredictionPath('cadd', 'PHRED'), + 'eigen': PredictionPath('eigen', 'Eigen_phred'), + 'fathmm': PredictionPath('dbnsfp', 'fathmm_MKL_coding_pred'), + 'gnomad_noncoding': PredictionPath('gnomad_non_coding_constraint', 'z_score'), + 'mpc': PredictionPath('mpc', 'MPC'), + 'mut_pred': PredictionPath('dbnsfp', 'MutPred_score'), + 'primate_ai': PredictionPath('primate_ai', 'score'), + 'splice_ai': PredictionPath('splice_ai', 'delta_score'), + 'splice_ai_consequence': PredictionPath('splice_ai', 'splice_consequence'), + 'vest': PredictionPath('dbnsfp', 'VEST4_score'), + 'mut_taster': PredictionPath('dbnsfp', 'MutationTaster_pred'), + 'polyphen': PredictionPath('dbnsfp', 'Polyphen2_HVAR_pred'), + 'revel': PredictionPath('dbnsfp', 'REVEL_score'), + 'sift': PredictionPath('dbnsfp', 'SIFT_pred'), + } + + GLOBALS = BaseHailTableQuery.GLOBALS + ['versions'] + CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS + ['rsid'] + BASE_ANNOTATION_FIELDS = { + 'chrom': lambda r: r.locus.contig.replace("^chr", ""), + 'pos': lambda r: r.locus.position, + 'ref': lambda r: r.alleles[0], + 'alt': lambda r: r.alleles[1], + 'genotypeFilters': lambda r: hl.str(' ,').join(r.filters), + 'mainTranscriptId': lambda r: r.sorted_transcript_consequences.first().transcript_id, + } + BASE_ANNOTATION_FIELDS.update(BaseHailTableQuery.BASE_ANNOTATION_FIELDS) + ENUM_ANNOTATION_FIELDS = { + 'clinvar': {'annotate_value': lambda value, enum, ht_globals: { + 'conflictingPathogenicities': value.conflictingPathogenicities.map( + lambda p: VariantHailTableQuery._enum_field(p, {k: enum[k] for k in ['pathogenicity']}) + ), + 'version': ht_globals['versions'].clinvar, + }}, + 'hgmd': {}, + 'screen': { + 'response_key': 'screenRegionType', + 'format_value': lambda value: value.region_types.first(), + }, + } + + def import_filtered_table(self, *args, **kwargs): + ht = super(VariantHailTableQuery, self).import_filtered_table(*args, **kwargs) + return ht.key_by(**{VARIANT_KEY_FIELD: ht.variant_id}) + + def _format_transcript_args(self): + args = super(VariantHailTableQuery, self)._format_transcript_args() + args.update({ + 'annotate_value': lambda transcript, *args: {'major_consequence': transcript.consequence_terms.first()}, + 'drop_fields': ['consequence_terms'], + }) + return args + + +QUERY_CLASS_MAP = { + VARIANT_DATASET: VariantHailTableQuery, +} diff --git a/hail_search/search.py b/hail_search/search.py new file mode 100644 index 0000000000..716aae5e7b --- /dev/null +++ b/hail_search/search.py @@ -0,0 +1,15 @@ +from hail_search.hail_search_query import QUERY_CLASS_MAP + + +def search_hail_backend(request): + sample_data = request.pop('sample_data', {}) + + data_types = list(sample_data.keys()) + single_data_type = data_types[0] if len(data_types) == 1 else None + + sample_data = sample_data[single_data_type] + data_type = single_data_type + query_cls = QUERY_CLASS_MAP[single_data_type] + + query = query_cls(data_type, sample_data=sample_data, **request) + return query.search() diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 1daad2bd33..ab105820bb 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -1,5 +1,7 @@ from aiohttp.test_utils import AioHTTPTestCase +from hail_search.test_utils import get_hail_search_body, FAMILY_2_VARIANT_SAMPLE_DATA, FAMILY_2_MISSING_SAMPLE_DATA, \ + VARIANT1, VARIANT2, VARIANT3, VARIANT4 from hail_search.web_app import init_web_app @@ -13,3 +15,23 @@ async def test_status(self): self.assertEqual(resp.status, 200) resp_json = await resp.json() self.assertDictEqual(resp_json, {'success': True}) + + async def test_search(self): + search_body = get_hail_search_body(sample_data=FAMILY_2_VARIANT_SAMPLE_DATA) + async with self.client.request('POST', '/search', json=search_body) as resp: + self.assertEqual(resp.status, 200) + resp_json = await resp.json() + self.assertSetEqual(set(resp_json.keys()), {'results', 'total'}) + self.assertEqual(resp_json['total'], 4) + results = [VARIANT1, VARIANT2, VARIANT3, VARIANT4] + self.assertListEqual( + [v['variantId'] for v in resp_json['results']], [v['variantId'] for v in results] + ) + self.assertListEqual(resp_json['results'], results) + + async def test_search_missing_data(self): + search_body = get_hail_search_body(sample_data=FAMILY_2_MISSING_SAMPLE_DATA) + async with self.client.request('POST', '/search', json=search_body) as resp: + self.assertEqual(resp.status, 400) + text = await resp.text() + self.assertEqual(text, 'The following samples are available in seqr but missing the loaded data: NA19675, NA19678') diff --git a/hail_search/test_utils.py b/hail_search/test_utils.py index ed26cd7ed5..b1af16e493 100644 --- a/hail_search/test_utils.py +++ b/hail_search/test_utils.py @@ -5,18 +5,20 @@ 'sample_id': 'NA20870', 'individual_guid': 'I000007_na20870', 'family_guid': 'F000003_3', 'project_guid': 'R0001_1kg', 'affected': 'A', 'sex': 'M', } +FAMILY_2_VARIANT_SAMPLE_DATA = {'VARIANTS': [ + {'sample_id': 'HG00731', 'individual_guid': 'I000004_hg00731', 'family_guid': 'F000002_2', 'project_guid': 'R0001_1kg', 'affected': 'A', 'sex': 'F'}, + {'sample_id': 'HG00732', 'individual_guid': 'I000005_hg00732', 'family_guid': 'F000002_2', 'project_guid': 'R0001_1kg', 'affected': 'N', 'sex': 'M'}, + {'sample_id': 'HG00733', 'individual_guid': 'I000006_hg00733', 'family_guid': 'F000002_2', 'project_guid': 'R0001_1kg', 'affected': 'N', 'sex': 'F'}, +]} EXPECTED_SAMPLE_DATA = { - 'VARIANTS': [ - {'sample_id': 'HG00731', 'individual_guid': 'I000004_hg00731', 'family_guid': 'F000002_2', 'project_guid': 'R0001_1kg', 'affected': 'A', 'sex': 'F'}, - {'sample_id': 'HG00732', 'individual_guid': 'I000005_hg00732', 'family_guid': 'F000002_2', 'project_guid': 'R0001_1kg', 'affected': 'N', 'sex': 'M'}, - {'sample_id': 'HG00733', 'individual_guid': 'I000006_hg00733', 'family_guid': 'F000002_2', 'project_guid': 'R0001_1kg', 'affected': 'N', 'sex': 'F'}, - FAMILY_3_SAMPLE, - ], 'SV_WES': [ + 'SV_WES': [ {'sample_id': 'HG00731', 'individual_guid': 'I000004_hg00731', 'family_guid': 'F000002_2', 'project_guid': 'R0001_1kg', 'affected': 'A', 'sex': 'F'}, {'sample_id': 'HG00732', 'individual_guid': 'I000005_hg00732', 'family_guid': 'F000002_2', 'project_guid': 'R0001_1kg', 'affected': 'N', 'sex': 'M'}, {'sample_id': 'HG00733', 'individual_guid': 'I000006_hg00733', 'family_guid': 'F000002_2', 'project_guid': 'R0001_1kg', 'affected': 'N', 'sex': 'F'} ], } +EXPECTED_SAMPLE_DATA.update(deepcopy(FAMILY_2_VARIANT_SAMPLE_DATA)) +EXPECTED_SAMPLE_DATA['VARIANTS'].append(FAMILY_3_SAMPLE) CUSTOM_AFFECTED_SAMPLE_DATA = {'VARIANTS': deepcopy(EXPECTED_SAMPLE_DATA['VARIANTS'])} CUSTOM_AFFECTED_SAMPLE_DATA['VARIANTS'][0]['affected'] = 'N' CUSTOM_AFFECTED_SAMPLE_DATA['VARIANTS'][1]['affected'] = 'A' @@ -28,16 +30,312 @@ {'sample_id': 'NA19678', 'individual_guid': 'I000002_na19678', 'family_guid': 'F000001_1', 'project_guid': 'R0001_1kg', 'affected': 'N', 'sex': 'M'}, ], } +FAMILY_2_MISSING_SAMPLE_DATA = deepcopy(FAMILY_1_SAMPLE_DATA) +for s in FAMILY_2_MISSING_SAMPLE_DATA['VARIANTS']: + s['family_guid'] = 'F000002_2' -ALL_AFFECTED_SAMPLE_DATA = deepcopy(EXPECTED_SAMPLE_DATA) -ALL_AFFECTED_SAMPLE_DATA['MITO'] = [ +FAMILY_2_MITO_SAMPLE_DATA = {'MITO': [ {'sample_id': 'HG00733', 'individual_guid': 'I000006_hg00733', 'family_guid': 'F000002_2', 'project_guid': 'R0001_1kg', 'affected': 'N', 'sex': 'F'}, -] +]} +FAMILY_2_ALL_SAMPLE_DATA = deepcopy(FAMILY_2_VARIANT_SAMPLE_DATA) +FAMILY_2_ALL_SAMPLE_DATA.update(FAMILY_2_MITO_SAMPLE_DATA) + +ALL_AFFECTED_SAMPLE_DATA = deepcopy(EXPECTED_SAMPLE_DATA) +ALL_AFFECTED_SAMPLE_DATA.update(FAMILY_2_MITO_SAMPLE_DATA) FAMILY_5_SAMPLE = { 'sample_id': 'NA20874', 'individual_guid': 'I000009_na20874', 'family_guid': 'F000005_5', 'project_guid': 'R0001_1kg', 'affected': 'N', 'sex': 'M', } ALL_AFFECTED_SAMPLE_DATA['VARIANTS'].append(FAMILY_5_SAMPLE) +VARIANT1 = { + 'variantId': '1-10439-AC-A', + 'chrom': '1', + 'pos': 10439, + 'ref': 'AC', + 'alt': 'A', + 'genomeVersion': '38', + 'liftedOverGenomeVersion': '37', + 'liftedOverChrom': '1', + 'liftedOverPos': 10439, + 'xpos': 1000010439, + 'rsid': 'rs112766696', + 'familyGuids': ['F000002_2'], + 'genotypes': { + 'I000004_hg00731': { + 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'dp': 10, 'gq': 99, 'ab': 0.5, + }, + 'I000005_hg00732': { + 'sampleId': 'HG00732', 'individualGuid': 'I000005_hg00732', 'familyGuid': 'F000002_2', + 'numAlt': 0, 'dp': 24, 'gq': 0, 'ab': 0.0, + }, + 'I000006_hg00733': { + 'sampleId': 'HG00733', 'individualGuid': 'I000006_hg00733', 'familyGuid': 'F000002_2', + 'numAlt': 0, 'dp': 60, 'gq': 20, 'ab': 0.0, + }, + }, + 'genotypeFilters': '', + 'clinvar': None, + 'hgmd': None, + 'screenRegionType': None, + 'populations': { + 'seqr': {'af': 0.10000000149011612, 'ac': 9, 'an': 90, 'hom': 2}, + 'topmed': {'af': 0.0784199982881546, 'ac': 20757, 'an': 264690, 'hom': 0, 'het': 20757}, + 'exac': {'af': 0.0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'het': 0, 'filter_af': 0.0}, + 'gnomad_exomes': {'af': 0.0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'filter_af': 0.0}, + 'gnomad_genomes': {'af': 0.34449315071105957, 'ac': 9271, 'an': 26912, 'hom': 480, 'hemi': 0, 'filter_af': 0.40276646614074707}, + }, + 'predictions': { + 'cadd': 4.668000221252441, + 'eigen': None, + 'fathmm': None, + 'gnomad_noncoding': None, + 'mpc': None, + 'mut_pred': None, + 'primate_ai': None, + 'splice_ai': None, + 'splice_ai_consequence': None, + 'vest': None, + 'mut_taster': None, + 'polyphen': None, + 'revel': None, + 'sift': None, + }, + 'transcripts': {}, + 'mainTranscriptId': None, + '_sort': [1000010439], +} +VARIANT2 = { + 'variantId': '1-11794419-T-G', + 'chrom': '1', + 'pos': 11794419, + 'ref': 'T', + 'alt': 'G', + 'genomeVersion': '38', + 'liftedOverGenomeVersion': '37', + 'liftedOverChrom': '1', + 'liftedOverPos': 11854476, + 'xpos': 1011794419, + 'rsid': 'rs1801131', + 'familyGuids': ['F000002_2'], + 'genotypes': { + 'I000004_hg00731': { + 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', + 'numAlt': 2, 'dp': 36, 'gq': 99, 'ab': 1.0, + }, + 'I000005_hg00732': { + 'sampleId': 'HG00732', 'individualGuid': 'I000005_hg00732', 'familyGuid': 'F000002_2', + 'numAlt': 0, 'dp': 33, 'gq': 40, 'ab': 0.0, + }, + 'I000006_hg00733': { + 'sampleId': 'HG00733', 'individualGuid': 'I000006_hg00733', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'dp': 32, 'gq': 99, 'ab': 0.625, + }, + }, + 'genotypeFilters': '', + 'clinvar': { + 'alleleId': 18560, + 'conflictingPathogenicities': [ + {'count': 1, 'pathogenicity': 'Likely_pathogenic'}, + {'count': 1, 'pathogenicity': 'Uncertain_significance'}, + {'count': 1, 'pathogenicity': 'Likely_benign'}, + {'count': 5, 'pathogenicity': 'Benign'}, + ], + 'goldStars': 1, + 'pathogenicity': 'Conflicting_interpretations_of_pathogenicity', + 'assertions': ['other'], + 'version': '2023-07-10', + }, + 'hgmd': {'accession': 'CM981315', 'class': 'DFP'}, + 'screenRegionType': None, + 'populations': { + 'seqr': {'af': 0.31111112236976624, 'ac': 28, 'an': 90, 'hom': 4}, + 'topmed': {'af': 0.24615199863910675, 'ac': 65154, 'an': 264690, 'hom': 8775, 'het': 47604}, + 'exac': {'af': 0.29499998688697815, 'ac': 35805, 'an': 121372, 'hom': 5872, 'hemi': 0, 'het': 24061, 'filter_af': 0.4153035283088684}, + 'gnomad_exomes': {'af': 0.28899794816970825, 'ac': 72672, 'an': 251462, 'hom': 11567, 'hemi': 0, 'filter_af': 0.4116474986076355}, + 'gnomad_genomes': {'af': 0.2633855640888214, 'ac': 40003, 'an': 151880, 'hom': 5754, 'hemi': 0, 'filter_af': 0.4067690968513489}, + }, + 'predictions': { + 'cadd': 20.899999618530273, + 'eigen': 2.000999927520752, + 'fathmm': 'D', + 'gnomad_noncoding': 5.868505001068115, + 'mpc': 0.28205373883247375, + 'mut_pred': None, + 'primate_ai': 0.4655807614326477, + 'splice_ai': 0.0, + 'splice_ai_consequence': 'No consequence', + 'vest': 0.210999995470047, + 'mut_taster': 'P', + 'polyphen': 'B', + 'revel': 0.19699999690055847, + 'sift': 'T', + }, + 'transcripts': { + 'ENSG00000177000': [ + {'aminoAcids': 'E/A', 'canonical': 1, 'codons': 'gAa/gCa', 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000376585.6:c.1409A>C', 'hgvsp': 'ENSP00000365770.1:p.Glu470Ala', 'transcriptId': 'ENST00000376585', 'isLofNagnag': None, 'transcriptRank': 0, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'missense_variant'}, + {'aminoAcids': 'E/A', 'canonical': None, 'codons': 'gAa/gCa', 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000376583.7:c.1409A>C', 'hgvsp': 'ENSP00000365767.3:p.Glu470Ala', 'transcriptId': 'ENST00000376583', 'isLofNagnag': None, 'transcriptRank': 1, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'missense_variant'}, + {'aminoAcids': 'E/A', 'canonical': None, 'codons': 'gAa/gCa', 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000376590.8:c.1286A>C', 'hgvsp': 'ENSP00000365775.3:p.Glu429Ala', 'transcriptId': 'ENST00000376590', 'isLofNagnag': None, 'transcriptRank': 2, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'missense_variant'}, + {'aminoAcids': 'E/A', 'canonical': None, 'codons': 'gAa/gCa', 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000376592.6:c.1286A>C', 'hgvsp': 'ENSP00000365777.1:p.Glu429Ala', 'transcriptId': 'ENST00000376592', 'isLofNagnag': None, 'transcriptRank': 3, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'missense_variant'}, + {'aminoAcids': 'E/A', 'canonical': None, 'codons': 'gAa/gCa', 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000423400.7:c.1406A>C', 'hgvsp': 'ENSP00000398908.3:p.Glu469Ala', 'transcriptId': 'ENST00000423400', 'isLofNagnag': None, 'transcriptRank': 4, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'missense_variant'}, + {'aminoAcids': 'E/A', 'canonical': None, 'codons': 'gAa/gCa', 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000641407.1:c.1286A>C', 'hgvsp': 'ENSP00000493098.1:p.Glu429Ala', 'transcriptId': 'ENST00000641407', 'isLofNagnag': None, 'transcriptRank': 5, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'missense_variant'}, + {'aminoAcids': 'E/A', 'canonical': None, 'codons': 'gAa/gCa', 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000641820.1:c.551A>C', 'hgvsp': 'ENSP00000492937.1:p.Glu184Ala', 'transcriptId': 'ENST00000641820', 'isLofNagnag': None, 'transcriptRank': 6, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'missense_variant'}, + {'aminoAcids': 'E/A', 'canonical': None, 'codons': 'gAa/gCa', 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000641446.1:c.1286A>C', 'hgvsp': 'ENSP00000493262.1:p.Glu429Ala', 'transcriptId': 'ENST00000641446', 'isLofNagnag': None, 'transcriptRank': 7, 'biotype': 'nonsense_mediated_decay', 'lofFilters': None, 'majorConsequence': 'missense_variant'}, + {'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000641747.1:c.*798A>C', 'hgvsp': None, 'transcriptId': 'ENST00000641747', 'isLofNagnag': None, 'transcriptRank': 8, 'biotype': 'nonsense_mediated_decay', 'lofFilters': None, 'majorConsequence': '3_prime_UTR_variant'}, + {'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000641759.1:n.1655A>C', 'hgvsp': None, 'transcriptId': 'ENST00000641759', 'isLofNagnag': None, 'transcriptRank': 9, 'biotype': 'retained_intron', 'lofFilters': None, 'majorConsequence': 'non_coding_transcript_exon_variant'}, + {'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000641805.1:n.1803A>C', 'hgvsp': None, 'transcriptId': 'ENST00000641805', 'isLofNagnag': None, 'transcriptRank': 10, 'biotype': 'retained_intron', 'lofFilters': None, 'majorConsequence': 'non_coding_transcript_exon_variant'}, + ], + }, + 'mainTranscriptId': 'ENST00000376585', + '_sort': [1011794419], +} +VARIANT3 = { + 'variantId': '1-91502721-G-A', + 'chrom': '1', + 'pos': 91502721, + 'ref': 'G', + 'alt': 'A', + 'genomeVersion': '38', + 'liftedOverGenomeVersion': '37', + 'liftedOverChrom': '1', + 'liftedOverPos': 91968278, + 'xpos': 1091502721, + 'rsid': 'rs13447464', + 'familyGuids': ['F000002_2'], + 'genotypes': { + 'I000004_hg00731': { + 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'dp': 40, 'gq': 99, 'ab': 1.0, + }, + 'I000005_hg00732': { + 'sampleId': 'HG00732', 'individualGuid': 'I000005_hg00732', 'familyGuid': 'F000002_2', + 'numAlt': 0, 'dp': 37, 'gq': 99, 'ab': 0.4594594594594595, + }, + 'I000006_hg00733': { + 'sampleId': 'HG00733', 'individualGuid': 'I000006_hg00733', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'dp': 27, 'gq': 99, 'ab': 0.4074074074074074, + }, + }, + 'genotypeFilters': '', + 'clinvar': None, + 'hgmd': None, + 'screenRegionType': None, + 'populations': { + 'seqr': {'af': 0.6666666865348816, 'ac': 4, 'an': 6, 'hom': 1}, + 'topmed': {'af': 0.36268100142478943, 'ac': 95998, 'an': 264690, 'hom': 19369, 'het': 57260}, + 'exac': {'af': 0.0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'het': 0, 'filter_af': 0.0}, + 'gnomad_exomes': {'af': 0.0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'filter_af': 0.0}, + 'gnomad_genomes': {'af': 0.38041073083877563, 'ac': 57757, 'an': 151828, 'hom': 12204, 'hemi': 0, 'filter_af': 0.4797786474227905}, + }, + 'predictions': { + 'cadd': 2.753999948501587, + 'eigen': 1.378000020980835, + 'fathmm': None, + 'gnomad_noncoding': 0.7389647960662842, + 'mpc': None, + 'mut_pred': None, + 'primate_ai': None, + 'splice_ai': 0.009999999776482582, + 'splice_ai_consequence': 'Donor gain', + 'vest': None, + 'mut_taster': None, + 'polyphen': None, + 'revel': None, + 'sift': None, + }, + 'transcripts': { + 'ENSG00000097046': [ + {'aminoAcids': None, 'canonical': 1, 'codons': None, 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000428239.5:c.115+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000428239', 'isLofNagnag': None, 'transcriptRank': 0, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'intron_variant'}, + {'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000234626.10:c.115+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000234626', 'isLofNagnag': None, 'transcriptRank': 1, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'intron_variant'}, + {'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000426137.1:c.115+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000426137', 'isLofNagnag': None, 'transcriptRank': 2, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'intron_variant'}, + ], + 'ENSG00000177000': [ + {'aminoAcids': None, 'canonical': None, 'codons': None, 'geneId': 'ENSG00000177000', 'hgvsc': 'ENST00000497611.1:n.501+890G>A', 'hgvsp': None, 'transcriptId': 'ENST00000497611', 'isLofNagnag': None, 'transcriptRank': 3, 'biotype': 'processed_transcript', 'lofFilters': None, 'majorConsequence': 'intron_variant'}, + ], + }, + 'mainTranscriptId': 'ENST00000428239', + '_sort': [1091502721], +} +VARIANT4 = { + 'variantId': '1-91511686-T-G', + 'chrom': '1', + 'pos': 91511686, + 'ref': 'T', + 'alt': 'G', + 'genomeVersion': '38', + 'liftedOverGenomeVersion': '37', + 'liftedOverChrom': '1', + 'liftedOverPos': 91977243, + 'xpos': 1091511686, + 'rsid': None, + 'familyGuids': ['F000002_2'], + 'genotypes': { + 'I000004_hg00731': { + 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'dp': 29, 'gq': 58, 'ab': 0.1724137931034483, + }, + 'I000005_hg00732': { + 'sampleId': 'HG00732', 'individualGuid': 'I000005_hg00732', 'familyGuid': 'F000002_2', + 'numAlt': 0, 'dp': 24, 'gq': 0, 'ab': 0.0, + }, + 'I000006_hg00733': { + 'sampleId': 'HG00733', 'individualGuid': 'I000006_hg00733', 'familyGuid': 'F000002_2', + 'numAlt': 0, 'dp': 45, 'gq': 0, 'ab': 0.0, + }, + }, + 'genotypeFilters': 'VQSRTrancheSNP99.95to100.00', + 'clinvar': None, + 'hgmd': None, + 'screenRegionType': 'CTCF-only', + 'populations': { + 'seqr': {'af': 0.02222222276031971, 'ac': 2, 'an': 90, 'hom': 0}, + 'topmed': {'af': 0.0, 'ac': 0, 'an': 0, 'hom': 0, 'het': 0}, + 'exac': {'af': 0.0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'het': 0, 'filter_af': 0.0}, + 'gnomad_exomes': {'af': 0.0, 'ac': 0, 'an': 0, 'hom': 0, 'hemi': 0, 'filter_af': 0.0}, + 'gnomad_genomes': {'af': 0.00026519427774474025, 'ac': 39, 'an': 147062, 'hom': 0, 'hemi': 0, 'filter_af': 0.0015030059730634093}, + }, + 'predictions': { + 'cadd': 29.899999618530273, + 'eigen': 9.491000175476074, + 'fathmm': 'D', + 'gnomad_noncoding': 0.2300506979227066, + 'mpc': 0.8326827883720398, + 'mut_pred': 0.6869999766349792, + 'primate_ai': 0.6995947360992432, + 'splice_ai': 0.0, + 'splice_ai_consequence': 'No consequence', + 'vest': 0.8579999804496765, + 'mut_taster': 'D', + 'polyphen': 'D', + 'revel': 0.5260000228881836, + 'sift': 'D', + }, + 'transcripts': { + 'ENSG00000097046': [ + {'aminoAcids': 'F/C', 'canonical': 1, 'codons': 'tTt/tGt', 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000428239.5:c.425T>G', 'hgvsp': 'ENSP00000393139.1:p.Phe142Cys', 'transcriptId': 'ENST00000428239', 'isLofNagnag': None, 'transcriptRank': 0, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'missense_variant'}, + {'aminoAcids': 'F/C', 'canonical': None, 'codons': 'tTt/tGt', 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000234626.10:c.425T>G', 'hgvsp': 'ENSP00000234626.6:p.Phe142Cys', 'transcriptId': 'ENST00000234626', 'isLofNagnag': None, 'transcriptRank': 1, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'missense_variant'}, + {'aminoAcids': 'F/C', 'canonical': None, 'codons': 'tTt/tGt', 'geneId': 'ENSG00000097046', 'hgvsc': 'ENST00000426137.1:c.425T>G', 'hgvsp': 'ENSP00000398077.1:p.Phe142Cys', 'transcriptId': 'ENST00000426137', 'isLofNagnag': None, 'transcriptRank': 2, 'biotype': 'protein_coding', 'lofFilters': None, 'majorConsequence': 'missense_variant'}, + ], + }, + 'mainTranscriptId': 'ENST00000428239', + '_sort': [1091511686], +} + +FAMILY_3_VARIANT = deepcopy(VARIANT3) +FAMILY_3_VARIANT['familyGuids'] = ['F000003_3'] +FAMILY_3_VARIANT['genotypes'] = { + 'I000007_na20870': { + 'sampleId': 'NA20870', 'individualGuid': 'I000007_na20870', 'familyGuid': 'F000003_3', + 'numAlt': 1, 'dp': 28, 'gq': 99, 'ab': 0.6785714285714286, + }, +} +MULTI_FAMILY_VARIANT = deepcopy(VARIANT3) +MULTI_FAMILY_VARIANT['familyGuids'] += FAMILY_3_VARIANT['familyGuids'] +MULTI_FAMILY_VARIANT['genotypes'].update(FAMILY_3_VARIANT['genotypes']) + +HAIL_BACKEND_VARIANTS = [VARIANT2, MULTI_FAMILY_VARIANT] +HAIL_BACKEND_SINGLE_FAMILY_VARIANTS = [VARIANT2, VARIANT3] + def get_hail_search_body(genome_version='GRCh38', num_results=100, sample_data=None, omit_sample_type=None, **search_body): sample_data = sample_data or EXPECTED_SAMPLE_DATA diff --git a/hail_search/web_app.py b/hail_search/web_app.py index 0415bd83a2..cf538cf751 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -1,6 +1,22 @@ from aiohttp import web +import json import hail as hl -import logging + +from hail_search.search import search_hail_backend + + +def _hl_json_default(o): + if isinstance(o, hl.Struct) or isinstance(o, hl.utils.frozendict): + return dict(o) + + +def hl_json_dumps(obj): + return json.dumps(obj, default=_hl_json_default) + + +async def search(request: web.Request) -> web.Response: + hail_results, total_results = search_hail_backend(await request.json()) + return web.json_response({'results': hail_results, 'total': total_results}, dumps=hl_json_dumps) async def status(request: web.Request) -> web.Response: @@ -8,10 +24,9 @@ async def status(request: web.Request) -> web.Response: def init_web_app(): - logging.basicConfig(level=logging.INFO) - hl.init() app = web.Application() app.add_routes([ web.get('/status', status), + web.post('/search', search), ]) return app diff --git a/seqr/utils/search/hail_search_utils_tests.py b/seqr/utils/search/hail_search_utils_tests.py index a3da7ea0f8..c37bbf2dff 100644 --- a/seqr/utils/search/hail_search_utils_tests.py +++ b/seqr/utils/search/hail_search_utils_tests.py @@ -9,9 +9,8 @@ from seqr.utils.search.utils import get_variant_query_gene_counts, query_variants, get_single_variant, \ get_variants_for_variant_ids, InvalidSearchException from seqr.utils.search.search_utils_tests import SearchTestHelper, MOCK_COUNTS -from seqr.views.utils.test_utils import PARSED_VARIANTS -from hail_search.test_utils import get_hail_search_body, EXPECTED_SAMPLE_DATA, FAMILY_1_SAMPLE_DATA, FAMILY_3_SAMPLE, \ - ALL_AFFECTED_SAMPLE_DATA, CUSTOM_AFFECTED_SAMPLE_DATA +from hail_search.test_utils import get_hail_search_body, EXPECTED_SAMPLE_DATA, FAMILY_1_SAMPLE_DATA, \ + FAMILY_2_ALL_SAMPLE_DATA, ALL_AFFECTED_SAMPLE_DATA, CUSTOM_AFFECTED_SAMPLE_DATA, HAIL_BACKEND_VARIANTS MOCK_HOST = 'http://test-hail-host' @@ -24,7 +23,7 @@ class HailSearchUtilsTests(SearchTestHelper, TestCase): def setUp(self): super(HailSearchUtilsTests, self).set_up() responses.add(responses.POST, f'{MOCK_HOST}:5000/search', status=200, json={ - 'results': PARSED_VARIANTS, 'total': 5, + 'results': HAIL_BACKEND_VARIANTS, 'total': 5, }) def _test_minimal_search_call(self, **kwargs): @@ -62,15 +61,15 @@ def _test_expected_search_call(self, search_fields=None, gene_ids=None, interval @responses.activate def test_query_variants(self): variants, total = query_variants(self.results_model, user=self.user) - self.assertListEqual(variants, PARSED_VARIANTS) + self.assertListEqual(variants, HAIL_BACKEND_VARIANTS) self.assertEqual(total, 5) - self.assert_cached_results({'all_results': PARSED_VARIANTS, 'total_results': 5}) + self.assert_cached_results({'all_results': HAIL_BACKEND_VARIANTS, 'total_results': 5}) self._test_expected_search_call() variants, _ = query_variants( self.results_model, user=self.user, sort='cadd', skip_genotype_filter=True, page=2, num_results=1, ) - self.assertListEqual(variants, PARSED_VARIANTS[1:]) + self.assertListEqual(variants, HAIL_BACKEND_VARIANTS[1:]) self._test_expected_search_call(sort='cadd', num_results=2) self.search_model.search['locus'] = {'rawVariantItems': '1-248367227-TC-T,2-103343353-GAGA-G'} @@ -172,7 +171,7 @@ def test_get_variant_query_gene_counts(self): @responses.activate def test_get_single_variant(self): variant = get_single_variant(self.families, '2-103343353-GAGA-G', user=self.user) - self.assertDictEqual(variant, PARSED_VARIANTS[0]) + self.assertDictEqual(variant, HAIL_BACKEND_VARIANTS[0]) self._test_minimal_search_call( variant_ids=[['2', 103343353, 'GAGA', 'G']], variant_keys=[], num_results=1, sample_data=ALL_AFFECTED_SAMPLE_DATA, omit_sample_type='SV_WES') @@ -183,16 +182,16 @@ def test_get_single_variant(self): num_results=1, sample_data=EXPECTED_SAMPLE_DATA, omit_sample_type='VARIANTS') with self.assertRaises(InvalidSearchException) as cm: - get_single_variant(self.families, '2-103343353-GAGA-G', user=self.user, return_all_queried_families=True) + get_single_variant(self.families, '1-91502721-G-A', user=self.user, return_all_queried_families=True) self.assertEqual( str(cm.exception), - 'Unable to return all families for the following variants: 1-248367227-TC-T (F000002_2; F000005_5), 2-103343353-GAGA-G (F000005_5)', + 'Unable to return all families for the following variants: 1-11794419-T-G (F000003_3; F000005_5), 1-91502721-G-A (F000005_5)', ) - get_single_variant(self.families.filter(guid='F000003_3'), '2-103343353-GAGA-G', user=self.user, return_all_queried_families=True) + get_single_variant(self.families.filter(guid='F000002_2'), '2-103343353-GAGA-G', user=self.user, return_all_queried_families=True) self._test_minimal_search_call( variant_ids=[['2', 103343353, 'GAGA', 'G']], variant_keys=[], - num_results=1, sample_data={'VARIANTS': [FAMILY_3_SAMPLE]}) + num_results=1, sample_data=FAMILY_2_ALL_SAMPLE_DATA) responses.add(responses.POST, f'{MOCK_HOST}:5000/search', status=200, json={'results': [], 'total': 0}) with self.assertRaises(InvalidSearchException) as cm: diff --git a/seqr/views/apis/variant_search_api_tests.py b/seqr/views/apis/variant_search_api_tests.py index 05b1c20314..feed023d44 100644 --- a/seqr/views/apis/variant_search_api_tests.py +++ b/seqr/views/apis/variant_search_api_tests.py @@ -6,6 +6,7 @@ from django.urls.base import reverse from elasticsearch.exceptions import ConnectionTimeout, TransportError +from hail_search.test_utils import HAIL_BACKEND_SINGLE_FAMILY_VARIANTS from seqr.models import VariantSearchResults, LocusList, Project, VariantSearch from seqr.utils.search.utils import InvalidSearchException from seqr.utils.search.elasticsearch.es_utils import InvalidIndexException @@ -23,7 +24,7 @@ SEARCH = {'filters': {}, 'inheritance': None} PROJECT_FAMILIES = [{'projectGuid': PROJECT_GUID, 'familyGuids': ['F000001_1', 'F000002_2']}] -VARIANTS_WITH_DISCOVERY_TAGS = deepcopy(VARIANTS) +VARIANTS_WITH_DISCOVERY_TAGS = deepcopy(VARIANTS + HAIL_BACKEND_SINGLE_FAMILY_VARIANTS) DISCOVERY_TAGS = [{ 'savedVariant': { 'variantGuid': 'SV0000006_1248367227_r0003_tes', @@ -74,7 +75,7 @@ ] EXPECTED_SEARCH_RESPONSE = { - 'searchedVariants': VARIANTS, + 'searchedVariants': VARIANTS + HAIL_BACKEND_SINGLE_FAMILY_VARIANTS, 'savedVariantsByGuid': { 'SV0000001_2103343353_r0390_100': expected_detail_saved_variant, 'SV0000002_1248367227_r0390_100': EXPECTED_SAVED_VARIANT, @@ -84,7 +85,7 @@ 'search': { 'search': SEARCH, 'projectFamilies': [{'projectGuid': PROJECT_GUID, 'familyGuids': mock.ANY}], - 'totalResults': 3, + 'totalResults': 5, }, 'variantTagsByGuid': { 'VT1708633_2103343353_r0390_100': EXPECTED_TAG, 'VT1726945_2103343353_r0390_100': EXPECTED_TAG, @@ -131,7 +132,7 @@ def _get_es_variants(results_model, **kwargs): results_model.save() - return deepcopy(VARIANTS), len(VARIANTS) + return deepcopy(VARIANTS + HAIL_BACKEND_SINGLE_FAMILY_VARIANTS), len(VARIANTS + HAIL_BACKEND_SINGLE_FAMILY_VARIANTS) def _get_empty_es_variants(results_model, **kwargs): @@ -373,16 +374,26 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro 'gnomad_exomes_freq', 'topmed_freq', 'cadd', 'revel', 'eigen', 'splice_ai', 'polyphen', 'sift', 'muttaster', 'fathmm', 'rsid', 'hgvsc', 'hgvsp', 'clinvar_clinical_significance', 'clinvar_gold_stars', 'filter', 'family_id_1', 'tags_1', 'notes_1', 'family_id_2', 'tags_2', 'notes_2', 'sample_1', 'num_alt_alleles_1', 'gq_1', 'ab_1', - 'sample_2', 'num_alt_alleles_2', 'gq_2', 'ab_2'], + 'sample_2', 'num_alt_alleles_2', 'gq_2', 'ab_2', 'sample_3', 'num_alt_alleles_3', 'gq_3', 'ab_3'], ['21', '3343400', 'GAGA', 'G', 'WASH7P', 'missense_variant', '0.13', '', '0.007', '', '', '', '', '', '', '', '', '', '', '', 'ENST00000623083.3:c.1075G>A', 'ENSP00000485442.1:p.Gly359Ser', '', '', '', '1', 'Tier 1 - Novel gene and phenotype (None)|Review (None)', '', '2', '', '', 'NA19675', '1', '46.0', - '0.702127659574', 'NA19679', '0', '99.0', '0.0'], + '0.702127659574', 'NA19679', '0', '99.0', '0.0', '', '', '', ''], ['3', '835', 'AAAG', 'A', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', - '1', '', '', '', '', '', 'NA19679', '0', '99.0', '0.0', '', '', '', ''], + '1', '', '', '', '', '', 'NA19679', '0', '99.0', '0.0', '', '', '', '', '', '', '', ''], ['12', '48367227', 'TC', 'T', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '2', 'Known gene for phenotype (None)|Excluded (None)', 'a later note (None)|test n\xf8te (None)', '', '', '', '', '', '', - '', '', '', '', '']] + '', '', '', '', '', '', '', '', ''], + ['1', '11794419', 'T', 'G', '', 'missense_variant', '', '0.29499998688697815', '0.2633855640888214', + '0.28899794816970825', '0.24615199863910675', '20.899999618530273', '0.19699999690055847', + '2.000999927520752', '0.0', '', 'tolerated', '', 'damaging', 'rs1801131', 'ENST00000376585.6:c.1409A>C', + 'ENSP00000365770.1:p.Glu470Ala', '', '1', '', '2', '', '', '', '', '', 'HG00731', '2', '99', '1.0', + 'HG00732', '0', '40', '0.0', 'HG00733', '1', '99', '0.625'], + ['1', '91502721', 'G', 'A', '', 'intron_variant', '', '0.0', '0.38041073083877563', '0.0', + '0.36268100142478943', '2.753999948501587', '', '1.378000020980835', '0.009999999776482582', '', '', '', + '', 'rs13447464', 'ENST00000428239.5:c.115+890G>A', '', '', '', '', '2', '', '', '', '', '', 'HG00731', + '1', '99', '1.0', 'HG00732', '0', '99', '0.4594594594594595', 'HG00733', '1', '99', '0.4074074074074074'], + ] self.assertEqual(response.content, ('\n'.join(['\t'.join(line) for line in expected_content])+'\n').encode('utf-8')) # test export with max families @@ -393,17 +404,31 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro ['chrom', 'pos', 'ref', 'alt', 'gene', 'worst_consequence', 'callset_freq', 'exac_freq', 'gnomad_genomes_freq', 'gnomad_exomes_freq', 'topmed_freq', 'cadd', 'revel', 'eigen', 'splice_ai', 'polyphen', 'sift', 'muttaster', 'fathmm', 'rsid', 'hgvsc', 'hgvsp', 'clinvar_clinical_significance', 'clinvar_gold_stars', 'filter', 'family_id_1', - 'tags_1', 'notes_1', 'sample_1', 'num_alt_alleles_1', 'gq_1', 'ab_1',], + 'tags_1', 'notes_1', 'sample_1', 'num_alt_alleles_1', 'gq_1', 'ab_1', 'sample_2', 'num_alt_alleles_2', + 'gq_2', 'ab_2', 'sample_3', 'num_alt_alleles_3', 'gq_3', 'ab_3'], ['21', '3343400', 'GAGA', 'G', 'WASH7P', 'missense_variant', '0.13', '', '0.007', '', '', '', '', '', '', '', '', '', '', '', 'ENST00000623083.3:c.1075G>A', 'ENSP00000485442.1:p.Gly359Ser', '', '', '', '1', - 'Tier 1 - Novel gene and phenotype (None)|Review (None)', '', 'NA19675', '1', '46.0', '0.702127659574',], + 'Tier 1 - Novel gene and phenotype (None)|Review (None)', '', 'NA19675', '1', '46.0', '0.702127659574', + '', '', '', '', '', '', '', '',], ['21', '3343400', 'GAGA', 'G', 'WASH7P', 'missense_variant', '0.13', '', '0.007', '', '', '', '', '', '', '', '', '', '', '', 'ENST00000623083.3:c.1075G>A', 'ENSP00000485442.1:p.Gly359Ser', '', '', '', '2', '', '', - 'NA19679', '0', '99.0', '0.0'], + 'NA19679', '0', '99.0', '0.0', '', '', '', '', '', '', '', '',], ['3', '835', 'AAAG', 'A', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', - '1', '', '', 'NA19679', '0', '99.0', '0.0',], + '1', '', '', 'NA19679', '0', '99.0', '0.0', '', '', '', '', '', '', '', '',], ['12', '48367227', 'TC', 'T', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', - '', '2', 'Known gene for phenotype (None)|Excluded (None)', 'a later note (None)|test n\xf8te (None)', '', '', '', '',]] + '', '2', 'Known gene for phenotype (None)|Excluded (None)', 'a later note (None)|test n\xf8te (None)', + '', '', '', '', '', '', '', '', '', '', '', '',], + ['1', '11794419', 'T', 'G', '', 'missense_variant', '', '0.29499998688697815', '0.2633855640888214', + '0.28899794816970825', '0.24615199863910675', '20.899999618530273', '0.19699999690055847', + '2.000999927520752', '0.0', '', 'tolerated', '', 'damaging', 'rs1801131', 'ENST00000376585.6:c.1409A>C', + 'ENSP00000365770.1:p.Glu470Ala', '', '1', '', '2', '', '', 'HG00731', '2', '99', '1.0', + 'HG00732', '0', '40', '0.0', 'HG00733', '1', '99', '0.625'], + ['1', '91502721', 'G', 'A', '', 'intron_variant', '', '0.0', '0.38041073083877563', '0.0', + '0.36268100142478943', '2.753999948501587', '', '1.378000020980835', '0.009999999776482582', '', '', + '', '', 'rs13447464', 'ENST00000428239.5:c.115+890G>A', '', '', '', '', '2', '', '', 'HG00731', + '1', '99', '1.0', 'HG00732', '0', '99', '0.4594594594594595', 'HG00733', '1', '99', + '0.4074074074074074'], + ] self.assertEqual(response.content, ('\n'.join(['\t'.join(line) for line in expected_content]) + '\n').encode('utf-8')) @@ -497,7 +522,7 @@ def _get_variants(results_model, **kwargs): results_model.save() self.assertSetEqual(expected_searched_families, {f.guid for f in results_model.families.all()}) matched_variants = [ - deepcopy(variant) for variant in VARIANTS + deepcopy(variant) for variant in VARIANTS + HAIL_BACKEND_SINGLE_FAMILY_VARIANTS if any(family_guid in expected_searched_families for family_guid in variant['familyGuids']) ] return matched_variants, len(matched_variants) @@ -573,7 +598,7 @@ def test_query_all_project_families_variants(self, mock_get_variants): self.assertDictEqual(response_json['search'], { 'search': SEARCH, 'projectFamilies': [{'projectGuid': 'R0003_test', 'familyGuids': mock.ANY}], - 'totalResults': 3, + 'totalResults': 5, }) self.assertSetEqual( {'F000011_11', 'F000012_12'}, set(response_json['search']['projectFamilies'][0]['familyGuids']))