diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index 32b4c657ee..1072def237 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -1061,12 +1061,23 @@ def gene_counts(self): ht.gene_ids, hl.struct(total=hl.agg.count(), families=hl.agg.counter(ht.families)) )) - def lookup_variant(self, variant_id, sample_data=None): - self._parse_intervals(intervals=None, variant_ids=[variant_id], variant_keys=[variant_id]) + def lookup_variants(self, variant_ids, annotation_fields=None): + self._parse_intervals(intervals=None, variant_ids=variant_ids, variant_keys=variant_ids) ht = self._read_table('annotations.ht', drop_globals=['paths', 'versions']) self._load_table_kwargs['_n_partitions'] = 1 ht = ht.filter(hl.is_defined(ht[XPOS])) + if not annotation_fields: + annotation_fields = { + k: v for k, v in self.annotation_fields(include_genotype_overrides=False).items() + if k not in {FAMILY_GUID_FIELD, GENOTYPES_FIELD, 'genotypeFilters'} + } + + formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=False) + + return formatted.aggregate(hl.agg.take(formatted.row, len(variant_ids))) + + def lookup_variant(self, variant_id, sample_data=None): annotation_fields = self.annotation_fields(include_genotype_overrides=False) entry_annotations = {k: annotation_fields[k] for k in [FAMILY_GUID_FIELD, GENOTYPES_FIELD]} annotation_fields.update({ @@ -1075,9 +1086,7 @@ def lookup_variant(self, variant_id, sample_data=None): 'genotypeFilters': lambda ht: hl.str(''), }) - formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=False) - - variants = formatted.aggregate(hl.agg.take(formatted.row, 1)) + variants = self.lookup_variants([variant_id], annotation_fields=annotation_fields) if not variants: raise HTTPNotFound() variant = dict(variants[0]) diff --git a/hail_search/search.py b/hail_search/search.py index 6af6d72606..8207ce7f09 100644 --- a/hail_search/search.py +++ b/hail_search/search.py @@ -27,6 +27,11 @@ def lookup_variant(request): return query.lookup_variant(request['variant_id'], sample_data=request.get('sample_data')) +def lookup_variants(request): + query = QUERY_CLASS_MAP[(request['data_type'], request['genome_version'])](sample_data=None) + return query.lookup_variants(request['variant_ids']) + + def load_globals(): for cls in QUERY_CLASS_MAP.values(): cls.load_globals() diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 4c20cfc556..c0c9d25da8 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -155,6 +155,8 @@ 'numAlt': 1, 'dp': 28, 'gq': 99, 'ab': 0.5, } +NO_GENOTYPE_GCNV_VARIANT = {**GCNV_VARIANT4, 'numExon': 8, 'end': 38736268} + # Ensures no variants are filtered out by annotation/path filters for compound hets COMP_HET_ALL_PASS_FILTERS = { 'annotations': {'splice_ai': '0.0', 'structural': ['DEL', 'CPX', 'INS', 'gCNV_DEL', 'gCNV_DUP']}, @@ -694,19 +696,43 @@ async def test_variant_lookup(self): self.assertEqual(resp.status, 200) resp_json = await resp.json() self.assertDictEqual(resp_json, { - **GCNV_VARIANT4, 'numExon': 8, 'end': 38736268, 'familyGuids': [], 'genotypes': {}, 'genotypeFilters': '', + **NO_GENOTYPE_GCNV_VARIANT, 'familyGuids': [], 'genotypes': {}, 'genotypeFilters': '', }) async with self.client.request('POST', '/lookup', json={**body, 'sample_data': EXPECTED_SAMPLE_DATA['SV_WES']}) as resp: self.assertEqual(resp.status, 200) resp_json = await resp.json() self.assertDictEqual(resp_json, { - **GCNV_VARIANT4, 'numExon': 8, 'end': 38736268, 'genotypes': { + **NO_GENOTYPE_GCNV_VARIANT, 'genotypes': { individual: {k: v for k, v in genotype.items() if k not in {'start', 'end', 'numExon', 'geneIds'}} for individual, genotype in GCNV_VARIANT4['genotypes'].items() } }) + async def test_multi_variant_lookup(self): + await self._test_multi_lookup(VARIANT_ID_SEARCH['variant_ids'], 'SNV_INDEL', [VARIANT1]) + + await self._test_multi_lookup([['7', 143270172, 'A', 'G']], 'SNV_INDEL', [GRCH37_VARIANT], genome_version='GRCh37') + + await self._test_multi_lookup([['M', 4429, 'G', 'A'], ['M', 14783, 'T', 'C']], 'MITO', [MITO_VARIANT1, MITO_VARIANT3]) + + await self._test_multi_lookup( + ['cohort_2911.chr1.final_cleanup_INS_chr1_160', 'phase2_DEL_chr14_4640'], + 'SV_WGS', [SV_VARIANT2, SV_VARIANT4], + ) + + await self._test_multi_lookup(['suffix_140608_DUP'], 'SV_WES', [NO_GENOTYPE_GCNV_VARIANT]) + + async def _test_multi_lookup(self, variant_ids, data_type, results, genome_version='GRCh38'): + body = {'genome_version': genome_version, 'data_type': data_type, 'variant_ids': variant_ids} + async with self.client.request('POST', '/multi_lookup', json=body) as resp: + self.assertEqual(resp.status, 200) + resp_json = await resp.json() + self.assertDictEqual(resp_json, {'results': [ + {k: v for k, v in variant.items() if k not in {'familyGuids', 'genotypes', 'genotypeFilters'}} + for variant in results + ]}) + async def test_frequency_filter(self): sv_callset_filter = {'sv_callset': {'af': 0.05}} await self._assert_expected_search( diff --git a/hail_search/web_app.py b/hail_search/web_app.py index 8ccfe429a9..a4ef765c04 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -9,7 +9,7 @@ import traceback from typing import Callable -from hail_search.search import search_hail_backend, load_globals, lookup_variant +from hail_search.search import search_hail_backend, load_globals, lookup_variant, lookup_variants logger = logging.getLogger(__name__) @@ -64,6 +64,11 @@ async def lookup(request: web.Request) -> web.Response: return web.json_response(result, dumps=hl_json_dumps) +async def multi_lookup(request: web.Request) -> web.Response: + result = await sync_to_async_hail_query(request, lookup_variants) + return web.json_response({'results': result}, dumps=hl_json_dumps) + + async def status(request: web.Request) -> web.Response: return web.json_response({'success': True}) @@ -84,6 +89,7 @@ async def init_web_app(): web.post('/search', search), web.post('/gene_counts', gene_counts), web.post('/lookup', lookup), + web.post('/multi_lookup', multi_lookup), ]) # The idea here is to run the hail queries off the main thread so that the # event loop stays live and the /status check is responsive. We only diff --git a/matchmaker/views/matchmaker_api_tests.py b/matchmaker/views/matchmaker_api_tests.py index 7b898c3c7f..149d0cde77 100644 --- a/matchmaker/views/matchmaker_api_tests.py +++ b/matchmaker/views/matchmaker_api_tests.py @@ -306,7 +306,7 @@ def test_search_individual_mme_matches(self, mock_email, mock_logger, mock_slack 'start': 248367227, 'referenceBases': 'TC', 'alternateBases': 'T', - 'assembly': 'GRCh37', + 'assembly': 'GRCh38', }, 'zygosity': 1, }], @@ -332,7 +332,7 @@ def test_search_individual_mme_matches(self, mock_email, mock_logger, mock_slack 'ref': 'TC', 'alt': 'T', 'end': None, - 'genomeVersion': 'GRCh37', + 'genomeVersion': 'GRCh38', } } ], diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json index 7032d4f55f..e3a01d8fa6 100644 --- a/seqr/fixtures/1kg_project.json +++ b/seqr/fixtures/1kg_project.json @@ -1884,7 +1884,6 @@ "genotypeFilters": "pass", "mainTranscriptId": "ENST00000505820", "populations": {"callset": {"ac": null, "an": null, "af": null}, "g1k": {"ac": null, "an": null, "af": 0.0}, "gnomad_genomes": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 0.0}, "gnomad_exomes": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 8.142526788913136e-06}, "exac": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 0.0}, "topmed": {"ac": null, "an": null, "af": null}}, - "genomeVersion": "37", "pos": 1560662, "predictions": {"eigen": null, "revel": null, "sift": "damaging", "cadd": "31", "metasvm": "D", "mpc": null, "splice_ai": null, "phastcons_100_vert": null, "mut_taster": "disease_causing", "fathmm": "tolerated", "polyphen": "probably_damaging", "dann": null, "primate_ai": null, "gerp_rs": null}, "hgmd": {"accession": null, "class": null}, @@ -1970,7 +1969,7 @@ "hom": 10332 } }, - "genomeVersion": "37", + "genomeVersion": "38", "pos": 1562437, "hgmd": {"accession": null, "class": null}, "rsid": null, @@ -2085,7 +2084,7 @@ "alt": "T", "variant_id": "12-48367227-TC-T", "saved_variant_json": { - "liftedOverGenomeVersion": "38", "liftedOverPos": "", "genomeVersion": "37", "pos": 248367227, + "liftedOverGenomeVersion": "37", "liftedOverPos": "", "genomeVersion": "38", "pos": 248367227, "transcripts": {}, "chrom": "1", "genotypes": { "I000018_na21234": {"sampleId": "NA20885", "ab": 0.0, "gq": 99.0, "numAlt": 1}}}, "family": 14 diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index ad87d71850..2a4c0d71a9 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -1,20 +1,26 @@ from collections import defaultdict from django.contrib.postgres.aggregates import ArrayAgg from django.core.management.base import BaseCommand, CommandError +from django.db.models import Q +from django.db.models.functions import JSONObject import json import logging from reference_data.models import GENOME_VERSION_LOOKUP -from seqr.models import Family, Sample, Project +from seqr.models import Family, Sample, SavedVariant from seqr.utils.file_utils import file_iter, does_file_exist from seqr.utils.search.add_data_utils import notify_search_data_loaded +from seqr.utils.search.utils import parse_valid_variant_id +from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup from seqr.views.utils.dataset_utils import match_and_update_search_samples -from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json +from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json, \ + saved_variants_dataset_type_filter logger = logging.getLogger(__name__) GS_PATH_TEMPLATE = 'gs://seqr-hail-search-data/v03/{path}/runs/{version}/' DATASET_TYPE_MAP = {'GCNV': Sample.DATASET_TYPE_SV_CALLS} +USER_EMAIL = 'manage_command' class Command(BaseCommand): @@ -83,20 +89,77 @@ def handle(self, *args, **options): reset_cached_search_results(project=None) # Send loading notifications + update_sample_data_by_project = { + s['individual__family__project']: s for s in updated_samples.values('individual__family__project').annotate( + samples=ArrayAgg(JSONObject(sample_id='sample_id', individual_id='individual_id')), + family_guids=ArrayAgg('individual__family__guid', distinct=True), + ) + } + updated_project_families = [] + updated_families = set() for project, sample_ids in samples_by_project.items(): - project_updated_samples = updated_samples.filter(individual__family__project=project) + project_sample_data = update_sample_data_by_project[project.id] notify_search_data_loaded( project, dataset_type, sample_type, inactivated_sample_guids, - updated_samples=project_updated_samples, num_samples=len(sample_ids), + updated_samples=project_sample_data['samples'], num_samples=len(sample_ids), ) + project_families = project_sample_data['family_guids'] + updated_families.update(project_families) + updated_project_families.append((project.id, project.name, project_families)) # Reload saved variant JSON - updated_annotation_samples = Sample.objects.filter(is_active=True, dataset_type=dataset_type) - if dataset_type == Sample.DATASET_TYPE_SV_CALLS: - updated_annotation_samples = updated_annotation_samples.filter(sample_type=sample_type) - projects = Project.objects.filter( - genome_version=genome_version.replace('GRCh', ''), family__individual__sample__in=updated_annotation_samples, - ).annotate(families=ArrayAgg('family__family_id', distinct=True)).values_list('id', 'name', 'families') - update_projects_saved_variant_json(projects, user_email='manage_command', dataset_type=dataset_type) + updated_variants_by_id = update_projects_saved_variant_json( + updated_project_families, user_email=USER_EMAIL, dataset_type=dataset_type) + self._reload_shared_variant_annotations( + updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version) logger.info('DONE') + + @staticmethod + def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version): + data_type = dataset_type + is_sv = dataset_type == Sample.DATASET_TYPE_SV_CALLS + db_genome_version = genome_version.replace('GRCh', '') + updated_annotation_samples = Sample.objects.filter( + is_active=True, dataset_type=dataset_type, + individual__family__project__genome_version=db_genome_version, + ).exclude(individual__family__guid__in=updated_families) + if is_sv: + updated_annotation_samples = updated_annotation_samples.filter(sample_type=sample_type) + data_type = f'{dataset_type}_{sample_type}' + + variant_models = SavedVariant.objects.filter( + family_id__in=updated_annotation_samples.values_list('individual__family', flat=True).distinct(), + **saved_variants_dataset_type_filter(dataset_type), + ).filter(Q(saved_variant_json__genomeVersion__isnull=True) | Q(saved_variant_json__genomeVersion=db_genome_version)) + + if not variant_models: + logger.info('No additional saved variants to update') + return + + variants_by_id = defaultdict(list) + for v in variant_models: + variants_by_id[v.variant_id].append(v) + + logger.info(f'Reloading shared annotations for {len(variant_models)} saved variants ({len(variants_by_id)} unique)') + + updated_variants_by_id = { + variant_id: {k: v for k, v in variant.items() if k not in {'familyGuids', 'genotypes', 'genotypeFilters'}} + for variant_id, variant in updated_variants_by_id.items() + } + fetch_variant_ids = set(variants_by_id.keys()) - set(updated_variants_by_id.keys()) + if fetch_variant_ids: + if not is_sv: + fetch_variant_ids = [parse_valid_variant_id(variant_id) for variant_id in fetch_variant_ids] + updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(fetch_variant_ids), data_type, genome_version) + logger.info(f'Fetched {len(updated_variants)} additional variants') + updated_variants_by_id.update({variant['variantId']: variant for variant in updated_variants}) + + updated_variant_models = [] + for variant_id, variant in updated_variants_by_id.items(): + for variant_model in variants_by_id[variant_id]: + variant_model.saved_variant_json.update(variant) + updated_variant_models.append(variant_model) + + SavedVariant.objects.bulk_update(updated_variant_models, ['saved_variant_json']) + logger.info(f'Updated {len(updated_variant_models)} saved variants') diff --git a/seqr/management/commands/lift_project_to_hg38.py b/seqr/management/commands/lift_project_to_hg38.py index 3f98c63616..ae182f42ca 100644 --- a/seqr/management/commands/lift_project_to_hg38.py +++ b/seqr/management/commands/lift_project_to_hg38.py @@ -77,7 +77,7 @@ def handle(self, *args, **options): len(es_variants), len(missing_variants) + len(lift_failed), missing_family_count)) def _get_variants_to_lift(saved_variants): - saved_variants_to_lift = [v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38] + saved_variants_to_lift = [v for v in saved_variants if v.get('genomeVersion') != GENOME_VERSION_GRCh38] num_already_lifted = len(saved_variants) - len(saved_variants_to_lift) if num_already_lifted: if input('Found {} saved variants already on Hg38. Continue with liftover (y/n)? '.format( diff --git a/seqr/management/commands/reload_saved_variant_json.py b/seqr/management/commands/reload_saved_variant_json.py index e884079d2d..ccb8ff82d3 100644 --- a/seqr/management/commands/reload_saved_variant_json.py +++ b/seqr/management/commands/reload_saved_variant_json.py @@ -13,12 +13,12 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('projects', nargs="*", help='Project(s) to transfer. If not specified, defaults to all projects.') - parser.add_argument('--family-id', help='optional family to reload variants for') + parser.add_argument('--family-guid', help='optional family to reload variants for') def handle(self, *args, **options): """transfer project""" projects_to_process = options['projects'] - family_id = options['family_id'] + family_guid = options['family_guid'] if projects_to_process: projects = Project.objects.filter(Q(name__in=projects_to_process) | Q(guid__in=projects_to_process)) @@ -27,7 +27,7 @@ def handle(self, *args, **options): projects = Project.objects.all() logging.info("Processing all %s projects" % len(projects)) - family_ids = [family_id] if family_id else None + family_ids = [family_guid] if family_guid else None project_list = [(*project, family_ids) for project in projects.values_list('id', 'name')] update_projects_saved_variant_json(project_list, user_email='manage_command') logger.info("Done") diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index 2f68a63fb2..4cb3e15983 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -76,7 +76,7 @@ def setUp(self): self.addCleanup(patcher.stop) super().setUp() - def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, additional_requests=0, num_projects=1): + def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, reload_annotations_logs, has_additional_requests=False): self.mock_subprocess.return_value.stdout = [json.dumps(metadata).encode()] self.mock_subprocess.return_value.wait.return_value = 0 @@ -89,7 +89,8 @@ def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls self.mock_logger.info.assert_has_calls([ mock.call(f'Loading new samples from {path}: auto__2023-08-08'), - mock.call(f'Loading {len(sample_guids)} WES {dataset_type} samples in {num_projects} projects'), + mock.call(f'Loading {len(sample_guids)} WES {dataset_type} samples in 2 projects'), + ] + [mock.call(log) for log in reload_annotations_logs] + [ mock.call('DONE'), ]) self.mock_logger.warining.assert_not_called() @@ -101,9 +102,9 @@ def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls ]) # Test reload saved variants - self.assertEqual(len(responses.calls), len(reload_calls) + additional_requests) + self.assertEqual(len(responses.calls), len(reload_calls) + (2 if has_additional_requests else 0)) for i, call in enumerate(reload_calls): - resp = responses.calls[i+additional_requests] + resp = responses.calls[i+(1 if has_additional_requests else 0)] self.assertEqual(resp.request.url, f'{MOCK_HAIL_HOST}:5000/search') self.assertEqual(resp.request.headers.get('From'), 'manage_command') self.assertDictEqual(json.loads(resp.request.body), call) @@ -133,6 +134,9 @@ def test_command(self, mock_email, mock_airtable_utils): 'results': [{'variantId': '12-48367227-TC-T', 'familyGuids': ['F000014_14'], 'updated_field': 'updated_value'}], 'total': 1, }) + responses.add(responses.POST, f'{MOCK_HAIL_HOST}:5000/multi_lookup', status=200, json={ + 'results': [{'variantId': '1-46859832-G-A', 'updated_new_field': 'updated_value', 'rsid': 'rs123'}], + }) # Test errors with self.assertRaises(CommandError) as ce: @@ -170,9 +174,11 @@ def test_command(self, mock_email, mock_airtable_utils): str(ce.exception), 'Data has genome version GRCh38 but the following projects have conflicting versions: R0003_test (GRCh37)') - project = Project.objects.get(guid=PROJECT_GUID) - project.genome_version = 38 - project.save() + # Update fixture data to allow testing edge cases + Project.objects.filter(id__in=[1, 3]).update(genome_version=38) + sv = SavedVariant.objects.get(guid='SV0000002_1248367227_r0390_100') + sv.saved_variant_json['genomeVersion'] = '38' + sv.save() with self.assertRaises(ValueError) as ce: call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08') @@ -186,9 +192,9 @@ def test_command(self, mock_email, mock_airtable_utils): search_body = { 'genome_version': 'GRCh38', 'num_results': 1, 'variant_ids': [['12', 48367227, 'TC', 'T']], 'variant_keys': [], } - self._test_success('GRCh38/SNV_INDEL', metadata, dataset_type='SNV_INDEL', num_projects=2, sample_guids={ + self._test_success('GRCh38/SNV_INDEL', metadata, dataset_type='SNV_INDEL', sample_guids={ EXISTING_SAMPLE_GUID, REPLACED_SAMPLE_GUID, NEW_SAMPLE_GUID_P3, NEW_SAMPLE_GUID_P4, - }, additional_requests=1, reload_calls=[ + }, has_additional_requests=True, reload_calls=[ {**search_body, 'sample_data': {'SNV_INDEL': [ {'individual_guid': 'I000017_na20889', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20889'}, {'individual_guid': 'I000016_na20888', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20888'}, @@ -196,6 +202,8 @@ def test_command(self, mock_email, mock_airtable_utils): {**search_body, 'sample_data': {'SNV_INDEL': [ {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'}, ]}}, + ], reload_annotations_logs=[ + 'Reloading shared annotations for 3 saved variants (3 unique)', 'Fetched 1 additional variants', 'Updated 2 saved variants', ]) old_data_sample_guid = 'S000143_na20885' @@ -238,9 +246,34 @@ def test_command(self, mock_email, mock_airtable_utils): self.assertEqual(Family.objects.get(guid='F000014_14').analysis_status, 'Rncc') # Test SavedVariant model updated + multi_lookup_request = responses.calls[3].request + self.assertEqual(multi_lookup_request.url, f'{MOCK_HAIL_HOST}:5000/multi_lookup') + self.assertEqual(multi_lookup_request.headers.get('From'), 'manage_command') + self.assertDictEqual(json.loads(multi_lookup_request.body), { + 'genome_version': 'GRCh38', + 'data_type': 'SNV_INDEL', + 'variant_ids': [['1', 1562437, 'G', 'C'], ['1', 46859832, 'G', 'A']], + }) + updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value') - self.assertEqual(len(updated_variants), 1) - self.assertEqual(updated_variants.first().guid, 'SV0000006_1248367227_r0004_non') + self.assertEqual(len(updated_variants), 2) + self.assertSetEqual( + {v.guid for v in updated_variants}, + {'SV0000006_1248367227_r0004_non', 'SV0000002_1248367227_r0390_100'} + ) + reloaded_variant = next(v for v in updated_variants if v.guid == 'SV0000006_1248367227_r0004_non') + annotation_updated_variant = next(v for v in updated_variants if v.guid == 'SV0000002_1248367227_r0390_100') + self.assertEqual(len(reloaded_variant.saved_variant_json), 3) + self.assertListEqual(reloaded_variant.saved_variant_json['familyGuids'], ['F000014_14']) + self.assertEqual(len(annotation_updated_variant.saved_variant_json), 18) + self.assertListEqual(annotation_updated_variant.saved_variant_json['familyGuids'], ['F000001_1']) + + annotation_updated_json = SavedVariant.objects.get(guid='SV0059956_11560662_f019313_1').saved_variant_json + self.assertEqual(len(annotation_updated_json), 18) + self.assertEqual(annotation_updated_json['updated_new_field'], 'updated_value') + self.assertEqual(annotation_updated_json['rsid'], 'rs123') + self.assertEqual(annotation_updated_json['mainTranscriptId'], 'ENST00000505820') + self.assertEqual(len(annotation_updated_json['genotypes']), 3) self.mock_utils_logger.error.assert_not_called() self.mock_utils_logger.info.assert_has_calls([ @@ -307,16 +340,20 @@ def test_gcnv_command(self): metadata = { 'callsets': ['1kg.vcf.gz'], 'sample_type': 'WES', - 'family_samples': {'F000012_12': ['NA20889']}, + 'family_samples': {'F000004_4': ['NA20872'], 'F000012_12': ['NA20889']}, } - self._test_success('GRCh37/GCNV', metadata, dataset_type='SV', sample_guids={f'S{GUID_ID}_NA20889'}, reload_calls=[{ + self._test_success('GRCh37/GCNV', metadata, dataset_type='SV', sample_guids={f'S{GUID_ID}_NA20872', f'S{GUID_ID}_NA20889'}, reload_calls=[{ 'genome_version': 'GRCh37', 'num_results': 1, 'variant_ids': [], 'variant_keys': ['prefix_19107_DEL'], 'sample_data': {'SV_WES': [{'individual_guid': 'I000017_na20889', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20889'}]}, - }]) + }], reload_annotations_logs=['No additional saved variants to update']) - self.mock_send_slack.assert_called_once_with('seqr-data-loading', - f'1 new WES SV samples are loaded in {SEQR_URL}project/{PROJECT_GUID}/project_page\n```NA20889```', - ) + self.mock_send_slack.assert_has_calls([ + mock.call( + 'seqr-data-loading', f'1 new WES SV samples are loaded in {SEQR_URL}project/R0001_1kg/project_page\n```NA20872```', + ), mock.call( + 'seqr-data-loading', f'1 new WES SV samples are loaded in {SEQR_URL}project/{PROJECT_GUID}/project_page\n```NA20889```', + ), + ]) self.mock_utils_logger.error.assert_called_with('Error in project Test Reprocessed Project: Bad Request') self.mock_utils_logger.info.assert_has_calls([ diff --git a/seqr/management/tests/lift_project_to_hg38_tests.py b/seqr/management/tests/lift_project_to_hg38_tests.py index 0342bd0fac..b3e1c1e833 100644 --- a/seqr/management/tests/lift_project_to_hg38_tests.py +++ b/seqr/management/tests/lift_project_to_hg38_tests.py @@ -59,11 +59,11 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)'), + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'), mock.call('Successfully lifted over 3 variants'), mock.call('Successfully updated 3 variants'), mock.call('---Done---'), - mock.call('Succesfully lifted over 3 variants. Skipped 3 failed variants. Family data not updated for 0 variants') + mock.call('Succesfully lifted over 3 variants. Skipped 2 failed variants. Family data not updated for 0 variants') ] mock_logger.info.assert_has_calls(calls) @@ -72,7 +72,6 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get calls = [ mock.call('chr21', 3343353), - mock.call('chr1', 1562437), mock.call('chr1', 248367227), mock.call('chr1', 1560662), ] @@ -80,7 +79,7 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get families = {family for family in Family.objects.filter(pk__in = [1, 2])} self.assertSetEqual(families, mock_get_es_variants.call_args.args[0]) - self.assertSetEqual({'1-1627057-G-C', '21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'}, + self.assertSetEqual({'21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'}, set(mock_get_es_variants.call_args.args[1])) # Test discontinue on lifted variants @@ -90,7 +89,7 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) - self.assertEqual(str(ce.exception), 'Error: found 1 saved variants already on Hg38') + self.assertEqual(str(ce.exception), 'Error: found 2 saved variants already on Hg38') calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), @@ -177,12 +176,19 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) - self.assertEqual(str(ce.exception), 'Error: unable to lift over 4 variants') + self.assertEqual(str(ce.exception), 'Error: found 1 saved variants already on Hg38') + + mock_input.side_effect = ['y', 'n'] + with self.assertRaises(CommandError) as ce: + call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), + '--es-index={}'.format(ELASTICSEARCH_INDEX)) + + self.assertEqual(str(ce.exception), 'Error: unable to lift over 3 variants') calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)') + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)') ] mock_logger.info.assert_has_calls(calls) @@ -190,16 +196,17 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, mock_get_es_variants.return_value = VARIANTS mock_liftover_to_38.convert_coordinate.side_effect = mock_convert_coordinate mock_logger.reset_mock() + mock_input.side_effect = ['y', 'n', 'n'] with self.assertRaises(CommandError) as ce: call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) - self.assertEqual(str(ce.exception), 'Error: unable to find 3 lifted-over variants') + self.assertEqual(str(ce.exception), 'Error: unable to find 2 lifted-over variants') calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)') + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)') ] mock_logger.info.assert_has_calls(calls) @@ -207,7 +214,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, self.assertSetEqual(mock_get_es_variants.call_args.args[0], families) self.assertSetEqual( set(mock_get_es_variants.call_args.args[1]), - {'1-1627057-G-C', '21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'} + {'21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'} ) # Test discontinue on missing family data while updating the saved variants @@ -215,7 +222,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, variants.append(SINGLE_VARIANT) mock_get_es_variants.return_value = variants mock_logger.reset_mock() - mock_input.side_effect = ['y', 'n'] + mock_input.side_effect = ['y', 'y', 'n'] with self.assertRaises(CommandError) as ce: call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) @@ -225,7 +232,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)'), + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'), mock.call('Successfully lifted over 4 variants') ] mock_logger.info.assert_has_calls(calls) @@ -240,11 +247,11 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'), + mock.call('Lifting over 2 variants (skipping 2 that are already lifted)'), mock.call('Successfully lifted over 4 variants'), mock.call('Successfully updated 4 variants'), mock.call('---Done---'), - mock.call('Succesfully lifted over 4 variants. Skipped 2 failed variants. Family data not updated for 1 variants') + mock.call('Succesfully lifted over 4 variants. Skipped 1 failed variants. Family data not updated for 1 variants') ] mock_logger.info.assert_has_calls(calls) diff --git a/seqr/management/tests/reload_saved_variant_json_tests.py b/seqr/management/tests/reload_saved_variant_json_tests.py index f540a9dc44..36b5b81cd2 100644 --- a/seqr/management/tests/reload_saved_variant_json_tests.py +++ b/seqr/management/tests/reload_saved_variant_json_tests.py @@ -7,7 +7,7 @@ PROJECT_NAME = '1kg project n\u00e5me with uni\u00e7\u00f8de' PROJECT_GUID = 'R0001_1kg' -FAMILY_ID = '1' +FAMILY_GUID = 'F000001_1' class ReloadSavedVariantJsonTest(TestCase): @@ -23,7 +23,7 @@ def test_with_param_command(self, mock_get_variants, mock_logger): # Test with a specific project and a family id. call_command('reload_saved_variant_json', PROJECT_NAME, - '--family-id={}'.format(FAMILY_ID)) + '--family-guid={}'.format(FAMILY_GUID)) family_1 = Family.objects.get(id=1) mock_get_variants.assert_called_with( @@ -70,7 +70,7 @@ def test_with_param_command(self, mock_get_variants, mock_logger): mock_get_variants.side_effect = Exception("Database error.") call_command('reload_saved_variant_json', PROJECT_GUID, - '--family-id={}'.format(FAMILY_ID)) + '--family-guid={}'.format(FAMILY_GUID)) mock_get_variants.assert_called_with([family_1], ['1-1562437-G-C', '1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command') diff --git a/seqr/utils/search/add_data_utils.py b/seqr/utils/search/add_data_utils.py index 654f4e5c6f..91366a6c74 100644 --- a/seqr/utils/search/add_data_utils.py +++ b/seqr/utils/search/add_data_utils.py @@ -43,7 +43,8 @@ def add_new_es_search_samples(request_json, project, user, notify=False, expecte if notify: num_samples = len(sample_ids) - num_skipped - notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sample_guids, updated_samples, num_samples) + updated_sample_data = updated_samples.values('sample_id', 'individual_id') + notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sample_guids, updated_sample_data, num_samples) return inactivated_sample_guids, updated_family_guids, updated_samples @@ -52,7 +53,7 @@ def notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sa is_internal = not project_has_anvil(project) or is_internal_anvil_project(project) previous_loaded_individuals = set(Sample.objects.filter(guid__in=inactivated_sample_guids).values_list('individual_id', flat=True)) - new_sample_ids = [sample.sample_id for sample in updated_samples if sample.individual_id not in previous_loaded_individuals] + new_sample_ids = [sample['sample_id'] for sample in updated_samples if sample['individual_id'] not in previous_loaded_individuals] url = f'{BASE_URL}project/{project.guid}/project_page' msg_dataset_type = '' if dataset_type == Sample.DATASET_TYPE_VARIANT_CALLS else f' {dataset_type}' diff --git a/seqr/utils/search/hail_search_utils.py b/seqr/utils/search/hail_search_utils.py index 7eb1c53cda..b144c8e5b8 100644 --- a/seqr/utils/search/hail_search_utils.py +++ b/seqr/utils/search/hail_search_utils.py @@ -110,6 +110,12 @@ def hail_variant_lookup(user, variant_id, samples=None, dataset_type=Sample.DATA return variants +def hail_variant_multi_lookup(user_email, variant_ids, data_type, genome_version): + body = {'genome_version': genome_version, 'data_type': data_type, 'variant_ids': variant_ids} + response_json = _execute_search(body, user=None, user_email=user_email, path='multi_lookup') + return response_json['results'] + + def _format_search_body(samples, genome_version, num_results, search): search_body = { 'genome_version': GENOME_VERSION_LOOKUP[genome_version], diff --git a/seqr/utils/search/utils.py b/seqr/utils/search/utils.py index c1f0e6d4b4..69c302fd16 100644 --- a/seqr/utils/search/utils.py +++ b/seqr/utils/search/utils.py @@ -333,15 +333,19 @@ def _parse_variant_items(search_json): def _parse_variant_id(variant_id): try: - chrom, pos, ref, alt = variant_id.split('-') - chrom = format_chrom(chrom) - pos = int(pos) - get_xpos(chrom, pos) - return chrom, pos, ref, alt + return parse_valid_variant_id(variant_id) except (KeyError, ValueError): return None +def parse_valid_variant_id(variant_id): + chrom, pos, ref, alt = variant_id.split('-') + chrom = format_chrom(chrom) + pos = int(pos) + get_xpos(chrom, pos) + return chrom, pos, ref, alt + + def _validate_sort(sort, families): if sort == PRIORITIZED_GENE_SORT and len(families) > 1: raise InvalidSearchException('Phenotype sort is only supported for single-family search.') diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py index 18c72f6fd3..6e55fc315c 100644 --- a/seqr/views/apis/summary_data_api_tests.py +++ b/seqr/views/apis/summary_data_api_tests.py @@ -151,7 +151,7 @@ 'end-1': None, 'ref-1': 'TC', 'zygosity-1': 'Heterozygous', - 'variant_reference_assembly-1': 'GRCh37', + 'variant_reference_assembly-1': 'GRCh38', 'allele_balance_or_heteroplasmy_percentage-1': None, 'gene-1': None, 'gene_id-1': None, diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index 02f17ff70a..3daba835ca 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -33,16 +33,18 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs): success = {} skipped = {} error = {} + updated_variants_by_id = {} logger.info(f'Reloading saved variants in {len(projects)} projects') - for project_id, project_name, family_ids in tqdm(projects, unit=' project'): + for project_id, project_name, family_guids in tqdm(projects, unit=' project'): try: - updated_saved_variant_guids = update_project_saved_variant_json( - project_id, user_email=user_email, family_ids=family_ids, **kwargs) - if updated_saved_variant_guids is None: + updated_saved_variants = update_project_saved_variant_json( + project_id, user_email=user_email, family_guids=family_guids, **kwargs) + if updated_saved_variants is None: skipped[project_name] = True else: - success[project_name] = len(updated_saved_variant_guids) - logger.info(f'Updated {len(updated_saved_variant_guids)} variants for project {project_name}') + success[project_name] = len(updated_saved_variants) + logger.info(f'Updated {len(updated_saved_variants)} variants for project {project_name}') + updated_variants_by_id.update({v.variant_id: v.saved_variant_json for v in updated_saved_variants.values()}) except Exception as e: traceback_message = traceback.format_exc() logger.error(traceback_message) @@ -59,20 +61,16 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs): logger.info(f'{len(error)} failed projects') for k, v in error.items(): logger.info(f' {k}: {v}') + return updated_variants_by_id -def update_project_saved_variant_json(project_id, family_ids=None, dataset_type=None, user=None, user_email=None): +def update_project_saved_variant_json(project_id, family_guids=None, dataset_type=None, user=None, user_email=None): saved_variants = SavedVariant.objects.filter(family__project_id=project_id).select_related('family') - if family_ids: - saved_variants = saved_variants.filter(family__family_id__in=family_ids) + if family_guids: + saved_variants = saved_variants.filter(family__guid__in=family_guids) if dataset_type: - xpos_filter_key = 'xpos__gte' if dataset_type == Sample.DATASET_TYPE_MITO_CALLS else 'xpos__lt' - dataset_filter = { - 'alt__isnull': dataset_type == Sample.DATASET_TYPE_SV_CALLS, - xpos_filter_key: get_xpos('M', 1), - } - saved_variants = saved_variants.filter(**dataset_filter) + saved_variants = saved_variants.filter(**saved_variants_dataset_type_filter(dataset_type)) if not saved_variants: return None @@ -91,15 +89,23 @@ def update_project_saved_variant_json(project_id, family_ids=None, dataset_type= for sub_var_ids in [variant_ids[i:i+MAX_VARIANTS_FETCH] for i in range(0, len(variant_ids), MAX_VARIANTS_FETCH)]: variants_json += get_variants_for_variant_ids(families, sub_var_ids, user=user, user_email=user_email) - updated_saved_variant_guids = [] + updated_saved_variants = {} for var in variants_json: for family_guid in var['familyGuids']: saved_variant = saved_variants_map.get((var['variantId'], family_guid)) if saved_variant: update_model_from_json(saved_variant, {'saved_variant_json': var}, user) - updated_saved_variant_guids.append(saved_variant.guid) + updated_saved_variants[saved_variant.guid] = saved_variant + + return updated_saved_variants + - return updated_saved_variant_guids +def saved_variants_dataset_type_filter(dataset_type): + xpos_filter_key = 'xpos__gte' if dataset_type == Sample.DATASET_TYPE_MITO_CALLS else 'xpos__lt' + return { + 'alt__isnull': dataset_type == Sample.DATASET_TYPE_SV_CALLS, + xpos_filter_key: get_xpos('M', 1), + } def parse_saved_variant_json(variant_json, family):