From fc9af3c8b253d8b71f1d0772c66c46571afb28d3 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 4 Mar 2024 17:30:31 -0500 Subject: [PATCH 01/12] add multi lookup endpoint --- hail_search/queries/base.py | 19 ++++++++++++++----- hail_search/search.py | 5 +++++ hail_search/test_search.py | 30 ++++++++++++++++++++++++++++-- hail_search/web_app.py | 8 +++++++- 4 files changed, 54 insertions(+), 8 deletions(-) diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index 9fcfc79683..5b30214853 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -1045,11 +1045,22 @@ def gene_counts(self): ht.gene_ids, hl.struct(total=hl.agg.count(), families=hl.agg.counter(ht.families)) )) - def lookup_variant(self, variant_id, sample_data=None): - self._parse_intervals(intervals=None, variant_ids=[variant_id], variant_keys=[variant_id]) + def lookup_variants(self, variant_ids, annotation_fields=None): + self._parse_intervals(intervals=None, variant_ids=variant_ids, variant_keys=variant_ids) ht = self._read_table('annotations.ht', drop_globals=['paths', 'versions']) ht = ht.filter(hl.is_defined(ht[XPOS])) + if not annotation_fields: + annotation_fields = { + k: v for k, v in self.annotation_fields(include_genotype_overrides=False).items() + if k not in {FAMILY_GUID_FIELD, GENOTYPES_FIELD, 'genotypeFilters'} + } + + formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=False) + + return formatted.aggregate(hl.agg.take(formatted.row, len(variant_ids))) + + def lookup_variant(self, variant_id, sample_data=None): annotation_fields = self.annotation_fields(include_genotype_overrides=False) entry_annotations = {k: annotation_fields[k] for k in [FAMILY_GUID_FIELD, GENOTYPES_FIELD]} annotation_fields.update({ @@ -1058,9 +1069,7 @@ def lookup_variant(self, variant_id, sample_data=None): 'genotypeFilters': lambda ht: hl.str(''), }) - formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=False) - - variants = formatted.aggregate(hl.agg.take(formatted.row, 1)) + variants = self.lookup_variants([variant_id], annotation_fields=annotation_fields) if not variants: raise HTTPNotFound() variant = dict(variants[0]) diff --git a/hail_search/search.py b/hail_search/search.py index 6af6d72606..8207ce7f09 100644 --- a/hail_search/search.py +++ b/hail_search/search.py @@ -27,6 +27,11 @@ def lookup_variant(request): return query.lookup_variant(request['variant_id'], sample_data=request.get('sample_data')) +def lookup_variants(request): + query = QUERY_CLASS_MAP[(request['data_type'], request['genome_version'])](sample_data=None) + return query.lookup_variants(request['variant_ids']) + + def load_globals(): for cls in QUERY_CLASS_MAP.values(): cls.load_globals() diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 4c20cfc556..c0c9d25da8 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -155,6 +155,8 @@ 'numAlt': 1, 'dp': 28, 'gq': 99, 'ab': 0.5, } +NO_GENOTYPE_GCNV_VARIANT = {**GCNV_VARIANT4, 'numExon': 8, 'end': 38736268} + # Ensures no variants are filtered out by annotation/path filters for compound hets COMP_HET_ALL_PASS_FILTERS = { 'annotations': {'splice_ai': '0.0', 'structural': ['DEL', 'CPX', 'INS', 'gCNV_DEL', 'gCNV_DUP']}, @@ -694,19 +696,43 @@ async def test_variant_lookup(self): self.assertEqual(resp.status, 200) resp_json = await resp.json() self.assertDictEqual(resp_json, { - **GCNV_VARIANT4, 'numExon': 8, 'end': 38736268, 'familyGuids': [], 'genotypes': {}, 'genotypeFilters': '', + **NO_GENOTYPE_GCNV_VARIANT, 'familyGuids': [], 'genotypes': {}, 'genotypeFilters': '', }) async with self.client.request('POST', '/lookup', json={**body, 'sample_data': EXPECTED_SAMPLE_DATA['SV_WES']}) as resp: self.assertEqual(resp.status, 200) resp_json = await resp.json() self.assertDictEqual(resp_json, { - **GCNV_VARIANT4, 'numExon': 8, 'end': 38736268, 'genotypes': { + **NO_GENOTYPE_GCNV_VARIANT, 'genotypes': { individual: {k: v for k, v in genotype.items() if k not in {'start', 'end', 'numExon', 'geneIds'}} for individual, genotype in GCNV_VARIANT4['genotypes'].items() } }) + async def test_multi_variant_lookup(self): + await self._test_multi_lookup(VARIANT_ID_SEARCH['variant_ids'], 'SNV_INDEL', [VARIANT1]) + + await self._test_multi_lookup([['7', 143270172, 'A', 'G']], 'SNV_INDEL', [GRCH37_VARIANT], genome_version='GRCh37') + + await self._test_multi_lookup([['M', 4429, 'G', 'A'], ['M', 14783, 'T', 'C']], 'MITO', [MITO_VARIANT1, MITO_VARIANT3]) + + await self._test_multi_lookup( + ['cohort_2911.chr1.final_cleanup_INS_chr1_160', 'phase2_DEL_chr14_4640'], + 'SV_WGS', [SV_VARIANT2, SV_VARIANT4], + ) + + await self._test_multi_lookup(['suffix_140608_DUP'], 'SV_WES', [NO_GENOTYPE_GCNV_VARIANT]) + + async def _test_multi_lookup(self, variant_ids, data_type, results, genome_version='GRCh38'): + body = {'genome_version': genome_version, 'data_type': data_type, 'variant_ids': variant_ids} + async with self.client.request('POST', '/multi_lookup', json=body) as resp: + self.assertEqual(resp.status, 200) + resp_json = await resp.json() + self.assertDictEqual(resp_json, {'results': [ + {k: v for k, v in variant.items() if k not in {'familyGuids', 'genotypes', 'genotypeFilters'}} + for variant in results + ]}) + async def test_frequency_filter(self): sv_callset_filter = {'sv_callset': {'af': 0.05}} await self._assert_expected_search( diff --git a/hail_search/web_app.py b/hail_search/web_app.py index 8ccfe429a9..a4ef765c04 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -9,7 +9,7 @@ import traceback from typing import Callable -from hail_search.search import search_hail_backend, load_globals, lookup_variant +from hail_search.search import search_hail_backend, load_globals, lookup_variant, lookup_variants logger = logging.getLogger(__name__) @@ -64,6 +64,11 @@ async def lookup(request: web.Request) -> web.Response: return web.json_response(result, dumps=hl_json_dumps) +async def multi_lookup(request: web.Request) -> web.Response: + result = await sync_to_async_hail_query(request, lookup_variants) + return web.json_response({'results': result}, dumps=hl_json_dumps) + + async def status(request: web.Request) -> web.Response: return web.json_response({'success': True}) @@ -84,6 +89,7 @@ async def init_web_app(): web.post('/search', search), web.post('/gene_counts', gene_counts), web.post('/lookup', lookup), + web.post('/multi_lookup', multi_lookup), ]) # The idea here is to run the hail queries off the main thread so that the # event loop stays live and the /status check is responsive. We only From 0085d75169210a5958e3f27c031b108381f71a6b Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 11:37:35 -0500 Subject: [PATCH 02/12] add multi lookup hlelper func --- .../commands/check_for_new_samples_from_pipeline.py | 1 + seqr/utils/search/hail_search_utils.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index ad87d71850..f0cd703ccf 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -8,6 +8,7 @@ from seqr.models import Family, Sample, Project from seqr.utils.file_utils import file_iter, does_file_exist from seqr.utils.search.add_data_utils import notify_search_data_loaded +from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup from seqr.views.utils.dataset_utils import match_and_update_search_samples from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json diff --git a/seqr/utils/search/hail_search_utils.py b/seqr/utils/search/hail_search_utils.py index 7eb1c53cda..b144c8e5b8 100644 --- a/seqr/utils/search/hail_search_utils.py +++ b/seqr/utils/search/hail_search_utils.py @@ -110,6 +110,12 @@ def hail_variant_lookup(user, variant_id, samples=None, dataset_type=Sample.DATA return variants +def hail_variant_multi_lookup(user_email, variant_ids, data_type, genome_version): + body = {'genome_version': genome_version, 'data_type': data_type, 'variant_ids': variant_ids} + response_json = _execute_search(body, user=None, user_email=user_email, path='multi_lookup') + return response_json['results'] + + def _format_search_body(samples, genome_version, num_results, search): search_body = { 'genome_version': GENOME_VERSION_LOOKUP[genome_version], From 0e9d4be8a48abe6e074206c58f0d1d62a510a751 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 12:30:57 -0500 Subject: [PATCH 03/12] update annotation for additional projects andfamilies --- .../check_for_new_samples_from_pipeline.py | 63 ++++++++++++++++--- .../commands/reload_saved_variant_json.py | 6 +- seqr/utils/search/add_data_utils.py | 5 +- seqr/views/utils/variant_utils.py | 25 ++++---- 4 files changed, 73 insertions(+), 26 deletions(-) diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index f0cd703ccf..921b4dc7d7 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -1,21 +1,24 @@ from collections import defaultdict from django.contrib.postgres.aggregates import ArrayAgg from django.core.management.base import BaseCommand, CommandError +from django.db.models.functions import JSONObject import json import logging from reference_data.models import GENOME_VERSION_LOOKUP -from seqr.models import Family, Sample, Project +from seqr.models import Family, Sample, SavedVariant from seqr.utils.file_utils import file_iter, does_file_exist from seqr.utils.search.add_data_utils import notify_search_data_loaded from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup from seqr.views.utils.dataset_utils import match_and_update_search_samples -from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json +from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json, \ + saved_variants_dataset_type_filter logger = logging.getLogger(__name__) GS_PATH_TEMPLATE = 'gs://seqr-hail-search-data/v03/{path}/runs/{version}/' DATASET_TYPE_MAP = {'GCNV': Sample.DATASET_TYPE_SV_CALLS} +USER_EMAIL = 'manage_command' class Command(BaseCommand): @@ -84,20 +87,60 @@ def handle(self, *args, **options): reset_cached_search_results(project=None) # Send loading notifications + update_sample_data_by_project = { + s['individual__family__project']: s for s in updated_samples.values('individual__family__project').annotate( + samples=ArrayAgg(JSONObject(sample_id='sample_id', individual_id='individual_id')), + family_guids=ArrayAgg('individual__family__guid', distinct=True), + ) + } + updated_project_families = [] + updated_families = set() for project, sample_ids in samples_by_project.items(): - project_updated_samples = updated_samples.filter(individual__family__project=project) + project_sample_data = update_sample_data_by_project[project.id] notify_search_data_loaded( project, dataset_type, sample_type, inactivated_sample_guids, - updated_samples=project_updated_samples, num_samples=len(sample_ids), + updated_samples=project_sample_data['samples'], num_samples=len(sample_ids), ) + project_families = project_sample_data['family_guids'] + updated_families.update(project_families) + updated_project_families.append((project.id, project.name, project_families)) # Reload saved variant JSON - updated_annotation_samples = Sample.objects.filter(is_active=True, dataset_type=dataset_type) + update_projects_saved_variant_json(updated_project_families, user_email=USER_EMAIL, dataset_type=dataset_type) + self._reload_shared_variant_annotations(updated_families, dataset_type, sample_type, genome_version) + + logger.info('DONE') + + def _reload_shared_variant_annotations(self, updated_families, dataset_type, sample_type, genome_version): + data_type = dataset_type + updated_annotation_samples = Sample.objects.filter( + is_active=True, dataset_type=dataset_type, + individual__family__project__genome_version=genome_version.replace('GRCh', ''), + ).exclude(individual__family__guid__in=updated_families) if dataset_type == Sample.DATASET_TYPE_SV_CALLS: updated_annotation_samples = updated_annotation_samples.filter(sample_type=sample_type) - projects = Project.objects.filter( - genome_version=genome_version.replace('GRCh', ''), family__individual__sample__in=updated_annotation_samples, - ).annotate(families=ArrayAgg('family__family_id', distinct=True)).values_list('id', 'name', 'families') - update_projects_saved_variant_json(projects, user_email='manage_command', dataset_type=dataset_type) + data_type = f'{dataset_type}_{sample_type}' - logger.info('DONE') + variant_models = SavedVariant.objects.filter( + family_id__in=updated_annotation_samples.values_list('individual__family', flat=True).distinct(), + **saved_variants_dataset_type_filter(dataset_type), + ) + if not variant_models: + logger.info('No additional saved variants to update') + return + + variants_by_id = defaultdict(list) + for v in variant_models: + variants_by_id[v.variant_id].append(v) + + logger.info(f'Reloading shared annotations for {len(variant_models)} saved variants ({len(variants_by_id)} unique)') + updated_variants = hail_variant_multi_lookup(USER_EMAIL, variants_by_id.keys(), data_type, genome_version) + logger.info(f'Fetched {len(updated_variants)} variants') + updated_variant_models = [] + for variant in updated_variants: + for variant_model in variants_by_id[variant['variantId']]: + variant_model.saved_variant_json.update(variant) + updated_variant_models.append(variant_model) + + SavedVariant.objects.bulk_update(updated_variant_models, ['saved_variant_json']) + logger.info(f'Updated {len(updated_variant_models)} saved variants') diff --git a/seqr/management/commands/reload_saved_variant_json.py b/seqr/management/commands/reload_saved_variant_json.py index e884079d2d..2659fd37b3 100644 --- a/seqr/management/commands/reload_saved_variant_json.py +++ b/seqr/management/commands/reload_saved_variant_json.py @@ -13,12 +13,12 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('projects', nargs="*", help='Project(s) to transfer. If not specified, defaults to all projects.') - parser.add_argument('--family-id', help='optional family to reload variants for') + parser.add_argument('--family-guid', help='optional family to reload variants for') def handle(self, *args, **options): """transfer project""" projects_to_process = options['projects'] - family_id = options['family_id'] + family_guid = options['family_guid'] if projects_to_process: projects = Project.objects.filter(Q(name__in=projects_to_process) | Q(guid__in=projects_to_process)) @@ -27,7 +27,7 @@ def handle(self, *args, **options): projects = Project.objects.all() logging.info("Processing all %s projects" % len(projects)) - family_ids = [family_id] if family_id else None + family_ids = [family_guid] if family_id else None project_list = [(*project, family_ids) for project in projects.values_list('id', 'name')] update_projects_saved_variant_json(project_list, user_email='manage_command') logger.info("Done") diff --git a/seqr/utils/search/add_data_utils.py b/seqr/utils/search/add_data_utils.py index 654f4e5c6f..91366a6c74 100644 --- a/seqr/utils/search/add_data_utils.py +++ b/seqr/utils/search/add_data_utils.py @@ -43,7 +43,8 @@ def add_new_es_search_samples(request_json, project, user, notify=False, expecte if notify: num_samples = len(sample_ids) - num_skipped - notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sample_guids, updated_samples, num_samples) + updated_sample_data = updated_samples.values('sample_id', 'individual_id') + notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sample_guids, updated_sample_data, num_samples) return inactivated_sample_guids, updated_family_guids, updated_samples @@ -52,7 +53,7 @@ def notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sa is_internal = not project_has_anvil(project) or is_internal_anvil_project(project) previous_loaded_individuals = set(Sample.objects.filter(guid__in=inactivated_sample_guids).values_list('individual_id', flat=True)) - new_sample_ids = [sample.sample_id for sample in updated_samples if sample.individual_id not in previous_loaded_individuals] + new_sample_ids = [sample['sample_id'] for sample in updated_samples if sample['individual_id'] not in previous_loaded_individuals] url = f'{BASE_URL}project/{project.guid}/project_page' msg_dataset_type = '' if dataset_type == Sample.DATASET_TYPE_VARIANT_CALLS else f' {dataset_type}' diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index 02f17ff70a..ff955ae632 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -34,10 +34,10 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs): skipped = {} error = {} logger.info(f'Reloading saved variants in {len(projects)} projects') - for project_id, project_name, family_ids in tqdm(projects, unit=' project'): + for project_id, project_name, family_guids in tqdm(projects, unit=' project'): try: updated_saved_variant_guids = update_project_saved_variant_json( - project_id, user_email=user_email, family_ids=family_ids, **kwargs) + project_id, user_email=user_email, family_guids=family_guids, **kwargs) if updated_saved_variant_guids is None: skipped[project_name] = True else: @@ -61,18 +61,13 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs): logger.info(f' {k}: {v}') -def update_project_saved_variant_json(project_id, family_ids=None, dataset_type=None, user=None, user_email=None): +def update_project_saved_variant_json(project_id, family_guids=None, dataset_type=None, user=None, user_email=None): saved_variants = SavedVariant.objects.filter(family__project_id=project_id).select_related('family') - if family_ids: - saved_variants = saved_variants.filter(family__family_id__in=family_ids) + if family_guids: + saved_variants = saved_variants.filter(family__guid__in=family_guids) if dataset_type: - xpos_filter_key = 'xpos__gte' if dataset_type == Sample.DATASET_TYPE_MITO_CALLS else 'xpos__lt' - dataset_filter = { - 'alt__isnull': dataset_type == Sample.DATASET_TYPE_SV_CALLS, - xpos_filter_key: get_xpos('M', 1), - } - saved_variants = saved_variants.filter(**dataset_filter) + saved_variants = saved_variants.filter(**saved_variants_dataset_type_filter(dataset_type)) if not saved_variants: return None @@ -102,6 +97,14 @@ def update_project_saved_variant_json(project_id, family_ids=None, dataset_type= return updated_saved_variant_guids +def saved_variants_dataset_type_filter(dataset_type): + xpos_filter_key = 'xpos__gte' if dataset_type == Sample.DATASET_TYPE_MITO_CALLS else 'xpos__lt' + return { + 'alt__isnull': dataset_type == Sample.DATASET_TYPE_SV_CALLS, + xpos_filter_key: get_xpos('M', 1), + } + + def parse_saved_variant_json(variant_json, family): if 'xpos' not in variant_json: variant_json['xpos'] = get_xpos(variant_json['chrom'], variant_json['pos']) From 7362de2892ee3c3b2dd715c8f0b563df7a0906e2 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 12:35:42 -0500 Subject: [PATCH 04/12] update reload tests --- seqr/management/commands/reload_saved_variant_json.py | 2 +- seqr/management/tests/reload_saved_variant_json_tests.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/seqr/management/commands/reload_saved_variant_json.py b/seqr/management/commands/reload_saved_variant_json.py index 2659fd37b3..ccb8ff82d3 100644 --- a/seqr/management/commands/reload_saved_variant_json.py +++ b/seqr/management/commands/reload_saved_variant_json.py @@ -27,7 +27,7 @@ def handle(self, *args, **options): projects = Project.objects.all() logging.info("Processing all %s projects" % len(projects)) - family_ids = [family_guid] if family_id else None + family_ids = [family_guid] if family_guid else None project_list = [(*project, family_ids) for project in projects.values_list('id', 'name')] update_projects_saved_variant_json(project_list, user_email='manage_command') logger.info("Done") diff --git a/seqr/management/tests/reload_saved_variant_json_tests.py b/seqr/management/tests/reload_saved_variant_json_tests.py index f540a9dc44..36b5b81cd2 100644 --- a/seqr/management/tests/reload_saved_variant_json_tests.py +++ b/seqr/management/tests/reload_saved_variant_json_tests.py @@ -7,7 +7,7 @@ PROJECT_NAME = '1kg project n\u00e5me with uni\u00e7\u00f8de' PROJECT_GUID = 'R0001_1kg' -FAMILY_ID = '1' +FAMILY_GUID = 'F000001_1' class ReloadSavedVariantJsonTest(TestCase): @@ -23,7 +23,7 @@ def test_with_param_command(self, mock_get_variants, mock_logger): # Test with a specific project and a family id. call_command('reload_saved_variant_json', PROJECT_NAME, - '--family-id={}'.format(FAMILY_ID)) + '--family-guid={}'.format(FAMILY_GUID)) family_1 = Family.objects.get(id=1) mock_get_variants.assert_called_with( @@ -70,7 +70,7 @@ def test_with_param_command(self, mock_get_variants, mock_logger): mock_get_variants.side_effect = Exception("Database error.") call_command('reload_saved_variant_json', PROJECT_GUID, - '--family-id={}'.format(FAMILY_ID)) + '--family-guid={}'.format(FAMILY_GUID)) mock_get_variants.assert_called_with([family_1], ['1-1562437-G-C', '1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command') From dcd7452328135ee1b6c92d9f7e8c569be489c278 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 12:50:48 -0500 Subject: [PATCH 05/12] update gcnv test --- ...eck_for_new_samples_from_pipeline_tests.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index 2f68a63fb2..ed6005962a 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -76,7 +76,7 @@ def setUp(self): self.addCleanup(patcher.stop) super().setUp() - def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, additional_requests=0, num_projects=1): + def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, reload_annotations_logs, additional_requests=0): self.mock_subprocess.return_value.stdout = [json.dumps(metadata).encode()] self.mock_subprocess.return_value.wait.return_value = 0 @@ -89,7 +89,8 @@ def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls self.mock_logger.info.assert_has_calls([ mock.call(f'Loading new samples from {path}: auto__2023-08-08'), - mock.call(f'Loading {len(sample_guids)} WES {dataset_type} samples in {num_projects} projects'), + mock.call(f'Loading {len(sample_guids)} WES {dataset_type} samples in 2 projects'), + ] + [mock.call(log) for log in reload_annotations_logs] + [ mock.call('DONE'), ]) self.mock_logger.warining.assert_not_called() @@ -186,7 +187,7 @@ def test_command(self, mock_email, mock_airtable_utils): search_body = { 'genome_version': 'GRCh38', 'num_results': 1, 'variant_ids': [['12', 48367227, 'TC', 'T']], 'variant_keys': [], } - self._test_success('GRCh38/SNV_INDEL', metadata, dataset_type='SNV_INDEL', num_projects=2, sample_guids={ + self._test_success('GRCh38/SNV_INDEL', metadata, dataset_type='SNV_INDEL', sample_guids={ EXISTING_SAMPLE_GUID, REPLACED_SAMPLE_GUID, NEW_SAMPLE_GUID_P3, NEW_SAMPLE_GUID_P4, }, additional_requests=1, reload_calls=[ {**search_body, 'sample_data': {'SNV_INDEL': [ @@ -196,7 +197,7 @@ def test_command(self, mock_email, mock_airtable_utils): {**search_body, 'sample_data': {'SNV_INDEL': [ {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'}, ]}}, - ]) + ], reload_annotations_logs=[]) # TODO old_data_sample_guid = 'S000143_na20885' self.assertFalse(Sample.objects.get(guid=old_data_sample_guid).is_active) @@ -307,16 +308,20 @@ def test_gcnv_command(self): metadata = { 'callsets': ['1kg.vcf.gz'], 'sample_type': 'WES', - 'family_samples': {'F000012_12': ['NA20889']}, + 'family_samples': {'F000004_4': ['NA20872'], 'F000012_12': ['NA20889']}, } - self._test_success('GRCh37/GCNV', metadata, dataset_type='SV', sample_guids={f'S{GUID_ID}_NA20889'}, reload_calls=[{ + self._test_success('GRCh37/GCNV', metadata, dataset_type='SV', sample_guids={f'S{GUID_ID}_NA20872', f'S{GUID_ID}_NA20889'}, reload_calls=[{ 'genome_version': 'GRCh37', 'num_results': 1, 'variant_ids': [], 'variant_keys': ['prefix_19107_DEL'], 'sample_data': {'SV_WES': [{'individual_guid': 'I000017_na20889', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20889'}]}, - }]) + }], reload_annotations_logs=['No additional saved variants to update']) - self.mock_send_slack.assert_called_once_with('seqr-data-loading', - f'1 new WES SV samples are loaded in {SEQR_URL}project/{PROJECT_GUID}/project_page\n```NA20889```', - ) + self.mock_send_slack.assert_has_calls([ + mock.call( + 'seqr-data-loading', f'1 new WES SV samples are loaded in {SEQR_URL}project/R0001_1kg/project_page\n```NA20872```', + ), mock.call( + 'seqr-data-loading', f'1 new WES SV samples are loaded in {SEQR_URL}project/{PROJECT_GUID}/project_page\n```NA20889```', + ), + ]) self.mock_utils_logger.error.assert_called_with('Error in project Test Reprocessed Project: Bad Request') self.mock_utils_logger.info.assert_has_calls([ From 485507f3fec7f55369707d7d26f3250aebe7704c Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 12:52:00 -0500 Subject: [PATCH 06/12] update gcnv test --- .../tests/check_for_new_samples_from_pipeline_tests.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index ed6005962a..deaf39d5da 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -171,9 +171,7 @@ def test_command(self, mock_email, mock_airtable_utils): str(ce.exception), 'Data has genome version GRCh38 but the following projects have conflicting versions: R0003_test (GRCh37)') - project = Project.objects.get(guid=PROJECT_GUID) - project.genome_version = 38 - project.save() + Project.objects.filter(id__in=[3]).update(genome_version=38) with self.assertRaises(ValueError) as ce: call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08') From 7742ab46b7c4a9259b7c26aa018a4e9db7cbe796 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 13:11:22 -0500 Subject: [PATCH 07/12] test multi lookup --- .../check_for_new_samples_from_pipeline.py | 2 +- ...eck_for_new_samples_from_pipeline_tests.py | 34 +++++++++++++++---- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index 921b4dc7d7..9efc1bf04a 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -134,7 +134,7 @@ def _reload_shared_variant_annotations(self, updated_families, dataset_type, sam variants_by_id[v.variant_id].append(v) logger.info(f'Reloading shared annotations for {len(variant_models)} saved variants ({len(variants_by_id)} unique)') - updated_variants = hail_variant_multi_lookup(USER_EMAIL, variants_by_id.keys(), data_type, genome_version) + updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(variants_by_id.keys()), data_type, genome_version) logger.info(f'Fetched {len(updated_variants)} variants') updated_variant_models = [] for variant in updated_variants: diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index deaf39d5da..04b48f4d53 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -76,7 +76,7 @@ def setUp(self): self.addCleanup(patcher.stop) super().setUp() - def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, reload_annotations_logs, additional_requests=0): + def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, reload_annotations_logs, has_additional_requests=False): self.mock_subprocess.return_value.stdout = [json.dumps(metadata).encode()] self.mock_subprocess.return_value.wait.return_value = 0 @@ -102,9 +102,9 @@ def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls ]) # Test reload saved variants - self.assertEqual(len(responses.calls), len(reload_calls) + additional_requests) + self.assertEqual(len(responses.calls), len(reload_calls) + (2 if has_additional_requests else 0)) for i, call in enumerate(reload_calls): - resp = responses.calls[i+additional_requests] + resp = responses.calls[i+(1 if has_additional_requests else 0)] self.assertEqual(resp.request.url, f'{MOCK_HAIL_HOST}:5000/search') self.assertEqual(resp.request.headers.get('From'), 'manage_command') self.assertDictEqual(json.loads(resp.request.body), call) @@ -134,6 +134,9 @@ def test_command(self, mock_email, mock_airtable_utils): 'results': [{'variantId': '12-48367227-TC-T', 'familyGuids': ['F000014_14'], 'updated_field': 'updated_value'}], 'total': 1, }) + responses.add(responses.POST, f'{MOCK_HAIL_HOST}:5000/multi_lookup', status=200, json={ + 'results': [{'variantId': '21-3343353-GAGA-G', 'updated_new_field': 'updated_value', 'rsid': 'rs123'}], + }) # Test errors with self.assertRaises(CommandError) as ce: @@ -171,7 +174,7 @@ def test_command(self, mock_email, mock_airtable_utils): str(ce.exception), 'Data has genome version GRCh38 but the following projects have conflicting versions: R0003_test (GRCh37)') - Project.objects.filter(id__in=[3]).update(genome_version=38) + Project.objects.filter(id__in=[1, 3]).update(genome_version=38) with self.assertRaises(ValueError) as ce: call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08') @@ -187,7 +190,7 @@ def test_command(self, mock_email, mock_airtable_utils): } self._test_success('GRCh38/SNV_INDEL', metadata, dataset_type='SNV_INDEL', sample_guids={ EXISTING_SAMPLE_GUID, REPLACED_SAMPLE_GUID, NEW_SAMPLE_GUID_P3, NEW_SAMPLE_GUID_P4, - }, additional_requests=1, reload_calls=[ + }, has_additional_requests=True, reload_calls=[ {**search_body, 'sample_data': {'SNV_INDEL': [ {'individual_guid': 'I000017_na20889', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20889'}, {'individual_guid': 'I000016_na20888', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20888'}, @@ -195,7 +198,9 @@ def test_command(self, mock_email, mock_airtable_utils): {**search_body, 'sample_data': {'SNV_INDEL': [ {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'}, ]}}, - ], reload_annotations_logs=[]) # TODO + ], reload_annotations_logs=[ + 'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 variants', 'Updated 1 saved variants', + ]) old_data_sample_guid = 'S000143_na20885' self.assertFalse(Sample.objects.get(guid=old_data_sample_guid).is_active) @@ -237,10 +242,27 @@ def test_command(self, mock_email, mock_airtable_utils): self.assertEqual(Family.objects.get(guid='F000014_14').analysis_status, 'Rncc') # Test SavedVariant model updated + multi_lookup_request = responses.calls[3].request + self.assertEqual(multi_lookup_request.url, f'{MOCK_HAIL_HOST}:5000/multi_lookup') + self.assertEqual(multi_lookup_request.headers.get('From'), 'manage_command') + # TODO reuse family specific variant where needed (12-48367227-TC-T) + self.assertDictEqual(json.loads(multi_lookup_request.body), { + 'genome_version': 'GRCh38', + 'data_type': 'SNV_INDEL', + 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '12-48367227-TC-T', '21-3343353-GAGA-G'], + }) + updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value') self.assertEqual(len(updated_variants), 1) self.assertEqual(updated_variants.first().guid, 'SV0000006_1248367227_r0004_non') + annotation_updated_json = SavedVariant.objects.get(guid='SV0000001_2103343353_r0390_100').saved_variant_json + self.assertEqual(len(annotation_updated_json), 20) + self.assertEqual(annotation_updated_json['updated_new_field'], 'updated_value') + self.assertEqual(annotation_updated_json['rsid'], 'rs123') + self.assertEqual(annotation_updated_json['mainTranscriptId'], 'ENST00000258436') + self.assertEqual(len(annotation_updated_json['genotypes']), 3) + self.mock_utils_logger.error.assert_not_called() self.mock_utils_logger.info.assert_has_calls([ mock.call('Updated 0 variants for project Test Reprocessed Project'), From 62e06f6ffae1344c6f70ea734e7b8a252e8a7def Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 15:40:01 -0500 Subject: [PATCH 08/12] do not refetch updated variants --- .../check_for_new_samples_from_pipeline.py | 26 ++++++++++++++----- ...eck_for_new_samples_from_pipeline_tests.py | 18 +++++++++---- seqr/views/utils/variant_utils.py | 17 +++++++----- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index 9efc1bf04a..0ee5626438 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -106,12 +106,15 @@ def handle(self, *args, **options): updated_project_families.append((project.id, project.name, project_families)) # Reload saved variant JSON - update_projects_saved_variant_json(updated_project_families, user_email=USER_EMAIL, dataset_type=dataset_type) - self._reload_shared_variant_annotations(updated_families, dataset_type, sample_type, genome_version) + updated_variants_by_id = update_projects_saved_variant_json( + updated_project_families, user_email=USER_EMAIL, dataset_type=dataset_type) + self._reload_shared_variant_annotations( + updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version) logger.info('DONE') - def _reload_shared_variant_annotations(self, updated_families, dataset_type, sample_type, genome_version): + @staticmethod + def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version): data_type = dataset_type updated_annotation_samples = Sample.objects.filter( is_active=True, dataset_type=dataset_type, @@ -134,11 +137,20 @@ def _reload_shared_variant_annotations(self, updated_families, dataset_type, sam variants_by_id[v.variant_id].append(v) logger.info(f'Reloading shared annotations for {len(variant_models)} saved variants ({len(variants_by_id)} unique)') - updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(variants_by_id.keys()), data_type, genome_version) - logger.info(f'Fetched {len(updated_variants)} variants') + + updated_variants_by_id = { + variant_id: {k: v for k, v in variant.items() if k not in {'familyGuids', 'genotypes', 'genotypeFilters'}} + for variant_id, variant in updated_variants_by_id.items() + } + fetch_variant_ids = set(variants_by_id.keys()) - set(updated_variants_by_id.keys()) + if fetch_variant_ids: + updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(fetch_variant_ids), data_type, genome_version) + logger.info(f'Fetched {len(updated_variants)} additional variants') + updated_variants_by_id.update({variant['variantId']: variant for variant in updated_variants}) + updated_variant_models = [] - for variant in updated_variants: - for variant_model in variants_by_id[variant['variantId']]: + for variant_id, variant in updated_variants_by_id.items(): + for variant_model in variants_by_id[variant_id]: variant_model.saved_variant_json.update(variant) updated_variant_models.append(variant_model) diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index 04b48f4d53..89c4679502 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -199,7 +199,7 @@ def test_command(self, mock_email, mock_airtable_utils): {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'}, ]}}, ], reload_annotations_logs=[ - 'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 variants', 'Updated 1 saved variants', + 'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 additional variants', 'Updated 2 saved variants', ]) old_data_sample_guid = 'S000143_na20885' @@ -245,16 +245,24 @@ def test_command(self, mock_email, mock_airtable_utils): multi_lookup_request = responses.calls[3].request self.assertEqual(multi_lookup_request.url, f'{MOCK_HAIL_HOST}:5000/multi_lookup') self.assertEqual(multi_lookup_request.headers.get('From'), 'manage_command') - # TODO reuse family specific variant where needed (12-48367227-TC-T) self.assertDictEqual(json.loads(multi_lookup_request.body), { 'genome_version': 'GRCh38', 'data_type': 'SNV_INDEL', - 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '12-48367227-TC-T', '21-3343353-GAGA-G'], + 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '21-3343353-GAGA-G'], }) updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value') - self.assertEqual(len(updated_variants), 1) - self.assertEqual(updated_variants.first().guid, 'SV0000006_1248367227_r0004_non') + self.assertEqual(len(updated_variants), 2) + self.assertSetEqual( + {v.guid for v in updated_variants}, + {'SV0000006_1248367227_r0004_non', 'SV0000002_1248367227_r0390_100'} + ) + reloaded_variant = next(v for v in updated_variants if v.guid == 'SV0000006_1248367227_r0004_non') + annotation_updated_variant = next(v for v in updated_variants if v.guid == 'SV0000002_1248367227_r0390_100') + self.assertEqual(len(reloaded_variant.saved_variant_json), 3) + self.assertListEqual(reloaded_variant.saved_variant_json['familyGuids'], ['F000014_14']) + self.assertEqual(len(annotation_updated_variant.saved_variant_json), 18) + self.assertListEqual(annotation_updated_variant.saved_variant_json['familyGuids'], ['F000001_1']) annotation_updated_json = SavedVariant.objects.get(guid='SV0000001_2103343353_r0390_100').saved_variant_json self.assertEqual(len(annotation_updated_json), 20) diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index ff955ae632..3daba835ca 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -33,16 +33,18 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs): success = {} skipped = {} error = {} + updated_variants_by_id = {} logger.info(f'Reloading saved variants in {len(projects)} projects') for project_id, project_name, family_guids in tqdm(projects, unit=' project'): try: - updated_saved_variant_guids = update_project_saved_variant_json( + updated_saved_variants = update_project_saved_variant_json( project_id, user_email=user_email, family_guids=family_guids, **kwargs) - if updated_saved_variant_guids is None: + if updated_saved_variants is None: skipped[project_name] = True else: - success[project_name] = len(updated_saved_variant_guids) - logger.info(f'Updated {len(updated_saved_variant_guids)} variants for project {project_name}') + success[project_name] = len(updated_saved_variants) + logger.info(f'Updated {len(updated_saved_variants)} variants for project {project_name}') + updated_variants_by_id.update({v.variant_id: v.saved_variant_json for v in updated_saved_variants.values()}) except Exception as e: traceback_message = traceback.format_exc() logger.error(traceback_message) @@ -59,6 +61,7 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs): logger.info(f'{len(error)} failed projects') for k, v in error.items(): logger.info(f' {k}: {v}') + return updated_variants_by_id def update_project_saved_variant_json(project_id, family_guids=None, dataset_type=None, user=None, user_email=None): @@ -86,15 +89,15 @@ def update_project_saved_variant_json(project_id, family_guids=None, dataset_typ for sub_var_ids in [variant_ids[i:i+MAX_VARIANTS_FETCH] for i in range(0, len(variant_ids), MAX_VARIANTS_FETCH)]: variants_json += get_variants_for_variant_ids(families, sub_var_ids, user=user, user_email=user_email) - updated_saved_variant_guids = [] + updated_saved_variants = {} for var in variants_json: for family_guid in var['familyGuids']: saved_variant = saved_variants_map.get((var['variantId'], family_guid)) if saved_variant: update_model_from_json(saved_variant, {'saved_variant_json': var}, user) - updated_saved_variant_guids.append(saved_variant.guid) + updated_saved_variants[saved_variant.guid] = saved_variant - return updated_saved_variant_guids + return updated_saved_variants def saved_variants_dataset_type_filter(dataset_type): From 4e18bdc5a495b31b4c4c386bd3717a464e918936 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 11 Mar 2024 13:54:53 -0400 Subject: [PATCH 09/12] update fixture data to test ulifted variants --- seqr/fixtures/1kg_project.json | 3 +- .../commands/lift_project_to_hg38.py | 2 +- ...eck_for_new_samples_from_pipeline_tests.py | 8 ++--- .../tests/lift_project_to_hg38_tests.py | 35 +++++++++++-------- 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json index 7032d4f55f..ca050a560f 100644 --- a/seqr/fixtures/1kg_project.json +++ b/seqr/fixtures/1kg_project.json @@ -1884,7 +1884,6 @@ "genotypeFilters": "pass", "mainTranscriptId": "ENST00000505820", "populations": {"callset": {"ac": null, "an": null, "af": null}, "g1k": {"ac": null, "an": null, "af": 0.0}, "gnomad_genomes": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 0.0}, "gnomad_exomes": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 8.142526788913136e-06}, "exac": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 0.0}, "topmed": {"ac": null, "an": null, "af": null}}, - "genomeVersion": "37", "pos": 1560662, "predictions": {"eigen": null, "revel": null, "sift": "damaging", "cadd": "31", "metasvm": "D", "mpc": null, "splice_ai": null, "phastcons_100_vert": null, "mut_taster": "disease_causing", "fathmm": "tolerated", "polyphen": "probably_damaging", "dann": null, "primate_ai": null, "gerp_rs": null}, "hgmd": {"accession": null, "class": null}, @@ -1970,7 +1969,7 @@ "hom": 10332 } }, - "genomeVersion": "37", + "genomeVersion": "38", "pos": 1562437, "hgmd": {"accession": null, "class": null}, "rsid": null, diff --git a/seqr/management/commands/lift_project_to_hg38.py b/seqr/management/commands/lift_project_to_hg38.py index 3f98c63616..ae182f42ca 100644 --- a/seqr/management/commands/lift_project_to_hg38.py +++ b/seqr/management/commands/lift_project_to_hg38.py @@ -77,7 +77,7 @@ def handle(self, *args, **options): len(es_variants), len(missing_variants) + len(lift_failed), missing_family_count)) def _get_variants_to_lift(saved_variants): - saved_variants_to_lift = [v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38] + saved_variants_to_lift = [v for v in saved_variants if v.get('genomeVersion') != GENOME_VERSION_GRCh38] num_already_lifted = len(saved_variants) - len(saved_variants_to_lift) if num_already_lifted: if input('Found {} saved variants already on Hg38. Continue with liftover (y/n)? '.format( diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index 89c4679502..de2f1efd8a 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -135,7 +135,7 @@ def test_command(self, mock_email, mock_airtable_utils): 'total': 1, }) responses.add(responses.POST, f'{MOCK_HAIL_HOST}:5000/multi_lookup', status=200, json={ - 'results': [{'variantId': '21-3343353-GAGA-G', 'updated_new_field': 'updated_value', 'rsid': 'rs123'}], + 'results': [{'variantId': '1-46859832-G-A', 'updated_new_field': 'updated_value', 'rsid': 'rs123'}], }) # Test errors @@ -264,11 +264,11 @@ def test_command(self, mock_email, mock_airtable_utils): self.assertEqual(len(annotation_updated_variant.saved_variant_json), 18) self.assertListEqual(annotation_updated_variant.saved_variant_json['familyGuids'], ['F000001_1']) - annotation_updated_json = SavedVariant.objects.get(guid='SV0000001_2103343353_r0390_100').saved_variant_json - self.assertEqual(len(annotation_updated_json), 20) + annotation_updated_json = SavedVariant.objects.get(guid='SV0059956_11560662_f019313_1').saved_variant_json + self.assertEqual(len(annotation_updated_json), 18) self.assertEqual(annotation_updated_json['updated_new_field'], 'updated_value') self.assertEqual(annotation_updated_json['rsid'], 'rs123') - self.assertEqual(annotation_updated_json['mainTranscriptId'], 'ENST00000258436') + self.assertEqual(annotation_updated_json['mainTranscriptId'], 'ENST00000505820') self.assertEqual(len(annotation_updated_json['genotypes']), 3) self.mock_utils_logger.error.assert_not_called() diff --git a/seqr/management/tests/lift_project_to_hg38_tests.py b/seqr/management/tests/lift_project_to_hg38_tests.py index 0342bd0fac..b3e1c1e833 100644 --- a/seqr/management/tests/lift_project_to_hg38_tests.py +++ b/seqr/management/tests/lift_project_to_hg38_tests.py @@ -59,11 +59,11 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)'), + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'), mock.call('Successfully lifted over 3 variants'), mock.call('Successfully updated 3 variants'), mock.call('---Done---'), - mock.call('Succesfully lifted over 3 variants. Skipped 3 failed variants. Family data not updated for 0 variants') + mock.call('Succesfully lifted over 3 variants. Skipped 2 failed variants. Family data not updated for 0 variants') ] mock_logger.info.assert_has_calls(calls) @@ -72,7 +72,6 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get calls = [ mock.call('chr21', 3343353), - mock.call('chr1', 1562437), mock.call('chr1', 248367227), mock.call('chr1', 1560662), ] @@ -80,7 +79,7 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get families = {family for family in Family.objects.filter(pk__in = [1, 2])} self.assertSetEqual(families, mock_get_es_variants.call_args.args[0]) - self.assertSetEqual({'1-1627057-G-C', '21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'}, + self.assertSetEqual({'21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'}, set(mock_get_es_variants.call_args.args[1])) # Test discontinue on lifted variants @@ -90,7 +89,7 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) - self.assertEqual(str(ce.exception), 'Error: found 1 saved variants already on Hg38') + self.assertEqual(str(ce.exception), 'Error: found 2 saved variants already on Hg38') calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), @@ -177,12 +176,19 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) - self.assertEqual(str(ce.exception), 'Error: unable to lift over 4 variants') + self.assertEqual(str(ce.exception), 'Error: found 1 saved variants already on Hg38') + + mock_input.side_effect = ['y', 'n'] + with self.assertRaises(CommandError) as ce: + call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), + '--es-index={}'.format(ELASTICSEARCH_INDEX)) + + self.assertEqual(str(ce.exception), 'Error: unable to lift over 3 variants') calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)') + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)') ] mock_logger.info.assert_has_calls(calls) @@ -190,16 +196,17 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, mock_get_es_variants.return_value = VARIANTS mock_liftover_to_38.convert_coordinate.side_effect = mock_convert_coordinate mock_logger.reset_mock() + mock_input.side_effect = ['y', 'n', 'n'] with self.assertRaises(CommandError) as ce: call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) - self.assertEqual(str(ce.exception), 'Error: unable to find 3 lifted-over variants') + self.assertEqual(str(ce.exception), 'Error: unable to find 2 lifted-over variants') calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)') + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)') ] mock_logger.info.assert_has_calls(calls) @@ -207,7 +214,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, self.assertSetEqual(mock_get_es_variants.call_args.args[0], families) self.assertSetEqual( set(mock_get_es_variants.call_args.args[1]), - {'1-1627057-G-C', '21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'} + {'21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'} ) # Test discontinue on missing family data while updating the saved variants @@ -215,7 +222,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, variants.append(SINGLE_VARIANT) mock_get_es_variants.return_value = variants mock_logger.reset_mock() - mock_input.side_effect = ['y', 'n'] + mock_input.side_effect = ['y', 'y', 'n'] with self.assertRaises(CommandError) as ce: call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) @@ -225,7 +232,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)'), + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'), mock.call('Successfully lifted over 4 variants') ] mock_logger.info.assert_has_calls(calls) @@ -240,11 +247,11 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'), + mock.call('Lifting over 2 variants (skipping 2 that are already lifted)'), mock.call('Successfully lifted over 4 variants'), mock.call('Successfully updated 4 variants'), mock.call('---Done---'), - mock.call('Succesfully lifted over 4 variants. Skipped 2 failed variants. Family data not updated for 1 variants') + mock.call('Succesfully lifted over 4 variants. Skipped 1 failed variants. Family data not updated for 1 variants') ] mock_logger.info.assert_has_calls(calls) From 868b726ea6f251664e312ea91e503aeb5ba90531 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 11 Mar 2024 14:15:28 -0400 Subject: [PATCH 10/12] only reload variants on correct genome build --- seqr/fixtures/1kg_project.json | 2 +- .../commands/check_for_new_samples_from_pipeline.py | 7 +++++-- .../tests/check_for_new_samples_from_pipeline_tests.py | 8 ++++++-- seqr/views/apis/summary_data_api_tests.py | 2 +- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json index ca050a560f..e3a01d8fa6 100644 --- a/seqr/fixtures/1kg_project.json +++ b/seqr/fixtures/1kg_project.json @@ -2084,7 +2084,7 @@ "alt": "T", "variant_id": "12-48367227-TC-T", "saved_variant_json": { - "liftedOverGenomeVersion": "38", "liftedOverPos": "", "genomeVersion": "37", "pos": 248367227, + "liftedOverGenomeVersion": "37", "liftedOverPos": "", "genomeVersion": "38", "pos": 248367227, "transcripts": {}, "chrom": "1", "genotypes": { "I000018_na21234": {"sampleId": "NA20885", "ab": 0.0, "gq": 99.0, "numAlt": 1}}}, "family": 14 diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index 0ee5626438..f61d592596 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -1,6 +1,7 @@ from collections import defaultdict from django.contrib.postgres.aggregates import ArrayAgg from django.core.management.base import BaseCommand, CommandError +from django.db.models import Q from django.db.models.functions import JSONObject import json import logging @@ -116,9 +117,10 @@ def handle(self, *args, **options): @staticmethod def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version): data_type = dataset_type + db_genome_version = genome_version.replace('GRCh', '') updated_annotation_samples = Sample.objects.filter( is_active=True, dataset_type=dataset_type, - individual__family__project__genome_version=genome_version.replace('GRCh', ''), + individual__family__project__genome_version=db_genome_version, ).exclude(individual__family__guid__in=updated_families) if dataset_type == Sample.DATASET_TYPE_SV_CALLS: updated_annotation_samples = updated_annotation_samples.filter(sample_type=sample_type) @@ -127,7 +129,8 @@ def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, variant_models = SavedVariant.objects.filter( family_id__in=updated_annotation_samples.values_list('individual__family', flat=True).distinct(), **saved_variants_dataset_type_filter(dataset_type), - ) + ).filter(Q(saved_variant_json__genomeVersion__isnull=True) | Q(saved_variant_json__genomeVersion=db_genome_version)) + if not variant_models: logger.info('No additional saved variants to update') return diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index de2f1efd8a..5662885fa6 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -174,7 +174,11 @@ def test_command(self, mock_email, mock_airtable_utils): str(ce.exception), 'Data has genome version GRCh38 but the following projects have conflicting versions: R0003_test (GRCh37)') + # Update fixture data to allow testing edge cases Project.objects.filter(id__in=[1, 3]).update(genome_version=38) + sv = SavedVariant.objects.get(guid='SV0000002_1248367227_r0390_100') + sv.saved_variant_json['genomeVersion'] = '38' + sv.save() with self.assertRaises(ValueError) as ce: call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08') @@ -199,7 +203,7 @@ def test_command(self, mock_email, mock_airtable_utils): {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'}, ]}}, ], reload_annotations_logs=[ - 'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 additional variants', 'Updated 2 saved variants', + 'Reloading shared annotations for 3 saved variants (3 unique)', 'Fetched 1 additional variants', 'Updated 2 saved variants', ]) old_data_sample_guid = 'S000143_na20885' @@ -248,7 +252,7 @@ def test_command(self, mock_email, mock_airtable_utils): self.assertDictEqual(json.loads(multi_lookup_request.body), { 'genome_version': 'GRCh38', 'data_type': 'SNV_INDEL', - 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '21-3343353-GAGA-G'], + 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A'], }) updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value') diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py index 18c72f6fd3..6e55fc315c 100644 --- a/seqr/views/apis/summary_data_api_tests.py +++ b/seqr/views/apis/summary_data_api_tests.py @@ -151,7 +151,7 @@ 'end-1': None, 'ref-1': 'TC', 'zygosity-1': 'Heterozygous', - 'variant_reference_assembly-1': 'GRCh37', + 'variant_reference_assembly-1': 'GRCh38', 'allele_balance_or_heteroplasmy_percentage-1': None, 'gene-1': None, 'gene_id-1': None, From 14f03d07bdc37d551bb576d61b3aeb05e81a2d4c Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 11 Mar 2024 14:23:02 -0400 Subject: [PATCH 11/12] fix test --- matchmaker/views/matchmaker_api_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/matchmaker/views/matchmaker_api_tests.py b/matchmaker/views/matchmaker_api_tests.py index 7b898c3c7f..149d0cde77 100644 --- a/matchmaker/views/matchmaker_api_tests.py +++ b/matchmaker/views/matchmaker_api_tests.py @@ -306,7 +306,7 @@ def test_search_individual_mme_matches(self, mock_email, mock_logger, mock_slack 'start': 248367227, 'referenceBases': 'TC', 'alternateBases': 'T', - 'assembly': 'GRCh37', + 'assembly': 'GRCh38', }, 'zygosity': 1, }], @@ -332,7 +332,7 @@ def test_search_individual_mme_matches(self, mock_email, mock_logger, mock_slack 'ref': 'TC', 'alt': 'T', 'end': None, - 'genomeVersion': 'GRCh37', + 'genomeVersion': 'GRCh38', } } ], From 464cb3b6f13dd33b636b17d2da46f2b7088a7f1d Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 11 Mar 2024 14:33:29 -0400 Subject: [PATCH 12/12] properly parse variant ids --- .../check_for_new_samples_from_pipeline.py | 6 +++++- .../check_for_new_samples_from_pipeline_tests.py | 2 +- seqr/utils/search/utils.py | 14 +++++++++----- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index f61d592596..2a4c0d71a9 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -10,6 +10,7 @@ from seqr.models import Family, Sample, SavedVariant from seqr.utils.file_utils import file_iter, does_file_exist from seqr.utils.search.add_data_utils import notify_search_data_loaded +from seqr.utils.search.utils import parse_valid_variant_id from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup from seqr.views.utils.dataset_utils import match_and_update_search_samples from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json, \ @@ -117,12 +118,13 @@ def handle(self, *args, **options): @staticmethod def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version): data_type = dataset_type + is_sv = dataset_type == Sample.DATASET_TYPE_SV_CALLS db_genome_version = genome_version.replace('GRCh', '') updated_annotation_samples = Sample.objects.filter( is_active=True, dataset_type=dataset_type, individual__family__project__genome_version=db_genome_version, ).exclude(individual__family__guid__in=updated_families) - if dataset_type == Sample.DATASET_TYPE_SV_CALLS: + if is_sv: updated_annotation_samples = updated_annotation_samples.filter(sample_type=sample_type) data_type = f'{dataset_type}_{sample_type}' @@ -147,6 +149,8 @@ def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, } fetch_variant_ids = set(variants_by_id.keys()) - set(updated_variants_by_id.keys()) if fetch_variant_ids: + if not is_sv: + fetch_variant_ids = [parse_valid_variant_id(variant_id) for variant_id in fetch_variant_ids] updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(fetch_variant_ids), data_type, genome_version) logger.info(f'Fetched {len(updated_variants)} additional variants') updated_variants_by_id.update({variant['variantId']: variant for variant in updated_variants}) diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index 5662885fa6..4cb3e15983 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -252,7 +252,7 @@ def test_command(self, mock_email, mock_airtable_utils): self.assertDictEqual(json.loads(multi_lookup_request.body), { 'genome_version': 'GRCh38', 'data_type': 'SNV_INDEL', - 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A'], + 'variant_ids': [['1', 1562437, 'G', 'C'], ['1', 46859832, 'G', 'A']], }) updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value') diff --git a/seqr/utils/search/utils.py b/seqr/utils/search/utils.py index c1f0e6d4b4..69c302fd16 100644 --- a/seqr/utils/search/utils.py +++ b/seqr/utils/search/utils.py @@ -333,15 +333,19 @@ def _parse_variant_items(search_json): def _parse_variant_id(variant_id): try: - chrom, pos, ref, alt = variant_id.split('-') - chrom = format_chrom(chrom) - pos = int(pos) - get_xpos(chrom, pos) - return chrom, pos, ref, alt + return parse_valid_variant_id(variant_id) except (KeyError, ValueError): return None +def parse_valid_variant_id(variant_id): + chrom, pos, ref, alt = variant_id.split('-') + chrom = format_chrom(chrom) + pos = int(pos) + get_xpos(chrom, pos) + return chrom, pos, ref, alt + + def _validate_sort(sort, families): if sort == PRIORITIZED_GENE_SORT and len(families) > 1: raise InvalidSearchException('Phenotype sort is only supported for single-family search.')