From fc9af3c8b253d8b71f1d0772c66c46571afb28d3 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 4 Mar 2024 17:30:31 -0500 Subject: [PATCH 01/28] add multi lookup endpoint --- hail_search/queries/base.py | 19 ++++++++++++++----- hail_search/search.py | 5 +++++ hail_search/test_search.py | 30 ++++++++++++++++++++++++++++-- hail_search/web_app.py | 8 +++++++- 4 files changed, 54 insertions(+), 8 deletions(-) diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index 9fcfc79683..5b30214853 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -1045,11 +1045,22 @@ def gene_counts(self): ht.gene_ids, hl.struct(total=hl.agg.count(), families=hl.agg.counter(ht.families)) )) - def lookup_variant(self, variant_id, sample_data=None): - self._parse_intervals(intervals=None, variant_ids=[variant_id], variant_keys=[variant_id]) + def lookup_variants(self, variant_ids, annotation_fields=None): + self._parse_intervals(intervals=None, variant_ids=variant_ids, variant_keys=variant_ids) ht = self._read_table('annotations.ht', drop_globals=['paths', 'versions']) ht = ht.filter(hl.is_defined(ht[XPOS])) + if not annotation_fields: + annotation_fields = { + k: v for k, v in self.annotation_fields(include_genotype_overrides=False).items() + if k not in {FAMILY_GUID_FIELD, GENOTYPES_FIELD, 'genotypeFilters'} + } + + formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=False) + + return formatted.aggregate(hl.agg.take(formatted.row, len(variant_ids))) + + def lookup_variant(self, variant_id, sample_data=None): annotation_fields = self.annotation_fields(include_genotype_overrides=False) entry_annotations = {k: annotation_fields[k] for k in [FAMILY_GUID_FIELD, GENOTYPES_FIELD]} annotation_fields.update({ @@ -1058,9 +1069,7 @@ def lookup_variant(self, variant_id, sample_data=None): 'genotypeFilters': lambda ht: hl.str(''), }) - formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=False) - - variants = formatted.aggregate(hl.agg.take(formatted.row, 1)) + variants = self.lookup_variants([variant_id], annotation_fields=annotation_fields) if not variants: raise HTTPNotFound() variant = dict(variants[0]) diff --git a/hail_search/search.py b/hail_search/search.py index 6af6d72606..8207ce7f09 100644 --- a/hail_search/search.py +++ b/hail_search/search.py @@ -27,6 +27,11 @@ def lookup_variant(request): return query.lookup_variant(request['variant_id'], sample_data=request.get('sample_data')) +def lookup_variants(request): + query = QUERY_CLASS_MAP[(request['data_type'], request['genome_version'])](sample_data=None) + return query.lookup_variants(request['variant_ids']) + + def load_globals(): for cls in QUERY_CLASS_MAP.values(): cls.load_globals() diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 4c20cfc556..c0c9d25da8 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -155,6 +155,8 @@ 'numAlt': 1, 'dp': 28, 'gq': 99, 'ab': 0.5, } +NO_GENOTYPE_GCNV_VARIANT = {**GCNV_VARIANT4, 'numExon': 8, 'end': 38736268} + # Ensures no variants are filtered out by annotation/path filters for compound hets COMP_HET_ALL_PASS_FILTERS = { 'annotations': {'splice_ai': '0.0', 'structural': ['DEL', 'CPX', 'INS', 'gCNV_DEL', 'gCNV_DUP']}, @@ -694,19 +696,43 @@ async def test_variant_lookup(self): self.assertEqual(resp.status, 200) resp_json = await resp.json() self.assertDictEqual(resp_json, { - **GCNV_VARIANT4, 'numExon': 8, 'end': 38736268, 'familyGuids': [], 'genotypes': {}, 'genotypeFilters': '', + **NO_GENOTYPE_GCNV_VARIANT, 'familyGuids': [], 'genotypes': {}, 'genotypeFilters': '', }) async with self.client.request('POST', '/lookup', json={**body, 'sample_data': EXPECTED_SAMPLE_DATA['SV_WES']}) as resp: self.assertEqual(resp.status, 200) resp_json = await resp.json() self.assertDictEqual(resp_json, { - **GCNV_VARIANT4, 'numExon': 8, 'end': 38736268, 'genotypes': { + **NO_GENOTYPE_GCNV_VARIANT, 'genotypes': { individual: {k: v for k, v in genotype.items() if k not in {'start', 'end', 'numExon', 'geneIds'}} for individual, genotype in GCNV_VARIANT4['genotypes'].items() } }) + async def test_multi_variant_lookup(self): + await self._test_multi_lookup(VARIANT_ID_SEARCH['variant_ids'], 'SNV_INDEL', [VARIANT1]) + + await self._test_multi_lookup([['7', 143270172, 'A', 'G']], 'SNV_INDEL', [GRCH37_VARIANT], genome_version='GRCh37') + + await self._test_multi_lookup([['M', 4429, 'G', 'A'], ['M', 14783, 'T', 'C']], 'MITO', [MITO_VARIANT1, MITO_VARIANT3]) + + await self._test_multi_lookup( + ['cohort_2911.chr1.final_cleanup_INS_chr1_160', 'phase2_DEL_chr14_4640'], + 'SV_WGS', [SV_VARIANT2, SV_VARIANT4], + ) + + await self._test_multi_lookup(['suffix_140608_DUP'], 'SV_WES', [NO_GENOTYPE_GCNV_VARIANT]) + + async def _test_multi_lookup(self, variant_ids, data_type, results, genome_version='GRCh38'): + body = {'genome_version': genome_version, 'data_type': data_type, 'variant_ids': variant_ids} + async with self.client.request('POST', '/multi_lookup', json=body) as resp: + self.assertEqual(resp.status, 200) + resp_json = await resp.json() + self.assertDictEqual(resp_json, {'results': [ + {k: v for k, v in variant.items() if k not in {'familyGuids', 'genotypes', 'genotypeFilters'}} + for variant in results + ]}) + async def test_frequency_filter(self): sv_callset_filter = {'sv_callset': {'af': 0.05}} await self._assert_expected_search( diff --git a/hail_search/web_app.py b/hail_search/web_app.py index 8ccfe429a9..a4ef765c04 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -9,7 +9,7 @@ import traceback from typing import Callable -from hail_search.search import search_hail_backend, load_globals, lookup_variant +from hail_search.search import search_hail_backend, load_globals, lookup_variant, lookup_variants logger = logging.getLogger(__name__) @@ -64,6 +64,11 @@ async def lookup(request: web.Request) -> web.Response: return web.json_response(result, dumps=hl_json_dumps) +async def multi_lookup(request: web.Request) -> web.Response: + result = await sync_to_async_hail_query(request, lookup_variants) + return web.json_response({'results': result}, dumps=hl_json_dumps) + + async def status(request: web.Request) -> web.Response: return web.json_response({'success': True}) @@ -84,6 +89,7 @@ async def init_web_app(): web.post('/search', search), web.post('/gene_counts', gene_counts), web.post('/lookup', lookup), + web.post('/multi_lookup', multi_lookup), ]) # The idea here is to run the hail queries off the main thread so that the # event loop stays live and the /status check is responsive. We only From 0085d75169210a5958e3f27c031b108381f71a6b Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 11:37:35 -0500 Subject: [PATCH 02/28] add multi lookup hlelper func --- .../commands/check_for_new_samples_from_pipeline.py | 1 + seqr/utils/search/hail_search_utils.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index ad87d71850..f0cd703ccf 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -8,6 +8,7 @@ from seqr.models import Family, Sample, Project from seqr.utils.file_utils import file_iter, does_file_exist from seqr.utils.search.add_data_utils import notify_search_data_loaded +from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup from seqr.views.utils.dataset_utils import match_and_update_search_samples from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json diff --git a/seqr/utils/search/hail_search_utils.py b/seqr/utils/search/hail_search_utils.py index 7eb1c53cda..b144c8e5b8 100644 --- a/seqr/utils/search/hail_search_utils.py +++ b/seqr/utils/search/hail_search_utils.py @@ -110,6 +110,12 @@ def hail_variant_lookup(user, variant_id, samples=None, dataset_type=Sample.DATA return variants +def hail_variant_multi_lookup(user_email, variant_ids, data_type, genome_version): + body = {'genome_version': genome_version, 'data_type': data_type, 'variant_ids': variant_ids} + response_json = _execute_search(body, user=None, user_email=user_email, path='multi_lookup') + return response_json['results'] + + def _format_search_body(samples, genome_version, num_results, search): search_body = { 'genome_version': GENOME_VERSION_LOOKUP[genome_version], From 0e9d4be8a48abe6e074206c58f0d1d62a510a751 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 12:30:57 -0500 Subject: [PATCH 03/28] update annotation for additional projects andfamilies --- .../check_for_new_samples_from_pipeline.py | 63 ++++++++++++++++--- .../commands/reload_saved_variant_json.py | 6 +- seqr/utils/search/add_data_utils.py | 5 +- seqr/views/utils/variant_utils.py | 25 ++++---- 4 files changed, 73 insertions(+), 26 deletions(-) diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index f0cd703ccf..921b4dc7d7 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -1,21 +1,24 @@ from collections import defaultdict from django.contrib.postgres.aggregates import ArrayAgg from django.core.management.base import BaseCommand, CommandError +from django.db.models.functions import JSONObject import json import logging from reference_data.models import GENOME_VERSION_LOOKUP -from seqr.models import Family, Sample, Project +from seqr.models import Family, Sample, SavedVariant from seqr.utils.file_utils import file_iter, does_file_exist from seqr.utils.search.add_data_utils import notify_search_data_loaded from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup from seqr.views.utils.dataset_utils import match_and_update_search_samples -from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json +from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json, \ + saved_variants_dataset_type_filter logger = logging.getLogger(__name__) GS_PATH_TEMPLATE = 'gs://seqr-hail-search-data/v03/{path}/runs/{version}/' DATASET_TYPE_MAP = {'GCNV': Sample.DATASET_TYPE_SV_CALLS} +USER_EMAIL = 'manage_command' class Command(BaseCommand): @@ -84,20 +87,60 @@ def handle(self, *args, **options): reset_cached_search_results(project=None) # Send loading notifications + update_sample_data_by_project = { + s['individual__family__project']: s for s in updated_samples.values('individual__family__project').annotate( + samples=ArrayAgg(JSONObject(sample_id='sample_id', individual_id='individual_id')), + family_guids=ArrayAgg('individual__family__guid', distinct=True), + ) + } + updated_project_families = [] + updated_families = set() for project, sample_ids in samples_by_project.items(): - project_updated_samples = updated_samples.filter(individual__family__project=project) + project_sample_data = update_sample_data_by_project[project.id] notify_search_data_loaded( project, dataset_type, sample_type, inactivated_sample_guids, - updated_samples=project_updated_samples, num_samples=len(sample_ids), + updated_samples=project_sample_data['samples'], num_samples=len(sample_ids), ) + project_families = project_sample_data['family_guids'] + updated_families.update(project_families) + updated_project_families.append((project.id, project.name, project_families)) # Reload saved variant JSON - updated_annotation_samples = Sample.objects.filter(is_active=True, dataset_type=dataset_type) + update_projects_saved_variant_json(updated_project_families, user_email=USER_EMAIL, dataset_type=dataset_type) + self._reload_shared_variant_annotations(updated_families, dataset_type, sample_type, genome_version) + + logger.info('DONE') + + def _reload_shared_variant_annotations(self, updated_families, dataset_type, sample_type, genome_version): + data_type = dataset_type + updated_annotation_samples = Sample.objects.filter( + is_active=True, dataset_type=dataset_type, + individual__family__project__genome_version=genome_version.replace('GRCh', ''), + ).exclude(individual__family__guid__in=updated_families) if dataset_type == Sample.DATASET_TYPE_SV_CALLS: updated_annotation_samples = updated_annotation_samples.filter(sample_type=sample_type) - projects = Project.objects.filter( - genome_version=genome_version.replace('GRCh', ''), family__individual__sample__in=updated_annotation_samples, - ).annotate(families=ArrayAgg('family__family_id', distinct=True)).values_list('id', 'name', 'families') - update_projects_saved_variant_json(projects, user_email='manage_command', dataset_type=dataset_type) + data_type = f'{dataset_type}_{sample_type}' - logger.info('DONE') + variant_models = SavedVariant.objects.filter( + family_id__in=updated_annotation_samples.values_list('individual__family', flat=True).distinct(), + **saved_variants_dataset_type_filter(dataset_type), + ) + if not variant_models: + logger.info('No additional saved variants to update') + return + + variants_by_id = defaultdict(list) + for v in variant_models: + variants_by_id[v.variant_id].append(v) + + logger.info(f'Reloading shared annotations for {len(variant_models)} saved variants ({len(variants_by_id)} unique)') + updated_variants = hail_variant_multi_lookup(USER_EMAIL, variants_by_id.keys(), data_type, genome_version) + logger.info(f'Fetched {len(updated_variants)} variants') + updated_variant_models = [] + for variant in updated_variants: + for variant_model in variants_by_id[variant['variantId']]: + variant_model.saved_variant_json.update(variant) + updated_variant_models.append(variant_model) + + SavedVariant.objects.bulk_update(updated_variant_models, ['saved_variant_json']) + logger.info(f'Updated {len(updated_variant_models)} saved variants') diff --git a/seqr/management/commands/reload_saved_variant_json.py b/seqr/management/commands/reload_saved_variant_json.py index e884079d2d..2659fd37b3 100644 --- a/seqr/management/commands/reload_saved_variant_json.py +++ b/seqr/management/commands/reload_saved_variant_json.py @@ -13,12 +13,12 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('projects', nargs="*", help='Project(s) to transfer. If not specified, defaults to all projects.') - parser.add_argument('--family-id', help='optional family to reload variants for') + parser.add_argument('--family-guid', help='optional family to reload variants for') def handle(self, *args, **options): """transfer project""" projects_to_process = options['projects'] - family_id = options['family_id'] + family_guid = options['family_guid'] if projects_to_process: projects = Project.objects.filter(Q(name__in=projects_to_process) | Q(guid__in=projects_to_process)) @@ -27,7 +27,7 @@ def handle(self, *args, **options): projects = Project.objects.all() logging.info("Processing all %s projects" % len(projects)) - family_ids = [family_id] if family_id else None + family_ids = [family_guid] if family_id else None project_list = [(*project, family_ids) for project in projects.values_list('id', 'name')] update_projects_saved_variant_json(project_list, user_email='manage_command') logger.info("Done") diff --git a/seqr/utils/search/add_data_utils.py b/seqr/utils/search/add_data_utils.py index 654f4e5c6f..91366a6c74 100644 --- a/seqr/utils/search/add_data_utils.py +++ b/seqr/utils/search/add_data_utils.py @@ -43,7 +43,8 @@ def add_new_es_search_samples(request_json, project, user, notify=False, expecte if notify: num_samples = len(sample_ids) - num_skipped - notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sample_guids, updated_samples, num_samples) + updated_sample_data = updated_samples.values('sample_id', 'individual_id') + notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sample_guids, updated_sample_data, num_samples) return inactivated_sample_guids, updated_family_guids, updated_samples @@ -52,7 +53,7 @@ def notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sa is_internal = not project_has_anvil(project) or is_internal_anvil_project(project) previous_loaded_individuals = set(Sample.objects.filter(guid__in=inactivated_sample_guids).values_list('individual_id', flat=True)) - new_sample_ids = [sample.sample_id for sample in updated_samples if sample.individual_id not in previous_loaded_individuals] + new_sample_ids = [sample['sample_id'] for sample in updated_samples if sample['individual_id'] not in previous_loaded_individuals] url = f'{BASE_URL}project/{project.guid}/project_page' msg_dataset_type = '' if dataset_type == Sample.DATASET_TYPE_VARIANT_CALLS else f' {dataset_type}' diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index 02f17ff70a..ff955ae632 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -34,10 +34,10 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs): skipped = {} error = {} logger.info(f'Reloading saved variants in {len(projects)} projects') - for project_id, project_name, family_ids in tqdm(projects, unit=' project'): + for project_id, project_name, family_guids in tqdm(projects, unit=' project'): try: updated_saved_variant_guids = update_project_saved_variant_json( - project_id, user_email=user_email, family_ids=family_ids, **kwargs) + project_id, user_email=user_email, family_guids=family_guids, **kwargs) if updated_saved_variant_guids is None: skipped[project_name] = True else: @@ -61,18 +61,13 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs): logger.info(f' {k}: {v}') -def update_project_saved_variant_json(project_id, family_ids=None, dataset_type=None, user=None, user_email=None): +def update_project_saved_variant_json(project_id, family_guids=None, dataset_type=None, user=None, user_email=None): saved_variants = SavedVariant.objects.filter(family__project_id=project_id).select_related('family') - if family_ids: - saved_variants = saved_variants.filter(family__family_id__in=family_ids) + if family_guids: + saved_variants = saved_variants.filter(family__guid__in=family_guids) if dataset_type: - xpos_filter_key = 'xpos__gte' if dataset_type == Sample.DATASET_TYPE_MITO_CALLS else 'xpos__lt' - dataset_filter = { - 'alt__isnull': dataset_type == Sample.DATASET_TYPE_SV_CALLS, - xpos_filter_key: get_xpos('M', 1), - } - saved_variants = saved_variants.filter(**dataset_filter) + saved_variants = saved_variants.filter(**saved_variants_dataset_type_filter(dataset_type)) if not saved_variants: return None @@ -102,6 +97,14 @@ def update_project_saved_variant_json(project_id, family_ids=None, dataset_type= return updated_saved_variant_guids +def saved_variants_dataset_type_filter(dataset_type): + xpos_filter_key = 'xpos__gte' if dataset_type == Sample.DATASET_TYPE_MITO_CALLS else 'xpos__lt' + return { + 'alt__isnull': dataset_type == Sample.DATASET_TYPE_SV_CALLS, + xpos_filter_key: get_xpos('M', 1), + } + + def parse_saved_variant_json(variant_json, family): if 'xpos' not in variant_json: variant_json['xpos'] = get_xpos(variant_json['chrom'], variant_json['pos']) From 7362de2892ee3c3b2dd715c8f0b563df7a0906e2 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 12:35:42 -0500 Subject: [PATCH 04/28] update reload tests --- seqr/management/commands/reload_saved_variant_json.py | 2 +- seqr/management/tests/reload_saved_variant_json_tests.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/seqr/management/commands/reload_saved_variant_json.py b/seqr/management/commands/reload_saved_variant_json.py index 2659fd37b3..ccb8ff82d3 100644 --- a/seqr/management/commands/reload_saved_variant_json.py +++ b/seqr/management/commands/reload_saved_variant_json.py @@ -27,7 +27,7 @@ def handle(self, *args, **options): projects = Project.objects.all() logging.info("Processing all %s projects" % len(projects)) - family_ids = [family_guid] if family_id else None + family_ids = [family_guid] if family_guid else None project_list = [(*project, family_ids) for project in projects.values_list('id', 'name')] update_projects_saved_variant_json(project_list, user_email='manage_command') logger.info("Done") diff --git a/seqr/management/tests/reload_saved_variant_json_tests.py b/seqr/management/tests/reload_saved_variant_json_tests.py index f540a9dc44..36b5b81cd2 100644 --- a/seqr/management/tests/reload_saved_variant_json_tests.py +++ b/seqr/management/tests/reload_saved_variant_json_tests.py @@ -7,7 +7,7 @@ PROJECT_NAME = '1kg project n\u00e5me with uni\u00e7\u00f8de' PROJECT_GUID = 'R0001_1kg' -FAMILY_ID = '1' +FAMILY_GUID = 'F000001_1' class ReloadSavedVariantJsonTest(TestCase): @@ -23,7 +23,7 @@ def test_with_param_command(self, mock_get_variants, mock_logger): # Test with a specific project and a family id. call_command('reload_saved_variant_json', PROJECT_NAME, - '--family-id={}'.format(FAMILY_ID)) + '--family-guid={}'.format(FAMILY_GUID)) family_1 = Family.objects.get(id=1) mock_get_variants.assert_called_with( @@ -70,7 +70,7 @@ def test_with_param_command(self, mock_get_variants, mock_logger): mock_get_variants.side_effect = Exception("Database error.") call_command('reload_saved_variant_json', PROJECT_GUID, - '--family-id={}'.format(FAMILY_ID)) + '--family-guid={}'.format(FAMILY_GUID)) mock_get_variants.assert_called_with([family_1], ['1-1562437-G-C', '1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command') From dcd7452328135ee1b6c92d9f7e8c569be489c278 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 12:50:48 -0500 Subject: [PATCH 05/28] update gcnv test --- ...eck_for_new_samples_from_pipeline_tests.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index 2f68a63fb2..ed6005962a 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -76,7 +76,7 @@ def setUp(self): self.addCleanup(patcher.stop) super().setUp() - def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, additional_requests=0, num_projects=1): + def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, reload_annotations_logs, additional_requests=0): self.mock_subprocess.return_value.stdout = [json.dumps(metadata).encode()] self.mock_subprocess.return_value.wait.return_value = 0 @@ -89,7 +89,8 @@ def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls self.mock_logger.info.assert_has_calls([ mock.call(f'Loading new samples from {path}: auto__2023-08-08'), - mock.call(f'Loading {len(sample_guids)} WES {dataset_type} samples in {num_projects} projects'), + mock.call(f'Loading {len(sample_guids)} WES {dataset_type} samples in 2 projects'), + ] + [mock.call(log) for log in reload_annotations_logs] + [ mock.call('DONE'), ]) self.mock_logger.warining.assert_not_called() @@ -186,7 +187,7 @@ def test_command(self, mock_email, mock_airtable_utils): search_body = { 'genome_version': 'GRCh38', 'num_results': 1, 'variant_ids': [['12', 48367227, 'TC', 'T']], 'variant_keys': [], } - self._test_success('GRCh38/SNV_INDEL', metadata, dataset_type='SNV_INDEL', num_projects=2, sample_guids={ + self._test_success('GRCh38/SNV_INDEL', metadata, dataset_type='SNV_INDEL', sample_guids={ EXISTING_SAMPLE_GUID, REPLACED_SAMPLE_GUID, NEW_SAMPLE_GUID_P3, NEW_SAMPLE_GUID_P4, }, additional_requests=1, reload_calls=[ {**search_body, 'sample_data': {'SNV_INDEL': [ @@ -196,7 +197,7 @@ def test_command(self, mock_email, mock_airtable_utils): {**search_body, 'sample_data': {'SNV_INDEL': [ {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'}, ]}}, - ]) + ], reload_annotations_logs=[]) # TODO old_data_sample_guid = 'S000143_na20885' self.assertFalse(Sample.objects.get(guid=old_data_sample_guid).is_active) @@ -307,16 +308,20 @@ def test_gcnv_command(self): metadata = { 'callsets': ['1kg.vcf.gz'], 'sample_type': 'WES', - 'family_samples': {'F000012_12': ['NA20889']}, + 'family_samples': {'F000004_4': ['NA20872'], 'F000012_12': ['NA20889']}, } - self._test_success('GRCh37/GCNV', metadata, dataset_type='SV', sample_guids={f'S{GUID_ID}_NA20889'}, reload_calls=[{ + self._test_success('GRCh37/GCNV', metadata, dataset_type='SV', sample_guids={f'S{GUID_ID}_NA20872', f'S{GUID_ID}_NA20889'}, reload_calls=[{ 'genome_version': 'GRCh37', 'num_results': 1, 'variant_ids': [], 'variant_keys': ['prefix_19107_DEL'], 'sample_data': {'SV_WES': [{'individual_guid': 'I000017_na20889', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20889'}]}, - }]) + }], reload_annotations_logs=['No additional saved variants to update']) - self.mock_send_slack.assert_called_once_with('seqr-data-loading', - f'1 new WES SV samples are loaded in {SEQR_URL}project/{PROJECT_GUID}/project_page\n```NA20889```', - ) + self.mock_send_slack.assert_has_calls([ + mock.call( + 'seqr-data-loading', f'1 new WES SV samples are loaded in {SEQR_URL}project/R0001_1kg/project_page\n```NA20872```', + ), mock.call( + 'seqr-data-loading', f'1 new WES SV samples are loaded in {SEQR_URL}project/{PROJECT_GUID}/project_page\n```NA20889```', + ), + ]) self.mock_utils_logger.error.assert_called_with('Error in project Test Reprocessed Project: Bad Request') self.mock_utils_logger.info.assert_has_calls([ From 485507f3fec7f55369707d7d26f3250aebe7704c Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 12:52:00 -0500 Subject: [PATCH 06/28] update gcnv test --- .../tests/check_for_new_samples_from_pipeline_tests.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index ed6005962a..deaf39d5da 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -171,9 +171,7 @@ def test_command(self, mock_email, mock_airtable_utils): str(ce.exception), 'Data has genome version GRCh38 but the following projects have conflicting versions: R0003_test (GRCh37)') - project = Project.objects.get(guid=PROJECT_GUID) - project.genome_version = 38 - project.save() + Project.objects.filter(id__in=[3]).update(genome_version=38) with self.assertRaises(ValueError) as ce: call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08') From 7742ab46b7c4a9259b7c26aa018a4e9db7cbe796 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 13:11:22 -0500 Subject: [PATCH 07/28] test multi lookup --- .../check_for_new_samples_from_pipeline.py | 2 +- ...eck_for_new_samples_from_pipeline_tests.py | 34 +++++++++++++++---- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index 921b4dc7d7..9efc1bf04a 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -134,7 +134,7 @@ def _reload_shared_variant_annotations(self, updated_families, dataset_type, sam variants_by_id[v.variant_id].append(v) logger.info(f'Reloading shared annotations for {len(variant_models)} saved variants ({len(variants_by_id)} unique)') - updated_variants = hail_variant_multi_lookup(USER_EMAIL, variants_by_id.keys(), data_type, genome_version) + updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(variants_by_id.keys()), data_type, genome_version) logger.info(f'Fetched {len(updated_variants)} variants') updated_variant_models = [] for variant in updated_variants: diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index deaf39d5da..04b48f4d53 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -76,7 +76,7 @@ def setUp(self): self.addCleanup(patcher.stop) super().setUp() - def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, reload_annotations_logs, additional_requests=0): + def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, reload_annotations_logs, has_additional_requests=False): self.mock_subprocess.return_value.stdout = [json.dumps(metadata).encode()] self.mock_subprocess.return_value.wait.return_value = 0 @@ -102,9 +102,9 @@ def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls ]) # Test reload saved variants - self.assertEqual(len(responses.calls), len(reload_calls) + additional_requests) + self.assertEqual(len(responses.calls), len(reload_calls) + (2 if has_additional_requests else 0)) for i, call in enumerate(reload_calls): - resp = responses.calls[i+additional_requests] + resp = responses.calls[i+(1 if has_additional_requests else 0)] self.assertEqual(resp.request.url, f'{MOCK_HAIL_HOST}:5000/search') self.assertEqual(resp.request.headers.get('From'), 'manage_command') self.assertDictEqual(json.loads(resp.request.body), call) @@ -134,6 +134,9 @@ def test_command(self, mock_email, mock_airtable_utils): 'results': [{'variantId': '12-48367227-TC-T', 'familyGuids': ['F000014_14'], 'updated_field': 'updated_value'}], 'total': 1, }) + responses.add(responses.POST, f'{MOCK_HAIL_HOST}:5000/multi_lookup', status=200, json={ + 'results': [{'variantId': '21-3343353-GAGA-G', 'updated_new_field': 'updated_value', 'rsid': 'rs123'}], + }) # Test errors with self.assertRaises(CommandError) as ce: @@ -171,7 +174,7 @@ def test_command(self, mock_email, mock_airtable_utils): str(ce.exception), 'Data has genome version GRCh38 but the following projects have conflicting versions: R0003_test (GRCh37)') - Project.objects.filter(id__in=[3]).update(genome_version=38) + Project.objects.filter(id__in=[1, 3]).update(genome_version=38) with self.assertRaises(ValueError) as ce: call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08') @@ -187,7 +190,7 @@ def test_command(self, mock_email, mock_airtable_utils): } self._test_success('GRCh38/SNV_INDEL', metadata, dataset_type='SNV_INDEL', sample_guids={ EXISTING_SAMPLE_GUID, REPLACED_SAMPLE_GUID, NEW_SAMPLE_GUID_P3, NEW_SAMPLE_GUID_P4, - }, additional_requests=1, reload_calls=[ + }, has_additional_requests=True, reload_calls=[ {**search_body, 'sample_data': {'SNV_INDEL': [ {'individual_guid': 'I000017_na20889', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20889'}, {'individual_guid': 'I000016_na20888', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20888'}, @@ -195,7 +198,9 @@ def test_command(self, mock_email, mock_airtable_utils): {**search_body, 'sample_data': {'SNV_INDEL': [ {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'}, ]}}, - ], reload_annotations_logs=[]) # TODO + ], reload_annotations_logs=[ + 'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 variants', 'Updated 1 saved variants', + ]) old_data_sample_guid = 'S000143_na20885' self.assertFalse(Sample.objects.get(guid=old_data_sample_guid).is_active) @@ -237,10 +242,27 @@ def test_command(self, mock_email, mock_airtable_utils): self.assertEqual(Family.objects.get(guid='F000014_14').analysis_status, 'Rncc') # Test SavedVariant model updated + multi_lookup_request = responses.calls[3].request + self.assertEqual(multi_lookup_request.url, f'{MOCK_HAIL_HOST}:5000/multi_lookup') + self.assertEqual(multi_lookup_request.headers.get('From'), 'manage_command') + # TODO reuse family specific variant where needed (12-48367227-TC-T) + self.assertDictEqual(json.loads(multi_lookup_request.body), { + 'genome_version': 'GRCh38', + 'data_type': 'SNV_INDEL', + 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '12-48367227-TC-T', '21-3343353-GAGA-G'], + }) + updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value') self.assertEqual(len(updated_variants), 1) self.assertEqual(updated_variants.first().guid, 'SV0000006_1248367227_r0004_non') + annotation_updated_json = SavedVariant.objects.get(guid='SV0000001_2103343353_r0390_100').saved_variant_json + self.assertEqual(len(annotation_updated_json), 20) + self.assertEqual(annotation_updated_json['updated_new_field'], 'updated_value') + self.assertEqual(annotation_updated_json['rsid'], 'rs123') + self.assertEqual(annotation_updated_json['mainTranscriptId'], 'ENST00000258436') + self.assertEqual(len(annotation_updated_json['genotypes']), 3) + self.mock_utils_logger.error.assert_not_called() self.mock_utils_logger.info.assert_has_calls([ mock.call('Updated 0 variants for project Test Reprocessed Project'), From 62e06f6ffae1344c6f70ea734e7b8a252e8a7def Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 15:40:01 -0500 Subject: [PATCH 08/28] do not refetch updated variants --- .../check_for_new_samples_from_pipeline.py | 26 ++++++++++++++----- ...eck_for_new_samples_from_pipeline_tests.py | 18 +++++++++---- seqr/views/utils/variant_utils.py | 17 +++++++----- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index 9efc1bf04a..0ee5626438 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -106,12 +106,15 @@ def handle(self, *args, **options): updated_project_families.append((project.id, project.name, project_families)) # Reload saved variant JSON - update_projects_saved_variant_json(updated_project_families, user_email=USER_EMAIL, dataset_type=dataset_type) - self._reload_shared_variant_annotations(updated_families, dataset_type, sample_type, genome_version) + updated_variants_by_id = update_projects_saved_variant_json( + updated_project_families, user_email=USER_EMAIL, dataset_type=dataset_type) + self._reload_shared_variant_annotations( + updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version) logger.info('DONE') - def _reload_shared_variant_annotations(self, updated_families, dataset_type, sample_type, genome_version): + @staticmethod + def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version): data_type = dataset_type updated_annotation_samples = Sample.objects.filter( is_active=True, dataset_type=dataset_type, @@ -134,11 +137,20 @@ def _reload_shared_variant_annotations(self, updated_families, dataset_type, sam variants_by_id[v.variant_id].append(v) logger.info(f'Reloading shared annotations for {len(variant_models)} saved variants ({len(variants_by_id)} unique)') - updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(variants_by_id.keys()), data_type, genome_version) - logger.info(f'Fetched {len(updated_variants)} variants') + + updated_variants_by_id = { + variant_id: {k: v for k, v in variant.items() if k not in {'familyGuids', 'genotypes', 'genotypeFilters'}} + for variant_id, variant in updated_variants_by_id.items() + } + fetch_variant_ids = set(variants_by_id.keys()) - set(updated_variants_by_id.keys()) + if fetch_variant_ids: + updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(fetch_variant_ids), data_type, genome_version) + logger.info(f'Fetched {len(updated_variants)} additional variants') + updated_variants_by_id.update({variant['variantId']: variant for variant in updated_variants}) + updated_variant_models = [] - for variant in updated_variants: - for variant_model in variants_by_id[variant['variantId']]: + for variant_id, variant in updated_variants_by_id.items(): + for variant_model in variants_by_id[variant_id]: variant_model.saved_variant_json.update(variant) updated_variant_models.append(variant_model) diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index 04b48f4d53..89c4679502 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -199,7 +199,7 @@ def test_command(self, mock_email, mock_airtable_utils): {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'}, ]}}, ], reload_annotations_logs=[ - 'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 variants', 'Updated 1 saved variants', + 'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 additional variants', 'Updated 2 saved variants', ]) old_data_sample_guid = 'S000143_na20885' @@ -245,16 +245,24 @@ def test_command(self, mock_email, mock_airtable_utils): multi_lookup_request = responses.calls[3].request self.assertEqual(multi_lookup_request.url, f'{MOCK_HAIL_HOST}:5000/multi_lookup') self.assertEqual(multi_lookup_request.headers.get('From'), 'manage_command') - # TODO reuse family specific variant where needed (12-48367227-TC-T) self.assertDictEqual(json.loads(multi_lookup_request.body), { 'genome_version': 'GRCh38', 'data_type': 'SNV_INDEL', - 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '12-48367227-TC-T', '21-3343353-GAGA-G'], + 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '21-3343353-GAGA-G'], }) updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value') - self.assertEqual(len(updated_variants), 1) - self.assertEqual(updated_variants.first().guid, 'SV0000006_1248367227_r0004_non') + self.assertEqual(len(updated_variants), 2) + self.assertSetEqual( + {v.guid for v in updated_variants}, + {'SV0000006_1248367227_r0004_non', 'SV0000002_1248367227_r0390_100'} + ) + reloaded_variant = next(v for v in updated_variants if v.guid == 'SV0000006_1248367227_r0004_non') + annotation_updated_variant = next(v for v in updated_variants if v.guid == 'SV0000002_1248367227_r0390_100') + self.assertEqual(len(reloaded_variant.saved_variant_json), 3) + self.assertListEqual(reloaded_variant.saved_variant_json['familyGuids'], ['F000014_14']) + self.assertEqual(len(annotation_updated_variant.saved_variant_json), 18) + self.assertListEqual(annotation_updated_variant.saved_variant_json['familyGuids'], ['F000001_1']) annotation_updated_json = SavedVariant.objects.get(guid='SV0000001_2103343353_r0390_100').saved_variant_json self.assertEqual(len(annotation_updated_json), 20) diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index ff955ae632..3daba835ca 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -33,16 +33,18 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs): success = {} skipped = {} error = {} + updated_variants_by_id = {} logger.info(f'Reloading saved variants in {len(projects)} projects') for project_id, project_name, family_guids in tqdm(projects, unit=' project'): try: - updated_saved_variant_guids = update_project_saved_variant_json( + updated_saved_variants = update_project_saved_variant_json( project_id, user_email=user_email, family_guids=family_guids, **kwargs) - if updated_saved_variant_guids is None: + if updated_saved_variants is None: skipped[project_name] = True else: - success[project_name] = len(updated_saved_variant_guids) - logger.info(f'Updated {len(updated_saved_variant_guids)} variants for project {project_name}') + success[project_name] = len(updated_saved_variants) + logger.info(f'Updated {len(updated_saved_variants)} variants for project {project_name}') + updated_variants_by_id.update({v.variant_id: v.saved_variant_json for v in updated_saved_variants.values()}) except Exception as e: traceback_message = traceback.format_exc() logger.error(traceback_message) @@ -59,6 +61,7 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs): logger.info(f'{len(error)} failed projects') for k, v in error.items(): logger.info(f' {k}: {v}') + return updated_variants_by_id def update_project_saved_variant_json(project_id, family_guids=None, dataset_type=None, user=None, user_email=None): @@ -86,15 +89,15 @@ def update_project_saved_variant_json(project_id, family_guids=None, dataset_typ for sub_var_ids in [variant_ids[i:i+MAX_VARIANTS_FETCH] for i in range(0, len(variant_ids), MAX_VARIANTS_FETCH)]: variants_json += get_variants_for_variant_ids(families, sub_var_ids, user=user, user_email=user_email) - updated_saved_variant_guids = [] + updated_saved_variants = {} for var in variants_json: for family_guid in var['familyGuids']: saved_variant = saved_variants_map.get((var['variantId'], family_guid)) if saved_variant: update_model_from_json(saved_variant, {'saved_variant_json': var}, user) - updated_saved_variant_guids.append(saved_variant.guid) + updated_saved_variants[saved_variant.guid] = saved_variant - return updated_saved_variant_guids + return updated_saved_variants def saved_variants_dataset_type_filter(dataset_type): From 7ae6c4bd0d196561b5d91d61c858c88f025411b9 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 16:53:10 -0500 Subject: [PATCH 09/28] Revert "Revert "Multi project gene search redesign"" This reverts commit 226b348f2f04b89ad3e9be6c6edaef50c24e9dbf. --- hail_search/queries/base.py | 12 ++++++++++-- hail_search/queries/snv_indel.py | 4 ++-- hail_search/queries/sv.py | 11 ----------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index 5dc9f6839d..0b395db517 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -281,7 +281,6 @@ def _parse_sample_data(self, sample_data): def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, **kwargs): filtered_project_hts = [] exception_messages = set() - num_projects = len(project_samples) for i, (project_guid, project_sample_data) in enumerate(project_samples.items()): project_ht = self._read_table( f'projects/{project_guid}.ht', @@ -292,7 +291,7 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, ** continue try: filtered_project_hts.append( - (*self._filter_entries_table(project_ht, project_sample_data, num_projects=num_projects, **kwargs), len(project_sample_data)) + (*self._filter_entries_table(project_ht, project_sample_data, **kwargs), len(project_sample_data)) ) except HTTPBadRequest as e: exception_messages.add(e.reason) @@ -305,6 +304,15 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, ** return filtered_project_hts def import_filtered_table(self, project_samples, num_families, intervals=None, **kwargs): + if len(project_samples) > 1 and (kwargs.get('parsed_intervals') or kwargs.get('padded_interval')): + # For multi-project interval search, faster to first read and filter the annotation table and then add entries + ht = self._read_table('annotations.ht') + ht = self._filter_annotated_table( + ht, parsed_intervals=kwargs.get('parsed_intervals'), padded_interval=kwargs.get('padded_interval'), + ) + self._load_table_kwargs['variant_ht'] = ht.select() + kwargs.update({'parsed_intervals': None, 'padded_interval': None}) + if num_families == 1: family_sample_data = list(project_samples.values())[0] family_guid = list(family_sample_data.keys())[0] diff --git a/hail_search/queries/snv_indel.py b/hail_search/queries/snv_indel.py index c41cdd4c99..c72812de80 100644 --- a/hail_search/queries/snv_indel.py +++ b/hail_search/queries/snv_indel.py @@ -81,9 +81,9 @@ class SnvIndelHailTableQuery(MitoHailTableQuery): ('is_gt_10_percent', 0.1), ]) - def _prefilter_entries_table(self, ht, *args, num_projects=1, **kwargs): + def _prefilter_entries_table(self, ht, *args, **kwargs): ht = super()._prefilter_entries_table(ht, *args, **kwargs) - if num_projects > 1 or not self._load_table_kwargs.get('_filter_intervals'): + if 'variant_ht' not in self._load_table_kwargs and not self._load_table_kwargs.get('_filter_intervals'): af_ht = self._get_loaded_filter_ht( GNOMAD_GENOMES_FIELD, 'high_af_variants.ht', self._get_gnomad_af_prefilter, **kwargs) if af_ht: diff --git a/hail_search/queries/sv.py b/hail_search/queries/sv.py index 0d50a7eb16..db39798174 100644 --- a/hail_search/queries/sv.py +++ b/hail_search/queries/sv.py @@ -56,17 +56,6 @@ class SvHailTableQuery(BaseHailTableQuery): def _get_sample_type(cls, *args): return cls.DATA_TYPE.split('_')[-1] - def import_filtered_table(self, project_samples, *args, parsed_intervals=None, padded_interval=None, **kwargs): - if len(project_samples) > 1 and (parsed_intervals or padded_interval): - # For multi-project interval search, faster to first read and filter the annotation table and then add entries - ht = self._read_table('annotations.ht') - ht = self._filter_annotated_table(ht, parsed_intervals=parsed_intervals, padded_interval=padded_interval) - self._load_table_kwargs['variant_ht'] = ht.select() - parsed_intervals = None - padded_interval = None - - return super().import_filtered_table(project_samples, *args, parsed_intervals=parsed_intervals, padded_interval=padded_interval, **kwargs) - def _filter_annotated_table(self, ht, *args, parsed_intervals=None, exclude_intervals=False, padded_interval=None, **kwargs): if parsed_intervals: interval_filter = hl.array(parsed_intervals).any(lambda interval: hl.if_else( From b79225550ad42b18a6b6e9ba98358591be6118ef Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 17:06:20 -0500 Subject: [PATCH 10/28] apply alll annotation filtering before join --- hail_search/queries/base.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index 0b395db517..6329e26a96 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -304,14 +304,12 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, ** return filtered_project_hts def import_filtered_table(self, project_samples, num_families, intervals=None, **kwargs): - if len(project_samples) > 1 and (kwargs.get('parsed_intervals') or kwargs.get('padded_interval')): + use_annotations_ht_first = len(project_samples) > 1 and (kwargs.get('parsed_intervals') or kwargs.get('padded_interval')) + if use_annotations_ht_first: # For multi-project interval search, faster to first read and filter the annotation table and then add entries ht = self._read_table('annotations.ht') - ht = self._filter_annotated_table( - ht, parsed_intervals=kwargs.get('parsed_intervals'), padded_interval=kwargs.get('padded_interval'), - ) + ht = self._filter_annotated_table(ht, **kwargs, is_comp_het=self._has_comp_het_search) self._load_table_kwargs['variant_ht'] = ht.select() - kwargs.update({'parsed_intervals': None, 'padded_interval': None}) if num_families == 1: family_sample_data = list(project_samples.values())[0] @@ -346,13 +344,18 @@ def import_filtered_table(self, project_samples, num_families, intervals=None, * logger.info(f'Found {num_projects_added} {self.DATA_TYPE} projects with matched entries') if comp_het_families_ht is not None: - comp_het_ht = self._query_table_annotations(comp_het_families_ht, self._get_table_path('annotations.ht')) - self._comp_het_ht = self._filter_annotated_table(comp_het_ht, is_comp_het=True, **kwargs) + self._comp_het_ht = self._query_table_annotations(comp_het_families_ht, self._get_table_path('annotations.ht')) + if not use_annotations_ht_first: + self._comp_het_ht = self._filter_annotated_table(self._comp_het_ht, is_comp_het=True, **kwargs) self._comp_het_ht = self._filter_compound_hets() if families_ht is not None: - ht = self._query_table_annotations(families_ht, self._get_table_path('annotations.ht')) - self._ht = self._filter_annotated_table(ht, **kwargs) + self._ht = self._query_table_annotations(families_ht, self._get_table_path('annotations.ht')) + if not use_annotations_ht_first: + self._ht = self._filter_annotated_table(self._ht, **kwargs) + else: + # TODO handle self._has_comp_het_search filter annotations + pass def _add_project_ht(self, families_ht, project_ht, default, default_1=None): if default_1 is None: From 8578aa6de83a37857f4a0a7e048c86bac4461ddd Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 17:12:17 -0500 Subject: [PATCH 11/28] filter annotations --- hail_search/queries/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index 6329e26a96..0dfc659a80 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -353,9 +353,10 @@ def import_filtered_table(self, project_samples, num_families, intervals=None, * self._ht = self._query_table_annotations(families_ht, self._get_table_path('annotations.ht')) if not use_annotations_ht_first: self._ht = self._filter_annotated_table(self._ht, **kwargs) - else: - # TODO handle self._has_comp_het_search filter annotations - pass + elif self._has_comp_het_search: + primary_annotation_filters = self._get_annotation_filters(self._ht) + if primary_annotation_filters: + self._ht = self._ht.filter(hl.any(primary_annotation_filters)) def _add_project_ht(self, families_ht, project_ht, default, default_1=None): if default_1 is None: From f281e255d4f405a27484f3479bf2f918aa1843e3 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 17:20:53 -0500 Subject: [PATCH 12/28] fix recessive filtering --- hail_search/queries/base.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index 0dfc659a80..d1badc1c60 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -354,9 +354,7 @@ def import_filtered_table(self, project_samples, num_families, intervals=None, * if not use_annotations_ht_first: self._ht = self._filter_annotated_table(self._ht, **kwargs) elif self._has_comp_het_search: - primary_annotation_filters = self._get_annotation_filters(self._ht) - if primary_annotation_filters: - self._ht = self._ht.filter(hl.any(primary_annotation_filters)) + self._ht = self._filter_by_annotations(self._ht, **(kwargs.get(parsed_annotations) or {})) def _add_project_ht(self, families_ht, project_ht, default, default_1=None): if default_1 is None: @@ -581,7 +579,7 @@ def _filter_annotated_table(self, ht, gene_ids=None, rs_ids=None, frequencies=No ht = self._filter_by_in_silico(ht, in_silico) - return self._filter_by_annotations(ht, is_comp_het, **(parsed_annotations or {})) + return self._filter_by_annotations(ht, is_comp_het=is_comp_het, **(parsed_annotations or {})) def _filter_by_gene_ids(self, ht, gene_ids): gene_ids = hl.set(gene_ids) @@ -742,7 +740,7 @@ def _parse_annotations(self, annotations, annotations_secondary, **kwargs): }) return parsed_annotations - def _filter_by_annotations(self, ht, is_comp_het, consequence_ids=None, annotation_overrides=None, + def _filter_by_annotations(self, ht, is_comp_het=False, consequence_ids=None, annotation_overrides=None, secondary_consequence_ids=None, secondary_annotation_overrides=None, **kwargs): annotation_exprs = {} From a1a93ad64c273179313fadd4388fafc80a1511f3 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 6 Mar 2024 19:33:44 -0500 Subject: [PATCH 13/28] fix typo --- hail_search/queries/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index d1badc1c60..32b4c657ee 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -354,7 +354,7 @@ def import_filtered_table(self, project_samples, num_families, intervals=None, * if not use_annotations_ht_first: self._ht = self._filter_annotated_table(self._ht, **kwargs) elif self._has_comp_het_search: - self._ht = self._filter_by_annotations(self._ht, **(kwargs.get(parsed_annotations) or {})) + self._ht = self._filter_by_annotations(self._ht, **(kwargs.get('parsed_annotations') or {})) def _add_project_ht(self, families_ht, project_ht, default, default_1=None): if default_1 is None: From c5204da3378463392a8e466be4e172e6b93ac8b0 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 7 Mar 2024 14:14:02 -0500 Subject: [PATCH 14/28] add probably solved --- .../0060_alter_family_analysis_status.py | 18 ++++++++++++++++++ seqr/models.py | 2 ++ 2 files changed, 20 insertions(+) create mode 100644 seqr/migrations/0060_alter_family_analysis_status.py diff --git a/seqr/migrations/0060_alter_family_analysis_status.py b/seqr/migrations/0060_alter_family_analysis_status.py new file mode 100644 index 0000000000..1e8a232e5f --- /dev/null +++ b/seqr/migrations/0060_alter_family_analysis_status.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.23 on 2024-03-07 19:13 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('seqr', '0059_project_subscribers'), + ] + + operations = [ + migrations.AlterField( + model_name='family', + name='analysis_status', + field=models.CharField(choices=[('S', 'S'), ('S_kgfp', 'S'), ('S_kgdp', 'S'), ('S_ng', 'S'), ('ES', 'E'), ('Sc_kgfp', 'S'), ('Sc_kgdp', 'S'), ('Sc_ng', 'S'), ('Rcpc', 'R'), ('Rncc', 'R'), ('C', 'C'), ('PB', 'P'), ('P', 'P'), ('I', 'A'), ('Q', 'W'), ('N', 'N')], default='Q', max_length=10), + ), + ] diff --git a/seqr/models.py b/seqr/models.py index d31a0a9dfa..e6d5e5bd74 100644 --- a/seqr/models.py +++ b/seqr/models.py @@ -278,6 +278,7 @@ def _compute_guid(self): class Family(ModelWithGUID): ANALYSIS_STATUS_ANALYSIS_IN_PROGRESS='I' ANALYSIS_STATUS_PARTIAL_SOLVE = 'P' + ANALYSIS_STATUS_PROBABLE_SOLVE = 'PB' ANALYSIS_STATUS_WAITING_FOR_DATA='Q' SOLVED_ANALYSIS_STATUS_CHOICES = ( ('S', 'Solved'), @@ -297,6 +298,7 @@ class Family(ModelWithGUID): ('Rcpc', 'Reviewed, currently pursuing candidates'), ('Rncc', 'Reviewed, no clear candidate'), ('C', 'Closed, no longer under analysis'), + (ANALYSIS_STATUS_PROBABLE_SOLVE, 'Probably Solved'), (ANALYSIS_STATUS_PARTIAL_SOLVE, 'Partial Solve - Analysis in Progress'), (ANALYSIS_STATUS_ANALYSIS_IN_PROGRESS, 'Analysis in Progress'), (ANALYSIS_STATUS_WAITING_FOR_DATA, 'Waiting for data'), From aab7868f2892b4f649150ba96f48a35a460afa40 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 7 Mar 2024 14:17:05 -0500 Subject: [PATCH 15/28] add probably solved to ui --- CHANGELOG.md | 1 + ui/shared/utils/constants.js | 2 ++ 2 files changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ceeedaf98..80818ba807 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # _seqr_ Changes ## dev +* Add "Probably Solved" analysis status (REQUIRES DB MIGRATION) ## 3/1/24 * Add subscribable project notifications (REQUIRES DB MIGRATION) diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js index 718c722ad1..c8d0671dc7 100644 --- a/ui/shared/utils/constants.js +++ b/ui/shared/utils/constants.js @@ -145,6 +145,7 @@ const FAMILY_STATUS_SOLVED_KNOWN_GENE_KNOWN_PHENOTYPE = 'S_kgfp' const FAMILY_STATUS_SOLVED_KNOWN_GENE_DIFFERENT_PHENOTYPE = 'S_kgdp' const FAMILY_STATUS_SOLVED_NOVEL_GENE = 'S_ng' const FAMILY_STATUS_EXTERNAL_SOLVE = 'ES' +const FAMILY_STATUS_PROBABLE_SOLVE = 'PB' const FAMILY_STATUS_STRONG_CANDIDATE_KNOWN_GENE_KNOWN_PHENOTYPE = 'Sc_kgfp' const FAMILY_STATUS_STRONG_CANDIDATE_KNOWN_GENE_DIFFERENT_PHENOTYPE = 'Sc_kgdp' const FAMILY_STATUS_STRONG_CANDIDATE_NOVEL_GENE = 'Sc_ng' @@ -164,6 +165,7 @@ export const SELECTABLE_FAMILY_ANALYSIS_STATUS_OPTIONS = [ { value: FAMILY_STATUS_SOLVED_KNOWN_GENE_DIFFERENT_PHENOTYPE, color: '#4CAF50', name: 'Solved - gene linked to different phenotype' }, { value: FAMILY_STATUS_SOLVED_NOVEL_GENE, color: '#4CAF50', name: 'Solved - novel gene' }, { value: FAMILY_STATUS_EXTERNAL_SOLVE, color: '#146917', name: 'External Solve' }, + { value: FAMILY_STATUS_PROBABLE_SOLVE, color: '#ACD657', name: 'Probably Solved' }, { value: FAMILY_STATUS_STRONG_CANDIDATE_KNOWN_GENE_KNOWN_PHENOTYPE, color: '#CDDC39', name: 'Strong candidate - known gene for phenotype' }, { value: FAMILY_STATUS_STRONG_CANDIDATE_KNOWN_GENE_DIFFERENT_PHENOTYPE, color: '#CDDC39', name: 'Strong candidate - gene linked to different phenotype' }, { value: FAMILY_STATUS_STRONG_CANDIDATE_NOVEL_GENE, color: '#CDDC39', name: 'Strong candidate - novel gene' }, From 98d4c01e90c22bbe0e1558853e94f336f5338c4d Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 7 Mar 2024 14:44:37 -0500 Subject: [PATCH 16/28] update solve_status reporting --- seqr/views/apis/report_api_tests.py | 18 +++++++++--------- seqr/views/apis/summary_data_api_tests.py | 4 ++-- seqr/views/utils/anvil_metadata_utils.py | 10 ++++++---- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index b6ce200b4b..647ff3d730 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -250,7 +250,7 @@ {'column': 'affected_status', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Affected', 'Unaffected', 'Unknown']}, {'column': 'phenotype_description'}, {'column': 'age_at_enrollment'}, - {'column': 'solve_status', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Yes', 'Likely', 'No', 'Partial']}, + {'column': 'solve_status', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Solved', 'Partially solved', 'Probably solved', 'Unsolved', 'Unaffected']}, {'column': 'missing_variant_case', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Yes', 'No']}, ], }, @@ -505,7 +505,7 @@ 'phenotype_description': None, 'pmid_id': None, 'seqr_chosen_consequence': None, - 'solve_status': 'No', + 'solve_status': 'Unsolved', 'svName': None, 'svType': None, 'sv_name': None, @@ -589,7 +589,7 @@ def test_anvil_export(self, mock_google_authenticated, mock_zip): 'dbgap_subject_id_1', 'No', '1', 'NA19678', 'NA19679', '-', 'Self', 'Male', 'Middle Eastern or North African', '-', '-', '-', 'OMIM:615120', 'Myasthenic syndrome, congenital, 8, with pre- and postsynaptic defects', 'Affected', 'Adult onset', '-', 'HP:0001631|HP:0002011|HP:0001636', 'HP:0011675|HP:0001674|HP:0001508', - 'myopathy', 'No'], subject_file) + 'myopathy', 'Unsolved'], subject_file) self.assertEqual(sample_file[0], [ 'entity:sample_id', '01-subject_id', '02-sample_id', '03-dbgap_sample_id', '04-sequencing_center', @@ -809,23 +809,23 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): self.assertListEqual([ 'Broad_NA19675_1', 'Broad_1kg project nme with unide', 'BROAD', 'HMB', 'Yes', 'IKBKAP|CCDC102B|CMA - normal', '34415322', 'Broad_1', 'Broad_NA19678', 'Broad_NA19679', '', 'Self', '', 'Male', '', - 'Middle Eastern or North African', '', '', '21', 'Affected', 'myopathy', '18', 'No', 'No', + 'Middle Eastern or North African', '', '', '21', 'Affected', 'myopathy', '18', 'Unsolved', 'No', ], row) hispanic_row = next(r for r in participant_file if r[0] == 'Broad_HG00731') self.assertListEqual([ 'Broad_HG00731', 'Broad_1kg project nme with unide', 'BROAD', 'HMB', '', '', '', 'Broad_2', 'Broad_HG00732', - 'Broad_HG00733', '', 'Self', '', 'Female', '', '', 'Hispanic or Latino', 'Other', '', 'Affected', '', '', 'No', 'No', + 'Broad_HG00733', '', 'Self', '', 'Female', '', '', 'Hispanic or Latino', 'Other', '', 'Affected', '', '', 'Unsolved', 'No', ], hispanic_row) solved_row = next(r for r in participant_file if r[0] == 'Broad_NA20876') self.assertListEqual([ 'Broad_NA20876', 'Broad_1kg project nme with unide', 'BROAD', 'HMB', '', '', '', 'Broad_7', '0', - '0', '', '', '', 'Male', '', '', '', '', '', 'Affected', '', '', 'Yes', 'No', + '0', '', '', '', 'Male', '', '', '', '', '', 'Affected', '', '', 'Solved', 'No', ], solved_row) multi_data_type_row = next(r for r in participant_file if r[0] == 'Broad_NA20888') self.assertListEqual([ 'Broad_NA20888', 'Broad_Test Reprocessed Project' if has_second_project else 'Broad_1kg project nme with unide', 'BROAD', 'HMB', 'No', '', '', 'Broad_12' if has_second_project else 'Broad_8', '0', '0', '', '', '', - 'Male' if has_second_project else 'Female', '', '', '', '', '', 'Affected', '', '', 'No', 'No', + 'Male' if has_second_project else 'Female', '', '', '', '', '', 'Affected', '', '', 'Unsolved', 'No', ], multi_data_type_row) self.assertEqual(len(family_file), 11 if has_second_project else 10) @@ -1076,7 +1076,7 @@ def test_family_metadata(self): 'familyGuid': 'F000012_12', 'family_id': '12', 'displayName': '12', - 'solve_status': 'No', + 'solve_status': 'Unsolved', 'actual_inheritance': 'unknown', 'date_data_generation': '2017-02-05', 'data_type': 'WES', @@ -1111,7 +1111,7 @@ def test_family_metadata(self): 'familyGuid': 'F000003_3', 'family_id': '3', 'displayName': '3', - 'solve_status': 'No', + 'solve_status': 'Unsolved', 'actual_inheritance': '', 'date_data_generation': '2017-02-05', 'data_type': 'WES', diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py index 9b1af12329..18c72f6fd3 100644 --- a/seqr/views/apis/summary_data_api_tests.py +++ b/seqr/views/apis/summary_data_api_tests.py @@ -35,7 +35,7 @@ EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = { "projectGuid": "R0003_test", "num_saved_variants": 2, - "solve_status": "No", + "solve_status": "Unsolved", "sample_id": "NA20889", "gene_known_for_phenotype-1": "Known", "gene_known_for_phenotype-2": "Known", @@ -143,7 +143,7 @@ 'pmid_id': None, 'proband_relationship': 'Self', 'sex': 'Female', - 'solve_status': 'No', + 'solve_status': 'Unsolved', 'alt-1': 'T', 'chrom-1': '1', 'gene_known_for_phenotype-1': 'Candidate', diff --git a/seqr/views/utils/anvil_metadata_utils.py b/seqr/views/utils/anvil_metadata_utils.py index 07c576c773..800887229e 100644 --- a/seqr/views/utils/anvil_metadata_utils.py +++ b/seqr/views/utils/anvil_metadata_utils.py @@ -51,9 +51,9 @@ } SOLVE_STATUS_LOOKUP = { - **{s: 'Yes' for s in Family.SOLVED_ANALYSIS_STATUSES}, - **{s: 'Likely' for s in Family.STRONG_CANDIDATE_ANALYSIS_STATUSES}, - Family.ANALYSIS_STATUS_PARTIAL_SOLVE: 'Partial', + **{s: 'Solved' for s in Family.SOLVED_ANALYSIS_STATUSES}, + Family.ANALYSIS_STATUS_PARTIAL_SOLVE: 'Partially solved', + Family.ANALYSIS_STATUS_PROBABLE_SOLVE: 'Probably solved', } MIM_INHERITANCE_MAP = { @@ -103,7 +103,7 @@ def _get_family_metadata(family_filter, family_fields, include_metadata, include for f in family_data: family_id = f.pop('id') f.update({ - 'solve_status': SOLVE_STATUS_LOOKUP.get(f['analysisStatus'], 'No'), + 'solve_status': SOLVE_STATUS_LOOKUP.get(f['analysisStatus'], 'Unsolved'), **{k: v['format'](f) for k, v in (family_fields or {}).items()}, }) if format_id: @@ -201,6 +201,8 @@ def parse_anvil_metadata( if individual.id in matchmaker_individuals: subject_row['MME'] = matchmaker_individuals[individual.id] if mme_values else 'Yes' subject_row.update(family_subject_row) + if individual.affected != Individual.AFFECTED_STATUS_AFFECTED: + subject_row['solve_status'] = 'Unaffected' add_row(subject_row, family_id, SUBJECT_ROW_TYPE) participant_id = subject_row['participant_id'] From 7a1f7677f451f37b0f0f9ee0339d034f0208eab5 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 7 Mar 2024 15:38:07 -0500 Subject: [PATCH 17/28] do not show delete button for non deletable projects --- seqr/views/apis/dashboard_api.py | 4 ++++ seqr/views/utils/individual_utils.py | 15 ++++++++++----- .../Dashboard/components/ProjectEllipsisMenu.jsx | 2 +- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/seqr/views/apis/dashboard_api.py b/seqr/views/apis/dashboard_api.py index b7aa07c413..d5ccc20766 100644 --- a/seqr/views/apis/dashboard_api.py +++ b/seqr/views/apis/dashboard_api.py @@ -4,6 +4,7 @@ from django.db import models from seqr.models import ProjectCategory, Sample, Family, Project +from seqr.views.utils.individual_utils import check_project_individuals_deletable from seqr.views.utils.json_utils import create_json_response from seqr.views.utils.orm_to_json_utils import get_json_for_projects from seqr.views.utils.permissions_utils import get_project_guids_user_can_view, login_and_policies_required @@ -46,6 +47,9 @@ def _get_projects_json(user): projects_by_guid[project.guid]['numFamilies'] = project.family__count projects_by_guid[project.guid]['numIndividuals'] = project.family__individual__count projects_by_guid[project.guid]['numVariantTags'] = project.family__savedvariant__count + if projects_by_guid[project.guid]['userIsCreator']: + errors, _ = check_project_individuals_deletable(project) + projects_by_guid[project.guid]['userCanDelete'] = not errors analysis_status_counts = Family.objects.filter(project__in=projects).values( 'project__guid', 'analysis_status').annotate(count=models.Count('*')) diff --git a/seqr/views/utils/individual_utils.py b/seqr/views/utils/individual_utils.py index 8ef121eb6c..3a35d34139 100644 --- a/seqr/views/utils/individual_utils.py +++ b/seqr/views/utils/individual_utils.py @@ -173,11 +173,7 @@ def delete_individuals(project, individual_guids, user): Returns: list: Family objects for families with deleted individuals """ - - individuals_to_delete = Individual.objects.filter( - family__project=project, guid__in=individual_guids) - - errors = backend_specific_call(_validate_no_submissions, _validate_no_sumissions_no_search_samples)(individuals_to_delete) + errors, individuals_to_delete = check_project_individuals_deletable(project.guid, individual_guids=individual_guids) if errors: raise ErrorsWarningsException(errors) @@ -196,6 +192,15 @@ def delete_individuals(project, individual_guids, user): return families_with_deleted_individuals +def check_project_individuals_deletable(project_guid, individual_guids=None): + individuals_to_delete = Individual.objects.filter(family__project__guid=project_guid) + if individual_guids is not None: + individuals_to_delete = individuals_to_delete.filter(guid__in=individual_guids) + + errors = backend_specific_call(_validate_no_submissions, _validate_no_sumissions_no_search_samples)(individuals_to_delete) + return errors, individuals_to_delete + + def _validate_delete_individuals(individuals_to_delete, error_type, query): errors = [] invalid_individuals = individuals_to_delete.filter(**query).distinct().values_list('individual_id', flat=True) diff --git a/ui/pages/Dashboard/components/ProjectEllipsisMenu.jsx b/ui/pages/Dashboard/components/ProjectEllipsisMenu.jsx index e794520ff9..0a2055cf20 100644 --- a/ui/pages/Dashboard/components/ProjectEllipsisMenu.jsx +++ b/ui/pages/Dashboard/components/ProjectEllipsisMenu.jsx @@ -55,7 +55,7 @@ const ProjectEllipsisMenu = React.memo((props) => { , ) } - if (props.project.userIsCreator) { + if (props.project.userCanDelete) { menuItems.push( , Date: Thu, 7 Mar 2024 15:54:30 -0500 Subject: [PATCH 18/28] fix typos and update tests --- seqr/fixtures/1kg_project.json | 6 +++--- seqr/views/apis/dashboard_api_tests.py | 6 ++++++ seqr/views/utils/individual_utils.py | 6 +++--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json index f9f87c1315..7032d4f55f 100644 --- a/seqr/fixtures/1kg_project.json +++ b/seqr/fixtures/1kg_project.json @@ -5,7 +5,7 @@ "fields": { "guid": "R0001_1kg", "created_date": "2017-03-12T19:27:08.156Z", - "created_by": null, + "created_by": 10, "last_modified_date": "2017-03-13T09:07:49.582Z", "name": "1kg project n\u00e5me with uni\u00e7\u00f8de", "description": "1000 genomes project description with uni\u00e7\u00f8de", @@ -29,7 +29,7 @@ "fields": { "guid": "R0002_empty", "created_date": "2017-03-12T19:27:08.156Z", - "created_by": null, + "created_by": 10, "last_modified_date": "2017-03-13T09:07:49.582Z", "name": "Empty Project", "description": "", @@ -50,7 +50,7 @@ "fields": { "guid": "R0003_test", "created_date": "2017-03-12T19:27:08.156Z", - "created_by": null, + "created_by": 10, "last_modified_date": "2017-03-13T09:07:49.582Z", "name": "Test Reprocessed Project", "description": "", diff --git a/seqr/views/apis/dashboard_api_tests.py b/seqr/views/apis/dashboard_api_tests.py index bc3a124263..ec0a5ac3f1 100644 --- a/seqr/views/apis/dashboard_api_tests.py +++ b/seqr/views/apis/dashboard_api_tests.py @@ -40,6 +40,8 @@ def test_dashboard_page_data(self, mock_set_redis, mock_get_redis): self.assertSetEqual( set(next(iter(response_json['projectsByGuid'].values())).keys()), DASHBOARD_PROJECT_FIELDS ) + self.assertSetEqual({p['userIsCreator'] for p in response_json['projectsByGuid'].values()}, {False}) + self.assertFalse(any('userCanDelete' in p for p in response_json['projectsByGuid'].values())) mock_get_redis.assert_called_with('projects__test_user_collaborator') mock_set_redis.assert_called_with( 'projects__test_user_collaborator', list(response_json['projectsByGuid'].keys()), expire=300) @@ -49,6 +51,10 @@ def test_dashboard_page_data(self, mock_set_redis, mock_get_redis): self.assertEqual(response.status_code, 200) response_json = response.json() self.assertEqual(len(response_json['projectsByGuid']), 3) + self.assertSetEqual({p['userIsCreator'] for p in response_json['projectsByGuid'].values()}, {True}) + self.assertTrue(response_json['projectsByGuid']['R0002_empty']['userCanDelete']) + self.assertFalse(response_json['projectsByGuid']['R0001_1kg']['userCanDelete']) + self.assertFalse(response_json['projectsByGuid']['R0003_test']['userCanDelete']) mock_get_redis.assert_called_with('projects__test_user') mock_set_redis.assert_called_with('projects__test_user', list(response_json['projectsByGuid'].keys()), expire=300) diff --git a/seqr/views/utils/individual_utils.py b/seqr/views/utils/individual_utils.py index 3a35d34139..40ae1196d2 100644 --- a/seqr/views/utils/individual_utils.py +++ b/seqr/views/utils/individual_utils.py @@ -173,7 +173,7 @@ def delete_individuals(project, individual_guids, user): Returns: list: Family objects for families with deleted individuals """ - errors, individuals_to_delete = check_project_individuals_deletable(project.guid, individual_guids=individual_guids) + errors, individuals_to_delete = check_project_individuals_deletable(project, individual_guids=individual_guids) if errors: raise ErrorsWarningsException(errors) @@ -192,8 +192,8 @@ def delete_individuals(project, individual_guids, user): return families_with_deleted_individuals -def check_project_individuals_deletable(project_guid, individual_guids=None): - individuals_to_delete = Individual.objects.filter(family__project__guid=project_guid) +def check_project_individuals_deletable(project, individual_guids=None): + individuals_to_delete = Individual.objects.filter(family__project=project) if individual_guids is not None: individuals_to_delete = individuals_to_delete.filter(guid__in=individual_guids) From 0ba46cbd4148eda478b84bb06dd262ea6a796619 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 11 Mar 2024 12:17:02 -0400 Subject: [PATCH 19/28] properly format mito chrom for variant id query --- seqr/utils/search/search_utils_tests.py | 6 +++--- seqr/utils/search/utils.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/seqr/utils/search/search_utils_tests.py b/seqr/utils/search/search_utils_tests.py index aec4a37952..7e2fcb6871 100644 --- a/seqr/utils/search/search_utils_tests.py +++ b/seqr/utils/search/search_utils_tests.py @@ -108,12 +108,12 @@ def test_get_single_variant(self, mock_get_variants_for_ids): self.assertEqual(str(cm.exception), 'Variant 10-10334333-A-G not found') def test_get_variants_for_variant_ids(self, mock_get_variants_for_ids): - variant_ids = ['2-103343353-GAGA-G', '1-248367227-TC-T', 'prefix-938_DEL', 'M-10195-C-A'] + variant_ids = ['2-103343353-GAGA-G', '1-248367227-TC-T', 'prefix-938_DEL', 'MT-10195-C-A'] get_variants_for_variant_ids(self.families, variant_ids, user=self.user) mock_get_variants_for_ids.assert_called_with(mock.ANY, '37', { '2-103343353-GAGA-G': ('2', 103343353, 'GAGA', 'G'), '1-248367227-TC-T': ('1', 248367227, 'TC', 'T'), - 'M-10195-C-A': ('M', 10195, 'C', 'A'), + 'MT-10195-C-A': ('M', 10195, 'C', 'A'), 'prefix-938_DEL': None, }, self.user, user_email=None) self.assertSetEqual(set(mock_get_variants_for_ids.call_args.args[0]), set(self.search_samples)) @@ -123,7 +123,7 @@ def test_get_variants_for_variant_ids(self, mock_get_variants_for_ids): mock_get_variants_for_ids.assert_called_with(mock.ANY, '37', { '2-103343353-GAGA-G': ('2', 103343353, 'GAGA', 'G'), '1-248367227-TC-T': ('1', 248367227, 'TC', 'T'), - 'M-10195-C-A': ('M', 10195, 'C', 'A'), + 'MT-10195-C-A': ('M', 10195, 'C', 'A'), }, self.user, user_email=None) skipped_samples = ['S000145_hg00731', 'S000146_hg00732', 'S000148_hg00733'] expected_samples = {s for s in self.search_samples if s.guid not in skipped_samples} diff --git a/seqr/utils/search/utils.py b/seqr/utils/search/utils.py index a94e40ddf0..c1f0e6d4b4 100644 --- a/seqr/utils/search/utils.py +++ b/seqr/utils/search/utils.py @@ -14,7 +14,7 @@ from seqr.utils.search.hail_search_utils import get_hail_variants, get_hail_variants_for_variant_ids, ping_hail_backend, \ hail_variant_lookup, validate_hail_backend_no_location_search from seqr.utils.gene_utils import parse_locus_list_items -from seqr.utils.xpos_utils import get_xpos +from seqr.utils.xpos_utils import get_xpos, format_chrom class InvalidSearchException(Exception): @@ -334,6 +334,7 @@ def _parse_variant_items(search_json): def _parse_variant_id(variant_id): try: chrom, pos, ref, alt = variant_id.split('-') + chrom = format_chrom(chrom) pos = int(pos) get_xpos(chrom, pos) return chrom, pos, ref, alt From 4e18bdc5a495b31b4c4c386bd3717a464e918936 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 11 Mar 2024 13:54:53 -0400 Subject: [PATCH 20/28] update fixture data to test ulifted variants --- seqr/fixtures/1kg_project.json | 3 +- .../commands/lift_project_to_hg38.py | 2 +- ...eck_for_new_samples_from_pipeline_tests.py | 8 ++--- .../tests/lift_project_to_hg38_tests.py | 35 +++++++++++-------- 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json index 7032d4f55f..ca050a560f 100644 --- a/seqr/fixtures/1kg_project.json +++ b/seqr/fixtures/1kg_project.json @@ -1884,7 +1884,6 @@ "genotypeFilters": "pass", "mainTranscriptId": "ENST00000505820", "populations": {"callset": {"ac": null, "an": null, "af": null}, "g1k": {"ac": null, "an": null, "af": 0.0}, "gnomad_genomes": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 0.0}, "gnomad_exomes": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 8.142526788913136e-06}, "exac": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 0.0}, "topmed": {"ac": null, "an": null, "af": null}}, - "genomeVersion": "37", "pos": 1560662, "predictions": {"eigen": null, "revel": null, "sift": "damaging", "cadd": "31", "metasvm": "D", "mpc": null, "splice_ai": null, "phastcons_100_vert": null, "mut_taster": "disease_causing", "fathmm": "tolerated", "polyphen": "probably_damaging", "dann": null, "primate_ai": null, "gerp_rs": null}, "hgmd": {"accession": null, "class": null}, @@ -1970,7 +1969,7 @@ "hom": 10332 } }, - "genomeVersion": "37", + "genomeVersion": "38", "pos": 1562437, "hgmd": {"accession": null, "class": null}, "rsid": null, diff --git a/seqr/management/commands/lift_project_to_hg38.py b/seqr/management/commands/lift_project_to_hg38.py index 3f98c63616..ae182f42ca 100644 --- a/seqr/management/commands/lift_project_to_hg38.py +++ b/seqr/management/commands/lift_project_to_hg38.py @@ -77,7 +77,7 @@ def handle(self, *args, **options): len(es_variants), len(missing_variants) + len(lift_failed), missing_family_count)) def _get_variants_to_lift(saved_variants): - saved_variants_to_lift = [v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38] + saved_variants_to_lift = [v for v in saved_variants if v.get('genomeVersion') != GENOME_VERSION_GRCh38] num_already_lifted = len(saved_variants) - len(saved_variants_to_lift) if num_already_lifted: if input('Found {} saved variants already on Hg38. Continue with liftover (y/n)? '.format( diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index 89c4679502..de2f1efd8a 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -135,7 +135,7 @@ def test_command(self, mock_email, mock_airtable_utils): 'total': 1, }) responses.add(responses.POST, f'{MOCK_HAIL_HOST}:5000/multi_lookup', status=200, json={ - 'results': [{'variantId': '21-3343353-GAGA-G', 'updated_new_field': 'updated_value', 'rsid': 'rs123'}], + 'results': [{'variantId': '1-46859832-G-A', 'updated_new_field': 'updated_value', 'rsid': 'rs123'}], }) # Test errors @@ -264,11 +264,11 @@ def test_command(self, mock_email, mock_airtable_utils): self.assertEqual(len(annotation_updated_variant.saved_variant_json), 18) self.assertListEqual(annotation_updated_variant.saved_variant_json['familyGuids'], ['F000001_1']) - annotation_updated_json = SavedVariant.objects.get(guid='SV0000001_2103343353_r0390_100').saved_variant_json - self.assertEqual(len(annotation_updated_json), 20) + annotation_updated_json = SavedVariant.objects.get(guid='SV0059956_11560662_f019313_1').saved_variant_json + self.assertEqual(len(annotation_updated_json), 18) self.assertEqual(annotation_updated_json['updated_new_field'], 'updated_value') self.assertEqual(annotation_updated_json['rsid'], 'rs123') - self.assertEqual(annotation_updated_json['mainTranscriptId'], 'ENST00000258436') + self.assertEqual(annotation_updated_json['mainTranscriptId'], 'ENST00000505820') self.assertEqual(len(annotation_updated_json['genotypes']), 3) self.mock_utils_logger.error.assert_not_called() diff --git a/seqr/management/tests/lift_project_to_hg38_tests.py b/seqr/management/tests/lift_project_to_hg38_tests.py index 0342bd0fac..b3e1c1e833 100644 --- a/seqr/management/tests/lift_project_to_hg38_tests.py +++ b/seqr/management/tests/lift_project_to_hg38_tests.py @@ -59,11 +59,11 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)'), + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'), mock.call('Successfully lifted over 3 variants'), mock.call('Successfully updated 3 variants'), mock.call('---Done---'), - mock.call('Succesfully lifted over 3 variants. Skipped 3 failed variants. Family data not updated for 0 variants') + mock.call('Succesfully lifted over 3 variants. Skipped 2 failed variants. Family data not updated for 0 variants') ] mock_logger.info.assert_has_calls(calls) @@ -72,7 +72,6 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get calls = [ mock.call('chr21', 3343353), - mock.call('chr1', 1562437), mock.call('chr1', 248367227), mock.call('chr1', 1560662), ] @@ -80,7 +79,7 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get families = {family for family in Family.objects.filter(pk__in = [1, 2])} self.assertSetEqual(families, mock_get_es_variants.call_args.args[0]) - self.assertSetEqual({'1-1627057-G-C', '21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'}, + self.assertSetEqual({'21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'}, set(mock_get_es_variants.call_args.args[1])) # Test discontinue on lifted variants @@ -90,7 +89,7 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) - self.assertEqual(str(ce.exception), 'Error: found 1 saved variants already on Hg38') + self.assertEqual(str(ce.exception), 'Error: found 2 saved variants already on Hg38') calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), @@ -177,12 +176,19 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) - self.assertEqual(str(ce.exception), 'Error: unable to lift over 4 variants') + self.assertEqual(str(ce.exception), 'Error: found 1 saved variants already on Hg38') + + mock_input.side_effect = ['y', 'n'] + with self.assertRaises(CommandError) as ce: + call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), + '--es-index={}'.format(ELASTICSEARCH_INDEX)) + + self.assertEqual(str(ce.exception), 'Error: unable to lift over 3 variants') calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)') + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)') ] mock_logger.info.assert_has_calls(calls) @@ -190,16 +196,17 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, mock_get_es_variants.return_value = VARIANTS mock_liftover_to_38.convert_coordinate.side_effect = mock_convert_coordinate mock_logger.reset_mock() + mock_input.side_effect = ['y', 'n', 'n'] with self.assertRaises(CommandError) as ce: call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) - self.assertEqual(str(ce.exception), 'Error: unable to find 3 lifted-over variants') + self.assertEqual(str(ce.exception), 'Error: unable to find 2 lifted-over variants') calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)') + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)') ] mock_logger.info.assert_has_calls(calls) @@ -207,7 +214,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, self.assertSetEqual(mock_get_es_variants.call_args.args[0], families) self.assertSetEqual( set(mock_get_es_variants.call_args.args[1]), - {'1-1627057-G-C', '21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'} + {'21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'} ) # Test discontinue on missing family data while updating the saved variants @@ -215,7 +222,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, variants.append(SINGLE_VARIANT) mock_get_es_variants.return_value = variants mock_logger.reset_mock() - mock_input.side_effect = ['y', 'n'] + mock_input.side_effect = ['y', 'y', 'n'] with self.assertRaises(CommandError) as ce: call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME), '--es-index={}'.format(ELASTICSEARCH_INDEX)) @@ -225,7 +232,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 4 variants (skipping 0 that are already lifted)'), + mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'), mock.call('Successfully lifted over 4 variants') ] mock_logger.info.assert_has_calls(calls) @@ -240,11 +247,11 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants, calls = [ mock.call('Updating project genome version for {}'.format(PROJECT_NAME)), mock.call('Validating es index test_new_index'), - mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'), + mock.call('Lifting over 2 variants (skipping 2 that are already lifted)'), mock.call('Successfully lifted over 4 variants'), mock.call('Successfully updated 4 variants'), mock.call('---Done---'), - mock.call('Succesfully lifted over 4 variants. Skipped 2 failed variants. Family data not updated for 1 variants') + mock.call('Succesfully lifted over 4 variants. Skipped 1 failed variants. Family data not updated for 1 variants') ] mock_logger.info.assert_has_calls(calls) From 868b726ea6f251664e312ea91e503aeb5ba90531 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 11 Mar 2024 14:15:28 -0400 Subject: [PATCH 21/28] only reload variants on correct genome build --- seqr/fixtures/1kg_project.json | 2 +- .../commands/check_for_new_samples_from_pipeline.py | 7 +++++-- .../tests/check_for_new_samples_from_pipeline_tests.py | 8 ++++++-- seqr/views/apis/summary_data_api_tests.py | 2 +- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json index ca050a560f..e3a01d8fa6 100644 --- a/seqr/fixtures/1kg_project.json +++ b/seqr/fixtures/1kg_project.json @@ -2084,7 +2084,7 @@ "alt": "T", "variant_id": "12-48367227-TC-T", "saved_variant_json": { - "liftedOverGenomeVersion": "38", "liftedOverPos": "", "genomeVersion": "37", "pos": 248367227, + "liftedOverGenomeVersion": "37", "liftedOverPos": "", "genomeVersion": "38", "pos": 248367227, "transcripts": {}, "chrom": "1", "genotypes": { "I000018_na21234": {"sampleId": "NA20885", "ab": 0.0, "gq": 99.0, "numAlt": 1}}}, "family": 14 diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index 0ee5626438..f61d592596 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -1,6 +1,7 @@ from collections import defaultdict from django.contrib.postgres.aggregates import ArrayAgg from django.core.management.base import BaseCommand, CommandError +from django.db.models import Q from django.db.models.functions import JSONObject import json import logging @@ -116,9 +117,10 @@ def handle(self, *args, **options): @staticmethod def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version): data_type = dataset_type + db_genome_version = genome_version.replace('GRCh', '') updated_annotation_samples = Sample.objects.filter( is_active=True, dataset_type=dataset_type, - individual__family__project__genome_version=genome_version.replace('GRCh', ''), + individual__family__project__genome_version=db_genome_version, ).exclude(individual__family__guid__in=updated_families) if dataset_type == Sample.DATASET_TYPE_SV_CALLS: updated_annotation_samples = updated_annotation_samples.filter(sample_type=sample_type) @@ -127,7 +129,8 @@ def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, variant_models = SavedVariant.objects.filter( family_id__in=updated_annotation_samples.values_list('individual__family', flat=True).distinct(), **saved_variants_dataset_type_filter(dataset_type), - ) + ).filter(Q(saved_variant_json__genomeVersion__isnull=True) | Q(saved_variant_json__genomeVersion=db_genome_version)) + if not variant_models: logger.info('No additional saved variants to update') return diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index de2f1efd8a..5662885fa6 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -174,7 +174,11 @@ def test_command(self, mock_email, mock_airtable_utils): str(ce.exception), 'Data has genome version GRCh38 but the following projects have conflicting versions: R0003_test (GRCh37)') + # Update fixture data to allow testing edge cases Project.objects.filter(id__in=[1, 3]).update(genome_version=38) + sv = SavedVariant.objects.get(guid='SV0000002_1248367227_r0390_100') + sv.saved_variant_json['genomeVersion'] = '38' + sv.save() with self.assertRaises(ValueError) as ce: call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08') @@ -199,7 +203,7 @@ def test_command(self, mock_email, mock_airtable_utils): {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'}, ]}}, ], reload_annotations_logs=[ - 'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 additional variants', 'Updated 2 saved variants', + 'Reloading shared annotations for 3 saved variants (3 unique)', 'Fetched 1 additional variants', 'Updated 2 saved variants', ]) old_data_sample_guid = 'S000143_na20885' @@ -248,7 +252,7 @@ def test_command(self, mock_email, mock_airtable_utils): self.assertDictEqual(json.loads(multi_lookup_request.body), { 'genome_version': 'GRCh38', 'data_type': 'SNV_INDEL', - 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '21-3343353-GAGA-G'], + 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A'], }) updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value') diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py index 18c72f6fd3..6e55fc315c 100644 --- a/seqr/views/apis/summary_data_api_tests.py +++ b/seqr/views/apis/summary_data_api_tests.py @@ -151,7 +151,7 @@ 'end-1': None, 'ref-1': 'TC', 'zygosity-1': 'Heterozygous', - 'variant_reference_assembly-1': 'GRCh37', + 'variant_reference_assembly-1': 'GRCh38', 'allele_balance_or_heteroplasmy_percentage-1': None, 'gene-1': None, 'gene_id-1': None, From 14f03d07bdc37d551bb576d61b3aeb05e81a2d4c Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 11 Mar 2024 14:23:02 -0400 Subject: [PATCH 22/28] fix test --- matchmaker/views/matchmaker_api_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/matchmaker/views/matchmaker_api_tests.py b/matchmaker/views/matchmaker_api_tests.py index 7b898c3c7f..149d0cde77 100644 --- a/matchmaker/views/matchmaker_api_tests.py +++ b/matchmaker/views/matchmaker_api_tests.py @@ -306,7 +306,7 @@ def test_search_individual_mme_matches(self, mock_email, mock_logger, mock_slack 'start': 248367227, 'referenceBases': 'TC', 'alternateBases': 'T', - 'assembly': 'GRCh37', + 'assembly': 'GRCh38', }, 'zygosity': 1, }], @@ -332,7 +332,7 @@ def test_search_individual_mme_matches(self, mock_email, mock_logger, mock_slack 'ref': 'TC', 'alt': 'T', 'end': None, - 'genomeVersion': 'GRCh37', + 'genomeVersion': 'GRCh38', } } ], From 464cb3b6f13dd33b636b17d2da46f2b7088a7f1d Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 11 Mar 2024 14:33:29 -0400 Subject: [PATCH 23/28] properly parse variant ids --- .../check_for_new_samples_from_pipeline.py | 6 +++++- .../check_for_new_samples_from_pipeline_tests.py | 2 +- seqr/utils/search/utils.py | 14 +++++++++----- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py index f61d592596..2a4c0d71a9 100644 --- a/seqr/management/commands/check_for_new_samples_from_pipeline.py +++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py @@ -10,6 +10,7 @@ from seqr.models import Family, Sample, SavedVariant from seqr.utils.file_utils import file_iter, does_file_exist from seqr.utils.search.add_data_utils import notify_search_data_loaded +from seqr.utils.search.utils import parse_valid_variant_id from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup from seqr.views.utils.dataset_utils import match_and_update_search_samples from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json, \ @@ -117,12 +118,13 @@ def handle(self, *args, **options): @staticmethod def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version): data_type = dataset_type + is_sv = dataset_type == Sample.DATASET_TYPE_SV_CALLS db_genome_version = genome_version.replace('GRCh', '') updated_annotation_samples = Sample.objects.filter( is_active=True, dataset_type=dataset_type, individual__family__project__genome_version=db_genome_version, ).exclude(individual__family__guid__in=updated_families) - if dataset_type == Sample.DATASET_TYPE_SV_CALLS: + if is_sv: updated_annotation_samples = updated_annotation_samples.filter(sample_type=sample_type) data_type = f'{dataset_type}_{sample_type}' @@ -147,6 +149,8 @@ def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, } fetch_variant_ids = set(variants_by_id.keys()) - set(updated_variants_by_id.keys()) if fetch_variant_ids: + if not is_sv: + fetch_variant_ids = [parse_valid_variant_id(variant_id) for variant_id in fetch_variant_ids] updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(fetch_variant_ids), data_type, genome_version) logger.info(f'Fetched {len(updated_variants)} additional variants') updated_variants_by_id.update({variant['variantId']: variant for variant in updated_variants}) diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py index 5662885fa6..4cb3e15983 100644 --- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py +++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py @@ -252,7 +252,7 @@ def test_command(self, mock_email, mock_airtable_utils): self.assertDictEqual(json.loads(multi_lookup_request.body), { 'genome_version': 'GRCh38', 'data_type': 'SNV_INDEL', - 'variant_ids': ['1-1562437-G-C', '1-46859832-G-A'], + 'variant_ids': [['1', 1562437, 'G', 'C'], ['1', 46859832, 'G', 'A']], }) updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value') diff --git a/seqr/utils/search/utils.py b/seqr/utils/search/utils.py index c1f0e6d4b4..69c302fd16 100644 --- a/seqr/utils/search/utils.py +++ b/seqr/utils/search/utils.py @@ -333,15 +333,19 @@ def _parse_variant_items(search_json): def _parse_variant_id(variant_id): try: - chrom, pos, ref, alt = variant_id.split('-') - chrom = format_chrom(chrom) - pos = int(pos) - get_xpos(chrom, pos) - return chrom, pos, ref, alt + return parse_valid_variant_id(variant_id) except (KeyError, ValueError): return None +def parse_valid_variant_id(variant_id): + chrom, pos, ref, alt = variant_id.split('-') + chrom = format_chrom(chrom) + pos = int(pos) + get_xpos(chrom, pos) + return chrom, pos, ref, alt + + def _validate_sort(sort, families): if sort == PRIORITIZED_GENE_SORT and len(families) > 1: raise InvalidSearchException('Phenotype sort is only supported for single-family search.') From bc2ed7e66927399ab78f829e082cc296e0c8c367 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 11 Mar 2024 16:59:06 -0400 Subject: [PATCH 24/28] specify which notification failed when reporting notification error --- matchmaker/views/external_api.py | 11 ++++------- matchmaker/views/external_api_tests.py | 4 ++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/matchmaker/views/external_api.py b/matchmaker/views/external_api.py index 007278a886..f2cd6320e7 100644 --- a/matchmaker/views/external_api.py +++ b/matchmaker/views/external_api.py @@ -151,7 +151,7 @@ def _generate_notification_for_incoming_match(results, incoming_query, incoming_ emails = [email.strip() for email in submission.contact_href.replace('mailto:', '').split(',')] send_emails = emails if len(emails) < 2 else [email for email in emails if email!= MME_DEFAULT_CONTACT_EMAIL] all_emails.update(send_emails) - match_results.append((result_text, send_emails)) + match_results.append((result_text, send_emails, submission.submission_id)) match_results = sorted(match_results, key=lambda result_tuple: result_tuple[0]) base_message = """Dear collaborators, @@ -180,12 +180,11 @@ def _generate_notification_for_incoming_match(results, incoming_query, incoming_ We sent this email alert to: {email_addresses_alert_sent_to}\n{footer}.""" safe_post_to_slack(MME_SLACK_MATCH_NOTIFICATION_CHANNEL, message_template.format( - base_message=base_message, match_results='\n'.join([text for text, _ in match_results]), + base_message=base_message, match_results='\n'.join([result[0] for result in match_results]), email_addresses_alert_sent_to=', '.join(sorted(all_emails)), footer=MME_EMAIL_FOOTER )) - email_errors = [] - for result_text, emails in match_results: + for result_text, emails, submission_id in match_results: try: email_message = EmailMessage( subject='Received new MME match', @@ -198,9 +197,7 @@ def _generate_notification_for_incoming_match(results, incoming_query, incoming_ ) email_message.send() except Exception as e: - email_errors.append(str(e)) - if email_errors: - raise ValueError('\n'.join(email_errors)) + logger.error(f'Unable to send notification email for incoming MME match with {submission_id}: {e}') MME_EMAIL_FOOTER = """ diff --git a/matchmaker/views/external_api_tests.py b/matchmaker/views/external_api_tests.py index 7ec0e164ad..0956b5ee8d 100644 --- a/matchmaker/views/external_api_tests.py +++ b/matchmaker/views/external_api_tests.py @@ -284,8 +284,8 @@ def test_mme_match_proxy(self, mock_post_to_slack, mock_email, mock_logger): mock.call().send(), ]) - mock_logger.error.assert_called_with( - 'Unable to create notification for incoming MME match request for 3 matches (NA19675_1_01, P0004515, P0004517): Email error') + mock_logger.error.assert_called_once_with( + 'Unable to send notification email for incoming MME match with NA19675_1_01: Email error') # Test receive same request again mock_post_to_slack.reset_mock() From 4d15bacb3f3cc5992b5ba8fad965ade4a8dc888b Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 12 Mar 2024 16:19:17 -0400 Subject: [PATCH 25/28] remove unused error hndler --- matchmaker/views/external_api.py | 11 ++--------- matchmaker/views/external_api_tests.py | 13 +++++++------ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/matchmaker/views/external_api.py b/matchmaker/views/external_api.py index f2cd6320e7..629ca4ba88 100644 --- a/matchmaker/views/external_api.py +++ b/matchmaker/views/external_api.py @@ -79,14 +79,7 @@ def mme_match_proxy(request, originating_node_name): patient_data=query_patient_data, origin_request_host=originating_node_name, ) - try: - _generate_notification_for_incoming_match(results, incoming_query, originating_node_name, query_patient_data) - except Exception as e: - logger.error('Unable to create notification for incoming MME match request for {} matches{}: {}'.format( - len(results), - ' ({})'.format(', '.join(sorted([result.get('patient', {}).get('id') for result in results]))) if results else '', - str(e)), - ) + _safe_generate_notification_for_incoming_match(results, incoming_query, originating_node_name, query_patient_data) return create_json_response({ 'results': sorted(results, key=lambda result: result['score']['patient'], reverse=True), @@ -94,7 +87,7 @@ def mme_match_proxy(request, originating_node_name): }) -def _generate_notification_for_incoming_match(results, incoming_query, incoming_request_node, incoming_patient): +def _safe_generate_notification_for_incoming_match(results, incoming_query, incoming_request_node, incoming_patient): """ Generate a SLACK notifcation to say that a VALID match request came in and the following results were sent back. If Slack is not supported, a message is not sent, but details persisted. diff --git a/matchmaker/views/external_api_tests.py b/matchmaker/views/external_api_tests.py index 0956b5ee8d..491cb4458f 100644 --- a/matchmaker/views/external_api_tests.py +++ b/matchmaker/views/external_api_tests.py @@ -399,9 +399,9 @@ def test_mme_match_proxy_phenotype_only(self, mock_post_to_slack, mock_email): mock.call().send(), ]) - @mock.patch('matchmaker.views.external_api.logger') + @mock.patch('seqr.utils.communication_utils.logger') @mock.patch('matchmaker.views.external_api.EmailMessage') - @mock.patch('matchmaker.views.external_api.safe_post_to_slack') + @mock.patch('seqr.utils.communication_utils._post_to_slack') def test_mme_match_proxy_no_results(self, mock_post_to_slack, mock_email, mock_logger): url = '/api/matchmaker/v1/match' request_body = { @@ -424,11 +424,12 @@ def test_mme_match_proxy_no_results(self, mock_post_to_slack, mock_email, mock_l self.assertEqual(incoming_query_q.count(), 1) self.assertIsNone(incoming_query_q.first().patient_id) - mock_post_to_slack.assert_called_with( - 'matchmaker_matches', - """A match request for 12345 came in from Test Institute today. + slack_message = """A match request for 12345 came in from Test Institute today. The contact information given was: test@test.com. We didn't find any individuals in matchbox that matched that query well, *so no results were sent back*.""" + mock_post_to_slack.assert_called_with( + 'matchmaker_matches', + slack_message ) mock_email.assert_not_called() @@ -440,6 +441,6 @@ def test_mme_match_proxy_no_results(self, mock_post_to_slack, mock_email, mock_l self.assertListEqual(response.json()['results'], []) self.assertEqual(MatchmakerIncomingQuery.objects.filter(institution='Test Institute').count(), 2) mock_logger.error.assert_called_with( - 'Unable to create notification for incoming MME match request for 0 matches: Slack connection error') + f'Slack error: Slack connection error: Original message in channel (matchmaker_matches) - {slack_message}') From 3e59c324ca7866936c92b3403e7e13550cd5a2a8 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 12 Mar 2024 17:33:36 -0400 Subject: [PATCH 26/28] include invalid sample id once per error message --- seqr/management/tests/load_rna_seq_tests.py | 3 ++- seqr/views/utils/dataset_utils.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/seqr/management/tests/load_rna_seq_tests.py b/seqr/management/tests/load_rna_seq_tests.py index 7c1cd0db02..6353809f5a 100644 --- a/seqr/management/tests/load_rna_seq_tests.py +++ b/seqr/management/tests/load_rna_seq_tests.py @@ -66,6 +66,7 @@ def test_tpm(self, mock_utils_logger): file_data=[ 'sample_id\tproject\tindividual_id\tgene_id\tTPM\ttissue\n', 'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000240361\t12.6\t\n', + 'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000233750\t1.26\t\n', 'NA19678_D1\t1kg project nåme with uniçøde\t\tENSG00000233750\t 6.04\twhole_blood\n', 'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000240361\t3.1\tinvalid\n', 'NA19677\t1kg project nåme with uniçøde\t\tENSG00000233750\t5.31\tmuscle\n', @@ -79,7 +80,7 @@ def test_tpm(self, mock_utils_logger): self.mock_gzip_file_iter.return_value = [ self.mock_gzip_file_iter.return_value[0], 'NA19678_D1\t1kg project nåme with uniçøde\tNA19678\tENSG00000233750\t 6.04\twhole_blood\n', - ] + self.mock_gzip_file_iter.return_value[2:] + ] + self.mock_gzip_file_iter.return_value[3:] call_command('load_rna_seq', 'tpm', RNA_FILE_ID, '--ignore-extra-samples') # Existing outlier data should be unchanged diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index b4e86c14ae..8824860637 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -347,7 +347,7 @@ def _load_rna_seq_file( loaded_samples = set() unmatched_samples = set() - missing_required_fields = defaultdict(list) + missing_required_fields = defaultdict(set) gene_ids = set() current_sample = None for line in tqdm(parsed_f, unit=' rows'): @@ -365,7 +365,7 @@ def _load_rna_seq_file( sample_id = row_dict.pop(SAMPLE_ID_COL) if SAMPLE_ID_COL in row_dict else row[SAMPLE_ID_COL] if missing_cols: for col in missing_cols: - missing_required_fields[col].append(sample_id) + missing_required_fields[col].add(sample_id) if missing_cols: continue From 3a89b87f1d391d9ef490205b741066f1fbcb2912 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 13 Mar 2024 12:26:29 -0400 Subject: [PATCH 27/28] add project permission check for bulk update analysis status --- seqr/views/apis/summary_data_api.py | 6 ++++-- seqr/views/apis/summary_data_api_tests.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/seqr/views/apis/summary_data_api.py b/seqr/views/apis/summary_data_api.py index 23f05f0393..33c4d9e8bb 100644 --- a/seqr/views/apis/summary_data_api.py +++ b/seqr/views/apis/summary_data_api.py @@ -158,8 +158,10 @@ def bulk_update_family_external_analysis(request): family_db_id_lookup = { (f['project__name'], f['family_id']): f['id'] for f in Family.objects.annotate( project_family=Concat('project__name', 'family_id', output_field=CharField()) - ).filter(project_family__in=[f'{project}{family}' for project, family in requested_families]) - .values('id', 'family_id', 'project__name') + ).filter( + project_family__in=[f'{project}{family}' for project, family in requested_families], + project__guid__in=get_project_guids_user_can_view(request.user), + ).values('id', 'family_id', 'project__name') } warnings = [] diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py index 6e55fc315c..62b682bc93 100644 --- a/seqr/views/apis/summary_data_api_tests.py +++ b/seqr/views/apis/summary_data_api_tests.py @@ -447,11 +447,12 @@ def test_bulk_update_family_external_analysis(self, mock_load_uploaded_file, moc ['Test Reprocessed Project', '12'], ['Test Reprocessed Project', 'not_a_family'], ['not_a_project', '2'], + ['Non-Analyst Project', '14'], ] response = self.client.post(url, content_type='application/json', data=json.dumps(body)) self.assertDictEqual(response.json(), { 'warnings': [ - 'No match found for the following families: not_a_family (Test Reprocessed Project), 2 (not_a_project)' + 'No match found for the following families: 14 (Non-Analyst Project), not_a_family (Test Reprocessed Project), 2 (not_a_project)' ], 'info': ['Updated "analysed by" for 2 families'], }) From a9c80e6485906ca22c8f9de6cedc4c1ddf57a474 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 13 Mar 2024 14:44:21 -0400 Subject: [PATCH 28/28] bumo changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80818ba807..ce09fe0a57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ # _seqr_ Changes ## dev + +## 3/13/24 * Add "Probably Solved" analysis status (REQUIRES DB MIGRATION) ## 3/1/24