From fc9af3c8b253d8b71f1d0772c66c46571afb28d3 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 4 Mar 2024 17:30:31 -0500
Subject: [PATCH 01/28] add multi lookup endpoint

---
 hail_search/queries/base.py | 19 ++++++++++++++-----
 hail_search/search.py       |  5 +++++
 hail_search/test_search.py  | 30 ++++++++++++++++++++++++++++--
 hail_search/web_app.py      |  8 +++++++-
 4 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index 9fcfc79683..5b30214853 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -1045,11 +1045,22 @@ def gene_counts(self):
             ht.gene_ids, hl.struct(total=hl.agg.count(), families=hl.agg.counter(ht.families))
         ))
 
-    def lookup_variant(self, variant_id, sample_data=None):
-        self._parse_intervals(intervals=None, variant_ids=[variant_id], variant_keys=[variant_id])
+    def lookup_variants(self, variant_ids, annotation_fields=None):
+        self._parse_intervals(intervals=None, variant_ids=variant_ids, variant_keys=variant_ids)
         ht = self._read_table('annotations.ht', drop_globals=['paths', 'versions'])
         ht = ht.filter(hl.is_defined(ht[XPOS]))
 
+        if not annotation_fields:
+            annotation_fields = {
+                k: v for k, v in self.annotation_fields(include_genotype_overrides=False).items()
+                if k not in {FAMILY_GUID_FIELD, GENOTYPES_FIELD, 'genotypeFilters'}
+            }
+
+        formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=False)
+
+        return formatted.aggregate(hl.agg.take(formatted.row, len(variant_ids)))
+
+    def lookup_variant(self, variant_id, sample_data=None):
         annotation_fields = self.annotation_fields(include_genotype_overrides=False)
         entry_annotations = {k: annotation_fields[k] for k in [FAMILY_GUID_FIELD, GENOTYPES_FIELD]}
         annotation_fields.update({
@@ -1058,9 +1069,7 @@ def lookup_variant(self, variant_id, sample_data=None):
             'genotypeFilters': lambda ht: hl.str(''),
         })
 
-        formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=False)
-
-        variants = formatted.aggregate(hl.agg.take(formatted.row, 1))
+        variants = self.lookup_variants([variant_id], annotation_fields=annotation_fields)
         if not variants:
             raise HTTPNotFound()
         variant = dict(variants[0])
diff --git a/hail_search/search.py b/hail_search/search.py
index 6af6d72606..8207ce7f09 100644
--- a/hail_search/search.py
+++ b/hail_search/search.py
@@ -27,6 +27,11 @@ def lookup_variant(request):
     return query.lookup_variant(request['variant_id'], sample_data=request.get('sample_data'))
 
 
+def lookup_variants(request):
+    query = QUERY_CLASS_MAP[(request['data_type'], request['genome_version'])](sample_data=None)
+    return query.lookup_variants(request['variant_ids'])
+
+
 def load_globals():
     for cls in QUERY_CLASS_MAP.values():
         cls.load_globals()
diff --git a/hail_search/test_search.py b/hail_search/test_search.py
index 4c20cfc556..c0c9d25da8 100644
--- a/hail_search/test_search.py
+++ b/hail_search/test_search.py
@@ -155,6 +155,8 @@
     'numAlt': 1, 'dp': 28, 'gq': 99, 'ab': 0.5,
 }
 
+NO_GENOTYPE_GCNV_VARIANT = {**GCNV_VARIANT4, 'numExon': 8, 'end': 38736268}
+
 # Ensures no variants are filtered out by annotation/path filters for compound hets
 COMP_HET_ALL_PASS_FILTERS = {
     'annotations': {'splice_ai': '0.0', 'structural': ['DEL', 'CPX', 'INS', 'gCNV_DEL', 'gCNV_DUP']},
@@ -694,19 +696,43 @@ async def test_variant_lookup(self):
             self.assertEqual(resp.status, 200)
             resp_json = await resp.json()
         self.assertDictEqual(resp_json, {
-            **GCNV_VARIANT4, 'numExon': 8, 'end': 38736268, 'familyGuids': [], 'genotypes': {}, 'genotypeFilters': '',
+            **NO_GENOTYPE_GCNV_VARIANT, 'familyGuids': [], 'genotypes': {}, 'genotypeFilters': '',
         })
 
         async with self.client.request('POST', '/lookup', json={**body, 'sample_data': EXPECTED_SAMPLE_DATA['SV_WES']}) as resp:
             self.assertEqual(resp.status, 200)
             resp_json = await resp.json()
         self.assertDictEqual(resp_json, {
-            **GCNV_VARIANT4, 'numExon': 8, 'end': 38736268, 'genotypes': {
+            **NO_GENOTYPE_GCNV_VARIANT, 'genotypes': {
                 individual: {k: v for k, v in genotype.items() if k not in {'start', 'end', 'numExon', 'geneIds'}}
                 for individual, genotype in GCNV_VARIANT4['genotypes'].items()
             }
         })
 
+    async def test_multi_variant_lookup(self):
+        await self._test_multi_lookup(VARIANT_ID_SEARCH['variant_ids'], 'SNV_INDEL', [VARIANT1])
+
+        await self._test_multi_lookup([['7', 143270172, 'A', 'G']], 'SNV_INDEL', [GRCH37_VARIANT], genome_version='GRCh37')
+
+        await self._test_multi_lookup([['M', 4429, 'G', 'A'], ['M', 14783, 'T', 'C']], 'MITO', [MITO_VARIANT1, MITO_VARIANT3])
+
+        await self._test_multi_lookup(
+            ['cohort_2911.chr1.final_cleanup_INS_chr1_160', 'phase2_DEL_chr14_4640'],
+            'SV_WGS', [SV_VARIANT2, SV_VARIANT4],
+        )
+
+        await self._test_multi_lookup(['suffix_140608_DUP'], 'SV_WES', [NO_GENOTYPE_GCNV_VARIANT])
+
+    async def _test_multi_lookup(self, variant_ids, data_type, results, genome_version='GRCh38'):
+        body = {'genome_version': genome_version, 'data_type': data_type, 'variant_ids': variant_ids}
+        async with self.client.request('POST', '/multi_lookup', json=body) as resp:
+            self.assertEqual(resp.status, 200)
+            resp_json = await resp.json()
+        self.assertDictEqual(resp_json, {'results': [
+            {k: v for k, v in variant.items() if k not in {'familyGuids', 'genotypes', 'genotypeFilters'}}
+            for variant in results
+        ]})
+
     async def test_frequency_filter(self):
         sv_callset_filter = {'sv_callset': {'af': 0.05}}
         await self._assert_expected_search(
diff --git a/hail_search/web_app.py b/hail_search/web_app.py
index 8ccfe429a9..a4ef765c04 100644
--- a/hail_search/web_app.py
+++ b/hail_search/web_app.py
@@ -9,7 +9,7 @@
 import traceback
 from typing import Callable
 
-from hail_search.search import search_hail_backend, load_globals, lookup_variant
+from hail_search.search import search_hail_backend, load_globals, lookup_variant, lookup_variants
 
 logger = logging.getLogger(__name__)
 
@@ -64,6 +64,11 @@ async def lookup(request: web.Request) -> web.Response:
     return web.json_response(result, dumps=hl_json_dumps)
 
 
+async def multi_lookup(request: web.Request) -> web.Response:
+    result = await sync_to_async_hail_query(request, lookup_variants)
+    return web.json_response({'results': result}, dumps=hl_json_dumps)
+
+
 async def status(request: web.Request) -> web.Response:
     return web.json_response({'success': True})
 
@@ -84,6 +89,7 @@ async def init_web_app():
         web.post('/search', search),
         web.post('/gene_counts', gene_counts),
         web.post('/lookup', lookup),
+        web.post('/multi_lookup', multi_lookup),
     ])
     # The idea here is to run the hail queries off the main thread so that the
     # event loop stays live and the /status check is responsive.  We only

From 0085d75169210a5958e3f27c031b108381f71a6b Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 11:37:35 -0500
Subject: [PATCH 02/28] add multi lookup hlelper func

---
 .../commands/check_for_new_samples_from_pipeline.py         | 1 +
 seqr/utils/search/hail_search_utils.py                      | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py
index ad87d71850..f0cd703ccf 100644
--- a/seqr/management/commands/check_for_new_samples_from_pipeline.py
+++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py
@@ -8,6 +8,7 @@
 from seqr.models import Family, Sample, Project
 from seqr.utils.file_utils import file_iter, does_file_exist
 from seqr.utils.search.add_data_utils import notify_search_data_loaded
+from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup
 from seqr.views.utils.dataset_utils import match_and_update_search_samples
 from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json
 
diff --git a/seqr/utils/search/hail_search_utils.py b/seqr/utils/search/hail_search_utils.py
index 7eb1c53cda..b144c8e5b8 100644
--- a/seqr/utils/search/hail_search_utils.py
+++ b/seqr/utils/search/hail_search_utils.py
@@ -110,6 +110,12 @@ def hail_variant_lookup(user, variant_id, samples=None, dataset_type=Sample.DATA
     return variants
 
 
+def hail_variant_multi_lookup(user_email, variant_ids, data_type, genome_version):
+    body = {'genome_version': genome_version, 'data_type': data_type, 'variant_ids': variant_ids}
+    response_json = _execute_search(body, user=None, user_email=user_email, path='multi_lookup')
+    return response_json['results']
+
+
 def _format_search_body(samples, genome_version, num_results, search):
     search_body = {
         'genome_version': GENOME_VERSION_LOOKUP[genome_version],

From 0e9d4be8a48abe6e074206c58f0d1d62a510a751 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 12:30:57 -0500
Subject: [PATCH 03/28] update annotation for additional projects andfamilies

---
 .../check_for_new_samples_from_pipeline.py    | 63 ++++++++++++++++---
 .../commands/reload_saved_variant_json.py     |  6 +-
 seqr/utils/search/add_data_utils.py           |  5 +-
 seqr/views/utils/variant_utils.py             | 25 ++++----
 4 files changed, 73 insertions(+), 26 deletions(-)

diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py
index f0cd703ccf..921b4dc7d7 100644
--- a/seqr/management/commands/check_for_new_samples_from_pipeline.py
+++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py
@@ -1,21 +1,24 @@
 from collections import defaultdict
 from django.contrib.postgres.aggregates import ArrayAgg
 from django.core.management.base import BaseCommand, CommandError
+from django.db.models.functions import JSONObject
 import json
 import logging
 
 from reference_data.models import GENOME_VERSION_LOOKUP
-from seqr.models import Family, Sample, Project
+from seqr.models import Family, Sample, SavedVariant
 from seqr.utils.file_utils import file_iter, does_file_exist
 from seqr.utils.search.add_data_utils import notify_search_data_loaded
 from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup
 from seqr.views.utils.dataset_utils import match_and_update_search_samples
-from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json
+from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json, \
+    saved_variants_dataset_type_filter
 
 logger = logging.getLogger(__name__)
 
 GS_PATH_TEMPLATE = 'gs://seqr-hail-search-data/v03/{path}/runs/{version}/'
 DATASET_TYPE_MAP = {'GCNV': Sample.DATASET_TYPE_SV_CALLS}
+USER_EMAIL = 'manage_command'
 
 
 class Command(BaseCommand):
@@ -84,20 +87,60 @@ def handle(self, *args, **options):
         reset_cached_search_results(project=None)
 
         # Send loading notifications
+        update_sample_data_by_project = {
+            s['individual__family__project']: s for s in updated_samples.values('individual__family__project').annotate(
+                samples=ArrayAgg(JSONObject(sample_id='sample_id', individual_id='individual_id')),
+                family_guids=ArrayAgg('individual__family__guid', distinct=True),
+            )
+        }
+        updated_project_families = []
+        updated_families = set()
         for project, sample_ids in samples_by_project.items():
-            project_updated_samples = updated_samples.filter(individual__family__project=project)
+            project_sample_data = update_sample_data_by_project[project.id]
             notify_search_data_loaded(
                 project, dataset_type, sample_type, inactivated_sample_guids,
-                updated_samples=project_updated_samples, num_samples=len(sample_ids),
+                updated_samples=project_sample_data['samples'], num_samples=len(sample_ids),
             )
+            project_families = project_sample_data['family_guids']
+            updated_families.update(project_families)
+            updated_project_families.append((project.id, project.name, project_families))
 
         # Reload saved variant JSON
-        updated_annotation_samples = Sample.objects.filter(is_active=True, dataset_type=dataset_type)
+        update_projects_saved_variant_json(updated_project_families, user_email=USER_EMAIL, dataset_type=dataset_type)
+        self._reload_shared_variant_annotations(updated_families, dataset_type, sample_type, genome_version)
+
+        logger.info('DONE')
+
+    def _reload_shared_variant_annotations(self, updated_families, dataset_type, sample_type, genome_version):
+        data_type = dataset_type
+        updated_annotation_samples = Sample.objects.filter(
+            is_active=True, dataset_type=dataset_type,
+            individual__family__project__genome_version=genome_version.replace('GRCh', ''),
+        ).exclude(individual__family__guid__in=updated_families)
         if dataset_type == Sample.DATASET_TYPE_SV_CALLS:
             updated_annotation_samples = updated_annotation_samples.filter(sample_type=sample_type)
-        projects = Project.objects.filter(
-            genome_version=genome_version.replace('GRCh', ''), family__individual__sample__in=updated_annotation_samples,
-        ).annotate(families=ArrayAgg('family__family_id', distinct=True)).values_list('id', 'name', 'families')
-        update_projects_saved_variant_json(projects, user_email='manage_command', dataset_type=dataset_type)
+            data_type = f'{dataset_type}_{sample_type}'
 
-        logger.info('DONE')
+        variant_models = SavedVariant.objects.filter(
+            family_id__in=updated_annotation_samples.values_list('individual__family', flat=True).distinct(),
+            **saved_variants_dataset_type_filter(dataset_type),
+        )
+        if not variant_models:
+            logger.info('No additional saved variants to update')
+            return
+
+        variants_by_id = defaultdict(list)
+        for v in variant_models:
+            variants_by_id[v.variant_id].append(v)
+
+        logger.info(f'Reloading shared annotations for {len(variant_models)} saved variants ({len(variants_by_id)} unique)')
+        updated_variants = hail_variant_multi_lookup(USER_EMAIL, variants_by_id.keys(), data_type, genome_version)
+        logger.info(f'Fetched {len(updated_variants)} variants')
+        updated_variant_models = []
+        for variant in updated_variants:
+            for variant_model in variants_by_id[variant['variantId']]:
+                variant_model.saved_variant_json.update(variant)
+                updated_variant_models.append(variant_model)
+
+        SavedVariant.objects.bulk_update(updated_variant_models, ['saved_variant_json'])
+        logger.info(f'Updated {len(updated_variant_models)} saved variants')
diff --git a/seqr/management/commands/reload_saved_variant_json.py b/seqr/management/commands/reload_saved_variant_json.py
index e884079d2d..2659fd37b3 100644
--- a/seqr/management/commands/reload_saved_variant_json.py
+++ b/seqr/management/commands/reload_saved_variant_json.py
@@ -13,12 +13,12 @@ class Command(BaseCommand):
 
     def add_arguments(self, parser):
         parser.add_argument('projects', nargs="*", help='Project(s) to transfer. If not specified, defaults to all projects.')
-        parser.add_argument('--family-id', help='optional family to reload variants for')
+        parser.add_argument('--family-guid', help='optional family to reload variants for')
 
     def handle(self, *args, **options):
         """transfer project"""
         projects_to_process = options['projects']
-        family_id = options['family_id']
+        family_guid = options['family_guid']
 
         if projects_to_process:
             projects = Project.objects.filter(Q(name__in=projects_to_process) | Q(guid__in=projects_to_process))
@@ -27,7 +27,7 @@ def handle(self, *args, **options):
             projects = Project.objects.all()
             logging.info("Processing all %s projects" % len(projects))
 
-        family_ids = [family_id] if family_id else None
+        family_ids = [family_guid] if family_id else None
         project_list = [(*project, family_ids) for project in projects.values_list('id', 'name')]
         update_projects_saved_variant_json(project_list, user_email='manage_command')
         logger.info("Done")
diff --git a/seqr/utils/search/add_data_utils.py b/seqr/utils/search/add_data_utils.py
index 654f4e5c6f..91366a6c74 100644
--- a/seqr/utils/search/add_data_utils.py
+++ b/seqr/utils/search/add_data_utils.py
@@ -43,7 +43,8 @@ def add_new_es_search_samples(request_json, project, user, notify=False, expecte
 
     if notify:
         num_samples = len(sample_ids) - num_skipped
-        notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sample_guids, updated_samples, num_samples)
+        updated_sample_data = updated_samples.values('sample_id', 'individual_id')
+        notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sample_guids, updated_sample_data, num_samples)
 
     return inactivated_sample_guids, updated_family_guids, updated_samples
 
@@ -52,7 +53,7 @@ def notify_search_data_loaded(project, dataset_type, sample_type, inactivated_sa
     is_internal = not project_has_anvil(project) or is_internal_anvil_project(project)
 
     previous_loaded_individuals = set(Sample.objects.filter(guid__in=inactivated_sample_guids).values_list('individual_id', flat=True))
-    new_sample_ids = [sample.sample_id for sample in updated_samples if sample.individual_id not in previous_loaded_individuals]
+    new_sample_ids = [sample['sample_id'] for sample in updated_samples if sample['individual_id'] not in previous_loaded_individuals]
 
     url = f'{BASE_URL}project/{project.guid}/project_page'
     msg_dataset_type = '' if dataset_type == Sample.DATASET_TYPE_VARIANT_CALLS else f' {dataset_type}'
diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py
index 02f17ff70a..ff955ae632 100644
--- a/seqr/views/utils/variant_utils.py
+++ b/seqr/views/utils/variant_utils.py
@@ -34,10 +34,10 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs):
     skipped = {}
     error = {}
     logger.info(f'Reloading saved variants in {len(projects)} projects')
-    for project_id, project_name, family_ids in tqdm(projects, unit=' project'):
+    for project_id, project_name, family_guids in tqdm(projects, unit=' project'):
         try:
             updated_saved_variant_guids = update_project_saved_variant_json(
-                project_id, user_email=user_email, family_ids=family_ids, **kwargs)
+                project_id, user_email=user_email, family_guids=family_guids, **kwargs)
             if updated_saved_variant_guids is None:
                 skipped[project_name] = True
             else:
@@ -61,18 +61,13 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs):
         logger.info(f'  {k}: {v}')
 
 
-def update_project_saved_variant_json(project_id, family_ids=None, dataset_type=None, user=None, user_email=None):
+def update_project_saved_variant_json(project_id, family_guids=None, dataset_type=None, user=None, user_email=None):
     saved_variants = SavedVariant.objects.filter(family__project_id=project_id).select_related('family')
-    if family_ids:
-        saved_variants = saved_variants.filter(family__family_id__in=family_ids)
+    if family_guids:
+        saved_variants = saved_variants.filter(family__guid__in=family_guids)
 
     if dataset_type:
-        xpos_filter_key = 'xpos__gte' if dataset_type == Sample.DATASET_TYPE_MITO_CALLS else 'xpos__lt'
-        dataset_filter = {
-            'alt__isnull': dataset_type == Sample.DATASET_TYPE_SV_CALLS,
-            xpos_filter_key: get_xpos('M', 1),
-        }
-        saved_variants = saved_variants.filter(**dataset_filter)
+        saved_variants = saved_variants.filter(**saved_variants_dataset_type_filter(dataset_type))
 
     if not saved_variants:
         return None
@@ -102,6 +97,14 @@ def update_project_saved_variant_json(project_id, family_ids=None, dataset_type=
     return updated_saved_variant_guids
 
 
+def saved_variants_dataset_type_filter(dataset_type):
+    xpos_filter_key = 'xpos__gte' if dataset_type == Sample.DATASET_TYPE_MITO_CALLS else 'xpos__lt'
+    return {
+        'alt__isnull': dataset_type == Sample.DATASET_TYPE_SV_CALLS,
+        xpos_filter_key: get_xpos('M', 1),
+    }
+
+
 def parse_saved_variant_json(variant_json, family):
     if 'xpos' not in variant_json:
         variant_json['xpos'] = get_xpos(variant_json['chrom'], variant_json['pos'])

From 7362de2892ee3c3b2dd715c8f0b563df7a0906e2 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 12:35:42 -0500
Subject: [PATCH 04/28] update reload tests

---
 seqr/management/commands/reload_saved_variant_json.py    | 2 +-
 seqr/management/tests/reload_saved_variant_json_tests.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/seqr/management/commands/reload_saved_variant_json.py b/seqr/management/commands/reload_saved_variant_json.py
index 2659fd37b3..ccb8ff82d3 100644
--- a/seqr/management/commands/reload_saved_variant_json.py
+++ b/seqr/management/commands/reload_saved_variant_json.py
@@ -27,7 +27,7 @@ def handle(self, *args, **options):
             projects = Project.objects.all()
             logging.info("Processing all %s projects" % len(projects))
 
-        family_ids = [family_guid] if family_id else None
+        family_ids = [family_guid] if family_guid else None
         project_list = [(*project, family_ids) for project in projects.values_list('id', 'name')]
         update_projects_saved_variant_json(project_list, user_email='manage_command')
         logger.info("Done")
diff --git a/seqr/management/tests/reload_saved_variant_json_tests.py b/seqr/management/tests/reload_saved_variant_json_tests.py
index f540a9dc44..36b5b81cd2 100644
--- a/seqr/management/tests/reload_saved_variant_json_tests.py
+++ b/seqr/management/tests/reload_saved_variant_json_tests.py
@@ -7,7 +7,7 @@
 
 PROJECT_NAME = '1kg project n\u00e5me with uni\u00e7\u00f8de'
 PROJECT_GUID = 'R0001_1kg'
-FAMILY_ID = '1'
+FAMILY_GUID = 'F000001_1'
 
 
 class ReloadSavedVariantJsonTest(TestCase):
@@ -23,7 +23,7 @@ def test_with_param_command(self, mock_get_variants, mock_logger):
         # Test with a specific project and a family id.
         call_command('reload_saved_variant_json',
                      PROJECT_NAME,
-                     '--family-id={}'.format(FAMILY_ID))
+                     '--family-guid={}'.format(FAMILY_GUID))
 
         family_1 = Family.objects.get(id=1)
         mock_get_variants.assert_called_with(
@@ -70,7 +70,7 @@ def test_with_param_command(self, mock_get_variants, mock_logger):
         mock_get_variants.side_effect = Exception("Database error.")
         call_command('reload_saved_variant_json',
                      PROJECT_GUID,
-                     '--family-id={}'.format(FAMILY_ID))
+                     '--family-guid={}'.format(FAMILY_GUID))
 
         mock_get_variants.assert_called_with([family_1], ['1-1562437-G-C', '1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command')
 

From dcd7452328135ee1b6c92d9f7e8c569be489c278 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 12:50:48 -0500
Subject: [PATCH 05/28] update gcnv test

---
 ...eck_for_new_samples_from_pipeline_tests.py | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
index 2f68a63fb2..ed6005962a 100644
--- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
+++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
@@ -76,7 +76,7 @@ def setUp(self):
         self.addCleanup(patcher.stop)
         super().setUp()
 
-    def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, additional_requests=0, num_projects=1):
+    def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, reload_annotations_logs, additional_requests=0):
         self.mock_subprocess.return_value.stdout = [json.dumps(metadata).encode()]
         self.mock_subprocess.return_value.wait.return_value = 0
 
@@ -89,7 +89,8 @@ def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls
 
         self.mock_logger.info.assert_has_calls([
             mock.call(f'Loading new samples from {path}: auto__2023-08-08'),
-            mock.call(f'Loading {len(sample_guids)} WES {dataset_type} samples in {num_projects} projects'),
+            mock.call(f'Loading {len(sample_guids)} WES {dataset_type} samples in 2 projects'),
+        ] + [mock.call(log) for log in reload_annotations_logs] + [
             mock.call('DONE'),
         ])
         self.mock_logger.warining.assert_not_called()
@@ -186,7 +187,7 @@ def test_command(self, mock_email, mock_airtable_utils):
         search_body = {
             'genome_version': 'GRCh38', 'num_results': 1, 'variant_ids': [['12', 48367227, 'TC', 'T']], 'variant_keys': [],
         }
-        self._test_success('GRCh38/SNV_INDEL', metadata, dataset_type='SNV_INDEL', num_projects=2, sample_guids={
+        self._test_success('GRCh38/SNV_INDEL', metadata, dataset_type='SNV_INDEL', sample_guids={
             EXISTING_SAMPLE_GUID, REPLACED_SAMPLE_GUID, NEW_SAMPLE_GUID_P3, NEW_SAMPLE_GUID_P4,
         }, additional_requests=1, reload_calls=[
             {**search_body, 'sample_data': {'SNV_INDEL': [
@@ -196,7 +197,7 @@ def test_command(self, mock_email, mock_airtable_utils):
             {**search_body, 'sample_data': {'SNV_INDEL': [
                 {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'},
             ]}},
-        ])
+        ], reload_annotations_logs=[])  # TODO
 
         old_data_sample_guid = 'S000143_na20885'
         self.assertFalse(Sample.objects.get(guid=old_data_sample_guid).is_active)
@@ -307,16 +308,20 @@ def test_gcnv_command(self):
         metadata = {
             'callsets': ['1kg.vcf.gz'],
             'sample_type': 'WES',
-            'family_samples': {'F000012_12': ['NA20889']},
+            'family_samples': {'F000004_4': ['NA20872'], 'F000012_12': ['NA20889']},
         }
-        self._test_success('GRCh37/GCNV', metadata, dataset_type='SV', sample_guids={f'S{GUID_ID}_NA20889'}, reload_calls=[{
+        self._test_success('GRCh37/GCNV', metadata, dataset_type='SV', sample_guids={f'S{GUID_ID}_NA20872', f'S{GUID_ID}_NA20889'}, reload_calls=[{
             'genome_version': 'GRCh37', 'num_results': 1, 'variant_ids': [], 'variant_keys': ['prefix_19107_DEL'],
             'sample_data': {'SV_WES': [{'individual_guid': 'I000017_na20889', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20889'}]},
-        }])
+        }], reload_annotations_logs=['No additional saved variants to update'])
 
-        self.mock_send_slack.assert_called_once_with('seqr-data-loading',
-            f'1 new WES SV samples are loaded in {SEQR_URL}project/{PROJECT_GUID}/project_page\n```NA20889```',
-        )
+        self.mock_send_slack.assert_has_calls([
+            mock.call(
+                'seqr-data-loading', f'1 new WES SV samples are loaded in {SEQR_URL}project/R0001_1kg/project_page\n```NA20872```',
+            ), mock.call(
+                'seqr-data-loading', f'1 new WES SV samples are loaded in {SEQR_URL}project/{PROJECT_GUID}/project_page\n```NA20889```',
+            ),
+        ])
 
         self.mock_utils_logger.error.assert_called_with('Error in project Test Reprocessed Project: Bad Request')
         self.mock_utils_logger.info.assert_has_calls([

From 485507f3fec7f55369707d7d26f3250aebe7704c Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 12:52:00 -0500
Subject: [PATCH 06/28] update gcnv test

---
 .../tests/check_for_new_samples_from_pipeline_tests.py        | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
index ed6005962a..deaf39d5da 100644
--- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
+++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
@@ -171,9 +171,7 @@ def test_command(self, mock_email, mock_airtable_utils):
             str(ce.exception),
             'Data has genome version GRCh38 but the following projects have conflicting versions: R0003_test (GRCh37)')
 
-        project = Project.objects.get(guid=PROJECT_GUID)
-        project.genome_version = 38
-        project.save()
+        Project.objects.filter(id__in=[3]).update(genome_version=38)
 
         with self.assertRaises(ValueError) as ce:
             call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08')

From 7742ab46b7c4a9259b7c26aa018a4e9db7cbe796 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 13:11:22 -0500
Subject: [PATCH 07/28] test multi lookup

---
 .../check_for_new_samples_from_pipeline.py    |  2 +-
 ...eck_for_new_samples_from_pipeline_tests.py | 34 +++++++++++++++----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py
index 921b4dc7d7..9efc1bf04a 100644
--- a/seqr/management/commands/check_for_new_samples_from_pipeline.py
+++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py
@@ -134,7 +134,7 @@ def _reload_shared_variant_annotations(self, updated_families, dataset_type, sam
             variants_by_id[v.variant_id].append(v)
 
         logger.info(f'Reloading shared annotations for {len(variant_models)} saved variants ({len(variants_by_id)} unique)')
-        updated_variants = hail_variant_multi_lookup(USER_EMAIL, variants_by_id.keys(), data_type, genome_version)
+        updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(variants_by_id.keys()), data_type, genome_version)
         logger.info(f'Fetched {len(updated_variants)} variants')
         updated_variant_models = []
         for variant in updated_variants:
diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
index deaf39d5da..04b48f4d53 100644
--- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
+++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
@@ -76,7 +76,7 @@ def setUp(self):
         self.addCleanup(patcher.stop)
         super().setUp()
 
-    def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, reload_annotations_logs, additional_requests=0):
+    def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls, reload_annotations_logs, has_additional_requests=False):
         self.mock_subprocess.return_value.stdout = [json.dumps(metadata).encode()]
         self.mock_subprocess.return_value.wait.return_value = 0
 
@@ -102,9 +102,9 @@ def _test_success(self, path, metadata, dataset_type, sample_guids, reload_calls
         ])
 
         # Test reload saved variants
-        self.assertEqual(len(responses.calls), len(reload_calls) + additional_requests)
+        self.assertEqual(len(responses.calls), len(reload_calls) + (2 if has_additional_requests else 0))
         for i, call in enumerate(reload_calls):
-            resp = responses.calls[i+additional_requests]
+            resp = responses.calls[i+(1 if has_additional_requests else 0)]
             self.assertEqual(resp.request.url, f'{MOCK_HAIL_HOST}:5000/search')
             self.assertEqual(resp.request.headers.get('From'), 'manage_command')
             self.assertDictEqual(json.loads(resp.request.body), call)
@@ -134,6 +134,9 @@ def test_command(self, mock_email, mock_airtable_utils):
             'results': [{'variantId': '12-48367227-TC-T', 'familyGuids': ['F000014_14'], 'updated_field': 'updated_value'}],
             'total': 1,
         })
+        responses.add(responses.POST, f'{MOCK_HAIL_HOST}:5000/multi_lookup', status=200, json={
+            'results': [{'variantId': '21-3343353-GAGA-G', 'updated_new_field': 'updated_value', 'rsid': 'rs123'}],
+        })
 
         # Test errors
         with self.assertRaises(CommandError) as ce:
@@ -171,7 +174,7 @@ def test_command(self, mock_email, mock_airtable_utils):
             str(ce.exception),
             'Data has genome version GRCh38 but the following projects have conflicting versions: R0003_test (GRCh37)')
 
-        Project.objects.filter(id__in=[3]).update(genome_version=38)
+        Project.objects.filter(id__in=[1, 3]).update(genome_version=38)
 
         with self.assertRaises(ValueError) as ce:
             call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08')
@@ -187,7 +190,7 @@ def test_command(self, mock_email, mock_airtable_utils):
         }
         self._test_success('GRCh38/SNV_INDEL', metadata, dataset_type='SNV_INDEL', sample_guids={
             EXISTING_SAMPLE_GUID, REPLACED_SAMPLE_GUID, NEW_SAMPLE_GUID_P3, NEW_SAMPLE_GUID_P4,
-        }, additional_requests=1, reload_calls=[
+        }, has_additional_requests=True, reload_calls=[
             {**search_body, 'sample_data': {'SNV_INDEL': [
                 {'individual_guid': 'I000017_na20889', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20889'},
                 {'individual_guid': 'I000016_na20888', 'family_guid': 'F000012_12', 'project_guid': 'R0003_test', 'affected': 'A', 'sample_id': 'NA20888'},
@@ -195,7 +198,9 @@ def test_command(self, mock_email, mock_airtable_utils):
             {**search_body, 'sample_data': {'SNV_INDEL': [
                 {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'},
             ]}},
-        ], reload_annotations_logs=[])  # TODO
+        ], reload_annotations_logs=[
+            'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 variants', 'Updated 1 saved variants',
+        ])
 
         old_data_sample_guid = 'S000143_na20885'
         self.assertFalse(Sample.objects.get(guid=old_data_sample_guid).is_active)
@@ -237,10 +242,27 @@ def test_command(self, mock_email, mock_airtable_utils):
         self.assertEqual(Family.objects.get(guid='F000014_14').analysis_status, 'Rncc')
 
         # Test SavedVariant model updated
+        multi_lookup_request = responses.calls[3].request
+        self.assertEqual(multi_lookup_request.url, f'{MOCK_HAIL_HOST}:5000/multi_lookup')
+        self.assertEqual(multi_lookup_request.headers.get('From'), 'manage_command')
+        # TODO reuse family specific variant where needed (12-48367227-TC-T)
+        self.assertDictEqual(json.loads(multi_lookup_request.body), {
+            'genome_version': 'GRCh38',
+            'data_type': 'SNV_INDEL',
+            'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '12-48367227-TC-T', '21-3343353-GAGA-G'],
+        })
+
         updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value')
         self.assertEqual(len(updated_variants), 1)
         self.assertEqual(updated_variants.first().guid, 'SV0000006_1248367227_r0004_non')
 
+        annotation_updated_json = SavedVariant.objects.get(guid='SV0000001_2103343353_r0390_100').saved_variant_json
+        self.assertEqual(len(annotation_updated_json), 20)
+        self.assertEqual(annotation_updated_json['updated_new_field'], 'updated_value')
+        self.assertEqual(annotation_updated_json['rsid'], 'rs123')
+        self.assertEqual(annotation_updated_json['mainTranscriptId'], 'ENST00000258436')
+        self.assertEqual(len(annotation_updated_json['genotypes']), 3)
+
         self.mock_utils_logger.error.assert_not_called()
         self.mock_utils_logger.info.assert_has_calls([
             mock.call('Updated 0 variants for project Test Reprocessed Project'),

From 62e06f6ffae1344c6f70ea734e7b8a252e8a7def Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 15:40:01 -0500
Subject: [PATCH 08/28] do not refetch updated variants

---
 .../check_for_new_samples_from_pipeline.py    | 26 ++++++++++++++-----
 ...eck_for_new_samples_from_pipeline_tests.py | 18 +++++++++----
 seqr/views/utils/variant_utils.py             | 17 +++++++-----
 3 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py
index 9efc1bf04a..0ee5626438 100644
--- a/seqr/management/commands/check_for_new_samples_from_pipeline.py
+++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py
@@ -106,12 +106,15 @@ def handle(self, *args, **options):
             updated_project_families.append((project.id, project.name, project_families))
 
         # Reload saved variant JSON
-        update_projects_saved_variant_json(updated_project_families, user_email=USER_EMAIL, dataset_type=dataset_type)
-        self._reload_shared_variant_annotations(updated_families, dataset_type, sample_type, genome_version)
+        updated_variants_by_id = update_projects_saved_variant_json(
+            updated_project_families, user_email=USER_EMAIL, dataset_type=dataset_type)
+        self._reload_shared_variant_annotations(
+            updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version)
 
         logger.info('DONE')
 
-    def _reload_shared_variant_annotations(self, updated_families, dataset_type, sample_type, genome_version):
+    @staticmethod
+    def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version):
         data_type = dataset_type
         updated_annotation_samples = Sample.objects.filter(
             is_active=True, dataset_type=dataset_type,
@@ -134,11 +137,20 @@ def _reload_shared_variant_annotations(self, updated_families, dataset_type, sam
             variants_by_id[v.variant_id].append(v)
 
         logger.info(f'Reloading shared annotations for {len(variant_models)} saved variants ({len(variants_by_id)} unique)')
-        updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(variants_by_id.keys()), data_type, genome_version)
-        logger.info(f'Fetched {len(updated_variants)} variants')
+
+        updated_variants_by_id = {
+            variant_id: {k: v for k, v in variant.items() if k not in {'familyGuids', 'genotypes', 'genotypeFilters'}}
+            for variant_id, variant in updated_variants_by_id.items()
+        }
+        fetch_variant_ids = set(variants_by_id.keys()) - set(updated_variants_by_id.keys())
+        if fetch_variant_ids:
+            updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(fetch_variant_ids), data_type, genome_version)
+            logger.info(f'Fetched {len(updated_variants)} additional variants')
+            updated_variants_by_id.update({variant['variantId']: variant for variant in updated_variants})
+
         updated_variant_models = []
-        for variant in updated_variants:
-            for variant_model in variants_by_id[variant['variantId']]:
+        for variant_id, variant in updated_variants_by_id.items():
+            for variant_model in variants_by_id[variant_id]:
                 variant_model.saved_variant_json.update(variant)
                 updated_variant_models.append(variant_model)
 
diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
index 04b48f4d53..89c4679502 100644
--- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
+++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
@@ -199,7 +199,7 @@ def test_command(self, mock_email, mock_airtable_utils):
                 {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'},
             ]}},
         ], reload_annotations_logs=[
-            'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 variants', 'Updated 1 saved variants',
+            'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 additional variants', 'Updated 2 saved variants',
         ])
 
         old_data_sample_guid = 'S000143_na20885'
@@ -245,16 +245,24 @@ def test_command(self, mock_email, mock_airtable_utils):
         multi_lookup_request = responses.calls[3].request
         self.assertEqual(multi_lookup_request.url, f'{MOCK_HAIL_HOST}:5000/multi_lookup')
         self.assertEqual(multi_lookup_request.headers.get('From'), 'manage_command')
-        # TODO reuse family specific variant where needed (12-48367227-TC-T)
         self.assertDictEqual(json.loads(multi_lookup_request.body), {
             'genome_version': 'GRCh38',
             'data_type': 'SNV_INDEL',
-            'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '12-48367227-TC-T', '21-3343353-GAGA-G'],
+            'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '21-3343353-GAGA-G'],
         })
 
         updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value')
-        self.assertEqual(len(updated_variants), 1)
-        self.assertEqual(updated_variants.first().guid, 'SV0000006_1248367227_r0004_non')
+        self.assertEqual(len(updated_variants), 2)
+        self.assertSetEqual(
+            {v.guid for v in updated_variants},
+            {'SV0000006_1248367227_r0004_non', 'SV0000002_1248367227_r0390_100'}
+        )
+        reloaded_variant = next(v for v in updated_variants if v.guid == 'SV0000006_1248367227_r0004_non')
+        annotation_updated_variant = next(v for v in updated_variants if v.guid == 'SV0000002_1248367227_r0390_100')
+        self.assertEqual(len(reloaded_variant.saved_variant_json), 3)
+        self.assertListEqual(reloaded_variant.saved_variant_json['familyGuids'], ['F000014_14'])
+        self.assertEqual(len(annotation_updated_variant.saved_variant_json), 18)
+        self.assertListEqual(annotation_updated_variant.saved_variant_json['familyGuids'], ['F000001_1'])
 
         annotation_updated_json = SavedVariant.objects.get(guid='SV0000001_2103343353_r0390_100').saved_variant_json
         self.assertEqual(len(annotation_updated_json), 20)
diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py
index ff955ae632..3daba835ca 100644
--- a/seqr/views/utils/variant_utils.py
+++ b/seqr/views/utils/variant_utils.py
@@ -33,16 +33,18 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs):
     success = {}
     skipped = {}
     error = {}
+    updated_variants_by_id = {}
     logger.info(f'Reloading saved variants in {len(projects)} projects')
     for project_id, project_name, family_guids in tqdm(projects, unit=' project'):
         try:
-            updated_saved_variant_guids = update_project_saved_variant_json(
+            updated_saved_variants = update_project_saved_variant_json(
                 project_id, user_email=user_email, family_guids=family_guids, **kwargs)
-            if updated_saved_variant_guids is None:
+            if updated_saved_variants is None:
                 skipped[project_name] = True
             else:
-                success[project_name] = len(updated_saved_variant_guids)
-                logger.info(f'Updated {len(updated_saved_variant_guids)} variants for project {project_name}')
+                success[project_name] = len(updated_saved_variants)
+                logger.info(f'Updated {len(updated_saved_variants)} variants for project {project_name}')
+                updated_variants_by_id.update({v.variant_id: v.saved_variant_json for v in updated_saved_variants.values()})
         except Exception as e:
             traceback_message = traceback.format_exc()
             logger.error(traceback_message)
@@ -59,6 +61,7 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs):
         logger.info(f'{len(error)} failed projects')
     for k, v in error.items():
         logger.info(f'  {k}: {v}')
+    return updated_variants_by_id
 
 
 def update_project_saved_variant_json(project_id, family_guids=None, dataset_type=None, user=None, user_email=None):
@@ -86,15 +89,15 @@ def update_project_saved_variant_json(project_id, family_guids=None, dataset_typ
     for sub_var_ids in [variant_ids[i:i+MAX_VARIANTS_FETCH] for i in range(0, len(variant_ids), MAX_VARIANTS_FETCH)]:
         variants_json += get_variants_for_variant_ids(families, sub_var_ids, user=user, user_email=user_email)
 
-    updated_saved_variant_guids = []
+    updated_saved_variants = {}
     for var in variants_json:
         for family_guid in var['familyGuids']:
             saved_variant = saved_variants_map.get((var['variantId'], family_guid))
             if saved_variant:
                 update_model_from_json(saved_variant, {'saved_variant_json': var}, user)
-                updated_saved_variant_guids.append(saved_variant.guid)
+                updated_saved_variants[saved_variant.guid] = saved_variant
 
-    return updated_saved_variant_guids
+    return updated_saved_variants
 
 
 def saved_variants_dataset_type_filter(dataset_type):

From 7ae6c4bd0d196561b5d91d61c858c88f025411b9 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 16:53:10 -0500
Subject: [PATCH 09/28] Revert "Revert "Multi project gene search redesign""

This reverts commit 226b348f2f04b89ad3e9be6c6edaef50c24e9dbf.
---
 hail_search/queries/base.py      | 12 ++++++++++--
 hail_search/queries/snv_indel.py |  4 ++--
 hail_search/queries/sv.py        | 11 -----------
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index 5dc9f6839d..0b395db517 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -281,7 +281,6 @@ def _parse_sample_data(self, sample_data):
     def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, **kwargs):
         filtered_project_hts = []
         exception_messages = set()
-        num_projects = len(project_samples)
         for i, (project_guid, project_sample_data) in enumerate(project_samples.items()):
             project_ht = self._read_table(
                 f'projects/{project_guid}.ht',
@@ -292,7 +291,7 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, **
                 continue
             try:
                 filtered_project_hts.append(
-                    (*self._filter_entries_table(project_ht, project_sample_data, num_projects=num_projects, **kwargs), len(project_sample_data))
+                    (*self._filter_entries_table(project_ht, project_sample_data, **kwargs), len(project_sample_data))
                 )
             except HTTPBadRequest as e:
                 exception_messages.add(e.reason)
@@ -305,6 +304,15 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, **
         return filtered_project_hts
 
     def import_filtered_table(self, project_samples, num_families, intervals=None, **kwargs):
+        if len(project_samples) > 1 and (kwargs.get('parsed_intervals') or kwargs.get('padded_interval')):
+            # For multi-project interval search, faster to first read and filter the annotation table and then add entries
+            ht = self._read_table('annotations.ht')
+            ht = self._filter_annotated_table(
+                ht, parsed_intervals=kwargs.get('parsed_intervals'), padded_interval=kwargs.get('padded_interval'),
+            )
+            self._load_table_kwargs['variant_ht'] = ht.select()
+            kwargs.update({'parsed_intervals': None, 'padded_interval': None})
+
         if num_families == 1:
             family_sample_data = list(project_samples.values())[0]
             family_guid = list(family_sample_data.keys())[0]
diff --git a/hail_search/queries/snv_indel.py b/hail_search/queries/snv_indel.py
index c41cdd4c99..c72812de80 100644
--- a/hail_search/queries/snv_indel.py
+++ b/hail_search/queries/snv_indel.py
@@ -81,9 +81,9 @@ class SnvIndelHailTableQuery(MitoHailTableQuery):
         ('is_gt_10_percent', 0.1),
     ])
 
-    def _prefilter_entries_table(self, ht, *args, num_projects=1, **kwargs):
+    def _prefilter_entries_table(self, ht, *args, **kwargs):
         ht = super()._prefilter_entries_table(ht, *args, **kwargs)
-        if num_projects > 1 or not self._load_table_kwargs.get('_filter_intervals'):
+        if 'variant_ht' not in self._load_table_kwargs and not self._load_table_kwargs.get('_filter_intervals'):
             af_ht = self._get_loaded_filter_ht(
                 GNOMAD_GENOMES_FIELD, 'high_af_variants.ht', self._get_gnomad_af_prefilter, **kwargs)
             if af_ht:
diff --git a/hail_search/queries/sv.py b/hail_search/queries/sv.py
index 0d50a7eb16..db39798174 100644
--- a/hail_search/queries/sv.py
+++ b/hail_search/queries/sv.py
@@ -56,17 +56,6 @@ class SvHailTableQuery(BaseHailTableQuery):
     def _get_sample_type(cls, *args):
         return cls.DATA_TYPE.split('_')[-1]
 
-    def import_filtered_table(self, project_samples, *args, parsed_intervals=None, padded_interval=None, **kwargs):
-        if len(project_samples) > 1 and (parsed_intervals or padded_interval):
-            # For multi-project interval search, faster to first read and filter the annotation table and then add entries
-            ht = self._read_table('annotations.ht')
-            ht = self._filter_annotated_table(ht, parsed_intervals=parsed_intervals, padded_interval=padded_interval)
-            self._load_table_kwargs['variant_ht'] = ht.select()
-            parsed_intervals = None
-            padded_interval = None
-
-        return super().import_filtered_table(project_samples, *args, parsed_intervals=parsed_intervals, padded_interval=padded_interval, **kwargs)
-
     def _filter_annotated_table(self, ht, *args, parsed_intervals=None, exclude_intervals=False, padded_interval=None, **kwargs):
         if parsed_intervals:
             interval_filter = hl.array(parsed_intervals).any(lambda interval: hl.if_else(

From b79225550ad42b18a6b6e9ba98358591be6118ef Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 17:06:20 -0500
Subject: [PATCH 10/28] apply alll annotation filtering before join

---
 hail_search/queries/base.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index 0b395db517..6329e26a96 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -304,14 +304,12 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, **
         return filtered_project_hts
 
     def import_filtered_table(self, project_samples, num_families, intervals=None, **kwargs):
-        if len(project_samples) > 1 and (kwargs.get('parsed_intervals') or kwargs.get('padded_interval')):
+        use_annotations_ht_first = len(project_samples) > 1 and (kwargs.get('parsed_intervals') or kwargs.get('padded_interval'))
+        if use_annotations_ht_first:
             # For multi-project interval search, faster to first read and filter the annotation table and then add entries
             ht = self._read_table('annotations.ht')
-            ht = self._filter_annotated_table(
-                ht, parsed_intervals=kwargs.get('parsed_intervals'), padded_interval=kwargs.get('padded_interval'),
-            )
+            ht = self._filter_annotated_table(ht, **kwargs, is_comp_het=self._has_comp_het_search)
             self._load_table_kwargs['variant_ht'] = ht.select()
-            kwargs.update({'parsed_intervals': None, 'padded_interval': None})
 
         if num_families == 1:
             family_sample_data = list(project_samples.values())[0]
@@ -346,13 +344,18 @@ def import_filtered_table(self, project_samples, num_families, intervals=None, *
                 logger.info(f'Found {num_projects_added} {self.DATA_TYPE} projects with matched entries')
 
         if comp_het_families_ht is not None:
-            comp_het_ht = self._query_table_annotations(comp_het_families_ht, self._get_table_path('annotations.ht'))
-            self._comp_het_ht = self._filter_annotated_table(comp_het_ht, is_comp_het=True, **kwargs)
+            self._comp_het_ht = self._query_table_annotations(comp_het_families_ht, self._get_table_path('annotations.ht'))
+            if not use_annotations_ht_first:
+                self._comp_het_ht = self._filter_annotated_table(self._comp_het_ht, is_comp_het=True, **kwargs)
             self._comp_het_ht = self._filter_compound_hets()
 
         if families_ht is not None:
-            ht = self._query_table_annotations(families_ht, self._get_table_path('annotations.ht'))
-            self._ht = self._filter_annotated_table(ht, **kwargs)
+            self._ht = self._query_table_annotations(families_ht, self._get_table_path('annotations.ht'))
+            if not use_annotations_ht_first:
+                self._ht = self._filter_annotated_table(self._ht, **kwargs)
+            else:
+                # TODO handle self._has_comp_het_search filter annotations
+                pass
 
     def _add_project_ht(self, families_ht, project_ht, default, default_1=None):
         if default_1 is None:

From 8578aa6de83a37857f4a0a7e048c86bac4461ddd Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 17:12:17 -0500
Subject: [PATCH 11/28] filter annotations

---
 hail_search/queries/base.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index 6329e26a96..0dfc659a80 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -353,9 +353,10 @@ def import_filtered_table(self, project_samples, num_families, intervals=None, *
             self._ht = self._query_table_annotations(families_ht, self._get_table_path('annotations.ht'))
             if not use_annotations_ht_first:
                 self._ht = self._filter_annotated_table(self._ht, **kwargs)
-            else:
-                # TODO handle self._has_comp_het_search filter annotations
-                pass
+            elif self._has_comp_het_search:
+                primary_annotation_filters = self._get_annotation_filters(self._ht)
+                if primary_annotation_filters:
+                    self._ht = self._ht.filter(hl.any(primary_annotation_filters))
 
     def _add_project_ht(self, families_ht, project_ht, default, default_1=None):
         if default_1 is None:

From f281e255d4f405a27484f3479bf2f918aa1843e3 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 17:20:53 -0500
Subject: [PATCH 12/28] fix recessive filtering

---
 hail_search/queries/base.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index 0dfc659a80..d1badc1c60 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -354,9 +354,7 @@ def import_filtered_table(self, project_samples, num_families, intervals=None, *
             if not use_annotations_ht_first:
                 self._ht = self._filter_annotated_table(self._ht, **kwargs)
             elif self._has_comp_het_search:
-                primary_annotation_filters = self._get_annotation_filters(self._ht)
-                if primary_annotation_filters:
-                    self._ht = self._ht.filter(hl.any(primary_annotation_filters))
+                self._ht = self._filter_by_annotations(self._ht, **(kwargs.get(parsed_annotations) or {}))
 
     def _add_project_ht(self, families_ht, project_ht, default, default_1=None):
         if default_1 is None:
@@ -581,7 +579,7 @@ def _filter_annotated_table(self, ht, gene_ids=None, rs_ids=None, frequencies=No
 
         ht = self._filter_by_in_silico(ht, in_silico)
 
-        return self._filter_by_annotations(ht, is_comp_het, **(parsed_annotations or {}))
+        return self._filter_by_annotations(ht, is_comp_het=is_comp_het, **(parsed_annotations or {}))
 
     def _filter_by_gene_ids(self, ht, gene_ids):
         gene_ids = hl.set(gene_ids)
@@ -742,7 +740,7 @@ def _parse_annotations(self, annotations, annotations_secondary, **kwargs):
         })
         return parsed_annotations
 
-    def _filter_by_annotations(self, ht, is_comp_het, consequence_ids=None, annotation_overrides=None,
+    def _filter_by_annotations(self, ht, is_comp_het=False, consequence_ids=None, annotation_overrides=None,
                                secondary_consequence_ids=None, secondary_annotation_overrides=None, **kwargs):
 
         annotation_exprs = {}

From a1a93ad64c273179313fadd4388fafc80a1511f3 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 6 Mar 2024 19:33:44 -0500
Subject: [PATCH 13/28] fix typo

---
 hail_search/queries/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index d1badc1c60..32b4c657ee 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -354,7 +354,7 @@ def import_filtered_table(self, project_samples, num_families, intervals=None, *
             if not use_annotations_ht_first:
                 self._ht = self._filter_annotated_table(self._ht, **kwargs)
             elif self._has_comp_het_search:
-                self._ht = self._filter_by_annotations(self._ht, **(kwargs.get(parsed_annotations) or {}))
+                self._ht = self._filter_by_annotations(self._ht, **(kwargs.get('parsed_annotations') or {}))
 
     def _add_project_ht(self, families_ht, project_ht, default, default_1=None):
         if default_1 is None:

From c5204da3378463392a8e466be4e172e6b93ac8b0 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 7 Mar 2024 14:14:02 -0500
Subject: [PATCH 14/28] add probably solved

---
 .../0060_alter_family_analysis_status.py       | 18 ++++++++++++++++++
 seqr/models.py                                 |  2 ++
 2 files changed, 20 insertions(+)
 create mode 100644 seqr/migrations/0060_alter_family_analysis_status.py

diff --git a/seqr/migrations/0060_alter_family_analysis_status.py b/seqr/migrations/0060_alter_family_analysis_status.py
new file mode 100644
index 0000000000..1e8a232e5f
--- /dev/null
+++ b/seqr/migrations/0060_alter_family_analysis_status.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.2.23 on 2024-03-07 19:13
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('seqr', '0059_project_subscribers'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='family',
+            name='analysis_status',
+            field=models.CharField(choices=[('S', 'S'), ('S_kgfp', 'S'), ('S_kgdp', 'S'), ('S_ng', 'S'), ('ES', 'E'), ('Sc_kgfp', 'S'), ('Sc_kgdp', 'S'), ('Sc_ng', 'S'), ('Rcpc', 'R'), ('Rncc', 'R'), ('C', 'C'), ('PB', 'P'), ('P', 'P'), ('I', 'A'), ('Q', 'W'), ('N', 'N')], default='Q', max_length=10),
+        ),
+    ]
diff --git a/seqr/models.py b/seqr/models.py
index d31a0a9dfa..e6d5e5bd74 100644
--- a/seqr/models.py
+++ b/seqr/models.py
@@ -278,6 +278,7 @@ def _compute_guid(self):
 class Family(ModelWithGUID):
     ANALYSIS_STATUS_ANALYSIS_IN_PROGRESS='I'
     ANALYSIS_STATUS_PARTIAL_SOLVE = 'P'
+    ANALYSIS_STATUS_PROBABLE_SOLVE = 'PB'
     ANALYSIS_STATUS_WAITING_FOR_DATA='Q'
     SOLVED_ANALYSIS_STATUS_CHOICES = (
         ('S', 'Solved'),
@@ -297,6 +298,7 @@ class Family(ModelWithGUID):
         ('Rcpc', 'Reviewed, currently pursuing candidates'),
         ('Rncc', 'Reviewed, no clear candidate'),
         ('C', 'Closed, no longer under analysis'),
+        (ANALYSIS_STATUS_PROBABLE_SOLVE, 'Probably Solved'),
         (ANALYSIS_STATUS_PARTIAL_SOLVE, 'Partial Solve - Analysis in Progress'),
         (ANALYSIS_STATUS_ANALYSIS_IN_PROGRESS, 'Analysis in Progress'),
         (ANALYSIS_STATUS_WAITING_FOR_DATA, 'Waiting for data'),

From aab7868f2892b4f649150ba96f48a35a460afa40 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 7 Mar 2024 14:17:05 -0500
Subject: [PATCH 15/28] add probably solved to ui

---
 CHANGELOG.md                 | 1 +
 ui/shared/utils/constants.js | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ceeedaf98..80818ba807 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # _seqr_ Changes
 
 ## dev
+* Add "Probably Solved" analysis status (REQUIRES DB MIGRATION)
 
 ## 3/1/24
 * Add subscribable project notifications (REQUIRES DB MIGRATION)
diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js
index 718c722ad1..c8d0671dc7 100644
--- a/ui/shared/utils/constants.js
+++ b/ui/shared/utils/constants.js
@@ -145,6 +145,7 @@ const FAMILY_STATUS_SOLVED_KNOWN_GENE_KNOWN_PHENOTYPE = 'S_kgfp'
 const FAMILY_STATUS_SOLVED_KNOWN_GENE_DIFFERENT_PHENOTYPE = 'S_kgdp'
 const FAMILY_STATUS_SOLVED_NOVEL_GENE = 'S_ng'
 const FAMILY_STATUS_EXTERNAL_SOLVE = 'ES'
+const FAMILY_STATUS_PROBABLE_SOLVE = 'PB'
 const FAMILY_STATUS_STRONG_CANDIDATE_KNOWN_GENE_KNOWN_PHENOTYPE = 'Sc_kgfp'
 const FAMILY_STATUS_STRONG_CANDIDATE_KNOWN_GENE_DIFFERENT_PHENOTYPE = 'Sc_kgdp'
 const FAMILY_STATUS_STRONG_CANDIDATE_NOVEL_GENE = 'Sc_ng'
@@ -164,6 +165,7 @@ export const SELECTABLE_FAMILY_ANALYSIS_STATUS_OPTIONS = [
   { value: FAMILY_STATUS_SOLVED_KNOWN_GENE_DIFFERENT_PHENOTYPE, color: '#4CAF50', name: 'Solved - gene linked to different phenotype' },
   { value: FAMILY_STATUS_SOLVED_NOVEL_GENE, color: '#4CAF50', name: 'Solved - novel gene' },
   { value: FAMILY_STATUS_EXTERNAL_SOLVE, color: '#146917', name: 'External Solve' },
+  { value: FAMILY_STATUS_PROBABLE_SOLVE, color: '#ACD657', name: 'Probably Solved' },
   { value: FAMILY_STATUS_STRONG_CANDIDATE_KNOWN_GENE_KNOWN_PHENOTYPE, color: '#CDDC39', name: 'Strong candidate - known gene for phenotype' },
   { value: FAMILY_STATUS_STRONG_CANDIDATE_KNOWN_GENE_DIFFERENT_PHENOTYPE, color: '#CDDC39', name: 'Strong candidate - gene linked to different phenotype' },
   { value: FAMILY_STATUS_STRONG_CANDIDATE_NOVEL_GENE, color: '#CDDC39', name: 'Strong candidate - novel gene' },

From 98d4c01e90c22bbe0e1558853e94f336f5338c4d Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 7 Mar 2024 14:44:37 -0500
Subject: [PATCH 16/28] update solve_status reporting

---
 seqr/views/apis/report_api_tests.py       | 18 +++++++++---------
 seqr/views/apis/summary_data_api_tests.py |  4 ++--
 seqr/views/utils/anvil_metadata_utils.py  | 10 ++++++----
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index b6ce200b4b..647ff3d730 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -250,7 +250,7 @@
                 {'column': 'affected_status', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Affected', 'Unaffected', 'Unknown']},
                 {'column': 'phenotype_description'},
                 {'column': 'age_at_enrollment'},
-                {'column': 'solve_status', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Yes', 'Likely', 'No', 'Partial']},
+                {'column': 'solve_status', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Solved', 'Partially solved', 'Probably solved', 'Unsolved', 'Unaffected']},
                 {'column': 'missing_variant_case', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Yes', 'No']},
             ],
         },
@@ -505,7 +505,7 @@
     'phenotype_description': None,
     'pmid_id': None,
     'seqr_chosen_consequence': None,
-    'solve_status': 'No',
+    'solve_status': 'Unsolved',
     'svName': None,
     'svType': None,
     'sv_name': None,
@@ -589,7 +589,7 @@ def test_anvil_export(self, mock_google_authenticated,  mock_zip):
             'dbgap_subject_id_1', 'No', '1', 'NA19678', 'NA19679', '-', 'Self', 'Male', 'Middle Eastern or North African', '-', '-',
             '-', 'OMIM:615120', 'Myasthenic syndrome, congenital, 8, with pre- and postsynaptic defects',
             'Affected', 'Adult onset', '-', 'HP:0001631|HP:0002011|HP:0001636', 'HP:0011675|HP:0001674|HP:0001508',
-            'myopathy', 'No'], subject_file)
+            'myopathy', 'Unsolved'], subject_file)
 
         self.assertEqual(sample_file[0], [
             'entity:sample_id', '01-subject_id', '02-sample_id', '03-dbgap_sample_id', '04-sequencing_center',
@@ -809,23 +809,23 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
         self.assertListEqual([
             'Broad_NA19675_1', 'Broad_1kg project nme with unide', 'BROAD', 'HMB', 'Yes', 'IKBKAP|CCDC102B|CMA - normal',
             '34415322', 'Broad_1', 'Broad_NA19678', 'Broad_NA19679', '', 'Self', '', 'Male', '',
-            'Middle Eastern or North African', '', '', '21', 'Affected', 'myopathy', '18', 'No', 'No',
+            'Middle Eastern or North African', '', '', '21', 'Affected', 'myopathy', '18', 'Unsolved', 'No',
         ], row)
         hispanic_row = next(r for r in participant_file if r[0] == 'Broad_HG00731')
         self.assertListEqual([
             'Broad_HG00731', 'Broad_1kg project nme with unide', 'BROAD', 'HMB', '', '', '', 'Broad_2', 'Broad_HG00732',
-            'Broad_HG00733', '', 'Self', '', 'Female', '', '', 'Hispanic or Latino', 'Other', '', 'Affected', '', '', 'No', 'No',
+            'Broad_HG00733', '', 'Self', '', 'Female', '', '', 'Hispanic or Latino', 'Other', '', 'Affected', '', '', 'Unsolved', 'No',
         ], hispanic_row)
         solved_row = next(r for r in participant_file if r[0] == 'Broad_NA20876')
         self.assertListEqual([
             'Broad_NA20876', 'Broad_1kg project nme with unide', 'BROAD', 'HMB', '', '', '', 'Broad_7', '0',
-            '0', '', '', '', 'Male', '', '', '', '', '', 'Affected', '', '', 'Yes', 'No',
+            '0', '', '', '', 'Male', '', '', '', '', '', 'Affected', '', '', 'Solved', 'No',
         ], solved_row)
         multi_data_type_row = next(r for r in participant_file if r[0] == 'Broad_NA20888')
         self.assertListEqual([
             'Broad_NA20888', 'Broad_Test Reprocessed Project' if has_second_project else 'Broad_1kg project nme with unide',
             'BROAD', 'HMB', 'No', '', '', 'Broad_12' if has_second_project else 'Broad_8', '0', '0', '', '', '',
-            'Male' if has_second_project else 'Female', '', '', '', '', '', 'Affected', '', '', 'No', 'No',
+            'Male' if has_second_project else 'Female', '', '', '', '', '', 'Affected', '', '', 'Unsolved', 'No',
         ], multi_data_type_row)
 
         self.assertEqual(len(family_file), 11 if has_second_project else 10)
@@ -1076,7 +1076,7 @@ def test_family_metadata(self):
             'familyGuid': 'F000012_12',
             'family_id': '12',
             'displayName': '12',
-            'solve_status': 'No',
+            'solve_status': 'Unsolved',
             'actual_inheritance': 'unknown',
             'date_data_generation': '2017-02-05',
             'data_type': 'WES',
@@ -1111,7 +1111,7 @@ def test_family_metadata(self):
             'familyGuid': 'F000003_3',
             'family_id': '3',
             'displayName': '3',
-            'solve_status': 'No',
+            'solve_status': 'Unsolved',
             'actual_inheritance': '',
             'date_data_generation': '2017-02-05',
             'data_type': 'WES',
diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py
index 9b1af12329..18c72f6fd3 100644
--- a/seqr/views/apis/summary_data_api_tests.py
+++ b/seqr/views/apis/summary_data_api_tests.py
@@ -35,7 +35,7 @@
 EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = {
     "projectGuid": "R0003_test",
     "num_saved_variants": 2,
-    "solve_status": "No",
+    "solve_status": "Unsolved",
     "sample_id": "NA20889",
     "gene_known_for_phenotype-1": "Known",
     "gene_known_for_phenotype-2": "Known",
@@ -143,7 +143,7 @@
     'pmid_id': None,
     'proband_relationship': 'Self',
     'sex': 'Female',
-    'solve_status': 'No',
+    'solve_status': 'Unsolved',
     'alt-1': 'T',
     'chrom-1': '1',
     'gene_known_for_phenotype-1': 'Candidate',
diff --git a/seqr/views/utils/anvil_metadata_utils.py b/seqr/views/utils/anvil_metadata_utils.py
index 07c576c773..800887229e 100644
--- a/seqr/views/utils/anvil_metadata_utils.py
+++ b/seqr/views/utils/anvil_metadata_utils.py
@@ -51,9 +51,9 @@
 }
 
 SOLVE_STATUS_LOOKUP = {
-    **{s: 'Yes' for s in Family.SOLVED_ANALYSIS_STATUSES},
-    **{s: 'Likely' for s in Family.STRONG_CANDIDATE_ANALYSIS_STATUSES},
-    Family.ANALYSIS_STATUS_PARTIAL_SOLVE: 'Partial',
+    **{s: 'Solved' for s in Family.SOLVED_ANALYSIS_STATUSES},
+    Family.ANALYSIS_STATUS_PARTIAL_SOLVE: 'Partially solved',
+    Family.ANALYSIS_STATUS_PROBABLE_SOLVE: 'Probably solved',
 }
 
 MIM_INHERITANCE_MAP = {
@@ -103,7 +103,7 @@ def _get_family_metadata(family_filter, family_fields, include_metadata, include
     for f in family_data:
         family_id = f.pop('id')
         f.update({
-            'solve_status': SOLVE_STATUS_LOOKUP.get(f['analysisStatus'], 'No'),
+            'solve_status': SOLVE_STATUS_LOOKUP.get(f['analysisStatus'], 'Unsolved'),
             **{k: v['format'](f) for k, v in (family_fields or {}).items()},
         })
         if format_id:
@@ -201,6 +201,8 @@ def parse_anvil_metadata(
             if individual.id in matchmaker_individuals:
                 subject_row['MME'] = matchmaker_individuals[individual.id] if mme_values else 'Yes'
             subject_row.update(family_subject_row)
+            if individual.affected != Individual.AFFECTED_STATUS_AFFECTED:
+                subject_row['solve_status'] = 'Unaffected'
             add_row(subject_row, family_id, SUBJECT_ROW_TYPE)
 
             participant_id = subject_row['participant_id']

From 7a1f7677f451f37b0f0f9ee0339d034f0208eab5 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 7 Mar 2024 15:38:07 -0500
Subject: [PATCH 17/28] do not show delete button for non deletable projects

---
 seqr/views/apis/dashboard_api.py                  |  4 ++++
 seqr/views/utils/individual_utils.py              | 15 ++++++++++-----
 .../Dashboard/components/ProjectEllipsisMenu.jsx  |  2 +-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/seqr/views/apis/dashboard_api.py b/seqr/views/apis/dashboard_api.py
index b7aa07c413..d5ccc20766 100644
--- a/seqr/views/apis/dashboard_api.py
+++ b/seqr/views/apis/dashboard_api.py
@@ -4,6 +4,7 @@
 from django.db import models
 
 from seqr.models import ProjectCategory, Sample, Family, Project
+from seqr.views.utils.individual_utils import check_project_individuals_deletable
 from seqr.views.utils.json_utils import create_json_response
 from seqr.views.utils.orm_to_json_utils import get_json_for_projects
 from seqr.views.utils.permissions_utils import get_project_guids_user_can_view, login_and_policies_required
@@ -46,6 +47,9 @@ def _get_projects_json(user):
         projects_by_guid[project.guid]['numFamilies'] = project.family__count
         projects_by_guid[project.guid]['numIndividuals'] = project.family__individual__count
         projects_by_guid[project.guid]['numVariantTags'] = project.family__savedvariant__count
+        if projects_by_guid[project.guid]['userIsCreator']:
+            errors, _ = check_project_individuals_deletable(project)
+            projects_by_guid[project.guid]['userCanDelete'] = not errors
 
     analysis_status_counts = Family.objects.filter(project__in=projects).values(
         'project__guid', 'analysis_status').annotate(count=models.Count('*'))
diff --git a/seqr/views/utils/individual_utils.py b/seqr/views/utils/individual_utils.py
index 8ef121eb6c..3a35d34139 100644
--- a/seqr/views/utils/individual_utils.py
+++ b/seqr/views/utils/individual_utils.py
@@ -173,11 +173,7 @@ def delete_individuals(project, individual_guids, user):
     Returns:
         list: Family objects for families with deleted individuals
     """
-
-    individuals_to_delete = Individual.objects.filter(
-        family__project=project, guid__in=individual_guids)
-
-    errors = backend_specific_call(_validate_no_submissions, _validate_no_sumissions_no_search_samples)(individuals_to_delete)
+    errors, individuals_to_delete = check_project_individuals_deletable(project.guid, individual_guids=individual_guids)
     if errors:
         raise ErrorsWarningsException(errors)
 
@@ -196,6 +192,15 @@ def delete_individuals(project, individual_guids, user):
     return families_with_deleted_individuals
 
 
+def check_project_individuals_deletable(project_guid, individual_guids=None):
+    individuals_to_delete = Individual.objects.filter(family__project__guid=project_guid)
+    if individual_guids is not None:
+        individuals_to_delete = individuals_to_delete.filter(guid__in=individual_guids)
+
+    errors = backend_specific_call(_validate_no_submissions, _validate_no_sumissions_no_search_samples)(individuals_to_delete)
+    return errors, individuals_to_delete
+
+
 def _validate_delete_individuals(individuals_to_delete, error_type, query):
     errors = []
     invalid_individuals = individuals_to_delete.filter(**query).distinct().values_list('individual_id', flat=True)
diff --git a/ui/pages/Dashboard/components/ProjectEllipsisMenu.jsx b/ui/pages/Dashboard/components/ProjectEllipsisMenu.jsx
index e794520ff9..0a2055cf20 100644
--- a/ui/pages/Dashboard/components/ProjectEllipsisMenu.jsx
+++ b/ui/pages/Dashboard/components/ProjectEllipsisMenu.jsx
@@ -55,7 +55,7 @@ const ProjectEllipsisMenu = React.memo((props) => {
       <Dropdown.Divider key="divider1" />,
     )
   }
-  if (props.project.userIsCreator) {
+  if (props.project.userCanDelete) {
     menuItems.push(
       <Dropdown.Divider key="divider2" />,
       <DeleteButton

From 87b0d9b615a1abf4546258c36e1b083d7261ad36 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 7 Mar 2024 15:54:30 -0500
Subject: [PATCH 18/28] fix typos and update tests

---
 seqr/fixtures/1kg_project.json         | 6 +++---
 seqr/views/apis/dashboard_api_tests.py | 6 ++++++
 seqr/views/utils/individual_utils.py   | 6 +++---
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json
index f9f87c1315..7032d4f55f 100644
--- a/seqr/fixtures/1kg_project.json
+++ b/seqr/fixtures/1kg_project.json
@@ -5,7 +5,7 @@
     "fields": {
         "guid": "R0001_1kg",
         "created_date": "2017-03-12T19:27:08.156Z",
-        "created_by": null,
+        "created_by": 10,
         "last_modified_date": "2017-03-13T09:07:49.582Z",
         "name": "1kg project n\u00e5me with uni\u00e7\u00f8de",
         "description": "1000 genomes project description with uni\u00e7\u00f8de",
@@ -29,7 +29,7 @@
     "fields": {
         "guid": "R0002_empty",
         "created_date": "2017-03-12T19:27:08.156Z",
-        "created_by": null,
+        "created_by": 10,
         "last_modified_date": "2017-03-13T09:07:49.582Z",
         "name": "Empty Project",
         "description": "",
@@ -50,7 +50,7 @@
     "fields": {
         "guid": "R0003_test",
         "created_date": "2017-03-12T19:27:08.156Z",
-        "created_by": null,
+        "created_by": 10,
         "last_modified_date": "2017-03-13T09:07:49.582Z",
         "name": "Test Reprocessed Project",
         "description": "",
diff --git a/seqr/views/apis/dashboard_api_tests.py b/seqr/views/apis/dashboard_api_tests.py
index bc3a124263..ec0a5ac3f1 100644
--- a/seqr/views/apis/dashboard_api_tests.py
+++ b/seqr/views/apis/dashboard_api_tests.py
@@ -40,6 +40,8 @@ def test_dashboard_page_data(self, mock_set_redis, mock_get_redis):
         self.assertSetEqual(
             set(next(iter(response_json['projectsByGuid'].values())).keys()), DASHBOARD_PROJECT_FIELDS
         )
+        self.assertSetEqual({p['userIsCreator'] for p in response_json['projectsByGuid'].values()}, {False})
+        self.assertFalse(any('userCanDelete' in p for p in response_json['projectsByGuid'].values()))
         mock_get_redis.assert_called_with('projects__test_user_collaborator')
         mock_set_redis.assert_called_with(
             'projects__test_user_collaborator', list(response_json['projectsByGuid'].keys()), expire=300)
@@ -49,6 +51,10 @@ def test_dashboard_page_data(self, mock_set_redis, mock_get_redis):
         self.assertEqual(response.status_code, 200)
         response_json = response.json()
         self.assertEqual(len(response_json['projectsByGuid']), 3)
+        self.assertSetEqual({p['userIsCreator'] for p in response_json['projectsByGuid'].values()}, {True})
+        self.assertTrue(response_json['projectsByGuid']['R0002_empty']['userCanDelete'])
+        self.assertFalse(response_json['projectsByGuid']['R0001_1kg']['userCanDelete'])
+        self.assertFalse(response_json['projectsByGuid']['R0003_test']['userCanDelete'])
         mock_get_redis.assert_called_with('projects__test_user')
         mock_set_redis.assert_called_with('projects__test_user', list(response_json['projectsByGuid'].keys()), expire=300)
 
diff --git a/seqr/views/utils/individual_utils.py b/seqr/views/utils/individual_utils.py
index 3a35d34139..40ae1196d2 100644
--- a/seqr/views/utils/individual_utils.py
+++ b/seqr/views/utils/individual_utils.py
@@ -173,7 +173,7 @@ def delete_individuals(project, individual_guids, user):
     Returns:
         list: Family objects for families with deleted individuals
     """
-    errors, individuals_to_delete = check_project_individuals_deletable(project.guid, individual_guids=individual_guids)
+    errors, individuals_to_delete = check_project_individuals_deletable(project, individual_guids=individual_guids)
     if errors:
         raise ErrorsWarningsException(errors)
 
@@ -192,8 +192,8 @@ def delete_individuals(project, individual_guids, user):
     return families_with_deleted_individuals
 
 
-def check_project_individuals_deletable(project_guid, individual_guids=None):
-    individuals_to_delete = Individual.objects.filter(family__project__guid=project_guid)
+def check_project_individuals_deletable(project, individual_guids=None):
+    individuals_to_delete = Individual.objects.filter(family__project=project)
     if individual_guids is not None:
         individuals_to_delete = individuals_to_delete.filter(guid__in=individual_guids)
 

From 0ba46cbd4148eda478b84bb06dd262ea6a796619 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 11 Mar 2024 12:17:02 -0400
Subject: [PATCH 19/28] properly format mito chrom for variant id query

---
 seqr/utils/search/search_utils_tests.py | 6 +++---
 seqr/utils/search/utils.py              | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/seqr/utils/search/search_utils_tests.py b/seqr/utils/search/search_utils_tests.py
index aec4a37952..7e2fcb6871 100644
--- a/seqr/utils/search/search_utils_tests.py
+++ b/seqr/utils/search/search_utils_tests.py
@@ -108,12 +108,12 @@ def test_get_single_variant(self, mock_get_variants_for_ids):
         self.assertEqual(str(cm.exception), 'Variant 10-10334333-A-G not found')
 
     def test_get_variants_for_variant_ids(self, mock_get_variants_for_ids):
-        variant_ids = ['2-103343353-GAGA-G', '1-248367227-TC-T', 'prefix-938_DEL', 'M-10195-C-A']
+        variant_ids = ['2-103343353-GAGA-G', '1-248367227-TC-T', 'prefix-938_DEL', 'MT-10195-C-A']
         get_variants_for_variant_ids(self.families, variant_ids, user=self.user)
         mock_get_variants_for_ids.assert_called_with(mock.ANY, '37', {
             '2-103343353-GAGA-G': ('2', 103343353, 'GAGA', 'G'),
             '1-248367227-TC-T': ('1', 248367227, 'TC', 'T'),
-            'M-10195-C-A': ('M', 10195, 'C', 'A'),
+            'MT-10195-C-A': ('M', 10195, 'C', 'A'),
             'prefix-938_DEL': None,
         }, self.user, user_email=None)
         self.assertSetEqual(set(mock_get_variants_for_ids.call_args.args[0]), set(self.search_samples))
@@ -123,7 +123,7 @@ def test_get_variants_for_variant_ids(self, mock_get_variants_for_ids):
         mock_get_variants_for_ids.assert_called_with(mock.ANY, '37', {
             '2-103343353-GAGA-G': ('2', 103343353, 'GAGA', 'G'),
             '1-248367227-TC-T': ('1', 248367227, 'TC', 'T'),
-            'M-10195-C-A': ('M', 10195, 'C', 'A'),
+            'MT-10195-C-A': ('M', 10195, 'C', 'A'),
         }, self.user, user_email=None)
         skipped_samples = ['S000145_hg00731', 'S000146_hg00732', 'S000148_hg00733']
         expected_samples = {s for s in self.search_samples if s.guid not in skipped_samples}
diff --git a/seqr/utils/search/utils.py b/seqr/utils/search/utils.py
index a94e40ddf0..c1f0e6d4b4 100644
--- a/seqr/utils/search/utils.py
+++ b/seqr/utils/search/utils.py
@@ -14,7 +14,7 @@
 from seqr.utils.search.hail_search_utils import get_hail_variants, get_hail_variants_for_variant_ids, ping_hail_backend, \
     hail_variant_lookup, validate_hail_backend_no_location_search
 from seqr.utils.gene_utils import parse_locus_list_items
-from seqr.utils.xpos_utils import get_xpos
+from seqr.utils.xpos_utils import get_xpos, format_chrom
 
 
 class InvalidSearchException(Exception):
@@ -334,6 +334,7 @@ def _parse_variant_items(search_json):
 def _parse_variant_id(variant_id):
     try:
         chrom, pos, ref, alt = variant_id.split('-')
+        chrom = format_chrom(chrom)
         pos = int(pos)
         get_xpos(chrom, pos)
         return chrom, pos, ref, alt

From 4e18bdc5a495b31b4c4c386bd3717a464e918936 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 11 Mar 2024 13:54:53 -0400
Subject: [PATCH 20/28] update fixture data to test ulifted variants

---
 seqr/fixtures/1kg_project.json                |  3 +-
 .../commands/lift_project_to_hg38.py          |  2 +-
 ...eck_for_new_samples_from_pipeline_tests.py |  8 ++---
 .../tests/lift_project_to_hg38_tests.py       | 35 +++++++++++--------
 4 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json
index 7032d4f55f..ca050a560f 100644
--- a/seqr/fixtures/1kg_project.json
+++ b/seqr/fixtures/1kg_project.json
@@ -1884,7 +1884,6 @@
             "genotypeFilters": "pass",
             "mainTranscriptId": "ENST00000505820",
             "populations": {"callset": {"ac": null, "an": null, "af": null}, "g1k": {"ac": null, "an": null, "af": 0.0}, "gnomad_genomes": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 0.0}, "gnomad_exomes": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 8.142526788913136e-06}, "exac": {"hemi": null, "ac": null, "an": null, "hom": null, "af": 0.0}, "topmed": {"ac": null, "an": null, "af": null}},
-            "genomeVersion": "37",
             "pos": 1560662,
             "predictions": {"eigen": null, "revel": null, "sift": "damaging", "cadd": "31", "metasvm": "D", "mpc": null, "splice_ai": null, "phastcons_100_vert": null, "mut_taster": "disease_causing", "fathmm": "tolerated", "polyphen": "probably_damaging", "dann": null, "primate_ai": null, "gerp_rs": null},
             "hgmd": {"accession": null, "class": null},
@@ -1970,7 +1969,7 @@
                     "hom": 10332
                     }
                 },
-            "genomeVersion": "37",
+            "genomeVersion": "38",
             "pos": 1562437,
             "hgmd": {"accession": null, "class": null},
             "rsid": null,
diff --git a/seqr/management/commands/lift_project_to_hg38.py b/seqr/management/commands/lift_project_to_hg38.py
index 3f98c63616..ae182f42ca 100644
--- a/seqr/management/commands/lift_project_to_hg38.py
+++ b/seqr/management/commands/lift_project_to_hg38.py
@@ -77,7 +77,7 @@ def handle(self, *args, **options):
             len(es_variants), len(missing_variants) + len(lift_failed), missing_family_count))
 
 def _get_variants_to_lift(saved_variants):
-    saved_variants_to_lift = [v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38]
+    saved_variants_to_lift = [v for v in saved_variants if v.get('genomeVersion') != GENOME_VERSION_GRCh38]
     num_already_lifted = len(saved_variants) - len(saved_variants_to_lift)
     if num_already_lifted:
         if input('Found {} saved variants already on Hg38. Continue with liftover (y/n)? '.format(
diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
index 89c4679502..de2f1efd8a 100644
--- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
+++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
@@ -135,7 +135,7 @@ def test_command(self, mock_email, mock_airtable_utils):
             'total': 1,
         })
         responses.add(responses.POST, f'{MOCK_HAIL_HOST}:5000/multi_lookup', status=200, json={
-            'results': [{'variantId': '21-3343353-GAGA-G', 'updated_new_field': 'updated_value', 'rsid': 'rs123'}],
+            'results': [{'variantId': '1-46859832-G-A', 'updated_new_field': 'updated_value', 'rsid': 'rs123'}],
         })
 
         # Test errors
@@ -264,11 +264,11 @@ def test_command(self, mock_email, mock_airtable_utils):
         self.assertEqual(len(annotation_updated_variant.saved_variant_json), 18)
         self.assertListEqual(annotation_updated_variant.saved_variant_json['familyGuids'], ['F000001_1'])
 
-        annotation_updated_json = SavedVariant.objects.get(guid='SV0000001_2103343353_r0390_100').saved_variant_json
-        self.assertEqual(len(annotation_updated_json), 20)
+        annotation_updated_json = SavedVariant.objects.get(guid='SV0059956_11560662_f019313_1').saved_variant_json
+        self.assertEqual(len(annotation_updated_json), 18)
         self.assertEqual(annotation_updated_json['updated_new_field'], 'updated_value')
         self.assertEqual(annotation_updated_json['rsid'], 'rs123')
-        self.assertEqual(annotation_updated_json['mainTranscriptId'], 'ENST00000258436')
+        self.assertEqual(annotation_updated_json['mainTranscriptId'], 'ENST00000505820')
         self.assertEqual(len(annotation_updated_json['genotypes']), 3)
 
         self.mock_utils_logger.error.assert_not_called()
diff --git a/seqr/management/tests/lift_project_to_hg38_tests.py b/seqr/management/tests/lift_project_to_hg38_tests.py
index 0342bd0fac..b3e1c1e833 100644
--- a/seqr/management/tests/lift_project_to_hg38_tests.py
+++ b/seqr/management/tests/lift_project_to_hg38_tests.py
@@ -59,11 +59,11 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get
         calls = [
             mock.call('Updating project genome version for {}'.format(PROJECT_NAME)),
             mock.call('Validating es index test_new_index'),
-            mock.call('Lifting over 4 variants (skipping 0 that are already lifted)'),
+            mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'),
             mock.call('Successfully lifted over 3 variants'),
             mock.call('Successfully updated 3 variants'),
             mock.call('---Done---'),
-            mock.call('Succesfully lifted over 3 variants. Skipped 3 failed variants. Family data not updated for 0 variants')
+            mock.call('Succesfully lifted over 3 variants. Skipped 2 failed variants. Family data not updated for 0 variants')
         ]
         mock_logger.info.assert_has_calls(calls)
 
@@ -72,7 +72,6 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get
 
         calls = [
             mock.call('chr21', 3343353),
-            mock.call('chr1', 1562437),
             mock.call('chr1', 248367227),
             mock.call('chr1', 1560662),
         ]
@@ -80,7 +79,7 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get
 
         families = {family for family in Family.objects.filter(pk__in = [1, 2])}
         self.assertSetEqual(families, mock_get_es_variants.call_args.args[0])
-        self.assertSetEqual({'1-1627057-G-C', '21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'},
+        self.assertSetEqual({'21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'},
                             set(mock_get_es_variants.call_args.args[1]))
 
         # Test discontinue on lifted variants
@@ -90,7 +89,7 @@ def test_command(self, mock_liftover, mock_get_es_variants, mock_input, mock_get
             call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME),
                     '--es-index={}'.format(ELASTICSEARCH_INDEX))
 
-        self.assertEqual(str(ce.exception), 'Error: found 1 saved variants already on Hg38')
+        self.assertEqual(str(ce.exception), 'Error: found 2 saved variants already on Hg38')
 
         calls = [
             mock.call('Updating project genome version for {}'.format(PROJECT_NAME)),
@@ -177,12 +176,19 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants,
             call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME),
                     '--es-index={}'.format(ELASTICSEARCH_INDEX))
 
-        self.assertEqual(str(ce.exception), 'Error: unable to lift over 4 variants')
+        self.assertEqual(str(ce.exception), 'Error: found 1 saved variants already on Hg38')
+
+        mock_input.side_effect = ['y', 'n']
+        with self.assertRaises(CommandError) as ce:
+            call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME),
+                    '--es-index={}'.format(ELASTICSEARCH_INDEX))
+
+        self.assertEqual(str(ce.exception), 'Error: unable to lift over 3 variants')
 
         calls = [
             mock.call('Updating project genome version for {}'.format(PROJECT_NAME)),
             mock.call('Validating es index test_new_index'),
-            mock.call('Lifting over 4 variants (skipping 0 that are already lifted)')
+            mock.call('Lifting over 3 variants (skipping 1 that are already lifted)')
         ]
         mock_logger.info.assert_has_calls(calls)
 
@@ -190,16 +196,17 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants,
         mock_get_es_variants.return_value = VARIANTS
         mock_liftover_to_38.convert_coordinate.side_effect = mock_convert_coordinate
         mock_logger.reset_mock()
+        mock_input.side_effect = ['y', 'n', 'n']
         with self.assertRaises(CommandError) as ce:
             call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME),
                     '--es-index={}'.format(ELASTICSEARCH_INDEX))
 
-        self.assertEqual(str(ce.exception), 'Error: unable to find 3 lifted-over variants')
+        self.assertEqual(str(ce.exception), 'Error: unable to find 2 lifted-over variants')
 
         calls = [
             mock.call('Updating project genome version for {}'.format(PROJECT_NAME)),
             mock.call('Validating es index test_new_index'),
-            mock.call('Lifting over 4 variants (skipping 0 that are already lifted)')
+            mock.call('Lifting over 3 variants (skipping 1 that are already lifted)')
         ]
         mock_logger.info.assert_has_calls(calls)
 
@@ -207,7 +214,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants,
         self.assertSetEqual(mock_get_es_variants.call_args.args[0], families)
         self.assertSetEqual(
             set(mock_get_es_variants.call_args.args[1]),
-            {'1-1627057-G-C', '21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'}
+            {'21-3343400-GAGA-G', '1-248203925-TC-T', '1-46394160-G-A'}
         )
 
         # Test discontinue on missing family data while updating the saved variants
@@ -215,7 +222,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants,
         variants.append(SINGLE_VARIANT)
         mock_get_es_variants.return_value = variants
         mock_logger.reset_mock()
-        mock_input.side_effect = ['y', 'n']
+        mock_input.side_effect = ['y', 'y', 'n']
         with self.assertRaises(CommandError) as ce:
             call_command('lift_project_to_hg38', '--project={}'.format(PROJECT_NAME),
                     '--es-index={}'.format(ELASTICSEARCH_INDEX))
@@ -225,7 +232,7 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants,
         calls = [
             mock.call('Updating project genome version for {}'.format(PROJECT_NAME)),
             mock.call('Validating es index test_new_index'),
-            mock.call('Lifting over 4 variants (skipping 0 that are already lifted)'),
+            mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'),
             mock.call('Successfully lifted over 4 variants')
         ]
         mock_logger.info.assert_has_calls(calls)
@@ -240,11 +247,11 @@ def test_command_other_exceptions(self, mock_liftover, mock_single_es_variants,
         calls = [
             mock.call('Updating project genome version for {}'.format(PROJECT_NAME)),
             mock.call('Validating es index test_new_index'),
-            mock.call('Lifting over 3 variants (skipping 1 that are already lifted)'),
+            mock.call('Lifting over 2 variants (skipping 2 that are already lifted)'),
             mock.call('Successfully lifted over 4 variants'),
             mock.call('Successfully updated 4 variants'),
             mock.call('---Done---'),
-            mock.call('Succesfully lifted over 4 variants. Skipped 2 failed variants. Family data not updated for 1 variants')
+            mock.call('Succesfully lifted over 4 variants. Skipped 1 failed variants. Family data not updated for 1 variants')
         ]
         mock_logger.info.assert_has_calls(calls)
 

From 868b726ea6f251664e312ea91e503aeb5ba90531 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 11 Mar 2024 14:15:28 -0400
Subject: [PATCH 21/28] only reload variants on correct genome build

---
 seqr/fixtures/1kg_project.json                            | 2 +-
 .../commands/check_for_new_samples_from_pipeline.py       | 7 +++++--
 .../tests/check_for_new_samples_from_pipeline_tests.py    | 8 ++++++--
 seqr/views/apis/summary_data_api_tests.py                 | 2 +-
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json
index ca050a560f..e3a01d8fa6 100644
--- a/seqr/fixtures/1kg_project.json
+++ b/seqr/fixtures/1kg_project.json
@@ -2084,7 +2084,7 @@
         "alt": "T",
         "variant_id": "12-48367227-TC-T",
         "saved_variant_json": {
-            "liftedOverGenomeVersion": "38",  "liftedOverPos": "", "genomeVersion": "37", "pos": 248367227,
+            "liftedOverGenomeVersion": "37",  "liftedOverPos": "", "genomeVersion": "38", "pos": 248367227,
             "transcripts": {}, "chrom": "1", "genotypes": {
                 "I000018_na21234": {"sampleId": "NA20885", "ab": 0.0, "gq": 99.0, "numAlt": 1}}},
         "family": 14
diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py
index 0ee5626438..f61d592596 100644
--- a/seqr/management/commands/check_for_new_samples_from_pipeline.py
+++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py
@@ -1,6 +1,7 @@
 from collections import defaultdict
 from django.contrib.postgres.aggregates import ArrayAgg
 from django.core.management.base import BaseCommand, CommandError
+from django.db.models import Q
 from django.db.models.functions import JSONObject
 import json
 import logging
@@ -116,9 +117,10 @@ def handle(self, *args, **options):
     @staticmethod
     def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version):
         data_type = dataset_type
+        db_genome_version = genome_version.replace('GRCh', '')
         updated_annotation_samples = Sample.objects.filter(
             is_active=True, dataset_type=dataset_type,
-            individual__family__project__genome_version=genome_version.replace('GRCh', ''),
+            individual__family__project__genome_version=db_genome_version,
         ).exclude(individual__family__guid__in=updated_families)
         if dataset_type == Sample.DATASET_TYPE_SV_CALLS:
             updated_annotation_samples = updated_annotation_samples.filter(sample_type=sample_type)
@@ -127,7 +129,8 @@ def _reload_shared_variant_annotations(updated_variants_by_id, updated_families,
         variant_models = SavedVariant.objects.filter(
             family_id__in=updated_annotation_samples.values_list('individual__family', flat=True).distinct(),
             **saved_variants_dataset_type_filter(dataset_type),
-        )
+        ).filter(Q(saved_variant_json__genomeVersion__isnull=True) | Q(saved_variant_json__genomeVersion=db_genome_version))
+
         if not variant_models:
             logger.info('No additional saved variants to update')
             return
diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
index de2f1efd8a..5662885fa6 100644
--- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
+++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
@@ -174,7 +174,11 @@ def test_command(self, mock_email, mock_airtable_utils):
             str(ce.exception),
             'Data has genome version GRCh38 but the following projects have conflicting versions: R0003_test (GRCh37)')
 
+        # Update fixture data to allow testing edge cases
         Project.objects.filter(id__in=[1, 3]).update(genome_version=38)
+        sv = SavedVariant.objects.get(guid='SV0000002_1248367227_r0390_100')
+        sv.saved_variant_json['genomeVersion'] = '38'
+        sv.save()
 
         with self.assertRaises(ValueError) as ce:
             call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08')
@@ -199,7 +203,7 @@ def test_command(self, mock_email, mock_airtable_utils):
                 {'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'},
             ]}},
         ], reload_annotations_logs=[
-            'Reloading shared annotations for 4 saved variants (4 unique)', 'Fetched 1 additional variants', 'Updated 2 saved variants',
+            'Reloading shared annotations for 3 saved variants (3 unique)', 'Fetched 1 additional variants', 'Updated 2 saved variants',
         ])
 
         old_data_sample_guid = 'S000143_na20885'
@@ -248,7 +252,7 @@ def test_command(self, mock_email, mock_airtable_utils):
         self.assertDictEqual(json.loads(multi_lookup_request.body), {
             'genome_version': 'GRCh38',
             'data_type': 'SNV_INDEL',
-            'variant_ids': ['1-1562437-G-C', '1-46859832-G-A', '21-3343353-GAGA-G'],
+            'variant_ids': ['1-1562437-G-C', '1-46859832-G-A'],
         })
 
         updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value')
diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py
index 18c72f6fd3..6e55fc315c 100644
--- a/seqr/views/apis/summary_data_api_tests.py
+++ b/seqr/views/apis/summary_data_api_tests.py
@@ -151,7 +151,7 @@
     'end-1': None,
     'ref-1': 'TC',
     'zygosity-1': 'Heterozygous',
-    'variant_reference_assembly-1': 'GRCh37',
+    'variant_reference_assembly-1': 'GRCh38',
     'allele_balance_or_heteroplasmy_percentage-1': None,
     'gene-1': None,
     'gene_id-1': None,

From 14f03d07bdc37d551bb576d61b3aeb05e81a2d4c Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 11 Mar 2024 14:23:02 -0400
Subject: [PATCH 22/28] fix test

---
 matchmaker/views/matchmaker_api_tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/matchmaker/views/matchmaker_api_tests.py b/matchmaker/views/matchmaker_api_tests.py
index 7b898c3c7f..149d0cde77 100644
--- a/matchmaker/views/matchmaker_api_tests.py
+++ b/matchmaker/views/matchmaker_api_tests.py
@@ -306,7 +306,7 @@ def test_search_individual_mme_matches(self, mock_email, mock_logger, mock_slack
                         'start': 248367227,
                         'referenceBases': 'TC',
                         'alternateBases': 'T',
-                        'assembly': 'GRCh37',
+                        'assembly': 'GRCh38',
                     },
                     'zygosity': 1,
                 }],
@@ -332,7 +332,7 @@ def test_search_individual_mme_matches(self, mock_email, mock_logger, mock_slack
                         'ref': 'TC',
                         'alt': 'T',
                         'end': None,
-                        'genomeVersion': 'GRCh37',
+                        'genomeVersion': 'GRCh38',
                     }
                 }
             ],

From 464cb3b6f13dd33b636b17d2da46f2b7088a7f1d Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 11 Mar 2024 14:33:29 -0400
Subject: [PATCH 23/28] properly parse variant ids

---
 .../check_for_new_samples_from_pipeline.py         |  6 +++++-
 .../check_for_new_samples_from_pipeline_tests.py   |  2 +-
 seqr/utils/search/utils.py                         | 14 +++++++++-----
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py
index f61d592596..2a4c0d71a9 100644
--- a/seqr/management/commands/check_for_new_samples_from_pipeline.py
+++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py
@@ -10,6 +10,7 @@
 from seqr.models import Family, Sample, SavedVariant
 from seqr.utils.file_utils import file_iter, does_file_exist
 from seqr.utils.search.add_data_utils import notify_search_data_loaded
+from seqr.utils.search.utils import parse_valid_variant_id
 from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup
 from seqr.views.utils.dataset_utils import match_and_update_search_samples
 from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json, \
@@ -117,12 +118,13 @@ def handle(self, *args, **options):
     @staticmethod
     def _reload_shared_variant_annotations(updated_variants_by_id, updated_families, dataset_type, sample_type, genome_version):
         data_type = dataset_type
+        is_sv = dataset_type == Sample.DATASET_TYPE_SV_CALLS
         db_genome_version = genome_version.replace('GRCh', '')
         updated_annotation_samples = Sample.objects.filter(
             is_active=True, dataset_type=dataset_type,
             individual__family__project__genome_version=db_genome_version,
         ).exclude(individual__family__guid__in=updated_families)
-        if dataset_type == Sample.DATASET_TYPE_SV_CALLS:
+        if is_sv:
             updated_annotation_samples = updated_annotation_samples.filter(sample_type=sample_type)
             data_type = f'{dataset_type}_{sample_type}'
 
@@ -147,6 +149,8 @@ def _reload_shared_variant_annotations(updated_variants_by_id, updated_families,
         }
         fetch_variant_ids = set(variants_by_id.keys()) - set(updated_variants_by_id.keys())
         if fetch_variant_ids:
+            if not is_sv:
+                fetch_variant_ids = [parse_valid_variant_id(variant_id) for variant_id in fetch_variant_ids]
             updated_variants = hail_variant_multi_lookup(USER_EMAIL, sorted(fetch_variant_ids), data_type, genome_version)
             logger.info(f'Fetched {len(updated_variants)} additional variants')
             updated_variants_by_id.update({variant['variantId']: variant for variant in updated_variants})
diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
index 5662885fa6..4cb3e15983 100644
--- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
+++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
@@ -252,7 +252,7 @@ def test_command(self, mock_email, mock_airtable_utils):
         self.assertDictEqual(json.loads(multi_lookup_request.body), {
             'genome_version': 'GRCh38',
             'data_type': 'SNV_INDEL',
-            'variant_ids': ['1-1562437-G-C', '1-46859832-G-A'],
+            'variant_ids': [['1', 1562437, 'G', 'C'], ['1', 46859832, 'G', 'A']],
         })
 
         updated_variants = SavedVariant.objects.filter(saved_variant_json__updated_field='updated_value')
diff --git a/seqr/utils/search/utils.py b/seqr/utils/search/utils.py
index c1f0e6d4b4..69c302fd16 100644
--- a/seqr/utils/search/utils.py
+++ b/seqr/utils/search/utils.py
@@ -333,15 +333,19 @@ def _parse_variant_items(search_json):
 
 def _parse_variant_id(variant_id):
     try:
-        chrom, pos, ref, alt = variant_id.split('-')
-        chrom = format_chrom(chrom)
-        pos = int(pos)
-        get_xpos(chrom, pos)
-        return chrom, pos, ref, alt
+        return parse_valid_variant_id(variant_id)
     except (KeyError, ValueError):
         return None
 
 
+def parse_valid_variant_id(variant_id):
+    chrom, pos, ref, alt = variant_id.split('-')
+    chrom = format_chrom(chrom)
+    pos = int(pos)
+    get_xpos(chrom, pos)
+    return chrom, pos, ref, alt
+
+
 def _validate_sort(sort, families):
     if sort == PRIORITIZED_GENE_SORT and len(families) > 1:
         raise InvalidSearchException('Phenotype sort is only supported for single-family search.')

From bc2ed7e66927399ab78f829e082cc296e0c8c367 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 11 Mar 2024 16:59:06 -0400
Subject: [PATCH 24/28] specify which notification failed when reporting
 notification error

---
 matchmaker/views/external_api.py       | 11 ++++-------
 matchmaker/views/external_api_tests.py |  4 ++--
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/matchmaker/views/external_api.py b/matchmaker/views/external_api.py
index 007278a886..f2cd6320e7 100644
--- a/matchmaker/views/external_api.py
+++ b/matchmaker/views/external_api.py
@@ -151,7 +151,7 @@ def _generate_notification_for_incoming_match(results, incoming_query, incoming_
         emails = [email.strip() for email in submission.contact_href.replace('mailto:', '').split(',')]
         send_emails = emails if len(emails) < 2 else [email for email in emails if email!= MME_DEFAULT_CONTACT_EMAIL]
         all_emails.update(send_emails)
-        match_results.append((result_text, send_emails))
+        match_results.append((result_text, send_emails, submission.submission_id))
     match_results = sorted(match_results, key=lambda result_tuple: result_tuple[0])
 
     base_message = """Dear collaborators,
@@ -180,12 +180,11 @@ def _generate_notification_for_incoming_match(results, incoming_query, incoming_
     We sent this email alert to: {email_addresses_alert_sent_to}\n{footer}."""
 
     safe_post_to_slack(MME_SLACK_MATCH_NOTIFICATION_CHANNEL, message_template.format(
-        base_message=base_message, match_results='\n'.join([text for text, _ in match_results]),
+        base_message=base_message, match_results='\n'.join([result[0] for result in match_results]),
         email_addresses_alert_sent_to=', '.join(sorted(all_emails)), footer=MME_EMAIL_FOOTER
     ))
 
-    email_errors = []
-    for result_text, emails in match_results:
+    for result_text, emails, submission_id in match_results:
         try:
             email_message = EmailMessage(
                 subject='Received new MME match',
@@ -198,9 +197,7 @@ def _generate_notification_for_incoming_match(results, incoming_query, incoming_
             )
             email_message.send()
         except Exception as e:
-            email_errors.append(str(e))
-    if email_errors:
-        raise ValueError('\n'.join(email_errors))
+            logger.error(f'Unable to send notification email for incoming MME match with {submission_id}: {e}')
 
 
 MME_EMAIL_FOOTER = """
diff --git a/matchmaker/views/external_api_tests.py b/matchmaker/views/external_api_tests.py
index 7ec0e164ad..0956b5ee8d 100644
--- a/matchmaker/views/external_api_tests.py
+++ b/matchmaker/views/external_api_tests.py
@@ -284,8 +284,8 @@ def test_mme_match_proxy(self, mock_post_to_slack, mock_email, mock_logger):
             mock.call().send(),
         ])
 
-        mock_logger.error.assert_called_with(
-            'Unable to create notification for incoming MME match request for 3 matches (NA19675_1_01, P0004515, P0004517): Email error')
+        mock_logger.error.assert_called_once_with(
+            'Unable to send notification email for incoming MME match with NA19675_1_01: Email error')
 
         # Test receive same request again
         mock_post_to_slack.reset_mock()

From 4d15bacb3f3cc5992b5ba8fad965ade4a8dc888b Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 12 Mar 2024 16:19:17 -0400
Subject: [PATCH 25/28] remove unused error hndler

---
 matchmaker/views/external_api.py       | 11 ++---------
 matchmaker/views/external_api_tests.py | 13 +++++++------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/matchmaker/views/external_api.py b/matchmaker/views/external_api.py
index f2cd6320e7..629ca4ba88 100644
--- a/matchmaker/views/external_api.py
+++ b/matchmaker/views/external_api.py
@@ -79,14 +79,7 @@ def mme_match_proxy(request, originating_node_name):
         patient_data=query_patient_data, origin_request_host=originating_node_name,
     )
 
-    try:
-        _generate_notification_for_incoming_match(results, incoming_query, originating_node_name, query_patient_data)
-    except Exception as e:
-        logger.error('Unable to create notification for incoming MME match request for {} matches{}: {}'.format(
-            len(results),
-            ' ({})'.format(', '.join(sorted([result.get('patient', {}).get('id') for result in results]))) if results else '',
-            str(e)),
-        )
+    _safe_generate_notification_for_incoming_match(results, incoming_query, originating_node_name, query_patient_data)
 
     return create_json_response({
         'results': sorted(results, key=lambda result: result['score']['patient'], reverse=True),
@@ -94,7 +87,7 @@ def mme_match_proxy(request, originating_node_name):
     })
 
 
-def _generate_notification_for_incoming_match(results, incoming_query, incoming_request_node, incoming_patient):
+def _safe_generate_notification_for_incoming_match(results, incoming_query, incoming_request_node, incoming_patient):
     """
     Generate a SLACK notifcation to say that a VALID match request came in and the following
     results were sent back. If Slack is not supported, a message is not sent, but details persisted.
diff --git a/matchmaker/views/external_api_tests.py b/matchmaker/views/external_api_tests.py
index 0956b5ee8d..491cb4458f 100644
--- a/matchmaker/views/external_api_tests.py
+++ b/matchmaker/views/external_api_tests.py
@@ -399,9 +399,9 @@ def test_mme_match_proxy_phenotype_only(self, mock_post_to_slack, mock_email):
             mock.call().send(),
         ])
 
-    @mock.patch('matchmaker.views.external_api.logger')
+    @mock.patch('seqr.utils.communication_utils.logger')
     @mock.patch('matchmaker.views.external_api.EmailMessage')
-    @mock.patch('matchmaker.views.external_api.safe_post_to_slack')
+    @mock.patch('seqr.utils.communication_utils._post_to_slack')
     def test_mme_match_proxy_no_results(self, mock_post_to_slack, mock_email, mock_logger):
         url = '/api/matchmaker/v1/match'
         request_body = {
@@ -424,11 +424,12 @@ def test_mme_match_proxy_no_results(self, mock_post_to_slack, mock_email, mock_l
         self.assertEqual(incoming_query_q.count(), 1)
         self.assertIsNone(incoming_query_q.first().patient_id)
 
-        mock_post_to_slack.assert_called_with(
-            'matchmaker_matches',
-            """A match request for 12345 came in from Test Institute today.
+        slack_message = """A match request for 12345 came in from Test Institute today.
         The contact information given was: test@test.com.
         We didn't find any individuals in matchbox that matched that query well, *so no results were sent back*."""
+        mock_post_to_slack.assert_called_with(
+            'matchmaker_matches',
+            slack_message
         )
         mock_email.assert_not_called()
 
@@ -440,6 +441,6 @@ def test_mme_match_proxy_no_results(self, mock_post_to_slack, mock_email, mock_l
         self.assertListEqual(response.json()['results'], [])
         self.assertEqual(MatchmakerIncomingQuery.objects.filter(institution='Test Institute').count(), 2)
         mock_logger.error.assert_called_with(
-            'Unable to create notification for incoming MME match request for 0 matches: Slack connection error')
+            f'Slack error: Slack connection error: Original message in channel (matchmaker_matches) - {slack_message}')
 
 

From 3e59c324ca7866936c92b3403e7e13550cd5a2a8 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 12 Mar 2024 17:33:36 -0400
Subject: [PATCH 26/28] include invalid sample id once per error message

---
 seqr/management/tests/load_rna_seq_tests.py | 3 ++-
 seqr/views/utils/dataset_utils.py           | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/seqr/management/tests/load_rna_seq_tests.py b/seqr/management/tests/load_rna_seq_tests.py
index 7c1cd0db02..6353809f5a 100644
--- a/seqr/management/tests/load_rna_seq_tests.py
+++ b/seqr/management/tests/load_rna_seq_tests.py
@@ -66,6 +66,7 @@ def test_tpm(self, mock_utils_logger):
             file_data=[
                 'sample_id\tproject\tindividual_id\tgene_id\tTPM\ttissue\n',
                 'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000240361\t12.6\t\n',
+                'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000233750\t1.26\t\n',
                 'NA19678_D1\t1kg project nåme with uniçøde\t\tENSG00000233750\t 6.04\twhole_blood\n',
                 'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000240361\t3.1\tinvalid\n',
                 'NA19677\t1kg project nåme with uniçøde\t\tENSG00000233750\t5.31\tmuscle\n',
@@ -79,7 +80,7 @@ def test_tpm(self, mock_utils_logger):
         self.mock_gzip_file_iter.return_value = [
             self.mock_gzip_file_iter.return_value[0],
             'NA19678_D1\t1kg project nåme with uniçøde\tNA19678\tENSG00000233750\t 6.04\twhole_blood\n',
-        ] + self.mock_gzip_file_iter.return_value[2:]
+        ] + self.mock_gzip_file_iter.return_value[3:]
         call_command('load_rna_seq', 'tpm', RNA_FILE_ID, '--ignore-extra-samples')
 
         # Existing outlier data should be unchanged
diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py
index b4e86c14ae..8824860637 100644
--- a/seqr/views/utils/dataset_utils.py
+++ b/seqr/views/utils/dataset_utils.py
@@ -347,7 +347,7 @@ def _load_rna_seq_file(
 
     loaded_samples = set()
     unmatched_samples = set()
-    missing_required_fields = defaultdict(list)
+    missing_required_fields = defaultdict(set)
     gene_ids = set()
     current_sample = None
     for line in tqdm(parsed_f, unit=' rows'):
@@ -365,7 +365,7 @@ def _load_rna_seq_file(
         sample_id = row_dict.pop(SAMPLE_ID_COL) if SAMPLE_ID_COL in row_dict else row[SAMPLE_ID_COL]
         if missing_cols:
             for col in missing_cols:
-                missing_required_fields[col].append(sample_id)
+                missing_required_fields[col].add(sample_id)
         if missing_cols:
             continue
 

From 3a89b87f1d391d9ef490205b741066f1fbcb2912 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 13 Mar 2024 12:26:29 -0400
Subject: [PATCH 27/28] add project permission check for bulk update analysis
 status

---
 seqr/views/apis/summary_data_api.py       | 6 ++++--
 seqr/views/apis/summary_data_api_tests.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/seqr/views/apis/summary_data_api.py b/seqr/views/apis/summary_data_api.py
index 23f05f0393..33c4d9e8bb 100644
--- a/seqr/views/apis/summary_data_api.py
+++ b/seqr/views/apis/summary_data_api.py
@@ -158,8 +158,10 @@ def bulk_update_family_external_analysis(request):
     family_db_id_lookup = {
         (f['project__name'], f['family_id']): f['id'] for f in Family.objects.annotate(
             project_family=Concat('project__name', 'family_id', output_field=CharField())
-        ).filter(project_family__in=[f'{project}{family}' for project, family in requested_families])
-        .values('id', 'family_id', 'project__name')
+        ).filter(
+            project_family__in=[f'{project}{family}' for project, family in requested_families],
+            project__guid__in=get_project_guids_user_can_view(request.user),
+        ).values('id', 'family_id', 'project__name')
     }
 
     warnings = []
diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py
index 6e55fc315c..62b682bc93 100644
--- a/seqr/views/apis/summary_data_api_tests.py
+++ b/seqr/views/apis/summary_data_api_tests.py
@@ -447,11 +447,12 @@ def test_bulk_update_family_external_analysis(self, mock_load_uploaded_file, moc
             ['Test Reprocessed Project', '12'],
             ['Test Reprocessed Project', 'not_a_family'],
             ['not_a_project', '2'],
+            ['Non-Analyst Project', '14'],
         ]
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertDictEqual(response.json(), {
             'warnings': [
-                'No match found for the following families: not_a_family (Test Reprocessed Project), 2 (not_a_project)'
+                'No match found for the following families: 14 (Non-Analyst Project), not_a_family (Test Reprocessed Project), 2 (not_a_project)'
             ],
             'info': ['Updated "analysed by" for 2 families'],
         })

From a9c80e6485906ca22c8f9de6cedc4c1ddf57a474 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 13 Mar 2024 14:44:21 -0400
Subject: [PATCH 28/28] bumo changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 80818ba807..ce09fe0a57 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,8 @@
 # _seqr_ Changes
 
 ## dev
+
+## 3/13/24
 * Add "Probably Solved" analysis status (REQUIRES DB MIGRATION)
 
 ## 3/1/24