From 1e0c8643b767ca3f0158f464cc05768c8d70497e Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 2 May 2024 15:38:24 -0400
Subject: [PATCH 01/51] accept drs file path

---
 seqr/views/apis/igv_api.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/seqr/views/apis/igv_api.py b/seqr/views/apis/igv_api.py
index 58fc92c1f0..6d0ae529fe 100644
--- a/seqr/views/apis/igv_api.py
+++ b/seqr/views/apis/igv_api.py
@@ -147,12 +147,13 @@ def update_individual_igv_sample(request, individual_guid):
         if not file_path:
             raise ValueError('request must contain fields: filePath')
 
-        sample_type = next((st for suffix, st in SAMPLE_TYPE_MAP if file_path.endswith(suffix)), None)
+        file_name = _get_valid_file_name(file_path, request.user)
+        if not file_name:
+            raise Exception('Error accessing "{}"'.format(file_path))
+        sample_type = next((st for suffix, st in SAMPLE_TYPE_MAP if file_name.endswith(suffix)), None)
         if not sample_type:
             raise Exception('Invalid file extension for "{}" - valid extensions are {}'.format(
-                file_path, ', '.join([suffix for suffix, _ in SAMPLE_TYPE_MAP])))
-        if not does_file_exist(file_path, user=request.user):
-            raise Exception('Error accessing "{}"'.format(file_path))
+                file_name, ', '.join([suffix for suffix, _ in SAMPLE_TYPE_MAP])))
 
         sample, created = get_or_create_model_from_json(
             IgvSample, create_json={'individual': individual, 'sample_type': sample_type},
@@ -172,6 +173,21 @@ def update_individual_igv_sample(request, individual_guid):
         return create_json_response({'error': error}, status=400, reason=error)
 
 
+def _get_valid_file_name(file_path, user):
+    if not file_path.startswith('drs://'):
+        return file_path if does_file_exist(file_path, user=user) else None
+
+    drs_path = file_path.split('/')
+    response = requests.get(
+        f'https://{drs_path[-2]}/ga4gh/drs/v1/objects/{drs_path[-1]}',
+        headers=_get_gs_auth_api_headers(user),
+    )
+    if response.status_code != 200:
+        return None
+    #access = next((a['access_url'] for a in drs_info['access_methods'] if a.get('type') == 'https'), None)
+    return response.json()['name']
+
+
 @login_and_policies_required
 def fetch_igv_track(request, project_guid, igv_track_path):
 
@@ -198,8 +214,12 @@ def _stream_gs(request, gs_path):
                                  content_type='application/octet-stream')
 
 
+def _get_gs_auth_api_headers(user):
+    return {'Authorization': 'Bearer {}'.format(_get_access_token(user))}
+
+
 def _get_gs_rest_api_headers(range_header, gs_path, user=None):
-    headers = {'Authorization': 'Bearer {}'.format(_get_access_token(user))}
+    headers = _get_gs_auth_api_headers(user)
     if range_header:
         headers['Range'] = range_header
     google_project = get_google_project(gs_path)

From 944cd05d234175797336cccfcc174e8dc41ea7d7 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 2 May 2024 15:58:52 -0400
Subject: [PATCH 02/51] stream drs uri for igv

---
 seqr/views/apis/igv_api.py | 54 +++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/seqr/views/apis/igv_api.py b/seqr/views/apis/igv_api.py
index 6d0ae529fe..262e354a9d 100644
--- a/seqr/views/apis/igv_api.py
+++ b/seqr/views/apis/igv_api.py
@@ -173,19 +173,26 @@ def update_individual_igv_sample(request, individual_guid):
         return create_json_response({'error': error}, status=400, reason=error)
 
 
-def _get_valid_file_name(file_path, user):
-    if not file_path.startswith('drs://'):
-        return file_path if does_file_exist(file_path, user=user) else None
+def _is_drs_uri_path(file_path):
+    return file_path.startswith('drs://')
+
 
+def _get_drs_info(file_path, user):
     drs_path = file_path.split('/')
     response = requests.get(
         f'https://{drs_path[-2]}/ga4gh/drs/v1/objects/{drs_path[-1]}',
         headers=_get_gs_auth_api_headers(user),
     )
-    if response.status_code != 200:
-        return None
-    #access = next((a['access_url'] for a in drs_info['access_methods'] if a.get('type') == 'https'), None)
-    return response.json()['name']
+
+    return response.json() if response.status_code == 200 else None
+
+
+def _get_valid_file_name(file_path, user):
+    if _is_drs_uri_path(file_path):
+        drs_info = _get_drs_info(file_path, user)
+        return None if drs_info is None else drs_info['name']
+
+    return file_path if does_file_exist(file_path, user=user) else None
 
 
 @login_and_policies_required
@@ -199,16 +206,35 @@ def fetch_igv_track(request, project_guid, igv_track_path):
     if is_google_bucket_file_path(igv_track_path):
         return _stream_gs(request, igv_track_path)
 
+    if _is_drs_uri_path(igv_track_path):
+        return _stream_drs(request, igv_track_path)
+
     return _stream_file(request, igv_track_path)
 
 
 def _stream_gs(request, gs_path):
-    headers = _get_gs_rest_api_headers(request.META.get('HTTP_RANGE'), gs_path, user=request.user)
+    return _stream_response(
+        url=f"{GS_STORAGE_URL}/{gs_path.replace('gs://', '', 1)}",
+        headers=_get_gs_rest_api_headers(gs_path, user=request.user),
+        request=request)
 
-    response = requests.get(
-        f"{GS_STORAGE_URL}/{gs_path.replace('gs://', '', 1)}",
-        headers=headers,
-        stream=True)
+
+def _stream_drs(request, file_path):
+    drs_info = _get_drs_info(file_path, request.user)
+    https_access = next(a['access_url'] for a in drs_info['access_methods'] if a['type'] == 'https')
+
+    return _stream_response(
+        https_access['url'],
+        headers=dict([h.split(': ') for h in https_access['headers']]),
+        request=request)
+
+
+def _stream_response(url, headers, request):
+    range_header = request.META.get('HTTP_RANGE')
+    if range_header:
+        headers['Range'] = range_header
+
+    response = requests.get(url, headers=headers, stream=True)
 
     return StreamingHttpResponse(response.iter_content(chunk_size=65536), status=response.status_code,
                                  content_type='application/octet-stream')
@@ -218,10 +244,8 @@ def _get_gs_auth_api_headers(user):
     return {'Authorization': 'Bearer {}'.format(_get_access_token(user))}
 
 
-def _get_gs_rest_api_headers(range_header, gs_path, user=None):
+def _get_gs_rest_api_headers(gs_path, user):
     headers = _get_gs_auth_api_headers(user)
-    if range_header:
-        headers['Range'] = range_header
     google_project = get_google_project(gs_path)
     if google_project:
         headers['x-goog-user-project'] = get_google_project(gs_path)

From c615b272e7bfff4dbe6f1ac868c80179b17a6057 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 2 May 2024 16:32:40 -0400
Subject: [PATCH 03/51] actually store file type

---
 seqr/views/apis/igv_api.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/seqr/views/apis/igv_api.py b/seqr/views/apis/igv_api.py
index 262e354a9d..41f9845a50 100644
--- a/seqr/views/apis/igv_api.py
+++ b/seqr/views/apis/igv_api.py
@@ -147,13 +147,14 @@ def update_individual_igv_sample(request, individual_guid):
         if not file_path:
             raise ValueError('request must contain fields: filePath')
 
-        file_name = _get_valid_file_name(file_path, request.user)
-        if not file_name:
-            raise Exception('Error accessing "{}"'.format(file_path))
-        sample_type = next((st for suffix, st in SAMPLE_TYPE_MAP if file_name.endswith(suffix)), None)
+        original_file_path = file_path
+        file_path = _get_valid_file_path(file_path, request.user)
+        if not file_path:
+            raise Exception('Error accessing "{}"'.format(original_file_path))
+        sample_type = next((st for suffix, st in SAMPLE_TYPE_MAP if file_path.endswith(suffix)), None)
         if not sample_type:
             raise Exception('Invalid file extension for "{}" - valid extensions are {}'.format(
-                file_name, ', '.join([suffix for suffix, _ in SAMPLE_TYPE_MAP])))
+                file_path, ', '.join([suffix for suffix, _ in SAMPLE_TYPE_MAP])))
 
         sample, created = get_or_create_model_from_json(
             IgvSample, create_json={'individual': individual, 'sample_type': sample_type},
@@ -180,17 +181,17 @@ def _is_drs_uri_path(file_path):
 def _get_drs_info(file_path, user):
     drs_path = file_path.split('/')
     response = requests.get(
-        f'https://{drs_path[-2]}/ga4gh/drs/v1/objects/{drs_path[-1]}',
+        f'https://{drs_path[2]}/ga4gh/drs/v1/objects/{drs_path[3]}',
         headers=_get_gs_auth_api_headers(user),
     )
 
     return response.json() if response.status_code == 200 else None
 
 
-def _get_valid_file_name(file_path, user):
+def _get_valid_file_path(file_path, user):
     if _is_drs_uri_path(file_path):
         drs_info = _get_drs_info(file_path, user)
-        return None if drs_info is None else drs_info['name']
+        return None if drs_info is None else f"{file_path}/{drs_info['name']}"
 
     return file_path if does_file_exist(file_path, user=user) else None
 
@@ -221,12 +222,16 @@ def _stream_gs(request, gs_path):
 
 def _stream_drs(request, file_path):
     drs_info = _get_drs_info(file_path, request.user)
+
     https_access = next(a['access_url'] for a in drs_info['access_methods'] if a['type'] == 'https')
+    url = https_access['url']
+    headers = dict([h.split(': ') for h in https_access['headers']])
 
-    return _stream_response(
-        https_access['url'],
-        headers=dict([h.split(': ') for h in https_access['headers']]),
-        request=request)
+    # TODO this does not actually result in a valid index file
+    if file_path.endswith('.cram.crai'):
+        url = url.replace('.cram', '.crai')
+
+    return _stream_response(url, headers, request)
 
 
 def _stream_response(url, headers, request):

From 24de48d9b00b6335003bc409b63f723d0c54e7f5 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 2 May 2024 17:12:25 -0400
Subject: [PATCH 04/51] use provided index url

---
 .../0064_igvsample_index_file_path.py         | 18 ++++++++++++++++
 seqr/models.py                                |  3 ++-
 seqr/views/apis/igv_api.py                    | 21 ++++++++++++-------
 ui/shared/components/form/IGVUploadField.jsx  |  2 +-
 .../components/panel/family/FamilyReads.jsx   |  8 +++----
 5 files changed, 38 insertions(+), 14 deletions(-)
 create mode 100644 seqr/migrations/0064_igvsample_index_file_path.py

diff --git a/seqr/migrations/0064_igvsample_index_file_path.py b/seqr/migrations/0064_igvsample_index_file_path.py
new file mode 100644
index 0000000000..437cabb23f
--- /dev/null
+++ b/seqr/migrations/0064_igvsample_index_file_path.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.2.23 on 2024-05-02 20:48
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('seqr', '0063_dynamicanalysisgroup'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='igvsample',
+            name='index_file_path',
+            field=models.TextField(blank=True, null=True),
+        ),
+    ]
diff --git a/seqr/models.py b/seqr/models.py
index aa89945fd6..46765f6702 100644
--- a/seqr/models.py
+++ b/seqr/models.py
@@ -742,6 +742,7 @@ class IgvSample(ModelWithGUID):
     individual = models.ForeignKey('Individual', on_delete=models.PROTECT)
     sample_type = models.CharField(max_length=15, choices=SAMPLE_TYPE_CHOICES)
     file_path = models.TextField()
+    index_file_path = models.TextField(null=True, blank=True)
     sample_id = models.TextField(null=True)
 
     def __unicode__(self):
@@ -753,7 +754,7 @@ def _compute_guid(self):
     class Meta:
         unique_together = ('individual', 'sample_type')
 
-        json_fields = ['guid', 'file_path', 'sample_type', 'sample_id']
+        json_fields = ['guid', 'file_path', 'index_file_path', 'sample_type', 'sample_id']
 
 
 class SavedVariant(ModelWithGUID):
diff --git a/seqr/views/apis/igv_api.py b/seqr/views/apis/igv_api.py
index 41f9845a50..3c14fd9e3a 100644
--- a/seqr/views/apis/igv_api.py
+++ b/seqr/views/apis/igv_api.py
@@ -31,7 +31,14 @@ def _process_alignment_records(rows, num_id_cols=1, **kwargs):
     parsed_records = defaultdict(list)
     for row in rows:
         row_id = row[0] if num_id_cols == 1 else tuple(row[:num_id_cols])
-        parsed_records[row_id].append({'filePath': row[num_id_cols], 'sampleId': row[num_cols] if len(row) > num_cols else None})
+        sample_id = None
+        index_file_path = None
+        if len(row) > num_cols:
+            if _is_drs_uri_path(row[num_cols]):
+                index_file_path = row[num_cols]
+            else:
+                sample_id = row[num_cols]
+        parsed_records[row_id].append({'filePath': row[num_id_cols], 'sampleId': sample_id, 'indexFilePath': index_file_path})
     return parsed_records
 
 
@@ -158,7 +165,10 @@ def update_individual_igv_sample(request, individual_guid):
 
         sample, created = get_or_create_model_from_json(
             IgvSample, create_json={'individual': individual, 'sample_type': sample_type},
-            update_json={'file_path': file_path, 'sample_id': request_json.get('sampleId')}, user=request.user)
+            update_json={
+                'file_path': file_path,
+                **{field: request_json.get(field) for field in ['sampleId', 'indexFilePath']}
+            }, user=request.user)
 
         response = {
             'igvSamplesByGuid': {
@@ -224,14 +234,9 @@ def _stream_drs(request, file_path):
     drs_info = _get_drs_info(file_path, request.user)
 
     https_access = next(a['access_url'] for a in drs_info['access_methods'] if a['type'] == 'https')
-    url = https_access['url']
     headers = dict([h.split(': ') for h in https_access['headers']])
 
-    # TODO this does not actually result in a valid index file
-    if file_path.endswith('.cram.crai'):
-        url = url.replace('.cram', '.crai')
-
-    return _stream_response(url, headers, request)
+    return _stream_response(https_access['url'], headers, request)
 
 
 def _stream_response(url, headers, request):
diff --git a/ui/shared/components/form/IGVUploadField.jsx b/ui/shared/components/form/IGVUploadField.jsx
index dc2f412fa6..af244d2e96 100644
--- a/ui/shared/components/form/IGVUploadField.jsx
+++ b/ui/shared/components/form/IGVUploadField.jsx
@@ -39,7 +39,7 @@ IgvDropzoneLabel.propTypes = {
 const NO_PROJECT_COLUMNS = [
   'Individual ID',
   'IGV Track File Path',
-  'gCNV Sample ID, to identify the sample in the gCNV batch path. Not used for other track types',
+  'For gCNV data: Sample ID, to identify the sample in the gCNV batch path. For other track types: Index File Path',
 ]
 
 // eslint-disable-next-line react-perf/jsx-no-new-array-as-prop
diff --git a/ui/shared/components/panel/family/FamilyReads.jsx b/ui/shared/components/panel/family/FamilyReads.jsx
index ddea5e7f07..bba09bd91f 100644
--- a/ui/shared/components/panel/family/FamilyReads.jsx
+++ b/ui/shared/components/panel/family/FamilyReads.jsx
@@ -30,6 +30,8 @@ const IGV = React.lazy(() => import('../../graph/IGV'))
 
 const MIN_LOCUS_RANGE_SIZE = 100
 
+const igvUrl = (sample, field = 'filePath') => `/api/project/${sample.projectGuid}/igv_track/${encodeURIComponent(sample[field])}`
+
 const getTrackOptions = (type, sample, individual) => {
   const name = ReactDOMServer.renderToString(
     <span id={`${individual.displayName}-${type}`}>
@@ -38,9 +40,7 @@ const getTrackOptions = (type, sample, individual) => {
     </span>,
   )
 
-  const url = `/api/project/${sample.projectGuid}/igv_track/${encodeURIComponent(sample.filePath)}`
-
-  return { url, name, type, ...TRACK_OPTIONS[type] }
+  return { url: igvUrl(sample), name, type, ...TRACK_OPTIONS[type] }
 }
 
 const getSampleColor = individual => (individual.affected === AFFECTED ? 'red' : 'blue')
@@ -75,7 +75,7 @@ const getIgvTracks = (igvSampleIndividuals, sortedIndividuals, sampleTypes) => {
           if (sample.filePath.endsWith('.cram')) {
             Object.assign(track, {
               format: 'cram',
-              indexURL: `${track.url}.crai`,
+              indexURL: sample.indexFilePath ? igvUrl(sample, 'indexFilePath') : `${track.url}.crai`,
             })
           } else {
             Object.assign(track, BAM_TRACK_OPTIONS)

From 59a04ba3d0825ee656dc9719983993334c066a00 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 2 May 2024 17:13:26 -0400
Subject: [PATCH 05/51] add validation for index files

---
 seqr/views/apis/igv_api.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/seqr/views/apis/igv_api.py b/seqr/views/apis/igv_api.py
index 3c14fd9e3a..31f84eae7c 100644
--- a/seqr/views/apis/igv_api.py
+++ b/seqr/views/apis/igv_api.py
@@ -162,6 +162,8 @@ def update_individual_igv_sample(request, individual_guid):
         if not sample_type:
             raise Exception('Invalid file extension for "{}" - valid extensions are {}'.format(
                 file_path, ', '.join([suffix for suffix, _ in SAMPLE_TYPE_MAP])))
+        if _is_drs_uri_path(file_path) and not request_json.get('indexFilePath'):
+            raise Exception('Index File Path is required for DRS URIs')
 
         sample, created = get_or_create_model_from_json(
             IgvSample, create_json={'individual': individual, 'sample_type': sample_type},

From 3207fce3572dce069ce9c2cfb52f61ce4f067b58 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 3 Jun 2024 11:25:51 -0400
Subject: [PATCH 06/51] fix migration

---
 CHANGELOG.md                                                  | 1 +
 ...e_index_file_path.py => 0067_igvsample_index_file_path.py} | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)
 rename seqr/migrations/{0064_igvsample_index_file_path.py => 0067_igvsample_index_file_path.py} (74%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 64626f8078..56a31c866d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # _seqr_ Changes
 
 ## dev
+* Adds index_file_path to IGV Sample model (REQUIRES DB MIGRATION)
 
 ## 5/24/24
 * Adds external_data to Family model (REQUIRES DB MIGRATION)
diff --git a/seqr/migrations/0064_igvsample_index_file_path.py b/seqr/migrations/0067_igvsample_index_file_path.py
similarity index 74%
rename from seqr/migrations/0064_igvsample_index_file_path.py
rename to seqr/migrations/0067_igvsample_index_file_path.py
index 437cabb23f..56fd82555f 100644
--- a/seqr/migrations/0064_igvsample_index_file_path.py
+++ b/seqr/migrations/0067_igvsample_index_file_path.py
@@ -1,4 +1,4 @@
-# Generated by Django 3.2.23 on 2024-05-02 20:48
+# Generated by Django 3.2.23 on 2024-06-03 15:25
 
 from django.db import migrations, models
 
@@ -6,7 +6,7 @@
 class Migration(migrations.Migration):
 
     dependencies = [
-        ('seqr', '0063_dynamicanalysisgroup'),
+        ('seqr', '0066_family_post_discovery_mondo_id'),
     ]
 
     operations = [

From ac9ac079c5de55d2017ad32a20efa4048be48773 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 3 Jun 2024 11:32:53 -0400
Subject: [PATCH 07/51] allow index URI for non-DRS alignment files

---
 seqr/views/apis/igv_api.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/seqr/views/apis/igv_api.py b/seqr/views/apis/igv_api.py
index 822ebcc124..dff66d2e4d 100644
--- a/seqr/views/apis/igv_api.py
+++ b/seqr/views/apis/igv_api.py
@@ -32,13 +32,14 @@ def _process_alignment_records(rows, num_id_cols=1, **kwargs):
     parsed_records = defaultdict(list)
     for row in rows:
         row_id = row[0] if num_id_cols == 1 else tuple(row[:num_id_cols])
+        file_path = row[num_id_cols]
         sample_id = None
         index_file_path = None
         if len(row) > num_cols:
-            if _is_drs_uri_path(row[num_cols]):
-                index_file_path = row[num_cols]
-            else:
+            if file_path.endswith(GCNV_FILE_EXTENSIONS):
                 sample_id = row[num_cols]
+            else:
+                index_file_path = row[num_cols]
         parsed_records[row_id].append({'filePath': row[num_id_cols], 'sampleId': sample_id, 'indexFilePath': index_file_path})
     return parsed_records
 
@@ -141,6 +142,8 @@ def _get_valid_matched_individuals(individual_dataset_mapping):
     ('bed.gz', IgvSample.SAMPLE_TYPE_GCNV),
 ]
 
+GCNV_FILE_EXTENSIONS = tuple(ext for ext, sample_type in SAMPLE_TYPE_MAP if sample_type == IgvSample.SAMPLE_TYPE_GCNV)
+
 
 @pm_or_data_manager_required
 def update_individual_igv_sample(request, individual_guid):

From dd2740bfc22a8c9bc65084205174f2857ac91b2f Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 19 Jul 2024 16:29:22 -0400
Subject: [PATCH 08/51] shared sample ID utility

---
 seqr/views/utils/airtable_utils.py       | 18 ++++++++++++++++++
 seqr/views/utils/anvil_metadata_utils.py | 18 +-----------------
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/seqr/views/utils/airtable_utils.py b/seqr/views/utils/airtable_utils.py
index 63a8767611..eb1a4f8d1b 100644
--- a/seqr/views/utils/airtable_utils.py
+++ b/seqr/views/utils/airtable_utils.py
@@ -104,3 +104,21 @@ def _populate_records(self, record_type, records, offset=None):
 
         if response_json.get('offset'):
             self._populate_records(record_type, records, offset=response_json['offset'])
+
+    def _get_samples_for_id_field(self, sample_ids, id_field, fields):
+        raw_records = self.fetch_records(
+            'Samples', fields=[id_field] + fields,
+            or_filters={f'{{{id_field}}}': sample_ids},
+        )
+
+        records_by_id = defaultdict(list)
+        for airtable_id, record in raw_records.items():
+            records_by_id[record[id_field]].append({**record, 'airtable_id': airtable_id})
+        return records_by_id
+
+    def get_samples_for_sample_ids(self, sample_ids, fields):
+        records_by_id = self._get_samples_for_id_field(sample_ids, 'CollaboratorSampleID', fields)
+        missing = set(sample_ids) - set(records_by_id.keys())
+        if missing:
+            records_by_id.update(self._get_samples_for_id_field(missing, 'SeqrCollaboratorSampleID', fields))
+        return records_by_id
diff --git a/seqr/views/utils/anvil_metadata_utils.py b/seqr/views/utils/anvil_metadata_utils.py
index 2f47aca5d0..f3a71193e1 100644
--- a/seqr/views/utils/anvil_metadata_utils.py
+++ b/seqr/views/utils/anvil_metadata_utils.py
@@ -550,27 +550,11 @@ def _get_variant_inheritance(individual, genotypes):
 LIST_SAMPLE_FIELDS = ['SequencingProduct', 'dbgap_submission']
 
 
-def _get_airtable_samples_for_id_field(sample_ids, id_field, fields, session):
-    raw_records = session.fetch_records(
-        'Samples', fields=[id_field] + fields,
-        or_filters={f'{{{id_field}}}': sample_ids},
-    )
-
-    records_by_id = defaultdict(list)
-    for airtable_id, record in raw_records.items():
-        records_by_id[record[id_field]].append({**record, 'airtable_id': airtable_id})
-    return records_by_id
-
-
 def _get_sample_airtable_metadata(sample_ids, user, airtable_fields):
     fields, list_fields = airtable_fields or [SINGLE_SAMPLE_FIELDS, LIST_SAMPLE_FIELDS]
     all_fields = fields + list_fields
 
-    session = AirtableSession(user)
-    records_by_id = _get_airtable_samples_for_id_field(sample_ids, 'CollaboratorSampleID', all_fields, session)
-    missing = set(sample_ids) - set(records_by_id.keys())
-    if missing:
-        records_by_id.update(_get_airtable_samples_for_id_field(missing, 'SeqrCollaboratorSampleID', all_fields, session))
+    records_by_id = AirtableSession(user).get_samples_for_sample_ids(sample_ids, all_fields)
 
     sample_records = {}
     for record_id, records in records_by_id.items():

From d0646073ac8101254537a10e2f3f14ed975829e3 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 19 Jul 2024 17:11:16 -0400
Subject: [PATCH 09/51] check if missign samples already loaded

---
 seqr/views/apis/data_manager_api.py | 34 +++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
index acbf6720b6..837790f091 100644
--- a/seqr/views/apis/data_manager_api.py
+++ b/seqr/views/apis/data_manager_api.py
@@ -451,6 +451,10 @@ def write_pedigree(request, project_guid):
     'On hold for phenotips, but ready to load',
     'Methods (Loading)',
 ]
+AVAILABLE_PDO_STATUSES = {
+    'Available in seqr',
+    'Historic',
+}
 
 
 @pm_or_data_manager_required
@@ -518,7 +522,7 @@ def load_data(request):
     additional_project_files = None
     individual_ids = None
     if dataset_type == Sample.DATASET_TYPE_VARIANT_CALLS:
-        individual_ids = _get_valid_project_samples(project_samples, sample_type)
+        individual_ids = _get_valid_project_samples(project_samples, sample_type, request.user)
         additional_project_files = {
             project_guid: (f'{project_guid}_ids', ['s'], [{'s': sample_id} for sample_id in sample_ids])
             for project_guid, sample_ids in project_samples.items()
@@ -534,7 +538,7 @@ def load_data(request):
     return create_json_response({'success': True})
 
 
-def _get_valid_project_samples(project_samples, sample_type):
+def _get_valid_project_samples(project_samples, sample_type, user):
     individuals = {
         (i['project'], i['individual_id']): i for i in Individual.objects.filter(family__project__guid__in=project_samples).values(
             'id', 'individual_id',
@@ -560,11 +564,20 @@ def _get_valid_project_samples(project_samples, sample_type):
     if missing_samples:
         errors.append(f'The following samples are included in airtable but missing from seqr: {", ".join(missing_samples)}')
 
-    missing_family_samples = defaultdict(list)
+    missing_samples = {}
     for (project, sample_id), individual in individuals.items():
         family_key = (project, individual['family_name'])
         if sample_id not in project_samples[project] and family_key in airtable_families and individual['sampleCount']:
-            missing_family_samples[family_key].append(sample_id)
+            missing_samples[(project, sample_id)] = individual
+
+    loaded_samples = _get_loaded_samples(missing_samples.keys(), user) if missing_samples else []
+
+    missing_family_samples = defaultdict(list)
+    for (project, sample_id), individual in missing_samples.items():
+        if (project, sample_id) in loaded_samples:
+            individual_ids.append(individual['id'])
+        else:
+            missing_family_samples[(project, individual['family_name'])].append(sample_id)
 
     if missing_family_samples:
         family_errors = [
@@ -578,6 +591,19 @@ def _get_valid_project_samples(project_samples, sample_type):
     return individual_ids
 
 
+def _get_loaded_samples(sample_keys, user):
+    sample_ids = [sample_id for _, sample_id in sample_keys]
+    samples_by_id = AirtableSession(user).get_samples_for_sample_ids(sample_ids, ['PDOStatus', 'SeqrProject'])
+    return [(project, sample_id) for project, sample_id in sample_keys if any(
+        _is_loaded_airtable_sample(s, project) for s in samples_by_id.get(sample_id, [])
+    )]
+
+
+def _is_loaded_airtable_sample(sample, project_guid):
+    return f'https://seqr.broadinstitute.org/project/{project_guid}/project_page' in sample['SeqrProject'] and any(
+        status in AVAILABLE_PDO_STATUSES for status in sample['PDOStatus'])
+
+
 # Hop-by-hop HTTP response headers shouldn't be forwarded.
 # More info at: http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.5.1
 EXCLUDE_HTTP_RESPONSE_HEADERS = {

From aef856e2a75e5a9469f7bbdc0cd3ef99daae86d0 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 22 Jul 2024 11:09:42 -0400
Subject: [PATCH 10/51] merge separate class for load data test

---
 seqr/views/apis/data_manager_api_tests.py | 42 +++++++++++------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py
index 6c9290c0cb..57847aeef0 100644
--- a/seqr/views/apis/data_manager_api_tests.py
+++ b/seqr/views/apis/data_manager_api_tests.py
@@ -11,7 +11,7 @@
     update_rna_seq, load_rna_seq_sample_data, load_phenotype_prioritization_data, write_pedigree, validate_callset, \
     get_loaded_projects, load_data
 from seqr.views.utils.orm_to_json_utils import _get_json_for_models
-from seqr.views.utils.test_utils import AuthenticationTestCase, AnvilAuthenticationTestCase, AirflowTestCase, AirtableTest
+from seqr.views.utils.test_utils import AuthenticationTestCase, AirflowTestCase, AirtableTest
 from seqr.utils.search.elasticsearch.es_utils_tests import urllib3_responses
 from seqr.models import Individual, RnaSeqOutlier, RnaSeqTpm, RnaSeqSpliceOutlier, Sample, Project, PhenotypePrioritization
 from settings import SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL
@@ -1495,9 +1495,15 @@ def _add_file_iter(self, stdout):
         self.mock_file_iter.return_value += stdout
 
 
-class AnvilDataManagerAPITest(AnvilAuthenticationTestCase, DataManagerAPITest):
+@mock.patch('seqr.views.utils.permissions_utils.PM_USER_GROUP', 'project-managers')
+class AnvilDataManagerAPITest(AirflowTestCase, DataManagerAPITest):
     fixtures = ['users', '1kg_project', 'reference_data']
 
+    DAG_NAME = 'v03_pipeline-MITO'
+    SECOND_DAG_NAME = 'v03_pipeline-GCNV'
+    LOADING_PROJECT_GUID = 'R0004_non_analyst_project'
+    PROJECTS = [PROJECT_GUID, LOADING_PROJECT_GUID]
+
     def setUp(self):
         patcher = mock.patch('seqr.utils.file_utils.subprocess.Popen')
         self.mock_subprocess = patcher.start()
@@ -1547,16 +1553,6 @@ def test_get_loaded_projects(self, *args, **kwargs):
         # Test relies on the local-only project data, and has no real difference for local/ non-local behavior
         pass
 
-
-@mock.patch('seqr.views.utils.permissions_utils.PM_USER_GROUP', 'project-managers')
-class LoadDataAPITest(AirflowTestCase):
-    fixtures = ['users', 'social_auth', '1kg_project']
-
-    DAG_NAME = 'v03_pipeline-MITO'
-    SECOND_DAG_NAME = 'v03_pipeline-GCNV'
-    LOADING_PROJECT_GUID = 'R0004_non_analyst_project'
-    PROJECTS = [PROJECT_GUID, LOADING_PROJECT_GUID]
-
     @staticmethod
     def _get_dag_variable_overrides(*args, **kwargs):
         return {
@@ -1568,14 +1564,16 @@ def _get_dag_variable_overrides(*args, **kwargs):
     @responses.activate
     @mock.patch('seqr.views.utils.export_utils.open')
     @mock.patch('seqr.views.utils.export_utils.TemporaryDirectory')
-    @mock.patch('seqr.utils.file_utils.subprocess.Popen')
-    def test_load_data(self, mock_subprocess, mock_temp_dir, mock_open):
+    def test_load_data(self, mock_temp_dir, mock_open):
         url = reverse(load_data)
         self.check_pm_login(url)
 
         mock_temp_dir.return_value.__enter__.return_value = '/mock/tmp'
-        mock_subprocess.return_value.wait.return_value = 0
-        mock_subprocess.return_value.communicate.return_value = b'', b'File not found'
+        mock_subprocess = mock.MagicMock()
+        self.mock_subprocess.side_effect = None
+        self.mock_subprocess.return_value = mock_subprocess
+        mock_subprocess.wait.return_value = 0
+        mock_subprocess.communicate.return_value = b'', b'File not found'
         body = {'filePath': 'gs://test_bucket/mito_callset.mt', 'datasetType': 'MITO', 'sampleType': 'WGS', 'projects': [
             json.dumps({'projectGuid': 'R0001_1kg'}), json.dumps(PROJECT_OPTION), json.dumps({'projectGuid': 'R0005_not_project'}),
         ]}
@@ -1589,7 +1587,7 @@ def test_load_data(self, mock_subprocess, mock_temp_dir, mock_open):
         self.assertDictEqual(response.json(), {'success': True})
 
         self.assert_airflow_calls()
-        self._has_expected_gs_calls(mock_subprocess, mock_open, 'MITO')
+        self._has_expected_gs_calls(mock_open, 'MITO')
 
         dag_json = """{
     "projects_to_run": [
@@ -1618,7 +1616,7 @@ def test_load_data(self, mock_subprocess, mock_temp_dir, mock_open):
         mock_open.reset_mock()
         responses.calls.reset()
         mock_subprocess.reset_mock()
-        mock_subprocess.return_value.communicate.return_value = b'gs://seqr-datasets/v02/GRCh38/RDG_WES_Broad_Internal_SV/\ngs://seqr-datasets/v02/GRCh38/RDG_WGS_Broad_Internal_SV/v01/\ngs://seqr-datasets/v02/GRCh38/RDG_WES_Broad_Internal_GCNV/v02/', b''
+        mock_subprocess.communicate.return_value = b'gs://seqr-datasets/v02/GRCh38/RDG_WES_Broad_Internal_SV/\ngs://seqr-datasets/v02/GRCh38/RDG_WGS_Broad_Internal_SV/v01/\ngs://seqr-datasets/v02/GRCh38/RDG_WES_Broad_Internal_GCNV/v02/', b''
 
         body.update({'datasetType': 'SV', 'filePath': 'gs://test_bucket/sv_callset.vcf', 'sampleType': 'WES'})
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
@@ -1626,7 +1624,7 @@ def test_load_data(self, mock_subprocess, mock_temp_dir, mock_open):
         self.assertDictEqual(response.json(), {'success': True})
 
         self.assert_airflow_calls(trigger_error=True, secondary_dag_name=self.SECOND_DAG_NAME)
-        self._has_expected_gs_calls(mock_subprocess, mock_open, 'SV', is_second_dag=True, sample_type='WES')
+        self._has_expected_gs_calls(mock_open, 'SV', is_second_dag=True, sample_type='WES')
         self.mock_airflow_logger.warning.assert_not_called()
         self.mock_airflow_logger.error.assert_called_with(mock.ANY, self.pm_user)
         errors = [call.args[0] for call in self.mock_airflow_logger.error.call_args_list]
@@ -1669,9 +1667,9 @@ def test_load_data(self, mock_subprocess, mock_temp_dir, mock_open):
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 200)
         self.assertDictEqual(response.json(), {'success': True})
-        self._has_expected_gs_calls(mock_subprocess, mock_open, 'SNV_INDEL', has_project_subset=True)
+        self._has_expected_gs_calls(mock_open, 'SNV_INDEL', has_project_subset=True)
 
-    def _has_expected_gs_calls(self, mock_subprocess, mock_open, dataset_type, sample_type='WGS', has_project_subset=False, **kwargs):
+    def _has_expected_gs_calls(self, mock_open, dataset_type, sample_type='WGS', has_project_subset=False, **kwargs):
         projects = self.PROJECTS[1:] if has_project_subset else self.PROJECTS
         mock_open.assert_has_calls([
             mock.call(f'/mock/tmp/{project}_pedigree.tsv', 'w') for project in projects
@@ -1698,7 +1696,7 @@ def _has_expected_gs_calls(self, mock_subprocess, mock_open, dataset_type, sampl
             ['R0004_non_analyst_project', 'F000014_14', '14', 'NA21987', '', '', 'M'],
         ])
 
-        mock_subprocess.assert_has_calls([
+        self.mock_subprocess.assert_has_calls([
             mock.call(
                 f'gsutil mv /mock/tmp/* gs://seqr-datasets/v02/GRCh38/RDG_{sample_type}_Broad_Internal/base/projects/{project}/',
                 stdout=-1, stderr=-2, shell=True,  # nosec

From 95265c210baa56363a3de9346c7df1401603592f Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 22 Jul 2024 11:52:30 -0400
Subject: [PATCH 11/51] test check airtable for loaded samples

---
 seqr/fixtures/social_auth.json              | 16 ++++++
 seqr/views/apis/data_manager_api_tests.py   | 58 +++++++++++++++++++--
 seqr/views/apis/variant_search_api_tests.py | 15 +++---
 3 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/seqr/fixtures/social_auth.json b/seqr/fixtures/social_auth.json
index e092f257fa..b4642e5919 100644
--- a/seqr/fixtures/social_auth.json
+++ b/seqr/fixtures/social_auth.json
@@ -95,5 +95,21 @@
         "created": "2020-03-12T23:09:54.180Z",
         "modified": "2020-03-12T23:09:54.180Z"
     }
+}, {
+    "model": "social_django.usersocialauth",
+    "pk": 7,
+    "fields": {
+        "user": 16,
+        "provider": "google-oauth2",
+        "uid": "test_data_manager@broadinstitute.org",
+        "extra_data": {
+            "expires": 3599,
+            "auth_time": 1603287741,
+            "token_type": "Bearer",
+            "access_token": "ya29.EXAMPLE"
+        },
+        "created": "2020-03-12T23:09:54.180Z",
+        "modified": "2020-03-12T23:09:54.180Z"
+    }
 }
 ]
diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py
index 57847aeef0..ace672dd41 100644
--- a/seqr/views/apis/data_manager_api_tests.py
+++ b/seqr/views/apis/data_manager_api_tests.py
@@ -435,6 +435,30 @@
         },
     ]
 }
+AIRTABLE_SAMPLE_RECORDS = {
+    'records': [
+        {
+            'id': 'recW24C2CJW5lT64K',
+            'fields': {
+                'CollaboratorSampleID': 'NA19678',
+                'SeqrProject': ['https://seqr.broadinstitute.org/project/R0001_1kg/project_page'],
+                'PDOStatus': ['Available in seqr'],
+            }
+        },
+    ],
+}
+AIRTABLE_SECONDARY_SAMPLE_RECORDS = {
+    'records': [
+        {
+            'id': 'recW24C2CJW5lT64K',
+            'fields': {
+                'SeqrCollaboratorSampleID': 'NA21234',
+                'SeqrProject': ['https://seqr.broadinstitute.org/project/R0004_non_analyst_project/project_page'],
+                'PDOStatus': ['Hold for phenotips'],
+            }
+        },
+    ],
+}
 
 
 @mock.patch('seqr.views.utils.permissions_utils.PM_USER_GROUP', 'project-managers')
@@ -1497,7 +1521,7 @@ def _add_file_iter(self, stdout):
 
 @mock.patch('seqr.views.utils.permissions_utils.PM_USER_GROUP', 'project-managers')
 class AnvilDataManagerAPITest(AirflowTestCase, DataManagerAPITest):
-    fixtures = ['users', '1kg_project', 'reference_data']
+    fixtures = ['users', 'social_auth', '1kg_project', 'reference_data']
 
     DAG_NAME = 'v03_pipeline-MITO'
     SECOND_DAG_NAME = 'v03_pipeline-GCNV'
@@ -1653,21 +1677,49 @@ def test_load_data(self, mock_temp_dir, mock_open):
 
         sample_ids = PROJECT_SAMPLES_OPTION['sampleIds']
         body['projects'] = [json.dumps({**PROJECT_OPTION, 'sampleIds': [sample_ids[1]]})]
+        airtable_samples_url = 'https://api.airtable.com/v0/app3Y97xtbbaOopVR/Samples'
+        responses.add(responses.GET, airtable_samples_url, json=AIRTABLE_SAMPLE_RECORDS, status=200)
+        responses.add(responses.GET, airtable_samples_url, json=AIRTABLE_SECONDARY_SAMPLE_RECORDS, status=200)
+
+        # Non-Broad users can not access airtable
+        response = self.client.post(url, content_type='application/json', data=json.dumps(body))
+        self.assertEqual(response.status_code, 403)
+
+        responses.calls.reset()
+        self.login_data_manager_user()
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 400)
         self.assertDictEqual(response.json(), {
             'warnings': None,
             'errors': ['The following families have previously loaded samples absent from airtable: 14 (NA21234)'],
         })
+        self.assert_expected_airtable_call(
+            call_index=0,
+            filter_formula="OR({CollaboratorSampleID}='NA21234')",
+            fields=['CollaboratorSampleID', 'PDOStatus', 'SeqrProject'],
+        )
+        self.assert_expected_airtable_call(
+            call_index=1,
+            filter_formula="OR({SeqrCollaboratorSampleID}='NA21234')",
+            fields=['SeqrCollaboratorSampleID', 'PDOStatus', 'SeqrProject'],
+        )
 
-        body['projects'] = [
-            json.dumps({'projectGuid': 'R0001_1kg', 'sampleIds': ['NA19675_1', 'NA19678', 'NA19679']}),
+        responses.calls.reset()
+        responses.add(responses.GET, airtable_samples_url, json=AIRTABLE_SAMPLE_RECORDS, status=200)
+        body['projects'] = [  # TODO
+            json.dumps({'projectGuid': 'R0001_1kg', 'sampleIds': ['NA19675_1', 'NA19679']}),
             json.dumps({**PROJECT_OPTION, 'sampleIds': sample_ids[:2]}),
         ]
+        body['sampleType'] = 'WES'
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 200)
         self.assertDictEqual(response.json(), {'success': True})
         self._has_expected_gs_calls(mock_open, 'SNV_INDEL', has_project_subset=True)
+        self.assert_expected_airtable_call(
+            call_index=0,
+            filter_formula="OR({CollaboratorSampleID}='NA19678')",
+            fields=['CollaboratorSampleID', 'PDOStatus', 'SeqrProject'],
+        )
 
     def _has_expected_gs_calls(self, mock_open, dataset_type, sample_type='WGS', has_project_subset=False, **kwargs):
         projects = self.PROJECTS[1:] if has_project_subset else self.PROJECTS
diff --git a/seqr/views/apis/variant_search_api_tests.py b/seqr/views/apis/variant_search_api_tests.py
index 5142b2154f..0cf449d4f0 100644
--- a/seqr/views/apis/variant_search_api_tests.py
+++ b/seqr/views/apis/variant_search_api_tests.py
@@ -1025,15 +1025,18 @@ def assert_no_list_ws_has_al(self, acl_call_count, group_call_count, workspace_n
     assert_ws_has_al(self, acl_call_count, group_call_count, workspace_name)
 
 
-def assert_has_list_ws(self):
-    self.mock_list_workspaces.assert_has_calls([
+def assert_has_list_ws(self, has_data_manager=False):
+    calls = [
         mock.call(self.no_access_user),
         mock.call(self.collaborator_user),
-    ])
+    ]
+    if has_data_manager:
+        calls.insert(1, mock.call(self.data_manager_user))
+    self.mock_list_workspaces.assert_has_calls(calls)
 
 
-def assert_no_al_has_list_ws(self, group_count=1):
-    assert_has_list_ws(self)
+def assert_no_al_has_list_ws(self, group_count=1, has_data_manager=False):
+    assert_has_list_ws(self, has_data_manager)
     self.mock_get_ws_access_level.assert_not_called()
     assert_workspace_calls(self, group_count)
 
@@ -1067,7 +1070,7 @@ def test_query_variants(self, *args):
 
     def test_query_all_projects_variants(self, *args):
         super(AnvilVariantSearchAPITest, self).test_query_all_projects_variants(*args)
-        assert_no_al_has_list_ws(self, group_count=3)
+        assert_no_al_has_list_ws(self, group_count=3, has_data_manager=True)
 
     def test_query_all_project_families_variants(self, *args):
         super(AnvilVariantSearchAPITest, self).test_query_all_project_families_variants(*args)

From c9bb84569b13a440dbc3bdf112bbc62abc1013f9 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 22 Jul 2024 11:53:03 -0400
Subject: [PATCH 12/51] test check airtable for loaded samples

---
 seqr/views/apis/data_manager_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
index 837790f091..f2bf9e5b8e 100644
--- a/seqr/views/apis/data_manager_api.py
+++ b/seqr/views/apis/data_manager_api.py
@@ -527,6 +527,7 @@ def load_data(request):
             project_guid: (f'{project_guid}_ids', ['s'], [{'s': sample_id} for sample_id in sample_ids])
             for project_guid, sample_ids in project_samples.items()
         }
+        import pdb; pdb.set_trace()
 
     success_message = f'*{request.user.email}* triggered loading internal {sample_type} {dataset_type} data for {len(projects)} projects'
     trigger_data_loading(

From 17b78d8a3fa3112b781bfc53d5d03918609a9b85 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 22 Jul 2024 12:02:49 -0400
Subject: [PATCH 13/51] correctly update sample subset

---
 seqr/views/apis/data_manager_api.py       |  2 +-
 seqr/views/apis/data_manager_api_tests.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
index f2bf9e5b8e..d6bc359e08 100644
--- a/seqr/views/apis/data_manager_api.py
+++ b/seqr/views/apis/data_manager_api.py
@@ -527,7 +527,6 @@ def load_data(request):
             project_guid: (f'{project_guid}_ids', ['s'], [{'s': sample_id} for sample_id in sample_ids])
             for project_guid, sample_ids in project_samples.items()
         }
-        import pdb; pdb.set_trace()
 
     success_message = f'*{request.user.email}* triggered loading internal {sample_type} {dataset_type} data for {len(projects)} projects'
     trigger_data_loading(
@@ -577,6 +576,7 @@ def _get_valid_project_samples(project_samples, sample_type, user):
     for (project, sample_id), individual in missing_samples.items():
         if (project, sample_id) in loaded_samples:
             individual_ids.append(individual['id'])
+            project_samples[project].append(sample_id)
         else:
             missing_family_samples[(project, individual['family_name'])].append(sample_id)
 
diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py
index ace672dd41..f3b8bb130e 100644
--- a/seqr/views/apis/data_manager_api_tests.py
+++ b/seqr/views/apis/data_manager_api_tests.py
@@ -1704,9 +1704,10 @@ def test_load_data(self, mock_temp_dir, mock_open):
             fields=['SeqrCollaboratorSampleID', 'PDOStatus', 'SeqrProject'],
         )
 
+        mock_subprocess.reset_mock()
         responses.calls.reset()
         responses.add(responses.GET, airtable_samples_url, json=AIRTABLE_SAMPLE_RECORDS, status=200)
-        body['projects'] = [  # TODO
+        body['projects'] = [
             json.dumps({'projectGuid': 'R0001_1kg', 'sampleIds': ['NA19675_1', 'NA19679']}),
             json.dumps({**PROJECT_OPTION, 'sampleIds': sample_ids[:2]}),
         ]
@@ -1714,7 +1715,7 @@ def test_load_data(self, mock_temp_dir, mock_open):
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 200)
         self.assertDictEqual(response.json(), {'success': True})
-        self._has_expected_gs_calls(mock_open, 'SNV_INDEL', has_project_subset=True)
+        self._has_expected_gs_calls(mock_open, 'SNV_INDEL', sample_type='WES', has_project_subset=True)
         self.assert_expected_airtable_call(
             call_index=0,
             filter_formula="OR({CollaboratorSampleID}='NA19678')",
@@ -1722,9 +1723,8 @@ def test_load_data(self, mock_temp_dir, mock_open):
         )
 
     def _has_expected_gs_calls(self, mock_open, dataset_type, sample_type='WGS', has_project_subset=False, **kwargs):
-        projects = self.PROJECTS[1:] if has_project_subset else self.PROJECTS
         mock_open.assert_has_calls([
-            mock.call(f'/mock/tmp/{project}_pedigree.tsv', 'w') for project in projects
+            mock.call(f'/mock/tmp/{project}_pedigree.tsv', 'w') for project in self.PROJECTS
         ], any_order=True)
         files = [
             [row.split('\t') for row in write_call.args[0].split('\n')]
@@ -1733,7 +1733,7 @@ def _has_expected_gs_calls(self, mock_open, dataset_type, sample_type='WGS', has
         self.assertEqual(len(files), 4 if has_project_subset else 2)
         if has_project_subset:
             self.assertEqual(len(files[1]), 4)
-            self.assertListEqual(files[1], [['s'], ['NA19675_1'], ['NA19678'], ['NA19679']])
+            self.assertListEqual(files[1], [['s'], ['NA19675_1'], ['NA19679'], ['NA19678']])
             self.assertEqual(len(files[3]), 3)
             self.assertListEqual(files[3], [['s'], ['NA21234'], ['NA21987']])
 
@@ -1752,10 +1752,10 @@ def _has_expected_gs_calls(self, mock_open, dataset_type, sample_type='WGS', has
             mock.call(
                 f'gsutil mv /mock/tmp/* gs://seqr-datasets/v02/GRCh38/RDG_{sample_type}_Broad_Internal/base/projects/{project}/',
                 stdout=-1, stderr=-2, shell=True,  # nosec
-            ) for project in projects
+            ) for project in self.PROJECTS
         ] + [
             mock.call(
                 f'gsutil rsync -r gs://seqr-datasets/v02/GRCh38/RDG_{sample_type}_Broad_Internal/base/projects/{project}/ gs://seqr-loading-temp/v03/GRCh38/{sample_type}/{dataset_type}/pedigrees/',
                 stdout=-1, stderr=-2, shell=True,  # nosec
-            ) for project in projects
+            ) for project in self.PROJECTS
         ], any_order=True)

From 47c55d3a67c4031eb02f03734d9477511b9c43ca Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 22 Jul 2024 14:59:04 -0400
Subject: [PATCH 14/51] add external data to bulk edit ui

---
 seqr/views/apis/project_api.py | 2 +-
 ui/pages/Project/constants.js  | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/seqr/views/apis/project_api.py b/seqr/views/apis/project_api.py
index e232f8fb58..3da4765471 100644
--- a/seqr/views/apis/project_api.py
+++ b/seqr/views/apis/project_api.py
@@ -189,7 +189,7 @@ def project_families(request, project_guid):
     families = family_models.values(
         'id', 'description',
         **{_to_camel_case(field): F(field) for field in [
-            'family_id', 'analysis_status', 'created_date', 'coded_phenotype', 'mondo_id',
+            'family_id', 'analysis_status', 'created_date', 'coded_phenotype', 'mondo_id', 'external_data',
         ]},
         familyGuid=F('guid'),
         projectGuid=Value(project_guid),
diff --git a/ui/pages/Project/constants.js b/ui/pages/Project/constants.js
index f63465019d..fd3f07640e 100644
--- a/ui/pages/Project/constants.js
+++ b/ui/pages/Project/constants.js
@@ -20,6 +20,7 @@ import {
   FAMILY_FIELD_MONDO_ID,
   FAMILY_FIELD_SAVED_VARIANTS,
   FAMILY_FIELD_NAME_LOOKUP,
+  FAMILY_FIELD_EXTERNAL_DATA,
   INDIVIDUAL_FIELD_ID,
   INDIVIDUAL_FIELD_PATERNAL_ID,
   INDIVIDUAL_FIELD_MATERNAL_ID,
@@ -37,6 +38,7 @@ import {
   FAMILY_NOTES_FIELDS,
   SNP_DATA_TYPE,
   MME_TAG_NAME,
+  FAMILY_EXTERNAL_DATA_LOOKUP,
 } from 'shared/utils/constants'
 
 export const CASE_REVIEW_TABLE_NAME = 'Case Review'
@@ -421,6 +423,10 @@ const FAMILY_FIELD_CONFIGS = Object.entries({
   [FAMILY_FIELD_ANALYSED_BY]: { format: analysedBy => analysedBy.map(o => o.createdBy).join(',') },
   [FAMILY_FIELD_CODED_PHENOTYPE]: { width: 4, description: "High level summary of the family's phenotype/disease" },
   [FAMILY_FIELD_MONDO_ID]: { width: 3, description: 'MONDO Disease Ontology ID' },
+  [FAMILY_FIELD_EXTERNAL_DATA]: {
+    description: 'Data types available external to seqr',
+    format: externalData => externalData.map(dataType => FAMILY_EXTERNAL_DATA_LOOKUP[dataType]?.name || dataType).join('; '),
+  },
   ...FAMILY_NOTES_FIELDS.reduce((acc, { id }) => ({ ...acc, [id]: { format: formatNotes } }), {}),
 }).reduce((acc, [field, config]) => ({ ...acc, [field]: { label: FAMILY_FIELD_NAME_LOOKUP[field], ...config } }), {})
 
@@ -447,6 +453,7 @@ export const FAMILY_BULK_EDIT_EXPORT_DATA = [
   FAMILY_FIELD_DESCRIPTION,
   FAMILY_FIELD_CODED_PHENOTYPE,
   FAMILY_FIELD_MONDO_ID,
+  FAMILY_FIELD_EXTERNAL_DATA,
 ].map(exportConfigForField(FAMILY_FIELD_CONFIGS))
 
 export const INDIVIDUAL_FIELDS = [

From dbc4a32551b8a71aced855ad1937becae460989c Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 22 Jul 2024 15:27:42 -0400
Subject: [PATCH 15/51] support external data in bulk upload

---
 seqr/views/apis/family_api.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/seqr/views/apis/family_api.py b/seqr/views/apis/family_api.py
index dcae71adeb..3e15337cb2 100644
--- a/seqr/views/apis/family_api.py
+++ b/seqr/views/apis/family_api.py
@@ -395,6 +395,12 @@ def update_family_analysis_groups(request, family_guid):
     })
 
 
+EXTERNAL_DATA_LOOKUP = {v: k for k, v in Family.EXTERNAL_DATA_CHOICES}
+PARSE_FAMILY_TABLE_FIELDS = {
+    'externalData': lambda data_type: [EXTERNAL_DATA_LOOKUP[dt.strip()] for dt in (data_type or '').split(';')],
+}
+
+
 @login_and_policies_required
 def receive_families_table_handler(request, project_guid):
     """Handler for the initial upload of an Excel or .tsv table of families. This handler
@@ -425,10 +431,12 @@ def _process_records(records, filename=''):
                 column_map['mondoId'] = i
             elif 'description' in key:
                 column_map['description'] = i
+            elif 'external' in key and 'data' in key:
+                column_map['externalData'] = i
         if FAMILY_ID_FIELD not in column_map:
             raise ValueError('Invalid header, missing family id column')
 
-        return [{column: row[index] if isinstance(index, int) else next((row[i] for i in index if row[i]), None)
+        return [{column: PARSE_FAMILY_TABLE_FIELDS.get(column, lambda v: v)(row[index])
                 for column, index in column_map.items()} for row in records[1:]]
 
     try:

From 8110d995b3336c0e78b5c2b8cab1477e7d73c91f Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 22 Jul 2024 16:00:38 -0400
Subject: [PATCH 16/51] update tests

---
 seqr/views/apis/family_api.py        |  2 +-
 seqr/views/apis/family_api_tests.py  | 22 +++++++++++++++++++---
 seqr/views/apis/project_api_tests.py |  4 +++-
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/seqr/views/apis/family_api.py b/seqr/views/apis/family_api.py
index 3e15337cb2..9f2820dffc 100644
--- a/seqr/views/apis/family_api.py
+++ b/seqr/views/apis/family_api.py
@@ -397,7 +397,7 @@ def update_family_analysis_groups(request, family_guid):
 
 EXTERNAL_DATA_LOOKUP = {v: k for k, v in Family.EXTERNAL_DATA_CHOICES}
 PARSE_FAMILY_TABLE_FIELDS = {
-    'externalData': lambda data_type: [EXTERNAL_DATA_LOOKUP[dt.strip()] for dt in (data_type or '').split(';')],
+    'externalData': lambda data_type: [EXTERNAL_DATA_LOOKUP[dt.strip()] for dt in (data_type or '').split(';') if dt],
 }
 
 
diff --git a/seqr/views/apis/family_api_tests.py b/seqr/views/apis/family_api_tests.py
index bd43d8d697..c4175eb150 100644
--- a/seqr/views/apis/family_api_tests.py
+++ b/seqr/views/apis/family_api_tests.py
@@ -469,9 +469,9 @@ def test_receive_families_table_handler(self, mock_pm_group):
 
         self.assertSetEqual(set(response_json.keys()), {'info', 'errors', 'warnings', 'uploadedFileId'})
 
-        url = reverse(edit_families_handler, args=[PROJECT_GUID])
+        edit_url = reverse(edit_families_handler, args=[PROJECT_GUID])
 
-        response = self.client.post(url, content_type='application/json',
+        response = self.client.post(edit_url, content_type='application/json',
                 data=json.dumps({'uploadedFileId': response_json['uploadedFileId']}))
         self.assertEqual(response.status_code, 200)
         response_json = response.json()
@@ -487,8 +487,18 @@ def test_receive_families_table_handler(self, mock_pm_group):
         self.assertEqual(family_2['description'], 'family two description')
         self.assertEqual(family_2['familyId'], '2')
 
+        internal_field_data = b'Family ID	External Data\n\
+"11"	""\n\
+"12"	"ONT lrGS; BioNano"'
+        response = self.client.post(url,  {'f': SimpleUploadedFile('families.tsv', internal_field_data)})
+        self.assertEqual(response.status_code, 200)
+        response = self.client.post(
+            edit_url, content_type='application/json', data=json.dumps({'uploadedFileId': response.json()['uploadedFileId']}))
+        self.assertEqual(response.status_code, 403)
+
         # Test PM permission
         url = reverse(receive_families_table_handler, args=[PM_REQUIRED_PROJECT_GUID])
+        edit_url = reverse(edit_families_handler, args=[PM_REQUIRED_PROJECT_GUID])
         response = self.client.post(url)
         self.assertEqual(response.status_code, 403)
 
@@ -498,8 +508,14 @@ def test_receive_families_table_handler(self, mock_pm_group):
         mock_pm_group.__bool__.return_value = True
         mock_pm_group.resolve_expression.return_value = 'project-managers'
 
-        response = self.client.post(url, {'f': SimpleUploadedFile('families.tsv', 'Family ID\n1'.encode('utf-8'))})
+        response = self.client.post(url,  {'f': SimpleUploadedFile('families.tsv', internal_field_data)})
+        self.assertEqual(response.status_code, 200)
+        response = self.client.post(
+            edit_url, content_type='application/json', data=json.dumps({'uploadedFileId': response.json()['uploadedFileId']}))
         self.assertEqual(response.status_code, 200)
+        response_json = response.json()
+        self.assertListEqual(response_json['familiesByGuid']['F000011_11']['externalData'], [])
+        self.assertListEqual(response_json['familiesByGuid']['F000012_12']['externalData'], ['L', 'B'])
 
     def test_create_update_and_delete_family_note(self):
         # create the note
diff --git a/seqr/views/apis/project_api_tests.py b/seqr/views/apis/project_api_tests.py
index 7c5c821542..a86b472dae 100644
--- a/seqr/views/apis/project_api_tests.py
+++ b/seqr/views/apis/project_api_tests.py
@@ -382,7 +382,7 @@ def test_project_families(self):
         family_3 = response_json['familiesByGuid']['F000003_3']
         family_fields = {
             'individualGuids', 'discoveryTags', 'caseReviewStatuses', 'caseReviewStatusLastModified', 'hasRequiredMetadata',
-            'parents', 'hasPhenotypePrioritization',
+            'parents', 'hasPhenotypePrioritization', 'externalData',
         }
         family_fields.update(SUMMARY_FAMILY_FIELDS)
         self.assertSetEqual(set(family_1.keys()), family_fields)
@@ -399,6 +399,8 @@ def test_project_families(self):
         self.assertListEqual(family_3['parents'], [])
         self.assertEqual(family_1['hasPhenotypePrioritization'], True)
         self.assertFalse(family_3['hasPhenotypePrioritization'], False)
+        self.assertListEqual(family_1['externalData'], ['M'])
+        self.assertListEqual(family_3['externalData'], [])
 
         self.assertListEqual(family_3['discoveryTags'], [])
         self.assertSetEqual({tag['variantGuid'] for tag in family_1['discoveryTags']}, {'SV0000001_2103343353_r0390_100'})

From a39d4023f1125fef63d182461e3359b9584748f9 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 23 Jul 2024 13:51:03 -0400
Subject: [PATCH 17/51] validate families have affected individuals

---
 seqr/views/apis/anvil_workspace_api.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/seqr/views/apis/anvil_workspace_api.py b/seqr/views/apis/anvil_workspace_api.py
index 9a0f006c14..4682064bc8 100644
--- a/seqr/views/apis/anvil_workspace_api.py
+++ b/seqr/views/apis/anvil_workspace_api.py
@@ -13,7 +13,7 @@
 from django.shortcuts import redirect
 
 from reference_data.models import GENOME_VERSION_LOOKUP
-from seqr.models import Project, CAN_EDIT, Sample
+from seqr.models import Project, CAN_EDIT, Sample, Individual
 from seqr.views.react_app import render_app_html
 from seqr.views.utils.airtable_utils import AirtableSession, ANVIL_REQUEST_TRACKING_TABLE
 from seqr.utils.search.constants import VCF_FILE_EXTENSIONS
@@ -250,10 +250,26 @@ def _parse_uploaded_pedigree(request_json, project=None):
     missing_samples = [record['individualId'] for record in pedigree_records
                        if record['individualId'] not in request_json['vcfSamples']]
 
+    errors = []
     if missing_samples:
-        error = 'The following samples are included in the pedigree file but are missing from the VCF: {}'.format(
-                ', '.join(missing_samples))
-        raise ErrorsWarningsException([error], [])
+        errors.append('The following samples are included in the pedigree file but are missing from the VCF: {}'.format(
+                ', '.join(missing_samples)))
+
+    records_by_family = defaultdict(list)
+    for record in pedigree_records:
+        records_by_family[record[JsonConstants.FAMILY_ID_COLUMN]].append(record)
+
+    no_affected_families = []
+    for family_id, records in records_by_family.items():
+        affected = [record for record in records if record[JsonConstants.AFFECTED_COLUMN] == Individual.AFFECTED_STATUS_AFFECTED]
+        if not affected:
+            no_affected_families.append(family_id)
+
+    if no_affected_families:
+        errors.append('The following families do not have any affected individuals: {}'.format(', '.join(no_affected_families)))
+
+    if errors:
+        raise ErrorsWarningsException(errors, [])
 
     return pedigree_records
 

From 2e77253acf3214325cc1b25a59b9bda39f681deb Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 23 Jul 2024 14:20:36 -0400
Subject: [PATCH 18/51] update ui to require HPO terms

---
 .../components/panel/LoadWorkspaceDataForm.jsx  | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/ui/shared/components/panel/LoadWorkspaceDataForm.jsx b/ui/shared/components/panel/LoadWorkspaceDataForm.jsx
index b134d57132..0ee64e81c2 100644
--- a/ui/shared/components/panel/LoadWorkspaceDataForm.jsx
+++ b/ui/shared/components/panel/LoadWorkspaceDataForm.jsx
@@ -12,6 +12,8 @@ import {
   FILE_FORMATS,
   INDIVIDUAL_CORE_EXPORT_DATA,
   INDIVIDUAL_ID_EXPORT_DATA,
+  INDIVIDUAL_HPO_EXPORT_DATA,
+  INDIVIDUAL_FIELD_FEATURES,
   INDIVIDUAL_FIELD_SEX,
   INDIVIDUAL_FIELD_AFFECTED,
   SAMPLE_TYPE_OPTIONS,
@@ -43,16 +45,19 @@ export const WORKSPACE_REQUIREMENTS = [
 ]
 
 const NON_ID_REQUIRED_FIELDS = [INDIVIDUAL_FIELD_SEX, INDIVIDUAL_FIELD_AFFECTED]
+const HPO_FIELD = { ...INDIVIDUAL_HPO_EXPORT_DATA[0], header: 'HPO Terms' }
 
 const FIELD_DESCRIPTIONS = {
   [FAMILY_FIELD_ID]: 'Family ID',
   [INDIVIDUAL_FIELD_ID]: 'Individual ID (needs to match the VCF ids)',
   [INDIVIDUAL_FIELD_SEX]: 'Male, Female, or Unknown',
   [INDIVIDUAL_FIELD_AFFECTED]: 'Affected, Unaffected, or Unknown',
+  [INDIVIDUAL_FIELD_FEATURES]: 'Semi-colon separated list of HPO terms. Required for affected individuals only.',
 }
 const REQUIRED_FIELDS = [
   ...INDIVIDUAL_ID_EXPORT_DATA,
   ...INDIVIDUAL_CORE_EXPORT_DATA.filter(({ field }) => NON_ID_REQUIRED_FIELDS.includes(field)),
+  HPO_FIELD,
 ].map(config => ({ ...config, description: FIELD_DESCRIPTIONS[config.field] }))
 
 const OPTIONAL_FIELDS = INDIVIDUAL_CORE_EXPORT_DATA.filter(({ field }) => !NON_ID_REQUIRED_FIELDS.includes(field))
@@ -60,7 +65,7 @@ const OPTIONAL_FIELDS = INDIVIDUAL_CORE_EXPORT_DATA.filter(({ field }) => !NON_I
 const BLANK_EXPORT = {
   filename: 'individuals_template',
   rawData: [],
-  headers: [...INDIVIDUAL_ID_EXPORT_DATA, ...INDIVIDUAL_CORE_EXPORT_DATA].map(config => config.header),
+  headers: [...INDIVIDUAL_ID_EXPORT_DATA, ...INDIVIDUAL_CORE_EXPORT_DATA, HPO_FIELD].map(config => config.header),
   processRow: val => val,
 }
 
@@ -68,11 +73,11 @@ const DEMO_EXPORT = {
   ...BLANK_EXPORT,
   filename: 'demo_individuals',
   rawData: [
-    ['FAM1', 'FAM1_1', 'FAM1_2', 'FAM1_3', 'Male', 'Affected', ''],
-    ['FAM1', 'FAM1_4', 'FAM1_2', 'FAM1_3', '', 'Affected', 'an affected sibling'],
-    ['FAM1', 'FAM1_2', '', '', 'Male', 'Unaffected', ''],
-    ['FAM1', 'FAM1_3', '', '', 'Female', '', 'affected status of mother unknown'],
-    ['FAM2', 'FAM2_1', '', '', 'Female', 'Affected', 'a proband-only family'],
+    ['FAM1', 'FAM1_1', 'FAM1_2', 'FAM1_3', 'Male', 'Affected', '', 'HP:0001324 (Muscle weakness)'],
+    ['FAM1', 'FAM1_4', 'FAM1_2', 'FAM1_3', '', 'Affected', 'an affected sibling', 'HP:0001250 (Seizure); HP:0001324 (Muscle weakness)'],
+    ['FAM1', 'FAM1_2', '', '', 'Male', 'Unaffected', '', ''],
+    ['FAM1', 'FAM1_3', '', '', 'Female', 'Unknown', 'affected status of mother unknown', ''],
+    ['FAM2', 'FAM2_1', '', '', 'Female', 'Affected', 'a proband-only family', 'HP:0001263 (Global developmental delay)'],
   ],
 }
 

From dc57e074428ebf55c7ebcb7e0994606c85c3be1c Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 23 Jul 2024 16:12:19 -0400
Subject: [PATCH 19/51] update hpo parsing for anvil upload

---
 seqr/views/apis/anvil_workspace_api.py  | 12 ++---
 seqr/views/apis/individual_api.py       | 50 ++++++++-------------
 seqr/views/utils/pedigree_info_utils.py | 60 ++++++++++++++++++++-----
 3 files changed, 73 insertions(+), 49 deletions(-)

diff --git a/seqr/views/apis/anvil_workspace_api.py b/seqr/views/apis/anvil_workspace_api.py
index 4682064bc8..5cf6af9036 100644
--- a/seqr/views/apis/anvil_workspace_api.py
+++ b/seqr/views/apis/anvil_workspace_api.py
@@ -243,7 +243,7 @@ def _parse_uploaded_pedigree(request_json, project=None):
     # Parse families/individuals in the uploaded pedigree file
     json_records = load_uploaded_file(request_json['uploadedFileId'])
     pedigree_records, _ = parse_basic_pedigree_table(
-        project, json_records, 'uploaded pedigree file', required_columns=[
+        project, json_records, 'uploaded pedigree file', update_features=True, required_columns=[
             JsonConstants.SEX_COLUMN, JsonConstants.AFFECTED_COLUMN,
         ])
 
@@ -259,11 +259,10 @@ def _parse_uploaded_pedigree(request_json, project=None):
     for record in pedigree_records:
         records_by_family[record[JsonConstants.FAMILY_ID_COLUMN]].append(record)
 
-    no_affected_families = []
-    for family_id, records in records_by_family.items():
-        affected = [record for record in records if record[JsonConstants.AFFECTED_COLUMN] == Individual.AFFECTED_STATUS_AFFECTED]
-        if not affected:
-            no_affected_families.append(family_id)
+    no_affected_families = [
+        family_id for family_id, records in records_by_family.items()
+        if not any(record[JsonConstants.AFFECTED_COLUMN] == Individual.AFFECTED_STATUS_AFFECTED for record in records)
+    ]
 
     if no_affected_families:
         errors.append('The following families do not have any affected individuals: {}'.format(', '.join(no_affected_families)))
@@ -278,6 +277,7 @@ def _trigger_add_workspace_data(project, pedigree_records, user, data_path, samp
     # add families and individuals according to the uploaded individual records
     pedigree_json, sample_ids = add_or_update_individuals_and_families(
         project, individual_records=pedigree_records, user=user, get_update_json=get_pedigree_json, get_updated_individual_ids=True,
+        allow_features_update=True,
     )
     num_updated_individuals = len(sample_ids)
     sample_ids.update(previous_loaded_ids or [])
diff --git a/seqr/views/apis/individual_api.py b/seqr/views/apis/individual_api.py
index 7b6d6f0c59..cef3f8d1f9 100644
--- a/seqr/views/apis/individual_api.py
+++ b/seqr/views/apis/individual_api.py
@@ -20,7 +20,8 @@
 from seqr.views.utils.orm_to_json_utils import _get_json_for_model, _get_json_for_individuals, add_individual_hpo_details, \
     _get_json_for_families, get_json_for_rna_seq_outliers, get_project_collaborators_by_username, INDIVIDUAL_DISPLAY_NAME_EXPR, \
     GREGOR_FINDING_TAG_TYPE
-from seqr.views.utils.pedigree_info_utils import parse_pedigree_table, validate_fam_file_records, JsonConstants, ErrorsWarningsException
+from seqr.views.utils.pedigree_info_utils import parse_pedigree_table, validate_fam_file_records, parse_hpo_terms, \
+    get_valid_hpo_terms, JsonConstants, ErrorsWarningsException
 from seqr.views.utils.permissions_utils import get_project_and_check_permissions, check_project_permissions, \
     get_project_and_check_pm_permissions, login_and_policies_required, has_project_permissions, project_has_anvil, \
     is_internal_anvil_project, pm_or_data_manager_required, check_workspace_perm
@@ -387,7 +388,7 @@ def _set_parent_relationships(record, parents_by_guid, guid_key, parent_key, par
 INDIVIDUAL_GUID_COL = 'individual_guid'
 HPO_TERM_NUMBER_COL = 'hpo_number'
 AFFECTED_FEATURE_COL = 'affected'
-FEATURES_COL = 'features'
+FEATURES_COL = JsonConstants.FEATURES
 ABSENT_FEATURES_COL = 'absent_features'
 BIRTH_COL = 'birth_year'
 DEATH_COL = 'death_year'
@@ -440,8 +441,8 @@ def _gene_list_value(val):
 
 
 INDIVIDUAL_METADATA_FIELDS = {
-    FEATURES_COL: lambda val: [{'id': feature} for feature in set(val)],
-    ABSENT_FEATURES_COL: lambda val: [{'id': feature} for feature in val],
+    FEATURES_COL: list,
+    ABSENT_FEATURES_COL: list,
     BIRTH_COL: int,
     DEATH_COL: int,
     ONSET_AGE_COL: lambda val: Individual.ONSET_AGE_REVERSE_LOOKUP[val],
@@ -471,7 +472,7 @@ def _nested_val(nested_key):
 
 def _get_phenotips_features(observed):
     def get_observed_features(features):
-        return [feature['id'] for feature in features if feature['observed'] == observed]
+        return [{'id': feature['id']} for feature in features if feature['observed'] == observed]
     return get_observed_features
 
 PHENOTIPS_JSON_FIELD_MAP = {
@@ -592,8 +593,8 @@ def _process_hpo_records(records, filename, project, user):
 
         if FEATURES_COL in column_map or ABSENT_FEATURES_COL in column_map:
             for row in row_dicts:
-                row[FEATURES_COL] = _parse_hpo_terms(row.get(FEATURES_COL))
-                row[ABSENT_FEATURES_COL] = _parse_hpo_terms(row.get(ABSENT_FEATURES_COL))
+                row[FEATURES_COL] = parse_hpo_terms(row.get(FEATURES_COL))
+                row[ABSENT_FEATURES_COL] = parse_hpo_terms(row.get(ABSENT_FEATURES_COL))
 
         elif HPO_TERM_NUMBER_COL in column_map:
             aggregate_rows = defaultdict(lambda: {FEATURES_COL: set(), ABSENT_FEATURES_COL: set()})
@@ -608,30 +609,20 @@ def _process_hpo_records(records, filename, project, user):
                 aggregate_entry.update({k: v for k, v in row.items() if v})
 
             row_dicts = [
-                {**entry, FEATURES_COL: list(entry[FEATURES_COL]), ABSENT_FEATURES_COL: list(entry[ABSENT_FEATURES_COL])}
+                {**entry, **{col: [{'id': feature} for feature in entry[col]] for col in [FEATURES_COL, ABSENT_FEATURES_COL]}}
                 for entry in aggregate_rows.values()
             ]
 
     return _parse_individual_hpo_terms(row_dicts, project, user)
 
 
-def _parse_hpo_terms(hpo_term_string):
-    if not hpo_term_string:
-        return []
-    return [hpo_term.strip() for hpo_term in re.sub(r'\(.*?\)', '', hpo_term_string).replace(',', ';').split(';')]
-
-
 def _has_same_features(individual, present_features, absent_features):
-    return {feature['id'] for feature in individual.features or []} == set(present_features or []) and \
-           {feature['id'] for feature in individual.absent_features or []} == set(absent_features or [])
+    return {feature['id'] for feature in individual.features or []} == {feature['id'] for feature in present_features or []} and \
+           {feature['id'] for feature in individual.absent_features or []} == {feature['id'] for feature in absent_features or []}
 
 
 def _get_valid_hpo_terms(json_records):
-    all_hpo_terms = set()
-    for record in json_records:
-        all_hpo_terms.update(record.get(FEATURES_COL, []))
-        all_hpo_terms.update(record.get(ABSENT_FEATURES_COL, []))
-    return set(HumanPhenotypeOntology.objects.filter(hpo_id__in=all_hpo_terms).values_list('hpo_id', flat=True))
+    return get_valid_hpo_terms(json_records, additional_feature_columns=[ABSENT_FEATURES_COL])
 
 
 def _parse_individual_hpo_terms(json_records, project, user):
@@ -704,14 +695,11 @@ def _get_record_individual(record, individual_lookup):
 
 def _remove_invalid_hpo_terms(record, hpo_terms):
     invalid_terms = set()
-    for feature in record.get(FEATURES_COL, []):
-        if feature not in hpo_terms:
-            invalid_terms.add(feature)
-            record[FEATURES_COL].remove(feature)
-    for feature in record.get(ABSENT_FEATURES_COL, []):
-        if feature not in hpo_terms:
-            invalid_terms.add(feature)
-            record[ABSENT_FEATURES_COL].remove(feature)
+    for col in [FEATURES_COL, ABSENT_FEATURES_COL]:
+        for feature in record.get(col, []):
+            if feature['id'] not in hpo_terms:
+                invalid_terms.add(feature['id'])
+                record[col].remove(feature)
     return invalid_terms
 
 
@@ -858,12 +846,12 @@ def import_gregor_metadata(request, project_guid):
         lambda r: r['participant_id'] in individuals_by_participant and r['ontology'] == 'HPO' and r['presence'] in {'Present', 'Absent'},
     ):
         col = FEATURES_COL if row['presence'] == 'Present' else ABSENT_FEATURES_COL
-        individuals_by_participant[row['participant_id']][col].append(row['term_id'])
+        individuals_by_participant[row['participant_id']][col].append({'id': row['term_id']})
     hpo_terms = _get_valid_hpo_terms(individuals)
     invalid_hpo_terms = set()
     for row in individuals:
         invalid_hpo_terms.update(_remove_invalid_hpo_terms(row, hpo_terms))
-        row.update({k: INDIVIDUAL_METADATA_FIELDS[k](v) for k, v in row.items() if k in [FEATURES_COL, ABSENT_FEATURES_COL]})
+        row.update({k: row[k] for k in [FEATURES_COL, ABSENT_FEATURES_COL] if k in row})
     if invalid_hpo_terms:
         warnings.append(f"Skipped the following unrecognized HPO terms: {', '.join(sorted(invalid_hpo_terms))}")
 
diff --git a/seqr/views/utils/pedigree_info_utils.py b/seqr/views/utils/pedigree_info_utils.py
index df2b0f026e..7472dbfd5f 100644
--- a/seqr/views/utils/pedigree_info_utils.py
+++ b/seqr/views/utils/pedigree_info_utils.py
@@ -2,11 +2,13 @@
 import difflib
 import os
 import json
+import re
 import tempfile
 import openpyxl as xl
 from collections import defaultdict
 from datetime import date
 
+from reference_data.models import HumanPhenotypeOntology
 from seqr.utils.communication_utils import send_html_email
 from seqr.utils.logging_utils import SeqrLogger
 from seqr.utils.middleware import ErrorsWarningsException
@@ -77,9 +79,12 @@ def parse_pedigree_table(parsed_file, filename, user, project):
     return json_records, warnings
 
 
-def parse_basic_pedigree_table(project, parsed_file, filename, required_columns=None):
+def parse_basic_pedigree_table(project, parsed_file, filename, required_columns=None, update_features=False):
     rows, header = _parse_pedigree_table_rows(parsed_file, filename)
-    return _parse_pedigree_table_json(project, rows, header=header, fail_on_warnings=True, required_columns=required_columns, allow_id_update=False)
+    return _parse_pedigree_table_json(
+        project, rows, header=header, fail_on_warnings=True, allow_id_update=False,
+        required_columns=required_columns, update_features=update_features, 
+    )
 
 
 def _parse_pedigree_table_rows(parsed_file, filename, header=None, rows=None):
@@ -110,15 +115,15 @@ def _parse_pedigree_table_rows(parsed_file, filename, header=None, rows=None):
         raise ErrorsWarningsException(['Error while parsing file: {}. {}'.format(filename, e)], [])
 
 
-def _parse_pedigree_table_json(project, rows, header=None, column_map=None, errors=None, fail_on_warnings=False, required_columns=None, allow_id_update=True):
+def _parse_pedigree_table_json(project, rows, header=None, column_map=None, errors=None, fail_on_warnings=False, required_columns=None, allow_id_update=True, update_features=False):
     # convert to json and validate
-    column_map = column_map or (_parse_header_columns(header, allow_id_update) if header else None)
+    column_map = column_map or (_parse_header_columns(header, allow_id_update, update_features) if header else None)
     if column_map:
-        json_records = _convert_fam_file_rows_to_json(column_map, rows, required_columns=required_columns)
+        json_records = _convert_fam_file_rows_to_json(column_map, rows, required_columns=required_columns, update_features=update_features)
     else:
         json_records = rows
 
-    warnings = validate_fam_file_records(project, json_records, fail_on_warnings=fail_on_warnings, errors=errors)
+    warnings = validate_fam_file_records(project, json_records, fail_on_warnings=fail_on_warnings, errors=errors, update_features=update_features)
     return json_records, warnings
 
 
@@ -142,7 +147,14 @@ def _parse_affected(affected):
     return None
 
 
-def _convert_fam_file_rows_to_json(column_map, rows, required_columns=None):
+def parse_hpo_terms(hpo_term_string):
+    if not hpo_term_string:
+        return []
+    terms = {hpo_term.strip() for hpo_term in re.sub(r'\(.*?\)', '', hpo_term_string).replace(',', ';').split(';')}
+    return[{'id': term for term in terms if term}]
+
+
+def _convert_fam_file_rows_to_json(column_map, rows, required_columns=None, update_features=False):
     """Parse the values in rows and convert them to a json representation.
 
     Args:
@@ -170,10 +182,11 @@ def _convert_fam_file_rows_to_json(column_map, rows, required_columns=None):
         ValueError: if there are unexpected values or row sizes
     """
     required_columns = [JsonConstants.FAMILY_ID_COLUMN, JsonConstants.INDIVIDUAL_ID_COLUMN] + (required_columns or [])
-    missing_cols = set(required_columns) - set(column_map.values())
+    missing_cols = [_to_title_case(_to_snake_case(col)) for col in set(required_columns) - set(column_map.values())]
+    if update_features and JsonConstants.FEATURES not in column_map.values():
+        missing_cols.append('HPO Terms')
     if missing_cols:
-        raise ErrorsWarningsException(
-            [f"Missing required columns: {', '.join([_to_title_case(_to_snake_case(col)) for col in sorted(missing_cols)])}"])
+        raise ErrorsWarningsException([f"Missing required columns: {', '.join(sorted(missing_cols))}"])
 
     json_results = []
     errors = []
@@ -200,7 +213,7 @@ def _convert_fam_file_rows_to_json(column_map, rows, required_columns=None):
     return json_results
 
 
-def _parse_header_columns(header, allow_id_update):
+def _parse_header_columns(header, allow_id_update, update_features):
     column_map = {}
     for key in header:
         column = None
@@ -215,6 +228,8 @@ def _parse_header_columns(header, allow_id_update):
         elif 'indiv' in key and 'previous' in key:
             if allow_id_update:
                 column = JsonConstants.PREVIOUS_INDIVIDUAL_ID_COLUMN
+        elif update_features and 'hpo' in key and 'term' in key:
+            column = JsonConstants.FEATURES
         else:
             column = next((
                 col for col, substrings in JsonConstants.COLUMN_SUBSTRINGS
@@ -238,7 +253,7 @@ def _format_value(value, column):
     return value
 
 
-def validate_fam_file_records(project, records, fail_on_warnings=False, errors=None, clear_invalid_values=False):
+def validate_fam_file_records(project, records, fail_on_warnings=False, errors=None, clear_invalid_values=False, update_features=False):
     """Basic validation such as checking that parents have the same family id as the child, etc.
 
     Args:
@@ -259,6 +274,8 @@ def validate_fam_file_records(project, records, fail_on_warnings=False, errors=N
     loaded_individual_families = dict(Individual.objects.filter(
         family__project=project, sample__is_active=True).values_list('individual_id', 'family__family_id'))
 
+    hpo_terms = get_valid_hpo_terms(records) if update_features else None
+
     errors = errors or []
     warnings = []
     individual_id_counts = defaultdict(int)
@@ -298,6 +315,14 @@ def validate_fam_file_records(project, records, fail_on_warnings=False, errors=N
         ]:
             _validate_parent(r, *parent, individual_id, family_id, records_by_id, warnings, errors, clear_invalid_values)
 
+        if update_features:
+            features = r[JsonConstants.FEATURES] or []
+            if not features and r[JsonConstants.AFFECTED_COLUMN] == Individual.AFFECTED_STATUS_AFFECTED:
+                errors.append(f'{individual_id} is affected but has no HPO terms')
+            invalid_features = {feature['id'] for feature in features if feature['id'] not in hpo_terms}
+            if invalid_features:
+                errors.append(f'{individual_id} has invalid HPO terms: {", ".join(sorted(invalid_features))}')
+
     errors += [
         f'{individual_id} is included as {count} separate records, but must be unique within the project'
         for individual_id, count in individual_id_counts.items() if count > 1
@@ -311,6 +336,15 @@ def validate_fam_file_records(project, records, fail_on_warnings=False, errors=N
     return warnings
 
 
+def get_valid_hpo_terms(records, additional_feature_columns=None):
+    all_hpo_terms = set()
+    for record in records:
+        all_hpo_terms.update({feature['id'] for feature in record.get(JsonConstants.FEATURES, [])})
+        for col in (additional_feature_columns or []):
+            all_hpo_terms.update({feature['id'] for feature in record.get(col, [])})
+    return set(HumanPhenotypeOntology.objects.filter(hpo_id__in=all_hpo_terms).values_list('hpo_id', flat=True))
+
+
 def _validate_parent(row, parent_id_type, parent_id_field, expected_sex, individual_id, family_id, records_by_id, warnings, errors, clear_invalid_values):
     parent_id = row.get(parent_id_field)
     if not parent_id:
@@ -808,6 +842,7 @@ class JsonConstants:
     PRIMARY_BIOSAMPLE = 'primaryBiosample'
     ANALYTE_TYPE = 'analyteType'
     TISSUE_AFFECTED_STATUS = 'tissueAffectedStatus'
+    FEATURES = 'features'
 
     JSON_COLUMNS = {MATERNAL_ETHNICITY, PATERNAL_ETHNICITY, BIRTH_YEAR, DEATH_YEAR, ONSET_AGE, AFFECTED_RELATIVES}
     NULLABLE_COLUMNS = {TISSUE_AFFECTED_STATUS}
@@ -823,6 +858,7 @@ class JsonConstants:
             (code for code, uberon_code in Individual.BIOSAMPLE_CHOICES if value.startswith(uberon_code)), None),
         ANALYTE_TYPE: Individual.ANALYTE_REVERSE_LOOKUP.get,
         TISSUE_AFFECTED_STATUS: lambda value: {'Yes': True, 'No': False, 'Unknown': None}[value],
+        FEATURES: lambda value: parse_hpo_terms(value),
     }
     FORMAT_COLUMNS.update({col: json.loads for col in JSON_COLUMNS})
 

From a7ddf14165cc7bcfd32177aaebf2e9f5fdcb9b34 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 23 Jul 2024 16:17:00 -0400
Subject: [PATCH 20/51] fix typo

---
 seqr/views/utils/pedigree_info_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seqr/views/utils/pedigree_info_utils.py b/seqr/views/utils/pedigree_info_utils.py
index 7472dbfd5f..9d7d131bc1 100644
--- a/seqr/views/utils/pedigree_info_utils.py
+++ b/seqr/views/utils/pedigree_info_utils.py
@@ -151,7 +151,7 @@ def parse_hpo_terms(hpo_term_string):
     if not hpo_term_string:
         return []
     terms = {hpo_term.strip() for hpo_term in re.sub(r'\(.*?\)', '', hpo_term_string).replace(',', ';').split(';')}
-    return[{'id': term for term in terms if term}]
+    return[{'id': term} for term in terms if term]
 
 
 def _convert_fam_file_rows_to_json(column_map, rows, required_columns=None, update_features=False):

From ff65b4a914ff2efb021c2244015d5837935f4246 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 23 Jul 2024 16:52:14 -0400
Subject: [PATCH 21/51] udpate tests

---
 seqr/views/apis/anvil_workspace_api_tests.py | 33 ++++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/seqr/views/apis/anvil_workspace_api_tests.py b/seqr/views/apis/anvil_workspace_api_tests.py
index e1b081adeb..d8bdf7e9e7 100644
--- a/seqr/views/apis/anvil_workspace_api_tests.py
+++ b/seqr/views/apis/anvil_workspace_api_tests.py
@@ -15,17 +15,19 @@
 
 LOAD_SAMPLE_DATA = [
     ["Family ID", "Individual ID", "Previous Individual ID", "Paternal ID", "Maternal ID", "Sex", "Affected Status",
-     "Notes", "familyNotes"],
-    ["1", " NA19675_1 ", "NA19675_1 ", "NA19678 ", "", "Female", "Affected", "A affected individual, test1-zsf", ""],
-    ["1", "NA19678", "", "", "", "Male", "Unaffected", "a individual note", ""],
-    ["21", " HG00735", "", "", "", "Unknown", "Unknown", "", "a new family"]]
+     "HPO Terms", "Notes", "familyNotes"],
+    ["1", " NA19675_1 ", "NA19675_1 ", "NA19678 ", "", "Female", "Affected", "HP:0012469 (Infantile spasms); HP:0011675 (Arrhythmia)", "A affected individual, test1-zsf", ""],
+    ["1", "NA19678", "", "", "", "Male", "Unaffected", "", "a individual note", ""],
+    ["21", " HG00735", "", "", "", "Unknown", "Affected", "HP:0001508,HP:0001508", "", "a new family"]]
 
-BAD_SAMPLE_DATA = [["1", "NA19674", "NA19674_1", "NA19678", "NA19679", "Female", "Affected", "A affected individual, test1-zsf", ""]]
-INVALID_ADDED_SAMPLE_DATA = [['22', 'HG00731', 'HG00731', '', '', 'Female', 'Affected', '', '']]
+BAD_SAMPLE_DATA = [["1", "NA19674", "NA19674_1", "NA19678", "NA19679", "Female", "Affected", "", "A affected individual, test1-zsf", ""],
+                   ["1", "NA19681", "", "", "", "Male", "Affected", "HP:0100258", "", ""]]
+INVALID_ADDED_SAMPLE_DATA = [['22', 'HG00731', 'HG00731', '', '', 'Female', 'Affected', 'HP:0011675', '', '']]
 
-MISSING_REQUIRED_SAMPLE_DATA = [["21", "HG00736", "", "", "", "", "", "", ""]]
+MISSING_REQUIRED_SAMPLE_DATA = [["21", "HG00736", "", "", "", "", "", "", "", ""]]
 
-LOAD_SAMPLE_DATA_EXTRA_SAMPLE = LOAD_SAMPLE_DATA + [["1", "NA19679", "", "", "", "Male", "Affected", "", ""]]
+LOAD_SAMPLE_DATA_EXTRA_SAMPLE = LOAD_SAMPLE_DATA + [["1", "NA19679", "", "", "", "Male", "Affected", "HP:0011675", "", ""],
+                                                    ["22", "HG00736", "", "", "", "Unknown", "Unknown", "", "", ""]]
 
 FILE_DATA = [
     '##fileformat=VCFv4.2\n',
@@ -468,7 +470,7 @@ def test_get_anvil_vcf_list(self, mock_subprocess, mock_file_logger, mock_utils_
 
 
 class LoadAnvilDataAPITest(AirflowTestCase):
-    fixtures = ['users', 'social_auth', '1kg_project']
+    fixtures = ['users', 'social_auth', 'reference_data', '1kg_project']
 
     LOADING_PROJECT_GUID = f'P_{TEST_NO_PROJECT_WORKSPACE_NAME}'
     DAG_NAME = 'v03_pipeline-SNV_INDEL'
@@ -676,7 +678,7 @@ def _test_errors(self, url, fields, workspace_name):
         response = self.client.post(url, content_type='application/json', data=json.dumps(REQUEST_BODY))
         self.assertEqual(response.status_code, 400)
         response_json = response.json()
-        self.assertListEqual(response_json['errors'], ['Missing required columns: Affected, Sex'])
+        self.assertListEqual(response_json['errors'], ['Missing required columns: Affected, HPO Terms, Sex'])
 
         self.mock_load_file.return_value = LOAD_SAMPLE_DATA + MISSING_REQUIRED_SAMPLE_DATA
         response = self.client.post(url, content_type='application/json', data=json.dumps(REQUEST_BODY))
@@ -690,6 +692,8 @@ def _test_errors(self, url, fields, workspace_name):
         self.assertEqual(response.status_code, 400)
         response_json = response.json()
         self.assertListEqual(response_json['errors'], [
+            'NA19674 is affected but has no HPO terms',
+            'NA19681 has invalid HPO terms: HP:0100258',
             'NA19679 is the mother of NA19674 but is not included. Make sure to create an additional record with NA19679 as the Individual ID',
         ])
 
@@ -699,7 +703,8 @@ def _test_errors(self, url, fields, workspace_name):
         self.assertEqual(response.status_code, 400)
         response_json = response.json()
         self.assertEqual(response_json['errors'],
-                         ['The following samples are included in the pedigree file but are missing from the VCF: NA19679'])
+                         ['The following samples are included in the pedigree file but are missing from the VCF: NA19679, HG00736',
+                          'The following families do not have any affected individuals: 22'])
 
     def _assert_valid_operation(self, project, test_add_data=True):
         genome_version = 'GRCh37' if test_add_data else 'GRCh38'
@@ -798,19 +803,21 @@ def _assert_valid_operation(self, project, test_add_data=True):
 
         individual_model_data = list(Individual.objects.filter(family__project=project).values(
             'family__family_id', 'individual_id', 'mother__individual_id', 'father__individual_id', 'sex', 'affected', 'notes',
+            'features',
         ))
         self.assertEqual(len(individual_model_data), 15 if test_add_data else 3)
         self.assertIn({
             'family__family_id': '21', 'individual_id': 'HG00735', 'mother__individual_id': None,
-            'father__individual_id': None, 'sex': 'U', 'affected': 'U', 'notes': None,
+            'father__individual_id': None, 'sex': 'U', 'affected': 'A', 'notes': None, 'features': [{'id': 'HP:0001508'}],
         }, individual_model_data)
         self.assertIn({
             'family__family_id': '1', 'individual_id': 'NA19675_1', 'mother__individual_id': None,
             'father__individual_id': 'NA19678', 'sex': 'F', 'affected': 'A', 'notes': 'A affected individual, test1-zsf',
+            'features': [{'id': 'HP:0012469'}, {'id': 'HP:0011675'}],
         }, individual_model_data)
         self.assertIn({
             'family__family_id': '1', 'individual_id': 'NA19678', 'mother__individual_id': None,
-            'father__individual_id': None, 'sex': 'M', 'affected': 'N', 'notes': 'a individual note'
+            'father__individual_id': None, 'sex': 'M', 'affected': 'N', 'notes': 'a individual note', 'features': [],
         }, individual_model_data)
 
     def _test_mv_file_and_triggering_dag_exception(self, url, workspace, sample_data, genome_version, request_body, num_samples=None):

From 948bf457dcb611233642f27ea21ce0f7bd4a48af Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 23 Jul 2024 16:53:43 -0400
Subject: [PATCH 22/51] fix empty features formatting

---
 seqr/views/utils/pedigree_info_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seqr/views/utils/pedigree_info_utils.py b/seqr/views/utils/pedigree_info_utils.py
index 9d7d131bc1..1d3135d5b0 100644
--- a/seqr/views/utils/pedigree_info_utils.py
+++ b/seqr/views/utils/pedigree_info_utils.py
@@ -244,7 +244,7 @@ def _parse_header_columns(header, allow_id_update, update_features):
 def _format_value(value, column):
     format_func = JsonConstants.FORMAT_COLUMNS.get(column)
     if format_func:
-        if (value or column in {JsonConstants.SEX_COLUMN, JsonConstants.AFFECTED_COLUMN}):
+        if (value or column in {JsonConstants.SEX_COLUMN, JsonConstants.AFFECTED_COLUMN, JsonConstants.FEATURES}):
             value = format_func(value)
             if value is None and column not in JsonConstants.NULLABLE_COLUMNS:
                 raise ValueError()

From 74e997470b31745cd151fb861cdf3c5283241346 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 10:10:47 -0400
Subject: [PATCH 23/51] codacy clean up

---
 seqr/views/utils/pedigree_info_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/seqr/views/utils/pedigree_info_utils.py b/seqr/views/utils/pedigree_info_utils.py
index 1d3135d5b0..fffe0426c7 100644
--- a/seqr/views/utils/pedigree_info_utils.py
+++ b/seqr/views/utils/pedigree_info_utils.py
@@ -83,7 +83,7 @@ def parse_basic_pedigree_table(project, parsed_file, filename, required_columns=
     rows, header = _parse_pedigree_table_rows(parsed_file, filename)
     return _parse_pedigree_table_json(
         project, rows, header=header, fail_on_warnings=True, allow_id_update=False,
-        required_columns=required_columns, update_features=update_features, 
+        required_columns=required_columns, update_features=update_features,
     )
 
 
@@ -858,7 +858,7 @@ class JsonConstants:
             (code for code, uberon_code in Individual.BIOSAMPLE_CHOICES if value.startswith(uberon_code)), None),
         ANALYTE_TYPE: Individual.ANALYTE_REVERSE_LOOKUP.get,
         TISSUE_AFFECTED_STATUS: lambda value: {'Yes': True, 'No': False, 'Unknown': None}[value],
-        FEATURES: lambda value: parse_hpo_terms(value),
+        FEATURES: parse_hpo_terms,
     }
     FORMAT_COLUMNS.update({col: json.loads for col in JSON_COLUMNS})
 

From a7dd1d6e763c78ad0bfab5a36a4f81133c188a92 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 10:34:50 -0400
Subject: [PATCH 24/51] migration

---
 ...e_index_file_path.py => 0071_igvsample_index_file_path.py} | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename seqr/migrations/{0068_igvsample_index_file_path.py => 0071_igvsample_index_file_path.py} (71%)

diff --git a/seqr/migrations/0068_igvsample_index_file_path.py b/seqr/migrations/0071_igvsample_index_file_path.py
similarity index 71%
rename from seqr/migrations/0068_igvsample_index_file_path.py
rename to seqr/migrations/0071_igvsample_index_file_path.py
index 8840dccc59..5d29558631 100644
--- a/seqr/migrations/0068_igvsample_index_file_path.py
+++ b/seqr/migrations/0071_igvsample_index_file_path.py
@@ -1,4 +1,4 @@
-# Generated by Django 3.2.23 on 2024-06-03 15:25
+# Generated by Django 4.2.13 on 2024-07-24 14:34
 
 from django.db import migrations, models
 
@@ -6,7 +6,7 @@
 class Migration(migrations.Migration):
 
     dependencies = [
-        ('seqr', '0067_alter_variantfunctionaldata_functional_data_tag'),
+        ('seqr', '0070_remove_rnasample_dataset_type_and_more'),
     ]
 
     operations = [

From 7294c4c895f3ed4b56f1081350716f573bb99c6e Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 10:35:28 -0400
Subject: [PATCH 25/51] fix changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a0881fecb9..6a9587770b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,13 +1,13 @@
 # _seqr_ Changes
 
 ## dev
+* Adds index_file_path to IGV Sample model (REQUIRES DB MIGRATION)
 
 ## 7/8/24
 * Add VLM contact for Projects (REQUIRES DB MIGRATION)
 
 ## 6/11/24
 * Add "Partial Phenotype Contribution" functional tag (REQUIRES DB MIGRATION)
-* Adds index_file_path to IGV Sample model (REQUIRES DB MIGRATION)
 
 ## 5/24/24
 * Adds external_data to Family model (REQUIRES DB MIGRATION)

From bf3b4066ffbd440541ea8e9d1afa9520b8058783 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 10:47:22 -0400
Subject: [PATCH 26/51] remove drs code

---
 seqr/views/apis/igv_api.py | 68 ++++++--------------------------------
 1 file changed, 11 insertions(+), 57 deletions(-)

diff --git a/seqr/views/apis/igv_api.py b/seqr/views/apis/igv_api.py
index dff66d2e4d..7bf9320112 100644
--- a/seqr/views/apis/igv_api.py
+++ b/seqr/views/apis/igv_api.py
@@ -158,16 +158,12 @@ def update_individual_igv_sample(request, individual_guid):
         if not file_path:
             raise ValueError('request must contain fields: filePath')
 
-        original_file_path = file_path
-        file_path = _get_valid_file_path(file_path, request.user)
-        if not file_path:
-            raise Exception('Error accessing "{}"'.format(original_file_path))
         sample_type = next((st for suffix, st in SAMPLE_TYPE_MAP if file_path.endswith(suffix)), None)
         if not sample_type:
             raise Exception('Invalid file extension for "{}" - valid extensions are {}'.format(
                 file_path, ', '.join([suffix for suffix, _ in SAMPLE_TYPE_MAP])))
-        if _is_drs_uri_path(file_path) and not request_json.get('indexFilePath'):
-            raise Exception('Index File Path is required for DRS URIs')
+        if not does_file_exist(file_path, user=request.user):
+            raise Exception('Error accessing "{}"'.format(file_path))
 
         sample, created = get_or_create_model_from_json(
             IgvSample, create_json={'individual': individual, 'sample_type': sample_type},
@@ -190,28 +186,6 @@ def update_individual_igv_sample(request, individual_guid):
         return create_json_response({'error': error}, status=400, reason=error)
 
 
-def _is_drs_uri_path(file_path):
-    return file_path.startswith('drs://')
-
-
-def _get_drs_info(file_path, user):
-    drs_path = file_path.split('/')
-    response = requests.get(
-        f'https://{drs_path[2]}/ga4gh/drs/v1/objects/{drs_path[3]}',
-        headers=_get_gs_auth_api_headers(user),
-    )
-
-    return response.json() if response.status_code == 200 else None
-
-
-def _get_valid_file_path(file_path, user):
-    if _is_drs_uri_path(file_path):
-        drs_info = _get_drs_info(file_path, user)
-        return None if drs_info is None else f"{file_path}/{drs_info['name']}"
-
-    return file_path if does_file_exist(file_path, user=user) else None
-
-
 @login_and_policies_required
 def fetch_igv_track(request, project_guid, igv_track_path):
 
@@ -223,45 +197,25 @@ def fetch_igv_track(request, project_guid, igv_track_path):
     if is_google_bucket_file_path(igv_track_path):
         return _stream_gs(request, igv_track_path)
 
-    if _is_drs_uri_path(igv_track_path):
-        return _stream_drs(request, igv_track_path)
-
     return _stream_file(request, igv_track_path)
 
 
 def _stream_gs(request, gs_path):
-    return _stream_response(
-        url=f"{GS_STORAGE_URL}/{gs_path.replace('gs://', '', 1)}",
-        headers=_get_gs_rest_api_headers(gs_path, user=request.user),
-        request=request)
-
+    headers = _get_gs_rest_api_headers(request.META.get('HTTP_RANGE'), gs_path, user=request.user)
 
-def _stream_drs(request, file_path):
-    drs_info = _get_drs_info(file_path, request.user)
-
-    https_access = next(a['access_url'] for a in drs_info['access_methods'] if a['type'] == 'https')
-    headers = dict([h.split(': ') for h in https_access['headers']])
-
-    return _stream_response(https_access['url'], headers, request)
-
-
-def _stream_response(url, headers, request):
-    range_header = request.META.get('HTTP_RANGE')
-    if range_header:
-        headers['Range'] = range_header
-
-    response = requests.get(url, headers=headers, stream=True, timeout=TIMEOUT)
+    response = requests.get(
+        f"{GS_STORAGE_URL}/{gs_path.replace('gs://', '', 1)}",
+        headers=headers,
+        stream=True, timeout=TIMEOUT)
 
     return StreamingHttpResponse(response.iter_content(chunk_size=65536), status=response.status_code,
                                  content_type='application/octet-stream')
 
 
-def _get_gs_auth_api_headers(user):
-    return {'Authorization': 'Bearer {}'.format(_get_access_token(user))}
-
-
-def _get_gs_rest_api_headers(gs_path, user):
-    headers = _get_gs_auth_api_headers(user)
+def _get_gs_rest_api_headers(range_header, gs_path, user=None):
+    headers = {'Authorization': 'Bearer {}'.format(_get_access_token(user))}
+    if range_header:
+        headers['Range'] = range_header
     google_project = get_google_project(gs_path)
     if google_project:
         headers['x-goog-user-project'] = get_google_project(gs_path)

From 140c7213db86f160dbb4ba71cc01a18d82044e09 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 11:19:07 -0400
Subject: [PATCH 27/51] validate new index file

---
 seqr/views/apis/igv_api.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/seqr/views/apis/igv_api.py b/seqr/views/apis/igv_api.py
index 7bf9320112..e74e26856a 100644
--- a/seqr/views/apis/igv_api.py
+++ b/seqr/views/apis/igv_api.py
@@ -58,8 +58,11 @@ def _process_igv_table_handler(parse_uploaded_file, get_valid_matched_individual
         info.append(message)
 
         existing_sample_files = defaultdict(set)
+        existing_sample_index_files = defaultdict(set)
         for sample in IgvSample.objects.select_related('individual').filter(individual__in=matched_individuals.keys()):
             existing_sample_files[sample.individual].add(sample.file_path)
+            if sample.index_file_path:
+                existing_sample_index_files[sample.individual].add(sample.index_file_path)
 
         num_unchanged_rows = 0
         all_updates = []
@@ -68,6 +71,7 @@ def _process_igv_table_handler(parse_uploaded_file, get_valid_matched_individual
                 dict(individualGuid=individual.guid, individualId=individual.individual_id, **update)
                 for update in updates
                 if update['filePath'] not in existing_sample_files[individual]
+                   or (update['indexFilePath'] and update['indexFilePath'] not in existing_sample_index_files)
             ]
             all_updates += changed_updates
             num_unchanged_rows += len(updates) - len(changed_updates)
@@ -164,6 +168,8 @@ def update_individual_igv_sample(request, individual_guid):
                 file_path, ', '.join([suffix for suffix, _ in SAMPLE_TYPE_MAP])))
         if not does_file_exist(file_path, user=request.user):
             raise Exception('Error accessing "{}"'.format(file_path))
+        if request_json.get('indexFilePath') and not does_file_exist(request_json['indexFilePath'], user=request.user):
+            raise Exception('Error accessing "{}"'.format(request_json['indexFilePath']))
 
         sample, created = get_or_create_model_from_json(
             IgvSample, create_json={'individual': individual, 'sample_type': sample_type},

From fa7a969ec13c3eb2dd632749b6301a56e9d1f8a1 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 12:21:37 -0400
Subject: [PATCH 28/51] update tests

---
 seqr/views/apis/igv_api_tests.py | 52 ++++++++++++++++++++------------
 seqr/views/utils/test_utils.py   |  2 +-
 2 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/seqr/views/apis/igv_api_tests.py b/seqr/views/apis/igv_api_tests.py
index 4de3bd6df5..2080f655cb 100644
--- a/seqr/views/apis/igv_api_tests.py
+++ b/seqr/views/apis/igv_api_tests.py
@@ -124,8 +124,8 @@ def test_receive_alignment_table_handler(self):
         self.assertListEqual(
             response_json['info'], ['Parsed 3 rows in 2 individuals from samples.csv', 'No change detected for 1 rows'])
         self.assertListEqual(sorted(response_json['updates'], key=lambda o: o['individualGuid']), [
-            {'individualGuid': 'I000001_na19675', 'individualId': 'NA19675_1', 'filePath': 'gs://readviz/batch_10.dcr.bed.gz', 'sampleId': 'NA19675'},
-            {'individualGuid': 'I000003_na19679', 'individualId': 'NA19679', 'filePath': 'gs://readviz/NA19679.bam', 'sampleId': None},
+            {'individualGuid': 'I000001_na19675', 'individualId': 'NA19675_1', 'filePath': 'gs://readviz/batch_10.dcr.bed.gz', 'indexFilePath': None, 'sampleId': 'NA19675'},
+            {'individualGuid': 'I000003_na19679', 'individualId': 'NA19679', 'filePath': 'gs://readviz/NA19679.bam', 'indexFilePath': None, 'sampleId': None},
         ])
 
         # test data manager access
@@ -147,7 +147,7 @@ def test_receive_bulk_alignment_table_handler(self, mock_load_uploaded_file):
         request_data = json.dumps({'mappingFile': {'uploadedFileId': uploaded_file_id}})
         pm_projects_rows = [
             ['1kg project nåme with uniçøde', 'NA19675_1', 'gs://readviz/batch_10.dcr.bed.gz', 'NA19675'],
-            ['1kg project nåme with uniçøde', 'NA19675_1', 'gs://readviz/NA19675_1.bam'],
+            ['1kg project nåme with uniçøde', 'NA19675_1', 'gs://readviz/NA19675_1.bam', 'gs://readviz-index/NA19675_1.bai'],
             ['1kg project nåme with uniçøde', 'NA20870', 'gs://readviz/NA20870.cram'],
             ['Test Reprocessed Project', 'NA20885', 'gs://readviz/NA20885.cram'],
         ]
@@ -177,21 +177,26 @@ def test_receive_bulk_alignment_table_handler(self, mock_load_uploaded_file):
         self.assertListEqual(response_json['warnings'], [])
         self.assertListEqual(response_json['info'], ['Parsed 4 rows in 3 individuals', 'No change detected for 1 rows'])
         updates = [
-            {'individualGuid': 'I000001_na19675', 'individualId': 'NA19675_1', 'filePath': 'gs://readviz/batch_10.dcr.bed.gz', 'sampleId': 'NA19675'},
-            {'individualGuid': 'I000001_na19675', 'individualId': 'NA19675_1', 'filePath': 'gs://readviz/NA19675_1.bam', 'sampleId': None},
-            {'individualGuid': 'I000015_na20885', 'individualId': 'NA20885', 'filePath': 'gs://readviz/NA20885.cram', 'sampleId': None},
+            {'individualGuid': 'I000001_na19675', 'individualId': 'NA19675_1', 'filePath': 'gs://readviz/batch_10.dcr.bed.gz', 'indexFilePath': None, 'sampleId': 'NA19675'},
+            {'individualGuid': 'I000001_na19675', 'individualId': 'NA19675_1', 'filePath': 'gs://readviz/NA19675_1.bam',
+             'indexFilePath': 'gs://readviz-index/NA19675_1.bai', 'sampleId': None},
+            {'individualGuid': 'I000015_na20885', 'individualId': 'NA20885', 'filePath': 'gs://readviz/NA20885.cram', 'indexFilePath': None, 'sampleId': None},
         ]
         self.assertListEqual(sorted(response_json['updates'], key=lambda o: o['individualGuid']), updates)
 
         # test data manager access
         self.login_data_manager_user()
+        rows[2].append('gs://readviz-index/NA20870.crai')
         mock_load_uploaded_file.return_value = rows
         response = self.client.post(url, content_type='application/json', data=request_data)
         self.assertEqual(response.status_code, 200)
         response_json = response.json()
-        self.assertListEqual(response_json['info'], ['Parsed 5 rows in 4 individuals', 'No change detected for 1 rows'])
-        self.assertListEqual(sorted(response_json['updates'], key=lambda o: o['individualGuid']), updates + [
-            {'individualGuid': 'I000018_na21234', 'individualId': 'NA21234', 'filePath': 'gs://readviz/NA21234.cram', 'sampleId': None}
+        self.assertListEqual(response_json['info'], ['Parsed 5 rows in 4 individuals'])
+        self.assertListEqual(sorted(response_json['updates'], key=lambda o: o['individualGuid']), updates[:2] + [
+            {'individualGuid': 'I000007_na20870', 'individualId': 'NA20870', 'sampleId': None,
+             'filePath': 'gs://readviz/NA20870.cram', 'indexFilePath': 'gs://readviz-index/NA20870.crai'},
+            updates[2],
+            {'individualGuid': 'I000018_na21234', 'individualId': 'NA21234', 'filePath': 'gs://readviz/NA21234.cram', 'indexFilePath': None, 'sampleId': None}
         ])
 
     @mock.patch('seqr.utils.file_utils.subprocess.Popen')
@@ -206,7 +211,7 @@ def test_add_alignment_sample(self, mock_local_file_exists, mock_subprocess):
         self.assertEqual(response.reason_phrase, 'request must contain fields: filePath')
 
         response = self.client.post(url, content_type='application/json', data=json.dumps({
-            'filePath': 'invalid_path.txt',
+            'filePath': 'invalid_path.txt', 'indexFilePath': None,
         }))
         self.assertEqual(response.status_code, 400)
         self.assertEqual(
@@ -216,31 +221,40 @@ def test_add_alignment_sample(self, mock_local_file_exists, mock_subprocess):
         mock_local_file_exists.return_value = False
         mock_subprocess.return_value.wait.return_value = 1
         response = self.client.post(url, content_type='application/json', data=json.dumps({
-            'filePath': '/readviz/NA19675_new.cram',
+            'filePath': '/readviz/NA19675_new.cram', 'indexFilePath': None,
         }))
         self.assertEqual(response.status_code, 400)
         self.assertEqual(response.reason_phrase, 'Error accessing "/readviz/NA19675_new.cram"')
 
         response = self.client.post(url, content_type='application/json', data=json.dumps({
-            'filePath': 'gs://readviz/NA19675_new.cram',
+            'filePath': 'gs://readviz/NA19675_new.cram', 'indexFilePath': None,
         }))
         self.assertEqual(response.status_code, 400)
         self.assertEqual(response.reason_phrase, 'Error accessing "gs://readviz/NA19675_new.cram"')
 
-        # Send valid request
         mock_local_file_exists.return_value = True
+        response = self.client.post(url, content_type='application/json', data=json.dumps({
+            'filePath': '/readviz/NA19675.new.cram', 'indexFilePath': 'gs://readviz/NA19675_new.crai',
+        }))
+        self.assertEqual(response.status_code, 400)
+        self.assertEqual(response.reason_phrase, 'Error accessing "gs://readviz/NA19675_new.crai"')
+
+        # Send valid request
         mock_subprocess.return_value.wait.return_value = 0
         response = self.client.post(url, content_type='application/json', data=json.dumps({
-            'filePath': '/readviz/NA19675.new.cram',
+            'filePath': '/readviz/NA19675.new.cram', 'indexFilePath': '/readviz-index/NA19675.cram.crai',
         }))
         self.assertEqual(response.status_code, 200)
         self.assertDictEqual(response.json(), {'igvSamplesByGuid': {'S000145_na19675': {
             'projectGuid': PROJECT_GUID, 'individualGuid': 'I000001_na19675', 'sampleGuid': 'S000145_na19675',
-            'familyGuid': 'F000001_1', 'filePath': '/readviz/NA19675.new.cram', 'sampleId': None, 'sampleType': 'alignment'}}})
-        mock_local_file_exists.assert_called_with('/readviz/NA19675.new.cram')
+            'familyGuid': 'F000001_1', 'filePath': '/readviz/NA19675.new.cram',
+            'indexFilePath': '/readviz-index/NA19675.cram.crai', 'sampleId': None, 'sampleType': 'alignment'}}})
+        mock_local_file_exists.assert_has_calls([
+            mock.call('/readviz/NA19675.new.cram'), mock.call('/readviz-index/NA19675.cram.crai'),
+        ])
 
         response = self.client.post(url, content_type='application/json', data=json.dumps({
-            'filePath': 'gs://readviz/batch_10.dcr.bed.gz', 'sampleId': 'NA19675',
+            'filePath': 'gs://readviz/batch_10.dcr.bed.gz', 'sampleId': 'NA19675', 'indexFilePath': None,
         }))
         self.assertEqual(response.status_code, 200)
         response_json = response.json()
@@ -250,7 +264,7 @@ def test_add_alignment_sample(self, mock_local_file_exists, mock_subprocess):
         sample_guid = next(iter(response_json['igvSamplesByGuid']))
         self.assertDictEqual(response_json['igvSamplesByGuid'][sample_guid], {
             'projectGuid': PROJECT_GUID, 'individualGuid': 'I000001_na19675', 'sampleGuid': sample_guid,
-            'familyGuid': 'F000001_1',  'filePath': 'gs://readviz/batch_10.dcr.bed.gz', 'sampleId': 'NA19675', 'sampleType': 'gcnv'})
+            'familyGuid': 'F000001_1',  'filePath': 'gs://readviz/batch_10.dcr.bed.gz', 'indexFilePath': None, 'sampleId': 'NA19675', 'sampleType': 'gcnv'})
         self.assertListEqual(list(response_json['individualsByGuid'].keys()), ['I000001_na19675'])
         self.assertSetEqual(
             set(response_json['individualsByGuid']['I000001_na19675']['igvSampleGuids']),
@@ -269,7 +283,7 @@ def test_add_alignment_sample(self, mock_local_file_exists, mock_subprocess):
         self.assertDictEqual(response_json['igvSamplesByGuid'][junctions_sample_guid], {
             'projectGuid': PROJECT_GUID, 'individualGuid': 'I000001_na19675', 'sampleGuid': junctions_sample_guid,
             'familyGuid': 'F000001_1',  'filePath': 'gs://readviz/batch_10.junctions.bed.gz', 'sampleId': 'NA19675',
-            'sampleType': 'spliceJunctions'})
+            'indexFilePath': None, 'sampleType': 'spliceJunctions'})
 
         # test data manager access
         self.login_data_manager_user()
diff --git a/seqr/views/utils/test_utils.py b/seqr/views/utils/test_utils.py
index 4b1b3780af..bc9fe1ffb9 100644
--- a/seqr/views/utils/test_utils.py
+++ b/seqr/views/utils/test_utils.py
@@ -797,7 +797,7 @@ def _get_list_param(call, param):
 }
 
 IGV_SAMPLE_FIELDS = {
-    'projectGuid', 'familyGuid', 'individualGuid', 'sampleGuid', 'filePath', 'sampleId', 'sampleType',
+    'projectGuid', 'familyGuid', 'individualGuid', 'sampleGuid', 'filePath', 'indexFilePath', 'sampleId', 'sampleType',
 }
 
 SAVED_VARIANT_FIELDS = {'variantGuid', 'variantId', 'familyGuids', 'xpos', 'ref', 'alt', 'selectedMainTranscriptId', 'acmgClassification'}

From 0c7e2be24530f5cc0f989084378f30b89cce8c3b Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 15:12:37 -0400
Subject: [PATCH 29/51] add family name edit button for anvil projects

---
 ui/shared/components/panel/family/Family.jsx | 31 +++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/ui/shared/components/panel/family/Family.jsx b/ui/shared/components/panel/family/Family.jsx
index b6036941f6..b11876a74b 100644
--- a/ui/shared/components/panel/family/Family.jsx
+++ b/ui/shared/components/panel/family/Family.jsx
@@ -35,6 +35,8 @@ import {
 import { FirstSample, AnalystEmailDropdown, AnalysedBy, AnalysisGroups, analysisStatusIcon } from './FamilyFields'
 import FamilyLayout from './FamilyLayout'
 
+const FAMILY_NAME_FIELD_PROPS = { label: 'Name' }
+
 const ASSIGNED_ANALYST_EDIT_FIELDS = [
   {
     name: 'assigned_analyst_username',
@@ -221,25 +223,38 @@ class Family extends React.PureComponent {
     })
   }
 
+  familyHeader = (displayName) => {
+    const { family, showFamilyPageLink } = this.props
+    const content = showFamilyPageLink ?
+      <Link to={`/project/${family.projectGuid}/family_page/${family.familyGuid}`}>{displayName}</Link> :
+      displayName
+    return <InlineHeader size="small" content={content} />
+  }
+
   render() {
     const {
       project, family, fields, rightContent, compact, useFullWidth, disablePedigreeZoom, disableEdit,
-      showFamilyPageLink, annotation, hidePedigree, toggleDetails,
+      annotation, hidePedigree, toggleDetails, updateFamily: dispatchUpdateFamily,
     } = this.props
 
     if (!family) {
       return <div>Family Not Found</div>
     }
 
+    const isEditable = !disableEdit && project.canEdit
+
     let leftContent = null
     if (!hidePedigree) {
       const familyHeader = (
-        <InlineHeader
-          key="name"
-          size="small"
-          content={showFamilyPageLink ?
-            <Link to={`/project/${family.projectGuid}/family_page/${family.familyGuid}`}>{family.displayName}</Link> :
-            family.displayName}
+        <BaseFieldView
+          field="displayName"
+          idField="familyGuid"
+          initialValues={family}
+          fieldDisplay={this.familyHeader}
+          isEditable={isEditable && !!project.workspaceName && !project.isAnalystProject}
+          formFieldProps={FAMILY_NAME_FIELD_PROPS}
+          modalTitle={`Edit Family ${family.displayName}`}
+          onSubmit={dispatchUpdateFamily}
         />
       )
       leftContent = (
@@ -256,7 +271,7 @@ class Family extends React.PureComponent {
                 key="pedigree"
                 family={family}
                 disablePedigreeZoom={disablePedigreeZoom}
-                isEditable={!disableEdit && project.canEdit}
+                isEditable={isEditable}
               />
             </span>
           )}

From 1167b1d3868b958c988f534a0a8a1a4a416c768b Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 15:17:32 -0400
Subject: [PATCH 30/51] edit underlyin id field

---
 ui/shared/components/panel/family/Family.jsx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ui/shared/components/panel/family/Family.jsx b/ui/shared/components/panel/family/Family.jsx
index b11876a74b..efa5dd5dc9 100644
--- a/ui/shared/components/panel/family/Family.jsx
+++ b/ui/shared/components/panel/family/Family.jsx
@@ -223,11 +223,11 @@ class Family extends React.PureComponent {
     })
   }
 
-  familyHeader = (displayName) => {
+  familyHeader = () => {
     const { family, showFamilyPageLink } = this.props
     const content = showFamilyPageLink ?
-      <Link to={`/project/${family.projectGuid}/family_page/${family.familyGuid}`}>{displayName}</Link> :
-      displayName
+      <Link to={`/project/${family.projectGuid}/family_page/${family.familyGuid}`}>{family.displayName}</Link> :
+      family.displayName
     return <InlineHeader size="small" content={content} />
   }
 
@@ -247,7 +247,7 @@ class Family extends React.PureComponent {
     if (!hidePedigree) {
       const familyHeader = (
         <BaseFieldView
-          field="displayName"
+          field="familyId"
           idField="familyGuid"
           initialValues={family}
           fieldDisplay={this.familyHeader}

From c9d77090c75933e1a16103b4b72497a4868b6978 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 15:36:11 -0400
Subject: [PATCH 31/51] actually edit family id

---
 seqr/views/apis/family_api.py         | 13 +++++++++----
 seqr/views/apis/individual_api.py     | 13 ++++---------
 seqr/views/utils/permissions_utils.py |  5 +++++
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/seqr/views/apis/family_api.py b/seqr/views/apis/family_api.py
index d12530007c..7e9bb08fbf 100644
--- a/seqr/views/apis/family_api.py
+++ b/seqr/views/apis/family_api.py
@@ -25,7 +25,7 @@
 from seqr.models import Family, FamilyAnalysedBy, Individual, FamilyNote, Sample, VariantTag, AnalysisGroup, RnaSeqTpm, \
     PhenotypePrioritization, Project, RnaSeqOutlier, RnaSeqSpliceOutlier, RnaSample
 from seqr.views.utils.permissions_utils import check_project_permissions, get_project_and_check_pm_permissions, \
-    login_and_policies_required, user_is_analyst, has_case_review_permissions
+    login_and_policies_required, user_is_analyst, has_case_review_permissions, external_anvil_project_can_edit
 from seqr.views.utils.variant_utils import get_phenotype_prioritization, get_omim_intervals_query, DISCOVERY_CATEGORY
 from seqr.utils.xpos_utils import get_chrom_pos
 
@@ -269,15 +269,20 @@ def update_family_fields_handler(request, family_guid):
     check_project_permissions(family.project, request.user)
 
     request_json = json.loads(request.body)
+    immutable_keys = [] if external_anvil_project_can_edit(family.project, request.user) else ['family_id']
     update_family_from_json(family, request_json, user=request.user, allow_unknown_keys=True, immutable_keys=[
-        'family_id', 'display_name',
-    ])
+        'display_name',
+    ] + immutable_keys)
 
     return create_json_response({
-        family.guid: _get_json_for_model(family, user=request.user)
+        family.guid: _get_json_for_model(family, user=request.user, process_result=_set_display_name)
     })
 
 
+def _set_display_name(family_json, family_model):
+    family_json['displayName'] = family_model.display_name or family_model.family_id
+
+
 @login_and_policies_required
 def update_family_assigned_analyst(request, family_guid):
     """Updates the specified field in the Family model.
diff --git a/seqr/views/apis/individual_api.py b/seqr/views/apis/individual_api.py
index 7b6d6f0c59..1224dd16ad 100644
--- a/seqr/views/apis/individual_api.py
+++ b/seqr/views/apis/individual_api.py
@@ -22,8 +22,8 @@
     GREGOR_FINDING_TAG_TYPE
 from seqr.views.utils.pedigree_info_utils import parse_pedigree_table, validate_fam_file_records, JsonConstants, ErrorsWarningsException
 from seqr.views.utils.permissions_utils import get_project_and_check_permissions, check_project_permissions, \
-    get_project_and_check_pm_permissions, login_and_policies_required, has_project_permissions, project_has_anvil, \
-    is_internal_anvil_project, pm_or_data_manager_required, check_workspace_perm
+    get_project_and_check_pm_permissions, login_and_policies_required, has_project_permissions, external_anvil_project_can_edit, \
+    pm_or_data_manager_required, check_workspace_perm
 from seqr.views.utils.project_context_utils import add_project_tag_type_counts
 from seqr.views.utils.individual_utils import delete_individuals, add_or_update_individuals_and_families
 from seqr.views.utils.variant_utils import bulk_create_tagged_variants
@@ -118,11 +118,6 @@ def _get_parsed_features(features):
     return list(parsed_features.values())
 
 
-def _anvil_project_can_edit_pedigree(project, user):
-    return project_has_anvil(project) and has_project_permissions(project, user, can_edit=True) and not \
-        is_internal_anvil_project(project)
-
-
 @login_and_policies_required
 def edit_individuals_handler(request, project_guid):
     """Modify one or more Individual records.
@@ -153,7 +148,7 @@ def edit_individuals_handler(request, project_guid):
     """
 
     project = get_project_and_check_pm_permissions(project_guid, request.user,
-                                                   override_permission_func=_anvil_project_can_edit_pedigree)
+                                                   override_permission_func=external_anvil_project_can_edit)
 
     request_json = json.loads(request.body)
 
@@ -230,7 +225,7 @@ def delete_individuals_handler(request, project_guid):
 
     # validate request
     project = get_project_and_check_pm_permissions(project_guid, request.user,
-                                                   override_permission_func=_anvil_project_can_edit_pedigree)
+                                                   override_permission_func=external_anvil_project_can_edit)
 
     request_json = json.loads(request.body)
     individuals_list = request_json.get('individuals')
diff --git a/seqr/views/utils/permissions_utils.py b/seqr/views/utils/permissions_utils.py
index 035df27b13..b440915052 100644
--- a/seqr/views/utils/permissions_utils.py
+++ b/seqr/views/utils/permissions_utils.py
@@ -145,6 +145,11 @@ def project_has_anvil(project):
     return anvil_enabled() and bool(project.workspace_namespace and project.workspace_name)
 
 
+def external_anvil_project_can_edit(project, user):
+    return project_has_anvil(project) and has_project_permissions(project, user, can_edit=True) and not \
+        is_internal_anvil_project(project)
+
+
 def _map_anvil_seqr_permission(anvil_permission):
     if anvil_permission.get('pending'):
         return None

From 57b4bd74246a319d315ed2d23af89ac25eef1fd0 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 15:53:16 -0400
Subject: [PATCH 32/51] add tests

---
 seqr/views/apis/family_api_tests.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/seqr/views/apis/family_api_tests.py b/seqr/views/apis/family_api_tests.py
index d638e07f7a..7c93eded24 100644
--- a/seqr/views/apis/family_api_tests.py
+++ b/seqr/views/apis/family_api_tests.py
@@ -11,7 +11,8 @@
     update_family_fields_handler, update_family_analysed_by, edit_families_handler, delete_families_handler, \
     receive_families_table_handler, create_family_note, update_family_note, delete_family_note, family_page_data, \
     family_variant_tag_summary, update_family_analysis_groups, get_family_rna_seq_data, get_family_phenotype_gene_scores
-from seqr.views.utils.test_utils import AuthenticationTestCase, FAMILY_NOTE_FIELDS, FAMILY_FIELDS, IGV_SAMPLE_FIELDS, \
+from seqr.views.utils.test_utils import AuthenticationTestCase, AnvilAuthenticationTestCase, \
+    FAMILY_NOTE_FIELDS, FAMILY_FIELDS, IGV_SAMPLE_FIELDS, \
     SAMPLE_FIELDS, INDIVIDUAL_FIELDS, INTERNAL_INDIVIDUAL_FIELDS, INTERNAL_FAMILY_FIELDS, CASE_REVIEW_FAMILY_FIELDS, \
     MATCHMAKER_SUBMISSION_FIELDS, TAG_TYPE_FIELDS, CASE_REVIEW_INDIVIDUAL_FIELDS
 from seqr.models import FamilyAnalysedBy, AnalysisGroup
@@ -35,8 +36,7 @@
 SAMPLE_GUIDS = ['S000129_na19675', 'S000130_na19678', 'S000131_na19679']
 
 
-class FamilyAPITest(AuthenticationTestCase):
-    fixtures = ['users', '1kg_project', 'reference_data']
+class FamilyAPITest(object):
 
     def test_family_page_data(self):
         url = reverse(family_page_data, args=[FAMILY_GUID])
@@ -253,6 +253,7 @@ def test_edit_families_handler(self, mock_pm_group):
         self.assertEqual(response.status_code, 403)
         mock_pm_group.__bool__.return_value = True
         mock_pm_group.resolve_expression.return_value = 'project-managers'
+        mock_pm_group.__eq__.side_effect = lambda s: s == 'project-managers'
 
         response = self.client.post(url, content_type='application/json', data=json.dumps({
             'families': [{'familyGuid': 'F000012_12'}]}))
@@ -307,6 +308,7 @@ def test_delete_families_handler(self, mock_pm_group):
         self.assertEqual(response.status_code, 403)
         mock_pm_group.__bool__.return_value = True
         mock_pm_group.resolve_expression.return_value = 'project-managers'
+        mock_pm_group.__eq__.side_effect = lambda s: s == 'project-managers'
 
         response = self.client.post(url, content_type='application/json', data=json.dumps({
             'families': [{'familyGuid': 'F000012_12'}]}))
@@ -436,6 +438,17 @@ def test_update_family_fields(self):
         self.assertEqual(response.status_code, 200)
         self.assertEqual(response.json()[FAMILY_GUID]['analysisStatusLastModifiedBy'], 'Test Collaborator User')
 
+        # Test External AnVIL projects
+        external_family_url = reverse(update_family_fields_handler, args=['F000014_14'])
+        response = self.client.post(external_family_url, content_type='application/json', data=json.dumps(body))
+        self.assertEqual(response.status_code, 200)
+        response_json = response.json()
+        self.assertEqual(response_json['F000014_14']['description'], 'Updated description')
+        self.assertEqual(response_json['F000014_14'][FAMILY_ID_FIELD], 'new_id' if self._anvil_enabled() else '14')
+
+    def _anvil_enabled(self):
+        return not self.ES_HOSTNAME
+
     @mock.patch('seqr.views.utils.permissions_utils.PM_USER_GROUP')
     def test_receive_families_table_handler(self, mock_pm_group):
         url = reverse(receive_families_table_handler, args=[PROJECT_GUID])
@@ -499,6 +512,7 @@ def test_receive_families_table_handler(self, mock_pm_group):
         self.assertEqual(response.status_code, 403)
         mock_pm_group.__bool__.return_value = True
         mock_pm_group.resolve_expression.return_value = 'project-managers'
+        mock_pm_group.__eq__.side_effect = lambda s: s == 'project-managers'
 
         response = self.client.post(url, {'f': SimpleUploadedFile('families.tsv', 'Family ID\n1'.encode('utf-8'))})
         self.assertEqual(response.status_code, 200)
@@ -607,3 +621,11 @@ def test_get_family_phenotype_gene_scores(self):
                 }
             }
         })
+
+
+class LocalFamilyAPITest(AuthenticationTestCase, FamilyAPITest):
+    fixtures = ['users', '1kg_project', 'reference_data']
+
+
+class AnvilFamilyAPITest(AnvilAuthenticationTestCase, FamilyAPITest):
+    fixtures = ['users', '1kg_project', 'reference_data']

From 4d8c599d93c582eae7d9bf254dad8826e2e9ae86 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 15:58:33 -0400
Subject: [PATCH 33/51] test display name response

---
 seqr/views/apis/family_api_tests.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/seqr/views/apis/family_api_tests.py b/seqr/views/apis/family_api_tests.py
index 7c93eded24..8d2a527c9a 100644
--- a/seqr/views/apis/family_api_tests.py
+++ b/seqr/views/apis/family_api_tests.py
@@ -428,6 +428,7 @@ def test_update_family_fields(self):
         response_json = response.json()
         self.assertEqual(response_json[FAMILY_GUID]['description'], 'Updated description')
         self.assertEqual(response_json[FAMILY_GUID][FAMILY_ID_FIELD], '1')
+        self.assertEqual(response_json[FAMILY_GUID]['displayName'], '1')
         self.assertEqual(response_json[FAMILY_GUID]['analysisStatus'], 'C')
         self.assertEqual(response_json[FAMILY_GUID]['analysisStatusLastModifiedBy'], 'Test Collaborator User')
         self.assertEqual(response_json[FAMILY_GUID]['analysisStatusLastModifiedDate'], '2020-01-01T00:00:00')
@@ -444,7 +445,9 @@ def test_update_family_fields(self):
         self.assertEqual(response.status_code, 200)
         response_json = response.json()
         self.assertEqual(response_json['F000014_14']['description'], 'Updated description')
-        self.assertEqual(response_json['F000014_14'][FAMILY_ID_FIELD], 'new_id' if self._anvil_enabled() else '14')
+        expected_id = 'new_id' if self._anvil_enabled() else '14'
+        self.assertEqual(response_json['F000014_14'][FAMILY_ID_FIELD], expected_id)
+        self.assertEqual(response_json['F000014_14']['displayName'], expected_id)
 
     def _anvil_enabled(self):
         return not self.ES_HOSTNAME

From 7c008863a69ee71025322b2acabb15d936025e24 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 16:52:04 -0400
Subject: [PATCH 34/51] mock persist file to gs

---
 seqr/views/apis/family_api_tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/seqr/views/apis/family_api_tests.py b/seqr/views/apis/family_api_tests.py
index 8d2a527c9a..6d11edca20 100644
--- a/seqr/views/apis/family_api_tests.py
+++ b/seqr/views/apis/family_api_tests.py
@@ -452,6 +452,7 @@ def test_update_family_fields(self):
     def _anvil_enabled(self):
         return not self.ES_HOSTNAME
 
+    @mock.patch('seqr.views.utils.file_utils.mv_file_to_gs', lambda *args: True)
     @mock.patch('seqr.views.utils.permissions_utils.PM_USER_GROUP')
     def test_receive_families_table_handler(self, mock_pm_group):
         url = reverse(receive_families_table_handler, args=[PROJECT_GUID])

From 9d74524ee557fc42bb40747c33c5280586e99b8b Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 24 Jul 2024 17:14:59 -0400
Subject: [PATCH 35/51] fix test persist file

---
 seqr/views/apis/family_api_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seqr/views/apis/family_api_tests.py b/seqr/views/apis/family_api_tests.py
index 6d11edca20..0544812056 100644
--- a/seqr/views/apis/family_api_tests.py
+++ b/seqr/views/apis/family_api_tests.py
@@ -452,7 +452,7 @@ def test_update_family_fields(self):
     def _anvil_enabled(self):
         return not self.ES_HOSTNAME
 
-    @mock.patch('seqr.views.utils.file_utils.mv_file_to_gs', lambda *args: True)
+    @mock.patch('seqr.views.utils.file_utils.anvil_enabled', lambda: False)
     @mock.patch('seqr.views.utils.permissions_utils.PM_USER_GROUP')
     def test_receive_families_table_handler(self, mock_pm_group):
         url = reverse(receive_families_table_handler, args=[PROJECT_GUID])

From 4c0c6563407e322e748008e1bb1e8c1f708b5275 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Thu, 25 Jul 2024 15:56:20 -0400
Subject: [PATCH 36/51] Add batch size to rna seq load route (#4257)

* Add batch size to rna seq load

* Pass kwargs down
---
 seqr/models.py                      | 8 ++++----
 seqr/views/apis/data_manager_api.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/seqr/models.py b/seqr/models.py
index 4514644634..a3b6cbc36e 100644
--- a/seqr/models.py
+++ b/seqr/models.py
@@ -124,13 +124,13 @@ def delete_model(self, user, user_can_delete=False):
         log_model_update(logger, self, user, 'delete')
 
     @classmethod
-    def bulk_create(cls, user, new_models):
+    def bulk_create(cls, user, new_models, **kwargs):
         """Helper bulk create method that logs the creation"""
         for model in new_models:
             model.created_by = user
             model.created_date = timezone.now()
             model.guid = model._format_guid(random.randint(10**(cls.GUID_PRECISION-1), 10**cls.GUID_PRECISION))  # nosec
-        models = cls.objects.bulk_create(new_models)
+        models = cls.objects.bulk_create(new_models, **kwargs)
         log_model_bulk_update(logger, models, user, 'create')
         return models
 
@@ -1139,11 +1139,11 @@ def log_model_no_guid_bulk_update(cls, models, user, update_type):
         logger.info(f'{update_type} {db_entity}s', user, db_update=db_update)
 
     @classmethod
-    def bulk_create(cls, user, new_models):
+    def bulk_create(cls, user, new_models, **kwargs):
         """Helper bulk create method that logs the creation"""
         for model in new_models:
             model.created_by = user
-        models = cls.objects.bulk_create(new_models)
+        models = cls.objects.bulk_create(new_models, **kwargs)
         cls.log_model_no_guid_bulk_update(models, user, 'create')
         return models
 
diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
index 0960c2d3eb..4b2f9d1e1b 100644
--- a/seqr/views/apis/data_manager_api.py
+++ b/seqr/views/apis/data_manager_api.py
@@ -336,7 +336,7 @@ def load_rna_seq_sample_data(request, sample_guid):
         return create_json_response({'error': error}, status=400)
 
     model_cls = config['model_class']
-    model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_rows])
+    model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_rows], batch_size=1000)
     update_model_from_json(sample, {'is_active': True}, user=request.user)
 
     return create_json_response({'success': True})

From 23fbddd497d4300d738c31ed565bf4b78a6c760e Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 25 Jul 2024 16:05:37 -0400
Subject: [PATCH 37/51] pr feedback

---
 seqr/views/apis/data_manager_api.py       | 10 +++++-----
 seqr/views/apis/data_manager_api_tests.py |  2 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
index c268250fb0..1ad54f5d99 100644
--- a/seqr/views/apis/data_manager_api.py
+++ b/seqr/views/apis/data_manager_api.py
@@ -498,7 +498,7 @@ def _fetch_airtable_loadable_project_samples(user):
     project_samples = defaultdict(set)
     for pdo in pdos.values():
         project_guid = re.match(
-            'https://seqr.broadinstitute.org/project/([^/]+)/project_page', pdo['SeqrProjectURL'],
+            f'{BASE_URL}project/([^/]+)/project_page', pdo['SeqrProjectURL'],
         ).group(1)
         project_samples[project_guid].update([
             sample_id for sample_id in pdo['PassingCollaboratorSampleIDs'] + pdo['SeqrIDs'] if sample_id
@@ -592,16 +592,16 @@ def _get_valid_project_samples(project_samples, sample_type, user):
     return individual_ids
 
 
-def _get_loaded_samples(sample_keys, user):
-    sample_ids = [sample_id for _, sample_id in sample_keys]
+def _get_loaded_samples(project_samples, user):
+    sample_ids = [sample_id for _, sample_id in project_samples]
     samples_by_id = AirtableSession(user).get_samples_for_sample_ids(sample_ids, ['PDOStatus', 'SeqrProject'])
-    return [(project, sample_id) for project, sample_id in sample_keys if any(
+    return [(project, sample_id) for project, sample_id in project_samples if any(
         _is_loaded_airtable_sample(s, project) for s in samples_by_id.get(sample_id, [])
     )]
 
 
 def _is_loaded_airtable_sample(sample, project_guid):
-    return f'https://seqr.broadinstitute.org/project/{project_guid}/project_page' in sample['SeqrProject'] and any(
+    return f'{BASE_URL}project/{project_guid}/project_page' in sample['SeqrProject'] and any(
         status in AVAILABLE_PDO_STATUSES for status in sample['PDOStatus'])
 
 
diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py
index 42cf958262..81e77f0fac 100644
--- a/seqr/views/apis/data_manager_api_tests.py
+++ b/seqr/views/apis/data_manager_api_tests.py
@@ -1460,6 +1460,7 @@ def test_validate_callset(self, mock_subprocess):
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 200)
 
+    @mock.patch('seqr.views.apis.data_manager_api.BASE_URL', 'https://seqr.broadinstitute.org/')
     @mock.patch('seqr.views.utils.airtable_utils.is_google_authenticated', lambda x: True)
     @responses.activate
     def test_get_loaded_projects(self):
@@ -1589,6 +1590,7 @@ def _get_dag_variable_overrides(*args, **kwargs):
         }
 
     @responses.activate
+    @mock.patch('seqr.views.apis.data_manager_api.BASE_URL', 'https://seqr.broadinstitute.org/')
     @mock.patch('seqr.views.utils.export_utils.open')
     @mock.patch('seqr.views.utils.export_utils.TemporaryDirectory')
     def test_load_data(self, mock_temp_dir, mock_open):

From f3d2cf5dc037c43499c0efa9b623d55136c6759b Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 25 Jul 2024 17:18:21 -0400
Subject: [PATCH 38/51] no max families for hpo search

---
 ui/pages/Search/VariantSearch.jsx         |  2 +-
 ui/pages/Search/components/PageHeader.jsx | 18 ++++++++--
 ui/pages/Search/selectors.js              |  8 +++--
 ui/pages/SummaryData/components/Hpo.jsx   | 40 +++++++++++++++--------
 ui/redux/rootReducer.js                   |  5 +--
 ui/redux/selectors.js                     |  1 +
 ui/redux/utils/configureStore.js          |  2 +-
 7 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/ui/pages/Search/VariantSearch.jsx b/ui/pages/Search/VariantSearch.jsx
index 843f9f79db..59e4274206 100644
--- a/ui/pages/Search/VariantSearch.jsx
+++ b/ui/pages/Search/VariantSearch.jsx
@@ -28,7 +28,7 @@ const VariantSearch = ({ match }) => (
       <Grid.Column width={16}>
         <Switch>
           <Route path={SEARCH_FORM_PAGES.map(pagePath => `${match.url}/${pagePath}`)} component={VariantSearchForm} />
-          <Route path={`${match.url}/families/:families`} component={NoEditProjectsVariantSearchForm} />
+          <Route path={`${match.url}/families/:familiesHash`} component={NoEditProjectsVariantSearchForm} />
           <Route path={`${match.url}/${SINGLE_VARIANT_RESULTS_PATH}`} />
           <Route path={match.url} exact component={VariantSearchForm} />
           <Route component={Error404} />
diff --git a/ui/pages/Search/components/PageHeader.jsx b/ui/pages/Search/components/PageHeader.jsx
index 58b2b74ca0..a8beb47253 100644
--- a/ui/pages/Search/components/PageHeader.jsx
+++ b/ui/pages/Search/components/PageHeader.jsx
@@ -2,7 +2,7 @@ import React from 'react'
 import PropTypes from 'prop-types'
 import { connect } from 'react-redux'
 
-import { getProjectsByGuid, getFamiliesByGuid, getAnalysisGroupsByGuid, getSearchesByHash } from 'redux/selectors'
+import { getProjectsByGuid, getFamiliesByGuid, getAnalysisGroupsByGuid, getSearchesByHash, getSearchFamiliesByHash } from 'redux/selectors'
 import PageHeaderLayout from 'shared/components/page/PageHeaderLayout'
 import { snakecaseToTitlecase } from 'shared/utils/stringUtils'
 
@@ -19,7 +19,14 @@ const PAGE_CONFIGS = {
     entity: analysisGroupsByGuid[entityGuid],
     entityUrlPath: `analysis_group/${entityGuid}`,
   }),
-  families: entityGuid => ({ description: `Searching in ${entityGuid.split(/[,:]/).length} Families` }),
+  families: (entityGuid, p, f, a, s, searchFamiliesByHash) => {
+    const numFamilies = Object.values(searchFamiliesByHash[entityGuid] || {}).reduce(
+      (acc, familyGuids) => acc + familyGuids.length, 0,
+    )
+    return {
+      description: `Searching in ${numFamilies} Families`,
+    }
+  },
   results: (entityGuid, projectsByGuid, familiesByGuid, analysisGroupsByGuid, searchesByHash) => {
     const { projectFamilies } = searchesByHash[entityGuid] || {}
     let pageType
@@ -62,12 +69,15 @@ const PAGE_CONFIGS = {
   variant: entityGuid => ({ entity: { name: entityGuid } }),
 }
 
-const getPageHeaderProps = ({ projectsByGuid, familiesByGuid, analysisGroupsByGuid, searchesByHash, match }) => {
+const getPageHeaderProps = (
+  { projectsByGuid, familiesByGuid, analysisGroupsByGuid, searchesByHash, searchFamiliesByHash, match },
+) => {
   const { pageType, entityGuid, subPageType, subEntityGuid } = match.params
 
   const breadcrumbIdSections = []
   const { entity, entityUrlPath, actualPageType, description } = PAGE_CONFIGS[subPageType || pageType](
     subEntityGuid || entityGuid, projectsByGuid, familiesByGuid, analysisGroupsByGuid, searchesByHash,
+    searchFamiliesByHash,
   )
   if (entity) {
     const project = projectsByGuid[entity.projectGuid || entityGuid]
@@ -90,6 +100,7 @@ PageHeader.propTypes = {
   familiesByGuid: PropTypes.object,
   analysisGroupsByGuid: PropTypes.object,
   searchesByHash: PropTypes.object,
+  searchFamiliesByHash: PropTypes.object,
   match: PropTypes.object,
 }
 
@@ -98,6 +109,7 @@ const mapStateToProps = state => ({
   familiesByGuid: getFamiliesByGuid(state),
   analysisGroupsByGuid: getAnalysisGroupsByGuid(state),
   searchesByHash: getSearchesByHash(state),
+  searchFamiliesByHash: getSearchFamiliesByHash(state),
 })
 
 export default connect(mapStateToProps)(PageHeader)
diff --git a/ui/pages/Search/selectors.js b/ui/pages/Search/selectors.js
index 04b5f042d2..929871dd3d 100644
--- a/ui/pages/Search/selectors.js
+++ b/ui/pages/Search/selectors.js
@@ -10,6 +10,7 @@ import {
   getCurrentSearchParams,
   getUser,
   getProjectDatasetTypes,
+  getSearchFamiliesByHash,
 } from 'redux/selectors'
 import { FAMILY_ANALYSIS_STATUS_LOOKUP } from 'shared/utils/constants'
 import { compareObjects } from 'shared/utils/sortUtils'
@@ -66,9 +67,10 @@ export const getProjectFamilies = createSelector(
 
 export const getMultiProjectFamilies = createSelector(
   (state, props) => props.match.params,
-  params => ({
-    projectFamilies: params.families.split(':').map(f => f.split(';')).map(
-      ([projectGuid, familyGuids]) => ({ projectGuid, familyGuids: familyGuids.split(',') }),
+  getSearchFamiliesByHash,
+  (params, searchFamiliesByHash) => ({
+    projectFamilies: Object.entries(searchFamiliesByHash[params.familiesHash] || {}).map(
+      ([projectGuid, familyGuids]) => ({ projectGuid, familyGuids }),
     ),
   }),
 )
diff --git a/ui/pages/SummaryData/components/Hpo.jsx b/ui/pages/SummaryData/components/Hpo.jsx
index 14bf7d0f18..97e4610e14 100644
--- a/ui/pages/SummaryData/components/Hpo.jsx
+++ b/ui/pages/SummaryData/components/Hpo.jsx
@@ -1,7 +1,9 @@
 import React from 'react'
-import { NavLink } from 'react-router-dom'
+import { connect } from 'react-redux'
+import PropTypes from 'prop-types'
 import { Divider, Button, Header } from 'semantic-ui-react'
 
+import { navigateSavedHashedSearch } from 'redux/rootReducer'
 import { NoHoverFamilyLink } from 'shared/components/buttons/FamilyLink'
 import AwesomeBar from 'shared/components/page/AwesomeBar'
 import { Phenotypes } from 'shared/components/panel/MatchmakerPanel'
@@ -12,7 +14,6 @@ import { HttpRequestHelper } from 'shared/utils/httpRequestHelper'
 import { GENOME_VERSION_LOOKUP } from 'shared/utils/constants'
 
 const SEARCH_CATEGORIES = ['hpo_terms']
-const MAX_SEARCH_FAMILIES = 500
 const ID_FIELD = 'individualGuid'
 const COLUMNS = [
   {
@@ -30,7 +31,9 @@ const COLUMNS = [
 
 class Hpo extends React.PureComponent {
 
-  static propTypes = {}
+  static propTypes = {
+    navigateSearch: PropTypes.func.isRequired,
+  }
 
   state = {
     data: [],
@@ -71,6 +74,7 @@ class Hpo extends React.PureComponent {
 
   render() {
     const { terms, data, loading, error } = this.state
+    const { navigateSearch } = this.props
 
     const familiesByGenomeVersion = data.reduce((acc, { familyData }) => {
       if (!acc[familyData.genomeVersion]) {
@@ -84,9 +88,9 @@ class Hpo extends React.PureComponent {
       (acc, families) => acc + Object.keys(families).length, 0,
     )
 
-    const genomeSearchPaths = Object.entries(familiesByGenomeVersion).map(([genomeVersion, familyAcc]) => {
+    const genomeProjectFamilies = Object.entries(familiesByGenomeVersion).map(([genomeVersion, familyAcc]) => {
       const families = Object.entries(familyAcc)
-      const searchPath = families.length < MAX_SEARCH_FAMILIES ? Object.entries(families.reduce(
+      const projectFamilies = families.reduce(
         (acc, [familyGuid, projectGuid]) => {
           if (!acc[projectGuid]) {
             acc[projectGuid] = []
@@ -94,8 +98,8 @@ class Hpo extends React.PureComponent {
           acc[projectGuid].push(familyGuid)
           return acc
         }, {},
-      )).map(([projectGuid, familyGuids]) => `${projectGuid};${familyGuids.join(',')}`).join(':') : ''
-      return [genomeVersion, searchPath]
+      )
+      return [genomeVersion, projectFamilies]
     })
 
     return (
@@ -124,14 +128,12 @@ class Hpo extends React.PureComponent {
           <Header size="medium">
             <Header.Content>{`${numFamilies} Families, ${data.length} Individuals`}</Header.Content>
             <Header.Subheader>
-              {genomeSearchPaths.map(([genomeVersion, searchPath]) => (
+              {genomeProjectFamilies.map(([genomeVersion, projectFamilies]) => (
                 <span key={genomeVersion}>
                   {`${GENOME_VERSION_LOOKUP[genomeVersion]}: `}
                   <ButtonLink
-                    as={NavLink}
-                    disabled={!searchPath}
-                    target="_blank"
-                    to={`/variant_search/families/${searchPath}`}
+                    onClick={navigateSearch}
+                    projectFamilies={projectFamilies}
                   >
                     {`Variant Search - ${Object.keys(familiesByGenomeVersion[genomeVersion]).length} Families`}
                   </ButtonLink>
@@ -155,4 +157,16 @@ class Hpo extends React.PureComponent {
 
 }
 
-export default Hpo
+const mapDispatchToProps = dispatch => ({
+  navigateSearch: (e, { projectFamilies }) => {
+    e.stopPropagation()
+    dispatch(navigateSavedHashedSearch(
+      projectFamilies,
+      resultsLink => window.open(resultsLink, '_blank'),
+      '/variant_search/families',
+      'searchFamiliesByHash',
+    ))
+  },
+})
+
+export default connect(null, mapDispatchToProps)(Hpo)
diff --git a/ui/redux/rootReducer.js b/ui/redux/rootReducer.js
index 8570df4b8e..8053d84340 100644
--- a/ui/redux/rootReducer.js
+++ b/ui/redux/rootReducer.js
@@ -187,11 +187,11 @@ export const updateGeneNote = values => updateEntity(
   values, RECEIVE_DATA, `/api/gene_info/${values.geneId || values.gene_id}/note`, 'noteGuid',
 )
 
-export const navigateSavedHashedSearch = (search, navigateSearch, resultsPath) => (dispatch) => {
+export const navigateSavedHashedSearch = (search, navigateSearch, resultsPath, hashKey) => (dispatch) => {
   // lazy load object-hash library as it is not used anywhere else
   import('object-hash').then((hash) => {
     const searchHash = hash.default.MD5(search)
-    dispatch({ type: RECEIVE_SAVED_SEARCHES, updatesById: { searchesByHash: { [searchHash]: search } } })
+    dispatch({ type: RECEIVE_SAVED_SEARCHES, updatesById: { [hashKey || 'searchesByHash']: { [searchHash]: search } } })
     navigateSearch(`${resultsPath || '/variant_search/results'}/${searchHash}`)
   })
 }
@@ -343,6 +343,7 @@ const rootReducer = combineReducers({
   variantNotesByGuid: createObjectsByIdReducer(RECEIVE_DATA, 'variantNotesByGuid'),
   variantFunctionalDataByGuid: createObjectsByIdReducer(RECEIVE_DATA, 'variantFunctionalDataByGuid'),
   searchesByHash: createObjectsByIdReducer(RECEIVE_SAVED_SEARCHES, 'searchesByHash'),
+  searchFamiliesByHash: createObjectsByIdReducer(RECEIVE_SAVED_SEARCHES, 'searchFamiliesByHash'),
   searchedVariants: createSingleValueReducer(RECEIVE_SEARCHED_VARIANTS, []),
   searchedVariantsLoading: loadingReducer(REQUEST_SEARCHED_VARIANTS, RECEIVE_SEARCHED_VARIANTS),
   searchGeneBreakdown: createObjectsByIdReducer(RECEIVE_SEARCH_GENE_BREAKDOWN, 'searchGeneBreakdown'),
diff --git a/ui/redux/selectors.js b/ui/redux/selectors.js
index 5db3c308e3..4ef20a4229 100644
--- a/ui/redux/selectors.js
+++ b/ui/redux/selectors.js
@@ -45,6 +45,7 @@ export const getAnvilLoadingDelayDate = state => state.meta.anvilLoadingDelayDat
 export const getSavedVariantsIsLoading = state => state.savedVariantsLoading.isLoading
 export const getSavedVariantsLoadingError = state => state.savedVariantsLoading.errorMessage
 export const getSearchesByHash = state => state.searchesByHash
+export const getSearchFamiliesByHash = state => state.searchFamiliesByHash
 export const getSearchedVariants = state => state.searchedVariants
 export const getSearchedVariantsIsLoading = state => state.searchedVariantsLoading.isLoading
 export const getSearchedVariantsErrorMessage = state => state.searchedVariantsLoading.errorMessage
diff --git a/ui/redux/utils/configureStore.js b/ui/redux/utils/configureStore.js
index a35be8fe44..cbbaf06326 100644
--- a/ui/redux/utils/configureStore.js
+++ b/ui/redux/utils/configureStore.js
@@ -5,7 +5,7 @@ import { loadState, saveState } from 'shared/utils/localStorage'
 
 const PERSISTING_STATE = [
   'projectsTableState', 'familyTableState', 'savedVariantTableState', 'variantSearchDisplay', 'searchesByHash',
-  'familyTableFilterState',
+  'familyTableFilterState', 'searchFamiliesByHash',
 ]
 
 const persistStoreMiddleware = store => next => (action) => {

From 73f70942e15e7ce02805bfa07c0c4e999ab5e394 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 26 Jul 2024 14:22:05 -0400
Subject: [PATCH 39/51] clear unused freq fields on header change

---
 ui/shared/components/panel/search/FrequencyFilter.jsx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ui/shared/components/panel/search/FrequencyFilter.jsx b/ui/shared/components/panel/search/FrequencyFilter.jsx
index 2deb5ddf0d..6d35360694 100644
--- a/ui/shared/components/panel/search/FrequencyFilter.jsx
+++ b/ui/shared/components/panel/search/FrequencyFilter.jsx
@@ -145,9 +145,9 @@ const callsetChange = (onChange, initialValues) => val => onChange(
   { ...initialValues, [THIS_CALLSET_FREQUENCY]: val, [SV_CALLSET_FREQUENCY]: val },
 )
 
-const freqChange = (onChange, initialValues) => val => onChange(FREQUENCIES.filter(
+const freqChange = onChange => val => onChange(FREQUENCIES.filter(
   ({ name }) => name !== THIS_CALLSET_FREQUENCY && name !== SV_CALLSET_FREQUENCY,
-).reduce((acc, { name }) => ({ ...acc, [name]: val }), initialValues || {}))
+).reduce((acc, { name }) => ({ ...acc, [name]: val }), {}))
 
 export const HeaderFrequencyFilter = ({ value, onChange, esEnabled, ...props }) => {
   const { callset, sv_callset: svCallset, ...freqValues } = value || {}
@@ -155,7 +155,7 @@ export const HeaderFrequencyFilter = ({ value, onChange, esEnabled, ...props })
 
   const onCallsetChange = callsetChange(onChange, freqValues)
 
-  const onFreqChange = freqChange(onChange, value)
+  const onFreqChange = freqChange(onChange)
   const callsetTitle = esEnabled ? 'Callset' : 'seqr'
 
   return (

From 13ef9ef23454b1183f5526222c4aa9a5eb6b2852 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 26 Jul 2024 14:23:53 -0400
Subject: [PATCH 40/51] larger freq header inputs

---
 ui/shared/components/panel/search/VariantSearchFormPanels.jsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ui/shared/components/panel/search/VariantSearchFormPanels.jsx b/ui/shared/components/panel/search/VariantSearchFormPanels.jsx
index 619676072d..f1bd1f6e83 100644
--- a/ui/shared/components/panel/search/VariantSearchFormPanels.jsx
+++ b/ui/shared/components/panel/search/VariantSearchFormPanels.jsx
@@ -195,7 +195,7 @@ export const FREQUENCY_PANEL = {
   name: 'freqs',
   headerProps: {
     title: 'Frequency',
-    inputSize: 10,
+    inputSize: 12,
     inputProps: {
       component: HeaderFrequencyFilter,
       format: val => val || {},

From 85e578f4471cb92dcfb468730eebd7aeb4ea745a Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 26 Jul 2024 14:28:32 -0400
Subject: [PATCH 41/51] dont clear callset filter on freq update

---
 ui/shared/components/panel/search/FrequencyFilter.jsx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ui/shared/components/panel/search/FrequencyFilter.jsx b/ui/shared/components/panel/search/FrequencyFilter.jsx
index 6d35360694..695aa0fcff 100644
--- a/ui/shared/components/panel/search/FrequencyFilter.jsx
+++ b/ui/shared/components/panel/search/FrequencyFilter.jsx
@@ -145,9 +145,9 @@ const callsetChange = (onChange, initialValues) => val => onChange(
   { ...initialValues, [THIS_CALLSET_FREQUENCY]: val, [SV_CALLSET_FREQUENCY]: val },
 )
 
-const freqChange = onChange => val => onChange(FREQUENCIES.filter(
-  ({ name }) => name !== THIS_CALLSET_FREQUENCY && name !== SV_CALLSET_FREQUENCY,
-).reduce((acc, { name }) => ({ ...acc, [name]: val }), {}))
+const freqChange = (onChange, initialValues) => val => onChange(FREQUENCIES.reduce((acc, { name }) => ({
+  ...acc, [name]: name !== THIS_CALLSET_FREQUENCY && name !== SV_CALLSET_FREQUENCY ? val : initialValues[name],
+}), initialValues || {}))
 
 export const HeaderFrequencyFilter = ({ value, onChange, esEnabled, ...props }) => {
   const { callset, sv_callset: svCallset, ...freqValues } = value || {}
@@ -155,7 +155,7 @@ export const HeaderFrequencyFilter = ({ value, onChange, esEnabled, ...props })
 
   const onCallsetChange = callsetChange(onChange, freqValues)
 
-  const onFreqChange = freqChange(onChange)
+  const onFreqChange = freqChange(onChange, value)
   const callsetTitle = esEnabled ? 'Callset' : 'seqr'
 
   return (

From d5d9baf9d7220ef7cf7a11df15f981db3f83c80c Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 26 Jul 2024 14:34:55 -0400
Subject: [PATCH 42/51] add browser support to FAQ

---
 ui/pages/Public/components/Faq.jsx | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/ui/pages/Public/components/Faq.jsx b/ui/pages/Public/components/Faq.jsx
index f8b6129edc..5bed4f052c 100644
--- a/ui/pages/Public/components/Faq.jsx
+++ b/ui/pages/Public/components/Faq.jsx
@@ -236,6 +236,17 @@ const FAQS = [
         </div>
       ),
     },
+  }, {
+    [ENGLISH]: {
+      header: 'Q. Which browsers are supported for seqr?',
+      content: `seqr is only supported in Google Chrome. While it may sometimes function in other browsers, to ensure 
+      reliable behavior you should only use seqr in Chrome`,
+    },
+    [SPANISH]: {
+      header: 'P: ¿Cuáles navegadores son compatibles con seqr?',
+      content: `seqr solamente es compatible con Google Chrome. Aunque a veces puede funcionar en otros navegadores, 
+      para garantizar un funcionamiento fiable sólo debe usar seqr en Chrome.`,
+    },
   }, {
     [ENGLISH]: {
       header: 'Q. How can I set up seqr locally?',

From 0c96bc36400ee7fab6f15f80d4fef9bfb4734a14 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 31 Jul 2024 11:52:03 -0400
Subject: [PATCH 43/51] shared saved variant model util

---
 .../check_for_new_samples_from_pipeline.py      | 10 +++++-----
 seqr/views/utils/variant_utils.py               | 17 ++++++++++++++---
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py
index c530eef6b6..bc68c142ba 100644
--- a/seqr/management/commands/check_for_new_samples_from_pipeline.py
+++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py
@@ -15,7 +15,7 @@
 from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup, search_data_type
 from seqr.views.utils.dataset_utils import match_and_update_search_samples
 from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json, \
-    saved_variants_dataset_type_filter
+    get_saved_variants
 from settings import SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL
 
 logger = logging.getLogger(__name__)
@@ -153,10 +153,10 @@ def _reload_shared_variant_annotations(data_type, genome_version, updated_varian
         if is_sv:
             updated_annotation_samples = updated_annotation_samples.filter(sample_type=data_type.split('_')[1])
 
-        variant_models = SavedVariant.objects.filter(
-            family_id__in=updated_annotation_samples.values_list('individual__family', flat=True).distinct(),
-            **saved_variants_dataset_type_filter(dataset_type),
-        ).filter(Q(saved_variant_json__genomeVersion__isnull=True) | Q(saved_variant_json__genomeVersion=db_genome_version))
+        variant_models = get_saved_variants(
+            family_guids=updated_annotation_samples.values_list('individual__family__guid', flat=True).distinct(),
+            dataset_type=dataset_type, genome_version=genome_version,
+        )
 
         if not variant_models:
             logger.info('No additional saved variants to update')
diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py
index 17d1f7e36a..5015c78f3a 100644
--- a/seqr/views/utils/variant_utils.py
+++ b/seqr/views/utils/variant_utils.py
@@ -66,13 +66,24 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs):
     return updated_variants_by_id
 
 
-def update_project_saved_variant_json(project_id, family_guids=None, dataset_type=None, user=None, user_email=None):
-    saved_variants = SavedVariant.objects.filter(family__project_id=project_id).select_related('family')
+def get_saved_variants(project_id=None, family_guids=None, dataset_type=None, genome_version=None):
+    saved_variants = SavedVariant.objects.all()
+    if project_id:
+        saved_variants = saved_variants.filter(family__project_id=project_id)
     if family_guids:
         saved_variants = saved_variants.filter(family__guid__in=family_guids)
-
     if dataset_type:
         saved_variants = saved_variants.filter(**saved_variants_dataset_type_filter(dataset_type))
+    if genome_version:
+        db_genome_version = genome_version.replace('GRCh', '')
+        saved_variants = saved_variants.filter(
+            Q(saved_variant_json__genomeVersion__isnull=True) | Q(saved_variant_json__genomeVersion=db_genome_version)
+        )
+    return saved_variants
+
+
+def update_project_saved_variant_json(project_id, family_guids=None, dataset_type=None, user=None, user_email=None):
+    saved_variants = get_saved_variants(project_id, family_guids, dataset_type).select_related('family')
 
     if not saved_variants:
         return None

From 3523d98e666848e0604c60f62ae8427f8d4fdf05 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 31 Jul 2024 12:02:45 -0400
Subject: [PATCH 44/51] only reload variants on project genome build

---
 .../check_for_new_samples_from_pipeline.py    |  4 ++--
 .../commands/reload_saved_variant_json.py     |  2 +-
 seqr/views/apis/saved_variant_api.py          |  2 +-
 seqr/views/utils/variant_utils.py             | 20 +++++++++----------
 4 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py
index bc68c142ba..17824d8114 100644
--- a/seqr/management/commands/check_for_new_samples_from_pipeline.py
+++ b/seqr/management/commands/check_for_new_samples_from_pipeline.py
@@ -108,7 +108,7 @@ def handle(self, *args, **options):
             )
             project_families = project_sample_data['family_guids']
             updated_families.update(project_families)
-            updated_project_families.append((project.id, project.name, project_families))
+            updated_project_families.append((project.id, project.name, project.genome_version, project_families))
 
         # Send failure notifications
         failed_family_samples = metadata.get('failed_family_samples', {})
@@ -154,8 +154,8 @@ def _reload_shared_variant_annotations(data_type, genome_version, updated_varian
             updated_annotation_samples = updated_annotation_samples.filter(sample_type=data_type.split('_')[1])
 
         variant_models = get_saved_variants(
+            genome_version, dataset_type=dataset_type,
             family_guids=updated_annotation_samples.values_list('individual__family__guid', flat=True).distinct(),
-            dataset_type=dataset_type, genome_version=genome_version,
         )
 
         if not variant_models:
diff --git a/seqr/management/commands/reload_saved_variant_json.py b/seqr/management/commands/reload_saved_variant_json.py
index 2e83305eaa..eea208cf32 100644
--- a/seqr/management/commands/reload_saved_variant_json.py
+++ b/seqr/management/commands/reload_saved_variant_json.py
@@ -27,6 +27,6 @@ def handle(self, *args, **options):
             logging.info("Processing all %s projects" % len(projects))
 
         family_ids = [family_guid] if family_guid else None
-        project_list = [(*project, family_ids) for project in projects.values_list('id', 'name')]
+        project_list = [(*project, family_ids) for project in projects.values_list('id', 'name', 'genome_version')]
         update_projects_saved_variant_json(project_list, user_email='manage_command')
         logger.info("Done")
diff --git a/seqr/views/apis/saved_variant_api.py b/seqr/views/apis/saved_variant_api.py
index 9220d0cbeb..18740ba3b9 100644
--- a/seqr/views/apis/saved_variant_api.py
+++ b/seqr/views/apis/saved_variant_api.py
@@ -303,7 +303,7 @@ def update_saved_variant_json(request, project_guid):
     project = get_project_and_check_permissions(project_guid, request.user, can_edit=True)
     reset_cached_search_results(project)
     try:
-        updated_saved_variant_guids = update_project_saved_variant_json(project.id, user=request.user)
+        updated_saved_variant_guids = update_project_saved_variant_json(project.id, project.genome_version, user=request.user)
     except Exception as e:
         logger.error('Unable to reset saved variant json for {}: {}'.format(project_guid, e))
         updated_saved_variant_guids = []
diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py
index 5015c78f3a..859f73b9cc 100644
--- a/seqr/views/utils/variant_utils.py
+++ b/seqr/views/utils/variant_utils.py
@@ -37,10 +37,10 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs):
     error = {}
     updated_variants_by_id = {}
     logger.info(f'Reloading saved variants in {len(projects)} projects')
-    for project_id, project_name, family_guids in tqdm(projects, unit=' project'):
+    for project_id, project_name, genome_version, family_guids in tqdm(projects, unit=' project'):
         try:
             updated_saved_variants = update_project_saved_variant_json(
-                project_id, user_email=user_email, family_guids=family_guids, **kwargs)
+                project_id, genome_version, user_email=user_email, family_guids=family_guids, **kwargs)
             if updated_saved_variants is None:
                 skipped[project_name] = True
             else:
@@ -66,24 +66,22 @@ def update_projects_saved_variant_json(projects, user_email, **kwargs):
     return updated_variants_by_id
 
 
-def get_saved_variants(project_id=None, family_guids=None, dataset_type=None, genome_version=None):
-    saved_variants = SavedVariant.objects.all()
+def get_saved_variants(genome_version, project_id=None, family_guids=None, dataset_type=None):
+    saved_variants = SavedVariant.objects.filter(
+        Q(saved_variant_json__genomeVersion__isnull=True) |
+        Q(saved_variant_json__genomeVersion=genome_version.replace('GRCh', ''))
+    )
     if project_id:
         saved_variants = saved_variants.filter(family__project_id=project_id)
     if family_guids:
         saved_variants = saved_variants.filter(family__guid__in=family_guids)
     if dataset_type:
         saved_variants = saved_variants.filter(**saved_variants_dataset_type_filter(dataset_type))
-    if genome_version:
-        db_genome_version = genome_version.replace('GRCh', '')
-        saved_variants = saved_variants.filter(
-            Q(saved_variant_json__genomeVersion__isnull=True) | Q(saved_variant_json__genomeVersion=db_genome_version)
-        )
     return saved_variants
 
 
-def update_project_saved_variant_json(project_id, family_guids=None, dataset_type=None, user=None, user_email=None):
-    saved_variants = get_saved_variants(project_id, family_guids, dataset_type).select_related('family')
+def update_project_saved_variant_json(project_id, genome_version, family_guids=None, dataset_type=None, user=None, user_email=None):
+    saved_variants = get_saved_variants(genome_version, project_id, family_guids, dataset_type).select_related('family')
 
     if not saved_variants:
         return None

From e5e3140954cacf9563e0f675facbf4eaa8186f60 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 31 Jul 2024 13:18:35 -0400
Subject: [PATCH 45/51] update unit tests

---
 .../check_for_new_samples_from_pipeline_tests.py   |  7 ++++---
 .../tests/reload_saved_variant_json_tests.py       | 14 +++++++-------
 seqr/views/apis/saved_variant_api_tests.py         |  6 +++---
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
index 193d7dee86..f9f5f83748 100644
--- a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
+++ b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
@@ -193,9 +193,10 @@ def test_command(self, mock_email, mock_airtable_utils):
 
         # Update fixture data to allow testing edge cases
         Project.objects.filter(id__in=[1, 3]).update(genome_version=38)
-        sv = SavedVariant.objects.get(guid='SV0000002_1248367227_r0390_100')
-        sv.saved_variant_json['genomeVersion'] = '38'
-        sv.save()
+        svs = SavedVariant.objects.filter(guid__in=['SV0000002_1248367227_r0390_100', 'SV0000006_1248367227_r0003_tes'])
+        for sv in svs:
+            sv.saved_variant_json['genomeVersion'] = '38'
+            sv.save()
 
         with self.assertRaises(ValueError) as ce:
             call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08')
diff --git a/seqr/management/tests/reload_saved_variant_json_tests.py b/seqr/management/tests/reload_saved_variant_json_tests.py
index 04175a94fc..4ceb4314b6 100644
--- a/seqr/management/tests/reload_saved_variant_json_tests.py
+++ b/seqr/management/tests/reload_saved_variant_json_tests.py
@@ -27,12 +27,12 @@ def test_with_param_command(self, mock_get_variants, mock_logger):
 
         family_1 = Family.objects.get(id=1)
         mock_get_variants.assert_called_with(
-            [family_1], ['1-1562437-G-CA', '1-46859832-G-A','21-3343353-GAGA-G'], user=None, user_email='manage_command')
+            [family_1], ['1-46859832-G-A','21-3343353-GAGA-G'], user=None, user_email='manage_command')
 
         logger_info_calls = [
-            mock.call('Updated 3 variants for project 1kg project n\xe5me with uni\xe7\xf8de'),
+            mock.call('Updated 2 variants for project 1kg project n\xe5me with uni\xe7\xf8de'),
             mock.call('Reload Summary: '),
-            mock.call('  1kg project n\xe5me with uni\xe7\xf8de: Updated 3 variants')
+            mock.call('  1kg project n\xe5me with uni\xe7\xf8de: Updated 2 variants')
         ]
         mock_logger.info.assert_has_calls(logger_info_calls)
         mock_get_variants.reset_mock()
@@ -45,7 +45,7 @@ def test_with_param_command(self, mock_get_variants, mock_logger):
         family_2 = Family.objects.get(id=2)
         mock_get_variants.assert_has_calls([
             mock.call(
-                [family_1, family_2], ['1-1562437-G-CA', '1-248367227-TC-T', '1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command',
+                [family_1, family_2], ['1-248367227-TC-T', '1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command',
             ),
             mock.call([Family.objects.get(id=12)], ['1-248367227-TC-T', 'prefix_19107_DEL'], user=None, user_email='manage_command'),
             mock.call([Family.objects.get(id=14)], ['1-248367227-TC-T'], user=None, user_email='manage_command')
@@ -53,11 +53,11 @@ def test_with_param_command(self, mock_get_variants, mock_logger):
 
         logger_info_calls = [
             mock.call('Reloading saved variants in 4 projects'),
-            mock.call('Updated 4 variants for project 1kg project n\xe5me with uni\xe7\xf8de'),
+            mock.call('Updated 3 variants for project 1kg project n\xe5me with uni\xe7\xf8de'),
             mock.call('Updated 2 variants for project Test Reprocessed Project'),
             mock.call('Updated 1 variants for project Non-Analyst Project'),
             mock.call('Reload Summary: '),
-            mock.call('  1kg project n\xe5me with uni\xe7\xf8de: Updated 4 variants'),
+            mock.call('  1kg project n\xe5me with uni\xe7\xf8de: Updated 3 variants'),
             mock.call('  Test Reprocessed Project: Updated 2 variants'),
             mock.call('  Non-Analyst Project: Updated 1 variants'),
             mock.call('Skipped the following 1 project with no saved variants: Empty Project'),
@@ -72,7 +72,7 @@ def test_with_param_command(self, mock_get_variants, mock_logger):
                      PROJECT_GUID,
                      '--family-guid={}'.format(FAMILY_GUID))
 
-        mock_get_variants.assert_called_with([family_1], ['1-1562437-G-CA', '1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command')
+        mock_get_variants.assert_called_with([family_1], ['1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command')
 
         logger_info_calls = [
             mock.call('Reload Summary: '),
diff --git a/seqr/views/apis/saved_variant_api_tests.py b/seqr/views/apis/saved_variant_api_tests.py
index 7d13c1afb5..a2a09a9765 100644
--- a/seqr/views/apis/saved_variant_api_tests.py
+++ b/seqr/views/apis/saved_variant_api_tests.py
@@ -906,7 +906,7 @@ def test_update_compound_hets_variant_functional_data(self):
         self.assertEqual(response.status_code, 400)
         self.assertDictEqual(response.json(), {'error': 'Unable to find the following variant(s): not_variant'})
 
-    @mock.patch('seqr.views.utils.variant_utils.MAX_VARIANTS_FETCH', 3)
+    @mock.patch('seqr.views.utils.variant_utils.MAX_VARIANTS_FETCH', 2)
     @mock.patch('seqr.utils.search.utils.es_backend_enabled')
     @mock.patch('seqr.views.apis.saved_variant_api.logger')
     @mock.patch('seqr.views.utils.variant_utils.get_variants_for_variant_ids')
@@ -925,12 +925,12 @@ def test_update_saved_variant_json(self, mock_get_variants, mock_logger, mock_es
         self.assertDictEqual(
             response.json(),
             {'SV0000002_1248367227_r0390_100': None, 'SV0000001_2103343353_r0390_100': None,
-            'SV0059957_11562437_f019313_1': None, 'SV0059956_11560662_f019313_1': None}
+            'SV0059956_11560662_f019313_1': None}
         )
 
         families = [Family.objects.get(guid='F000001_1'), Family.objects.get(guid='F000002_2')]
         mock_get_variants.assert_has_calls([
-            mock.call(families, ['1-1562437-G-CA', '1-248367227-TC-T', '1-46859832-G-A'], user=self.manager_user, user_email=None),
+            mock.call(families, ['1-248367227-TC-T', '1-46859832-G-A'], user=self.manager_user, user_email=None),
             mock.call(families, ['21-3343353-GAGA-G'], user=self.manager_user, user_email=None),
         ])
         mock_logger.error.assert_not_called()

From d86935fe99dd9a48f6b0162e3ec610c0c23eafe1 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 1 Aug 2024 15:10:55 -0400
Subject: [PATCH 46/51] fix error for loading context when no variants present

---
 seqr/views/apis/saved_variant_api_tests.py | 30 +++++++++++++++-------
 seqr/views/utils/variant_utils.py          |  3 +++
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/seqr/views/apis/saved_variant_api_tests.py b/seqr/views/apis/saved_variant_api_tests.py
index 7d13c1afb5..3ea5f54ece 100644
--- a/seqr/views/apis/saved_variant_api_tests.py
+++ b/seqr/views/apis/saved_variant_api_tests.py
@@ -25,10 +25,12 @@
 COMPOUND_HET_2_GUID = 'SV0059957_11562437_f019313_1'
 GENE_GUID_2 = 'ENSG00000197530'
 
+VARIANT_TAG_RESPONSE_KEYS = {
+    'variantTagsByGuid', 'variantNotesByGuid', 'variantFunctionalDataByGuid', 'savedVariantsByGuid',
+}
 SAVED_VARIANT_RESPONSE_KEYS = {
-    'variantTagsByGuid', 'variantNotesByGuid', 'variantFunctionalDataByGuid', 'savedVariantsByGuid', 'familiesByGuid',
+    *VARIANT_TAG_RESPONSE_KEYS, 'familiesByGuid', 'omimIntervals',
     'genesById', 'locusListsByGuid', 'rnaSeqData', 'mmeSubmissionsByGuid', 'transcriptsById', 'phenotypeGeneScores',
-    'omimIntervals',
 }
 
 COMPOUND_HET_3_JSON = {
@@ -235,6 +237,10 @@ def test_saved_variant_data(self):
         # get variants with no tags for whole project
         response = self.client.get('{}?includeNoteVariants=true'.format(url))
         self.assertEqual(response.status_code, 200)
+        no_families_response_keys = {*SAVED_VARIANT_RESPONSE_KEYS}
+        no_families_response_keys.remove('familiesByGuid')
+        no_families_response_keys.remove('transcriptsById')
+        self.assertSetEqual(set(response.json().keys()), no_families_response_keys)
         variants = response.json()['savedVariantsByGuid']
         self.assertSetEqual(set(variants.keys()), {COMPOUND_HET_1_GUID, COMPOUND_HET_2_GUID})
         self.assertListEqual(variants[COMPOUND_HET_1_GUID]['tagGuids'], [])
@@ -266,10 +272,7 @@ def test_saved_variant_data(self):
         response = self.client.get(url.replace(PROJECT_GUID, 'R0003_test'))
         self.assertEqual(response.status_code, 200)
         response_json = response.json()
-        response_keys = {*SAVED_VARIANT_RESPONSE_KEYS}
-        response_keys.remove('familiesByGuid')
-        response_keys.remove('transcriptsById')
-        self.assertSetEqual(set(response_json.keys()), response_keys)
+        self.assertSetEqual(set(response_json.keys()), no_families_response_keys)
 
         self.assertSetEqual(
             set(response_json['savedVariantsByGuid'].keys()),
@@ -330,6 +333,17 @@ def test_saved_variant_data(self):
         self.assertListEqual(variants['SV0000002_1248367227_r0390_100']['familyGuids'], ['F000002_2'])
         self.assertEqual(set(response_json['familiesByGuid'].keys()), {'F000001_1', 'F000002_2', 'F000012_12'})
 
+        # Test empty project
+        empty_project_url = url.replace(PROJECT_GUID, 'R0002_empty')
+        response = self.client.get(empty_project_url)
+        self.assertEqual(response.status_code, 200)
+        empty_response = {k: {} for k in VARIANT_TAG_RESPONSE_KEYS}
+        self.assertDictEqual(response.json(), empty_response)
+
+        response = self.client.get(f'{empty_project_url}?loadProjectTagTypes=true&loadFamilyContext=true')
+        self.assertEqual(response.status_code, 200)
+        self.assertDictEqual(response.json(), empty_response)
+
     def test_create_saved_variant(self):
         create_saved_variant_url = reverse(create_saved_variant_handler)
         self.check_collaborator_login(create_saved_variant_url, request_data={'familyGuid': 'F000001_1'})
@@ -410,9 +424,7 @@ def test_create_saved_sv_variant(self):
         self.assertEqual(response.status_code, 200)
 
         response_json = response.json()
-        self.assertSetEqual(set(response_json.keys()), {
-            'variantTagsByGuid', 'variantNotesByGuid', 'variantFunctionalDataByGuid', 'savedVariantsByGuid', 'genesById',
-        })
+        self.assertSetEqual(set(response_json.keys()), {*VARIANT_TAG_RESPONSE_KEYS, 'genesById'})
         self.assertEqual(len(response_json['savedVariantsByGuid']), 1)
         variant_guid = next(iter(response_json['savedVariantsByGuid']))
 
diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py
index 17d1f7e36a..7f98e14653 100644
--- a/seqr/views/utils/variant_utils.py
+++ b/seqr/views/utils/variant_utils.py
@@ -374,6 +374,9 @@ def get_variants_response(request, saved_variants, response_variants=None, add_a
         if saved_variants is not None else {'savedVariantsByGuid': {}}
 
     variants = list(response['savedVariantsByGuid'].values()) if response_variants is None else response_variants
+    if not variants:
+        return response
+
     genes, transcripts, family_genes = _saved_variant_genes_transcripts(variants)
 
     projects = Project.objects.filter(family__guid__in=family_genes.keys()).distinct()

From b3ebaec088dd9ddffe26023a378f2749b7cf2c82 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 1 Aug 2024 15:25:09 -0400
Subject: [PATCH 47/51] fix unit tests

---
 seqr/views/apis/summary_data_api_tests.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py
index 8d90812dbf..cc80bc86fb 100644
--- a/seqr/views/apis/summary_data_api_tests.py
+++ b/seqr/views/apis/summary_data_api_tests.py
@@ -26,10 +26,12 @@
     u'dateGenerated': '2020-04-27'
 }
 
+VARIANT_TAG_RESPONSE_KEYS = {
+    'variantTagsByGuid', 'variantNotesByGuid', 'variantFunctionalDataByGuid', 'savedVariantsByGuid',
+}
 SAVED_VARIANT_RESPONSE_KEYS = {
-    'projectsByGuid', 'locusListsByGuid', 'savedVariantsByGuid', 'variantFunctionalDataByGuid', 'genesById',
-    'variantNotesByGuid', 'individualsByGuid', 'variantTagsByGuid', 'familiesByGuid', 'familyNotesByGuid',
-    'mmeSubmissionsByGuid', 'transcriptsById',
+    *VARIANT_TAG_RESPONSE_KEYS, 'projectsByGuid', 'locusListsByGuid', 'genesById',
+    'individualsByGuid', 'familiesByGuid', 'familyNotesByGuid', 'mmeSubmissionsByGuid', 'transcriptsById',
 }
 
 EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = {
@@ -317,7 +319,7 @@ def test_saved_variants_page(self):
 
         response = self.client.get('{}?gene=ENSG00000135953'.format(url))
         self.assertEqual(response.status_code, 200)
-        self.assertDictEqual(response.json(), {k: {} for k in SAVED_VARIANT_RESPONSE_KEYS if k != 'transcriptsById'})
+        self.assertDictEqual(response.json(), {k: {} for k in VARIANT_TAG_RESPONSE_KEYS})
 
         self.login_manager()
         response = self.client.get(url)

From a18bf966e14b8352ee6ef50bb88132aae34c419b Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 1 Aug 2024 16:00:37 -0400
Subject: [PATCH 48/51] fix unit tests

---
 seqr/views/apis/saved_variant_api_tests.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/seqr/views/apis/saved_variant_api_tests.py b/seqr/views/apis/saved_variant_api_tests.py
index 3ea5f54ece..e70c410aa7 100644
--- a/seqr/views/apis/saved_variant_api_tests.py
+++ b/seqr/views/apis/saved_variant_api_tests.py
@@ -1011,8 +1011,10 @@ def test_saved_variant_data(self, *args):
         super(AnvilSavedVariantAPITest, self).test_saved_variant_data(*args)
         self.mock_list_workspaces.assert_called_with(self.analyst_user)
         self.mock_get_ws_access_level.assert_called_with(
+            mock.ANY, 'my-seqr-billing', 'empty')
+        self.mock_get_ws_access_level.assert_any_call(
             mock.ANY, 'my-seqr-billing', 'anvil-1kg project n\u00e5me with uni\u00e7\u00f8de')
-        self.assertEqual(self.mock_get_ws_access_level.call_count, 15)
+        self.assertEqual(self.mock_get_ws_access_level.call_count, 17)
         self.mock_get_groups.assert_has_calls([mock.call(self.collaborator_user), mock.call(self.analyst_user)])
         self.assertEqual(self.mock_get_groups.call_count, 11)
         self.mock_get_ws_acl.assert_not_called()

From f5c9b02fafa33e2c7fb67067540b502ef96c56cb Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 2 Aug 2024 13:35:53 -0400
Subject: [PATCH 49/51] bump changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e0184a538f..e730f2820d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,8 @@
 # _seqr_ Changes
 
 ## dev
+
+## 8/2/24
 * Adds index_file_path to IGV Sample model (REQUIRES DB MIGRATION)
 
 ## 7/24/24

From 0d7dba43cfb0d76e55f655d06cb0305e876b8ab5 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 2 Aug 2024 13:49:08 -0400
Subject: [PATCH 50/51] fix flapping test

---
 seqr/views/apis/anvil_workspace_api_tests.py | 2 +-
 seqr/views/utils/pedigree_info_utils.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/seqr/views/apis/anvil_workspace_api_tests.py b/seqr/views/apis/anvil_workspace_api_tests.py
index d8bdf7e9e7..2265fb7507 100644
--- a/seqr/views/apis/anvil_workspace_api_tests.py
+++ b/seqr/views/apis/anvil_workspace_api_tests.py
@@ -813,7 +813,7 @@ def _assert_valid_operation(self, project, test_add_data=True):
         self.assertIn({
             'family__family_id': '1', 'individual_id': 'NA19675_1', 'mother__individual_id': None,
             'father__individual_id': 'NA19678', 'sex': 'F', 'affected': 'A', 'notes': 'A affected individual, test1-zsf',
-            'features': [{'id': 'HP:0012469'}, {'id': 'HP:0011675'}],
+            'features': [{'id': 'HP:0011675'}, {'id': 'HP:0012469'}],
         }, individual_model_data)
         self.assertIn({
             'family__family_id': '1', 'individual_id': 'NA19678', 'mother__individual_id': None,
diff --git a/seqr/views/utils/pedigree_info_utils.py b/seqr/views/utils/pedigree_info_utils.py
index fffe0426c7..91b74f8566 100644
--- a/seqr/views/utils/pedigree_info_utils.py
+++ b/seqr/views/utils/pedigree_info_utils.py
@@ -151,7 +151,7 @@ def parse_hpo_terms(hpo_term_string):
     if not hpo_term_string:
         return []
     terms = {hpo_term.strip() for hpo_term in re.sub(r'\(.*?\)', '', hpo_term_string).replace(',', ';').split(';')}
-    return[{'id': term} for term in terms if term]
+    return[{'id': term} for term in sorted(terms) if term]
 
 
 def _convert_fam_file_rows_to_json(column_map, rows, required_columns=None, update_features=False):

From 640ee46ad86642b384ae4afeea9108f42cece5d4 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 2 Aug 2024 14:07:57 -0400
Subject: [PATCH 51/51] update to latest docker compose syntax

---
 deploy/LOCAL_DEVELOPMENT_INSTALL.md |  2 +-
 deploy/LOCAL_INSTALL.md             | 36 ++++++++++++++---------------
 test_local_deployment.sh            | 20 ++++++++--------
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/deploy/LOCAL_DEVELOPMENT_INSTALL.md b/deploy/LOCAL_DEVELOPMENT_INSTALL.md
index 919a0acc9b..02e34f8859 100644
--- a/deploy/LOCAL_DEVELOPMENT_INSTALL.md
+++ b/deploy/LOCAL_DEVELOPMENT_INSTALL.md
@@ -116,7 +116,7 @@ Before running seqr, make sure the following are currently running/ started:
   - If you want ES running but do not need production data/ are working with a standalone seqr instance, 
   use docker-compose
     ```bash
-    docker-compose up elasticsearch
+    docker compose up elasticsearch
     ```
     
 ### Run ui asset server
diff --git a/deploy/LOCAL_INSTALL.md b/deploy/LOCAL_INSTALL.md
index 6ed9162d40..d766931d45 100644
--- a/deploy/LOCAL_INSTALL.md
+++ b/deploy/LOCAL_INSTALL.md
@@ -31,10 +31,10 @@ SEQR_DIR=$(pwd)
 
 wget https://raw.githubusercontent.com/broadinstitute/seqr/master/docker-compose.yml
 
-docker-compose up -d seqr   # start up the seqr docker image in the background after also starting other components it depends on (postgres, redis, elasticsearch). This may take 10+ minutes.
-docker-compose logs -f seqr  # (optional) continuously print seqr logs to see when it is done starting up or if there are any errors. Type Ctrl-C to exit from the logs. 
+docker compose up -d seqr   # start up the seqr docker image in the background after also starting other components it depends on (postgres, redis, elasticsearch). This may take 10+ minutes.
+docker compose logs -f seqr  # (optional) continuously print seqr logs to see when it is done starting up or if there are any errors. Type Ctrl-C to exit from the logs. 
 
-docker-compose exec seqr python manage.py createsuperuser  # create a seqr Admin user 
+docker compose exec seqr python manage.py createsuperuser  # create a seqr Admin user 
 
 open http://localhost     # open the seqr landing page in your browser. Log in to seqr using the email and password from the previous step
 ```
@@ -45,15 +45,15 @@ Updating your local installation of seqr involves pulling the latest version of
 
 ```bash
 # run this from the directory containing your docker-compose.yml file
-docker-compose pull
-docker-compose up -d seqr
+docker compose pull
+docker compose up -d seqr
 
-docker-compose logs -f seqr  # (optional) continuously print seqr logs to see when it is done starting up or if there are any errors. Type Ctrl-C to exit from the logs. 
+docker compose logs -f seqr  # (optional) continuously print seqr logs to see when it is done starting up or if there are any errors. Type Ctrl-C to exit from the logs. 
 ```
 
 To update reference data in seqr, such as OMIM, HPO, etc., run the following
 ```bash
-docker-compose exec seqr ./manage.py update_all_reference_data --use-cached-omim --skip-gencode
+docker compose exec seqr ./manage.py update_all_reference_data --use-cached-omim --skip-gencode
 ```
    
 ### Annotating and loading VCF callsets 
@@ -79,7 +79,7 @@ The steps below describe how to annotate a callset and then load it into your on
    
 1. start a pipeline-runner container which has the necessary tools and environment for starting and submitting jobs to a Dataproc cluster.
    ```bash
-   docker-compose up -d pipeline-runner            # start the pipeline-runner container 
+   docker compose up -d pipeline-runner            # start the pipeline-runner container 
    ```
    
 1. if you haven't already, upload reference data to your own google bucket. 
@@ -88,7 +88,7 @@ This is expected to take a while
    ```bash
    BUILD_VERSION=38                 # can be 37 or 38
     
-   docker-compose exec pipeline-runner copy_reference_data_to_gs.sh $BUILD_VERSION $GS_BUCKET
+   docker compose exec pipeline-runner copy_reference_data_to_gs.sh $BUILD_VERSION $GS_BUCKET
    
    ```
    Periodically, you may want to update the reference data in order to get the latest versions of these annotations. 
@@ -115,7 +115,7 @@ annotations, but you will need to re-load previously loaded projects to get the
     
    INPUT_FILE_PATH=/${GS_FILE_PATH}/${FILENAME}  
     
-   docker-compose exec pipeline-runner load_data_dataproc.sh $BUILD_VERSION $SAMPLE_TYPE $INDEX_NAME $GS_BUCKET $INPUT_FILE_PATH
+   docker compose exec pipeline-runner load_data_dataproc.sh $BUILD_VERSION $SAMPLE_TYPE $INDEX_NAME $GS_BUCKET $INPUT_FILE_PATH
    
    ``` 
    
@@ -138,13 +138,13 @@ The steps below describe how to annotate a callset and then load it into your on
 
 1. start a pipeline-runner container
    ```bash
-   docker-compose up -d pipeline-runner            # start the pipeline-runner container 
+   docker compose up -d pipeline-runner            # start the pipeline-runner container 
    ```
 
 1. authenticate into your google cloud account.
 This is required for hail to access buckets hosted on gcloud.
    ```bash
-   docker-compose exec pipeline-runner  gcloud auth application-default login
+   docker compose exec pipeline-runner  gcloud auth application-default login
    ```
    
 1. if you haven't already, download VEP and other reference data to the docker image's mounted directories. 
@@ -153,7 +153,7 @@ This is expected to take a while
    ```bash
    BUILD_VERSION=38                 # can be 37 or 38
     
-   docker-compose exec pipeline-runner download_reference_data.sh $BUILD_VERSION
+   docker compose exec pipeline-runner download_reference_data.sh $BUILD_VERSION
    
    ``` 
    Periodically, you may want to update the reference data in order to get the latest versions of these annotations. 
@@ -163,12 +163,12 @@ annotations, but you will need to re-load previously loaded projects to get the
    BUILD_VERSION=38                 # can be 37 or 38
    
    # Update clinvar 
-   docker-compose exec pipeline-runner rm -rf "/seqr-reference-data/GRCh${BUILD_VERSION}/clinvar.GRCh${BUILD_VERSION}.ht"
-   docker-compose exec pipeline-runner gsutil rsync -r "gs://seqr-reference-data/GRCh${BUILD_VERSION}/clinvar/clinvar.GRCh${BUILD_VERSION}.ht" "/seqr-reference-data/GRCh${BUILD_VERSION}/clinvar.GRCh${BUILD_VERSION}.ht"
+   docker compose exec pipeline-runner rm -rf "/seqr-reference-data/GRCh${BUILD_VERSION}/clinvar.GRCh${BUILD_VERSION}.ht"
+   docker compose exec pipeline-runner gsutil rsync -r "gs://seqr-reference-data/GRCh${BUILD_VERSION}/clinvar/clinvar.GRCh${BUILD_VERSION}.ht" "/seqr-reference-data/GRCh${BUILD_VERSION}/clinvar.GRCh${BUILD_VERSION}.ht"
   
    # Update all other reference data
-   docker-compose exec pipeline-runner rm -rf "/seqr-reference-data/GRCh${BUILD_VERSION}/combined_reference_data_grch${BUILD_VERSION}.ht"
-   docker-compose exec pipeline-runner gsutil rsync -r "gs://seqr-reference-data/GRCh${BUILD_VERSION}/all_reference_data/combined_reference_data_grch${BUILD_VERSION}.ht" "/seqr-reference-data/GRCh${BUILD_VERSION}/combined_reference_data_grch${BUILD_VERSION}.ht"
+   docker compose exec pipeline-runner rm -rf "/seqr-reference-data/GRCh${BUILD_VERSION}/combined_reference_data_grch${BUILD_VERSION}.ht"
+   docker compose exec pipeline-runner gsutil rsync -r "gs://seqr-reference-data/GRCh${BUILD_VERSION}/all_reference_data/combined_reference_data_grch${BUILD_VERSION}.ht" "/seqr-reference-data/GRCh${BUILD_VERSION}/combined_reference_data_grch${BUILD_VERSION}.ht"
     ```
 
 1. run the loading command in the pipeline-runner container. Adjust the arguments as needed
@@ -179,7 +179,7 @@ annotations, but you will need to re-load previously loaded projects to get the
     
    INPUT_FILE_PATH=${FILE_PATH}/${FILENAME}  
     
-   docker-compose exec pipeline-runner load_data.sh $BUILD_VERSION $SAMPLE_TYPE $INDEX_NAME $INPUT_FILE_PATH
+   docker compose exec pipeline-runner load_data.sh $BUILD_VERSION $SAMPLE_TYPE $INDEX_NAME $INPUT_FILE_PATH
    
    ``` 
 
diff --git a/test_local_deployment.sh b/test_local_deployment.sh
index e6d38d908f..b964a2e235 100755
--- a/test_local_deployment.sh
+++ b/test_local_deployment.sh
@@ -3,15 +3,15 @@
 set -ex
 
 # Due to travis filesystem issues, need to explicitly grant permissions for the volume mount from the container
-# This is not required to use docker-compose locally, only for testing
-docker-compose up -d elasticsearch
-docker-compose exec -T elasticsearch chmod 777 ./data
+# This is not required to use docker compose locally, only for testing
+docker compose up -d elasticsearch
+docker compose exec -T elasticsearch chmod 777 ./data
 
-docker-compose up -d seqr
-docker-compose logs postgres
-docker-compose logs elasticsearch
-docker-compose logs redis
-docker-compose exec -T seqr curl elasticsearch:9200
+docker compose up -d seqr
+docker compose logs postgres
+docker compose logs elasticsearch
+docker compose logs redis
+docker compose exec -T seqr curl elasticsearch:9200
 sleep 30
-docker-compose logs seqr
-echo -ne 'testpassword\n' docker-compose exec -T seqr python manage.py createsuperuser --username test --email test@test.com
+docker compose logs seqr
+echo -ne 'testpassword\n' docker compose exec -T seqr python manage.py createsuperuser --username test --email test@test.com