Merge pull request #4272 from broadinstitute/dev

Dev
broadinstitute · Aug 2, 2024 · ed7a173 · ed7a173
2 parents 9bfe662 + 9ef435d
commit ed7a173
Show file tree

Hide file tree

Showing 47 changed files with 642 additions and 284 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ## dev
 
+## 8/2/24
+* Adds index_file_path to IGV Sample model (REQUIRES DB MIGRATION)
+
 ## 7/24/24
 * Split RNA Sample models (REQUIRES DB MIGRATION)
 

diff --git a/deploy/LOCAL_DEVELOPMENT_INSTALL.md b/deploy/LOCAL_DEVELOPMENT_INSTALL.md
@@ -116,7 +116,7 @@ Before running seqr, make sure the following are currently running/ started:
   - If you want ES running but do not need production data/ are working with a standalone seqr instance, 
   use docker-compose
     ```bash
-    docker-compose up elasticsearch
+    docker compose up elasticsearch
     ```
 
 ### Run ui asset server

diff --git a/deploy/LOCAL_INSTALL.md b/deploy/LOCAL_INSTALL.md
@@ -31,10 +31,10 @@ SEQR_DIR=$(pwd)
 
 wget https://raw.githubusercontent.com/broadinstitute/seqr/master/docker-compose.yml
 
-docker-compose up -d seqr   # start up the seqr docker image in the background after also starting other components it depends on (postgres, redis, elasticsearch). This may take 10+ minutes.
-docker-compose logs -f seqr  # (optional) continuously print seqr logs to see when it is done starting up or if there are any errors. Type Ctrl-C to exit from the logs. 
+docker compose up -d seqr   # start up the seqr docker image in the background after also starting other components it depends on (postgres, redis, elasticsearch). This may take 10+ minutes.
+docker compose logs -f seqr  # (optional) continuously print seqr logs to see when it is done starting up or if there are any errors. Type Ctrl-C to exit from the logs. 
 
-docker-compose exec seqr python manage.py createsuperuser  # create a seqr Admin user 
+docker compose exec seqr python manage.py createsuperuser  # create a seqr Admin user 
 
 open http://localhost     # open the seqr landing page in your browser. Log in to seqr using the email and password from the previous step
 ```
@@ -45,15 +45,15 @@ Updating your local installation of seqr involves pulling the latest version of
 
 ```bash
 # run this from the directory containing your docker-compose.yml file
-docker-compose pull
-docker-compose up -d seqr
+docker compose pull
+docker compose up -d seqr
 
-docker-compose logs -f seqr  # (optional) continuously print seqr logs to see when it is done starting up or if there are any errors. Type Ctrl-C to exit from the logs. 
+docker compose logs -f seqr  # (optional) continuously print seqr logs to see when it is done starting up or if there are any errors. Type Ctrl-C to exit from the logs. 
 ```
 
 To update reference data in seqr, such as OMIM, HPO, etc., run the following
 ```bash
-docker-compose exec seqr ./manage.py update_all_reference_data --use-cached-omim --skip-gencode
+docker compose exec seqr ./manage.py update_all_reference_data --use-cached-omim --skip-gencode
 ```
 
 ### Annotating and loading VCF callsets 
@@ -79,7 +79,7 @@ The steps below describe how to annotate a callset and then load it into your on
 
 1. start a pipeline-runner container which has the necessary tools and environment for starting and submitting jobs to a Dataproc cluster.
    ```bash
-   docker-compose up -d pipeline-runner            # start the pipeline-runner container 
+   docker compose up -d pipeline-runner            # start the pipeline-runner container 
    ```
 
 1. if you haven't already, upload reference data to your own google bucket. 
@@ -88,7 +88,7 @@ This is expected to take a while
    ```bash
    BUILD_VERSION=38                 # can be 37 or 38
     
-   docker-compose exec pipeline-runner copy_reference_data_to_gs.sh $BUILD_VERSION $GS_BUCKET
+   docker compose exec pipeline-runner copy_reference_data_to_gs.sh $BUILD_VERSION $GS_BUCKET
    
    ```
    Periodically, you may want to update the reference data in order to get the latest versions of these annotations. 
@@ -115,7 +115,7 @@ annotations, but you will need to re-load previously loaded projects to get the
     
    INPUT_FILE_PATH=/${GS_FILE_PATH}/${FILENAME}  
     
-   docker-compose exec pipeline-runner load_data_dataproc.sh $BUILD_VERSION $SAMPLE_TYPE $INDEX_NAME $GS_BUCKET $INPUT_FILE_PATH
+   docker compose exec pipeline-runner load_data_dataproc.sh $BUILD_VERSION $SAMPLE_TYPE $INDEX_NAME $GS_BUCKET $INPUT_FILE_PATH
    
    ``` 
    
@@ -138,13 +138,13 @@ The steps below describe how to annotate a callset and then load it into your on
 
 1. start a pipeline-runner container
    ```bash
-   docker-compose up -d pipeline-runner            # start the pipeline-runner container 
+   docker compose up -d pipeline-runner            # start the pipeline-runner container 
    ```
 
 1. authenticate into your google cloud account.
 This is required for hail to access buckets hosted on gcloud.
    ```bash
-   docker-compose exec pipeline-runner  gcloud auth application-default login
+   docker compose exec pipeline-runner  gcloud auth application-default login
    ```
    
 1. if you haven't already, download VEP and other reference data to the docker image's mounted directories. 
@@ -153,7 +153,7 @@ This is expected to take a while
    ```bash
    BUILD_VERSION=38                 # can be 37 or 38
     
-   docker-compose exec pipeline-runner download_reference_data.sh $BUILD_VERSION
+   docker compose exec pipeline-runner download_reference_data.sh $BUILD_VERSION
    
    ``` 
    Periodically, you may want to update the reference data in order to get the latest versions of these annotations. 
@@ -163,12 +163,12 @@ annotations, but you will need to re-load previously loaded projects to get the
    BUILD_VERSION=38                 # can be 37 or 38
    
    # Update clinvar 
-   docker-compose exec pipeline-runner rm -rf "/seqr-reference-data/GRCh${BUILD_VERSION}/clinvar.GRCh${BUILD_VERSION}.ht"
-   docker-compose exec pipeline-runner gsutil rsync -r "gs://seqr-reference-data/GRCh${BUILD_VERSION}/clinvar/clinvar.GRCh${BUILD_VERSION}.ht" "/seqr-reference-data/GRCh${BUILD_VERSION}/clinvar.GRCh${BUILD_VERSION}.ht"
+   docker compose exec pipeline-runner rm -rf "/seqr-reference-data/GRCh${BUILD_VERSION}/clinvar.GRCh${BUILD_VERSION}.ht"
+   docker compose exec pipeline-runner gsutil rsync -r "gs://seqr-reference-data/GRCh${BUILD_VERSION}/clinvar/clinvar.GRCh${BUILD_VERSION}.ht" "/seqr-reference-data/GRCh${BUILD_VERSION}/clinvar.GRCh${BUILD_VERSION}.ht"
   
    # Update all other reference data
-   docker-compose exec pipeline-runner rm -rf "/seqr-reference-data/GRCh${BUILD_VERSION}/combined_reference_data_grch${BUILD_VERSION}.ht"
-   docker-compose exec pipeline-runner gsutil rsync -r "gs://seqr-reference-data/GRCh${BUILD_VERSION}/all_reference_data/combined_reference_data_grch${BUILD_VERSION}.ht" "/seqr-reference-data/GRCh${BUILD_VERSION}/combined_reference_data_grch${BUILD_VERSION}.ht"
+   docker compose exec pipeline-runner rm -rf "/seqr-reference-data/GRCh${BUILD_VERSION}/combined_reference_data_grch${BUILD_VERSION}.ht"
+   docker compose exec pipeline-runner gsutil rsync -r "gs://seqr-reference-data/GRCh${BUILD_VERSION}/all_reference_data/combined_reference_data_grch${BUILD_VERSION}.ht" "/seqr-reference-data/GRCh${BUILD_VERSION}/combined_reference_data_grch${BUILD_VERSION}.ht"
     ```
 
 1. run the loading command in the pipeline-runner container. Adjust the arguments as needed
@@ -179,7 +179,7 @@ annotations, but you will need to re-load previously loaded projects to get the
     
    INPUT_FILE_PATH=${FILE_PATH}/${FILENAME}  
     
-   docker-compose exec pipeline-runner load_data.sh $BUILD_VERSION $SAMPLE_TYPE $INDEX_NAME $INPUT_FILE_PATH
+   docker compose exec pipeline-runner load_data.sh $BUILD_VERSION $SAMPLE_TYPE $INDEX_NAME $INPUT_FILE_PATH
    
    ``` 
 

diff --git a/seqr/fixtures/social_auth.json b/seqr/fixtures/social_auth.json
@@ -95,5 +95,21 @@
         "created": "2020-03-12T23:09:54.180Z",
         "modified": "2020-03-12T23:09:54.180Z"
     }
+}, {
+    "model": "social_django.usersocialauth",
+    "pk": 7,
+    "fields": {
+        "user": 16,
+        "provider": "google-oauth2",
+        "uid": "[email protected]",
+        "extra_data": {
+            "expires": 3599,
+            "auth_time": 1603287741,
+            "token_type": "Bearer",
+            "access_token": "ya29.EXAMPLE"
+        },
+        "created": "2020-03-12T23:09:54.180Z",
+        "modified": "2020-03-12T23:09:54.180Z"
+    }
 }
 ]
diff --git a/seqr/management/commands/check_for_new_samples_from_pipeline.py b/seqr/management/commands/check_for_new_samples_from_pipeline.py
@@ -15,7 +15,7 @@
 from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup, search_data_type
 from seqr.views.utils.dataset_utils import match_and_update_search_samples
 from seqr.views.utils.variant_utils import reset_cached_search_results, update_projects_saved_variant_json, \
-    saved_variants_dataset_type_filter
+    get_saved_variants
 from settings import SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL
 
 logger = logging.getLogger(__name__)
@@ -108,7 +108,7 @@ def handle(self, *args, **options):
             )
             project_families = project_sample_data['family_guids']
             updated_families.update(project_families)
-            updated_project_families.append((project.id, project.name, project_families))
+            updated_project_families.append((project.id, project.name, project.genome_version, project_families))
 
         # Send failure notifications
         failed_family_samples = metadata.get('failed_family_samples', {})
@@ -153,10 +153,10 @@ def _reload_shared_variant_annotations(data_type, genome_version, updated_varian
         if is_sv:
             updated_annotation_samples = updated_annotation_samples.filter(sample_type=data_type.split('_')[1])
 
-        variant_models = SavedVariant.objects.filter(
-            family_id__in=updated_annotation_samples.values_list('individual__family', flat=True).distinct(),
-            **saved_variants_dataset_type_filter(dataset_type),
-        ).filter(Q(saved_variant_json__genomeVersion__isnull=True) | Q(saved_variant_json__genomeVersion=db_genome_version))
+        variant_models = get_saved_variants(
+            genome_version, dataset_type=dataset_type,
+            family_guids=updated_annotation_samples.values_list('individual__family__guid', flat=True).distinct(),
+        )
 
         if not variant_models:
             logger.info('No additional saved variants to update')

diff --git a/seqr/management/commands/reload_saved_variant_json.py b/seqr/management/commands/reload_saved_variant_json.py
@@ -27,6 +27,6 @@ def handle(self, *args, **options):
             logging.info("Processing all %s projects" % len(projects))
 
         family_ids = [family_guid] if family_guid else None
-        project_list = [(*project, family_ids) for project in projects.values_list('id', 'name')]
+        project_list = [(*project, family_ids) for project in projects.values_list('id', 'name', 'genome_version')]
         update_projects_saved_variant_json(project_list, user_email='manage_command')
         logger.info("Done")
diff --git a/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py b/seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
@@ -193,9 +193,10 @@ def test_command(self, mock_email, mock_airtable_utils):
 
         # Update fixture data to allow testing edge cases
         Project.objects.filter(id__in=[1, 3]).update(genome_version=38)
-        sv = SavedVariant.objects.get(guid='SV0000002_1248367227_r0390_100')
-        sv.saved_variant_json['genomeVersion'] = '38'
-        sv.save()
+        svs = SavedVariant.objects.filter(guid__in=['SV0000002_1248367227_r0390_100', 'SV0000006_1248367227_r0003_tes'])
+        for sv in svs:
+            sv.saved_variant_json['genomeVersion'] = '38'
+            sv.save()
 
         with self.assertRaises(ValueError) as ce:
             call_command('check_for_new_samples_from_pipeline', 'GRCh38/SNV_INDEL', 'auto__2023-08-08')

diff --git a/seqr/management/tests/reload_saved_variant_json_tests.py b/seqr/management/tests/reload_saved_variant_json_tests.py
@@ -27,12 +27,12 @@ def test_with_param_command(self, mock_get_variants, mock_logger):
 
         family_1 = Family.objects.get(id=1)
         mock_get_variants.assert_called_with(
-            [family_1], ['1-1562437-G-CA', '1-46859832-G-A','21-3343353-GAGA-G'], user=None, user_email='manage_command')
+            [family_1], ['1-46859832-G-A','21-3343353-GAGA-G'], user=None, user_email='manage_command')
 
         logger_info_calls = [
-            mock.call('Updated 3 variants for project 1kg project n\xe5me with uni\xe7\xf8de'),
+            mock.call('Updated 2 variants for project 1kg project n\xe5me with uni\xe7\xf8de'),
             mock.call('Reload Summary: '),
-            mock.call('  1kg project n\xe5me with uni\xe7\xf8de: Updated 3 variants')
+            mock.call('  1kg project n\xe5me with uni\xe7\xf8de: Updated 2 variants')
         ]
         mock_logger.info.assert_has_calls(logger_info_calls)
         mock_get_variants.reset_mock()
@@ -45,19 +45,19 @@ def test_with_param_command(self, mock_get_variants, mock_logger):
         family_2 = Family.objects.get(id=2)
         mock_get_variants.assert_has_calls([
             mock.call(
-                [family_1, family_2], ['1-1562437-G-CA', '1-248367227-TC-T', '1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command',
+                [family_1, family_2], ['1-248367227-TC-T', '1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command',
             ),
             mock.call([Family.objects.get(id=12)], ['1-248367227-TC-T', 'prefix_19107_DEL'], user=None, user_email='manage_command'),
             mock.call([Family.objects.get(id=14)], ['1-248367227-TC-T'], user=None, user_email='manage_command')
         ], any_order=True)
 
         logger_info_calls = [
             mock.call('Reloading saved variants in 4 projects'),
-            mock.call('Updated 4 variants for project 1kg project n\xe5me with uni\xe7\xf8de'),
+            mock.call('Updated 3 variants for project 1kg project n\xe5me with uni\xe7\xf8de'),
             mock.call('Updated 2 variants for project Test Reprocessed Project'),
             mock.call('Updated 1 variants for project Non-Analyst Project'),
             mock.call('Reload Summary: '),
-            mock.call('  1kg project n\xe5me with uni\xe7\xf8de: Updated 4 variants'),
+            mock.call('  1kg project n\xe5me with uni\xe7\xf8de: Updated 3 variants'),
             mock.call('  Test Reprocessed Project: Updated 2 variants'),
             mock.call('  Non-Analyst Project: Updated 1 variants'),
             mock.call('Skipped the following 1 project with no saved variants: Empty Project'),
@@ -72,7 +72,7 @@ def test_with_param_command(self, mock_get_variants, mock_logger):
                      PROJECT_GUID,
                      '--family-guid={}'.format(FAMILY_GUID))
 
-        mock_get_variants.assert_called_with([family_1], ['1-1562437-G-CA', '1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command')
+        mock_get_variants.assert_called_with([family_1], ['1-46859832-G-A', '21-3343353-GAGA-G'], user=None, user_email='manage_command')
 
         logger_info_calls = [
             mock.call('Reload Summary: '),

diff --git a/seqr/migrations/0071_igvsample_index_file_path.py b/seqr/migrations/0071_igvsample_index_file_path.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.13 on 2024-07-24 14:34
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('seqr', '0070_remove_rnasample_dataset_type_and_more'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='igvsample',
+            name='index_file_path',
+            field=models.TextField(blank=True, null=True),
+        ),
+    ]
diff --git a/seqr/models.py b/seqr/models.py
@@ -124,13 +124,13 @@ def delete_model(self, user, user_can_delete=False):
         log_model_update(logger, self, user, 'delete')
 
     @classmethod
-    def bulk_create(cls, user, new_models):
+    def bulk_create(cls, user, new_models, **kwargs):
         """Helper bulk create method that logs the creation"""
         for model in new_models:
             model.created_by = user
             model.created_date = timezone.now()
             model.guid = model._format_guid(random.randint(10**(cls.GUID_PRECISION-1), 10**cls.GUID_PRECISION))  # nosec
-        models = cls.objects.bulk_create(new_models)
+        models = cls.objects.bulk_create(new_models, **kwargs)
         log_model_bulk_update(logger, models, user, 'create')
         return models
 
@@ -785,6 +785,7 @@ class IgvSample(ModelWithGUID):
     individual = models.ForeignKey('Individual', on_delete=models.PROTECT)
     sample_type = models.CharField(max_length=15, choices=SAMPLE_TYPE_CHOICES)
     file_path = models.TextField()
+    index_file_path = models.TextField(null=True, blank=True)
     sample_id = models.TextField(null=True)
 
     def __unicode__(self):
@@ -796,7 +797,7 @@ def __unicode__(self):
     class Meta:
         unique_together = ('individual', 'sample_type')
 
-        json_fields = ['guid', 'file_path', 'sample_type', 'sample_id']
+        json_fields = ['guid', 'file_path', 'index_file_path', 'sample_type', 'sample_id']
 
 
 class SavedVariant(ModelWithGUID):
@@ -1139,11 +1140,11 @@ def log_model_no_guid_bulk_update(cls, models, user, update_type):
         logger.info(f'{update_type} {db_entity}s', user, db_update=db_update)
 
     @classmethod
-    def bulk_create(cls, user, new_models):
+    def bulk_create(cls, user, new_models, **kwargs):
         """Helper bulk create method that logs the creation"""
         for model in new_models:
             model.created_by = user
-        models = cls.objects.bulk_create(new_models)
+        models = cls.objects.bulk_create(new_models, **kwargs)
         cls.log_model_no_guid_bulk_update(models, user, 'create')
         return models
 

diff --git a/seqr/views/apis/anvil_workspace_api.py b/seqr/views/apis/anvil_workspace_api.py
@@ -13,7 +13,7 @@
 from django.shortcuts import redirect
 
 from reference_data.models import GENOME_VERSION_LOOKUP
-from seqr.models import Project, CAN_EDIT, Sample
+from seqr.models import Project, CAN_EDIT, Sample, Individual
 from seqr.views.react_app import render_app_html
 from seqr.views.utils.airtable_utils import AirtableSession, ANVIL_REQUEST_TRACKING_TABLE
 from seqr.utils.search.constants import VCF_FILE_EXTENSIONS
@@ -243,17 +243,32 @@ def _parse_uploaded_pedigree(request_json, project=None):
     # Parse families/individuals in the uploaded pedigree file
     json_records = load_uploaded_file(request_json['uploadedFileId'])
     pedigree_records, _ = parse_basic_pedigree_table(
-        project, json_records, 'uploaded pedigree file', required_columns=[
+        project, json_records, 'uploaded pedigree file', update_features=True, required_columns=[
             JsonConstants.SEX_COLUMN, JsonConstants.AFFECTED_COLUMN,
         ])
 
     missing_samples = [record['individualId'] for record in pedigree_records
                        if record['individualId'] not in request_json['vcfSamples']]
 
+    errors = []
     if missing_samples:
-        error = 'The following samples are included in the pedigree file but are missing from the VCF: {}'.format(
-                ', '.join(missing_samples))
-        raise ErrorsWarningsException([error], [])
+        errors.append('The following samples are included in the pedigree file but are missing from the VCF: {}'.format(
+                ', '.join(missing_samples)))
+
+    records_by_family = defaultdict(list)
+    for record in pedigree_records:
+        records_by_family[record[JsonConstants.FAMILY_ID_COLUMN]].append(record)
+
+    no_affected_families = [
+        family_id for family_id, records in records_by_family.items()
+        if not any(record[JsonConstants.AFFECTED_COLUMN] == Individual.AFFECTED_STATUS_AFFECTED for record in records)
+    ]
+
+    if no_affected_families:
+        errors.append('The following families do not have any affected individuals: {}'.format(', '.join(no_affected_families)))
+
+    if errors:
+        raise ErrorsWarningsException(errors, [])
 
     return pedigree_records
 
@@ -262,6 +277,7 @@ def _trigger_add_workspace_data(project, pedigree_records, user, data_path, samp
     # add families and individuals according to the uploaded individual records
     pedigree_json, sample_ids = add_or_update_individuals_and_families(
         project, individual_records=pedigree_records, user=user, get_update_json=get_pedigree_json, get_updated_individual_ids=True,
+        allow_features_update=True,
     )
     num_updated_individuals = len(sample_ids)
     sample_ids.update(previous_loaded_ids or [])