Merge pull request #3463 from broadinstitute/bulk-load-igv

Bulk load igv
broadinstitute · Jul 11, 2023 · 55da63e · 55da63e
2 parents fe6e943 + d1441ab
commit 55da63e
Show file tree

Hide file tree

Showing 7 changed files with 229 additions and 52 deletions.
diff --git a/seqr/urls.py b/seqr/urls.py
@@ -134,7 +134,7 @@
 from seqr.views.apis.awesomebar_api import awesomebar_autocomplete_handler
 from seqr.views.apis.auth_api import login_required_error, login_view, logout_view, policies_required_error
 from seqr.views.apis.igv_api import fetch_igv_track, receive_igv_table_handler, update_individual_igv_sample, \
-    igv_genomes_proxy
+    igv_genomes_proxy, receive_bulk_igv_table_handler
 from seqr.views.apis.analysis_group_api import update_analysis_group_handler, delete_analysis_group_handler
 from seqr.views.apis.project_api import create_project_handler, update_project_handler, delete_project_handler, \
     project_page_data, project_families, project_overview, project_mme_submisssions, project_individuals, \
@@ -324,6 +324,7 @@
     'data_management/validate_callset': validate_callset,
     'data_management/loaded_projects/(?P<sample_type>[^/]+)/(?P<dataset_type>[^/]+)': get_loaded_projects,
     'data_management/load_data': load_data,
+    'data_management/add_igv': receive_bulk_igv_table_handler,
 
     'summary_data/saved_variants/(?P<tag>[^/]+)': saved_variants_page,
     'summary_data/hpo/(?P<hpo_id>[^/]+)': hpo_summary_data,

diff --git a/seqr/views/apis/igv_api.py b/seqr/views/apis/igv_api.py
@@ -8,12 +8,12 @@
 from seqr.models import Individual, IgvSample
 from seqr.utils.file_utils import file_iter, does_file_exist, is_google_bucket_file_path, run_command, get_google_project
 from seqr.utils.redis_utils import safe_redis_get_json, safe_redis_set_json
-from seqr.views.utils.file_utils import save_uploaded_file
+from seqr.views.utils.file_utils import save_uploaded_file, load_uploaded_file
 from seqr.views.utils.json_to_orm_utils import get_or_create_model_from_json
 from seqr.views.utils.json_utils import create_json_response
 from seqr.views.utils.orm_to_json_utils import get_json_for_sample
 from seqr.views.utils.permissions_utils import get_project_and_check_permissions, check_project_permissions, \
-    login_and_policies_required, pm_or_data_manager_required
+    login_and_policies_required, pm_or_data_manager_required, get_project_guids_user_can_view
 
 GS_STORAGE_ACCESS_CACHE_KEY = 'gs_storage_access_cache_entry'
 GS_STORAGE_URL = 'https://storage.googleapis.com'
@@ -22,51 +22,49 @@
     'gs': GS_STORAGE_URL,
 }
 
-@pm_or_data_manager_required
-def receive_igv_table_handler(request, project_guid):
-    project = get_project_and_check_permissions(project_guid, request.user, can_edit=True)
-    info = []
 
-    def _process_alignment_records(rows, **kwargs):
-        invalid_row = next((row for row in rows if not 2 <= len(row) <= 3), None)
-        if invalid_row:
-            raise ValueError("Must contain 2 or 3 columns: " + ', '.join(invalid_row))
-        parsed_records = defaultdict(list)
-        for row in rows:
-            parsed_records[row[0]].append({'filePath': row[1], 'sampleId': row[2] if len(row)> 2 else None})
-        return parsed_records
+def _process_alignment_records(rows, num_id_cols=1, **kwargs):
+    num_cols = num_id_cols + 1
+    invalid_row = next((row for row in rows if not num_cols <= len(row) <= num_cols+1), None)
+    if invalid_row:
+        raise ValueError(f"Must contain {num_cols} or {num_cols+1} columns: {', '.join(invalid_row)}")
+    parsed_records = defaultdict(list)
+    for row in rows:
+        row_id = row[0] if num_id_cols == 1 else tuple(row[:num_id_cols])
+        parsed_records[row_id].append({'filePath': row[num_id_cols], 'sampleId': row[num_cols] if len(row) > num_cols else None})
+    return parsed_records
 
-    try:
-        uploaded_file_id, filename, individual_dataset_mapping = save_uploaded_file(request, process_records=_process_alignment_records)
 
-        matched_individuals = Individual.objects.filter(family__project=project, individual_id__in=individual_dataset_mapping.keys())
-        unmatched_individuals = set(individual_dataset_mapping.keys()) - {i.individual_id for i in matched_individuals}
-        if len(unmatched_individuals) > 0:
-            raise Exception('The following Individual IDs do not exist: {}'.format(", ".join(unmatched_individuals)))
+def _process_igv_table_handler(parse_uploaded_file, get_valid_matched_individuals):
+    info = []
 
-        info.append('Parsed {} rows in {} individuals from {}'.format(
-            sum([len(rows) for rows in individual_dataset_mapping.values()]), len(individual_dataset_mapping), filename))
+    try:
+        uploaded_file_id, filename, individual_dataset_mapping = parse_uploaded_file()
 
-        existing_sample_files = defaultdict(set)
-        for sample in IgvSample.objects.select_related('individual').filter(individual__in=matched_individuals):
-            existing_sample_files[sample.individual.individual_id].add(sample.file_path)
+        matched_individuals = get_valid_matched_individuals(individual_dataset_mapping)
 
-        unchanged_rows = set()
-        for individual_id, updates in individual_dataset_mapping.items():
-            unchanged_rows.update([
-                (individual_id, update['filePath']) for update in updates
-                if update['filePath'] in existing_sample_files[individual_id]
-            ])
+        message = f'Parsed {sum([len(rows) for rows in individual_dataset_mapping.values()])} rows in {len(matched_individuals)} individuals'
+        if filename:
+            message += f' from {filename}'
+        info.append(message)
 
-        if unchanged_rows:
-            info.append('No change detected for {} rows'.format(len(unchanged_rows)))
+        existing_sample_files = defaultdict(set)
+        for sample in IgvSample.objects.select_related('individual').filter(individual__in=matched_individuals.keys()):
+            existing_sample_files[sample.individual].add(sample.file_path)
 
+        num_unchanged_rows = 0
         all_updates = []
-        for i in matched_individuals:
-            all_updates += [
-                dict(individualGuid=i.guid, individualId=i.individual_id, **update) for update in individual_dataset_mapping[i.individual_id]
-                if (i.individual_id, update['filePath']) not in unchanged_rows
+        for individual, updates in matched_individuals.items():
+            changed_updates = [
+                dict(individualGuid=individual.guid, individualId=individual.individual_id, **update)
+                for update in updates
+                if update['filePath'] not in existing_sample_files[individual]
             ]
+            all_updates += changed_updates
+            num_unchanged_rows += len(updates) - len(changed_updates)
+
+        if num_unchanged_rows:
+            info.append('No change detected for {} rows'.format(num_unchanged_rows))
 
     except Exception as e:
         return create_json_response({'errors': [str(e)]}, status=400)
@@ -75,11 +73,58 @@ def _process_alignment_records(rows, **kwargs):
         'updates': all_updates,
         'uploadedFileId': uploaded_file_id,
         'errors': [],
+        'warnings': [],
         'info': info,
     }
     return create_json_response(response)
 
 
+@pm_or_data_manager_required
+def receive_igv_table_handler(request, project_guid):
+    project = get_project_and_check_permissions(project_guid, request.user, can_edit=True)
+
+    def _get_valid_matched_individuals(individual_dataset_mapping):
+        matched_individuals = Individual.objects.filter(
+            family__project=project, individual_id__in=individual_dataset_mapping.keys()
+        )
+        unmatched_individuals = set(individual_dataset_mapping.keys()) - {i.individual_id for i in matched_individuals}
+        if len(unmatched_individuals) > 0:
+            raise Exception('The following Individual IDs do not exist: {}'.format(", ".join(unmatched_individuals)))
+
+        return {i: individual_dataset_mapping[i.individual_id] for i in matched_individuals}
+
+    return _process_igv_table_handler(
+        lambda: save_uploaded_file(request, process_records=_process_alignment_records),
+        _get_valid_matched_individuals,
+    )
+
+
+@pm_or_data_manager_required
+def receive_bulk_igv_table_handler(request):
+    def _parse_uploaded_file():
+        uploaded_file_id = json.loads(request.body).get('mappingFile', {}).get('uploadedFileId')
+        if not uploaded_file_id:
+            raise ValueError('No file uploaded')
+        records = _process_alignment_records(load_uploaded_file(uploaded_file_id), num_id_cols=2)
+        return uploaded_file_id, None, records
+
+    def _get_valid_matched_individuals(individual_dataset_mapping):
+        individuals = Individual.objects.filter(
+            family__project__guid__in=get_project_guids_user_can_view(request.user, limit_data_manager=False),
+            family__project__name__in={k[0] for k in individual_dataset_mapping.keys()},
+            individual_id__in={k[1] for k in individual_dataset_mapping.keys()},
+        ).select_related('family__project')
+        individuals_by_project_id = {(i.family.project.name, i.individual_id): i for i in individuals}
+        unmatched = set(individual_dataset_mapping.keys()) - set(individuals_by_project_id.keys())
+        if len(unmatched) > 0:
+            raise Exception(
+                f'The following Individuals do not exist: {", ".join([f"{i} ({p})" for p, i in sorted(unmatched)])}')
+
+        return {v: individual_dataset_mapping[k] for k, v in individuals_by_project_id.items() if individual_dataset_mapping[k]}
+
+    return _process_igv_table_handler(_parse_uploaded_file, _get_valid_matched_individuals)
+
+
 SAMPLE_TYPE_MAP = [
     ('bam', IgvSample.SAMPLE_TYPE_ALIGNMENT),
     ('cram', IgvSample.SAMPLE_TYPE_ALIGNMENT),

diff --git a/seqr/views/apis/igv_api_tests.py b/seqr/views/apis/igv_api_tests.py
@@ -6,7 +6,7 @@
 from django.core.files.uploadedfile import SimpleUploadedFile
 from django.urls.base import reverse
 from seqr.views.apis.igv_api import fetch_igv_track, receive_igv_table_handler, update_individual_igv_sample, \
-    igv_genomes_proxy
+    igv_genomes_proxy, receive_bulk_igv_table_handler
 from seqr.views.apis.igv_api import GS_STORAGE_ACCESS_CACHE_KEY
 from seqr.views.utils.test_utils import AuthenticationTestCase
 
@@ -118,8 +118,9 @@ def test_receive_alignment_table_handler(self):
         self.assertEqual(response.status_code, 200)
 
         response_json = response.json()
-        self.assertSetEqual(set(response_json.keys()), {'uploadedFileId', 'errors', 'info', 'updates'})
+        self.assertSetEqual(set(response_json.keys()), {'uploadedFileId', 'errors', 'warnings', 'info', 'updates'})
         self.assertListEqual(response_json['errors'], [])
+        self.assertListEqual(response_json['warnings'], [])
         self.assertListEqual(
             response_json['info'], ['Parsed 3 rows in 2 individuals from samples.csv', 'No change detected for 1 rows'])
         self.assertListEqual(sorted(response_json['updates'], key=lambda o: o['individualGuid']), [
@@ -132,6 +133,67 @@ def test_receive_alignment_table_handler(self):
         response = self.client.post(url, data={'f': f})
         self.assertEqual(response.status_code, 200)
 
+    @mock.patch('seqr.views.apis.igv_api.load_uploaded_file')
+    def test_receive_bulk_alignment_table_handler(self, mock_load_uploaded_file):
+        url = reverse(receive_bulk_igv_table_handler)
+        self.check_pm_login(url)
+
+        # Send invalid requests
+        response = self.client.post(url, content_type='application/json', data=json.dumps({}))
+        self.assertEqual(response.status_code, 400)
+        self.assertDictEqual(response.json(), {'errors': ['No file uploaded']})
+
+        uploaded_file_id = 'test_file_id'
+        request_data = json.dumps({'mappingFile': {'uploadedFileId': uploaded_file_id}})
+        pm_projects_rows = [
+            ['1kg project nåme with uniçøde', 'NA19675_1', 'gs://readviz/batch_10.dcr.bed.gz', 'NA19675'],
+            ['1kg project nåme with uniçøde', 'NA19675_1', 'gs://readviz/NA19675_1.bam'],
+            ['1kg project nåme with uniçøde', 'NA20870', 'gs://readviz/NA20870.cram'],
+            ['Test Reprocessed Project', 'NA20885', 'gs://readviz/NA20885.cram'],
+        ]
+        rows = pm_projects_rows + [['Non-Analyst Project', 'NA21234', 'gs://readviz/NA21234.cram']]
+        mock_load_uploaded_file.return_value = [['NA19675']] + rows
+        response = self.client.post(url, content_type='application/json', data=request_data)
+        self.assertEqual(response.status_code, 400)
+        self.assertDictEqual(response.json(), {'errors': ['Must contain 3 or 4 columns: NA19675']})
+
+        mock_load_uploaded_file.return_value = rows + [
+            ['Non-project', 'NA19675_1', 'gs://readviz/NA19679.bam'],
+            ['1kg project nåme with uniçøde', 'NA19675', 'gs://readviz/batch_10.dcr.bed.gz'],
+        ]
+        response = self.client.post(url, content_type='application/json', data=request_data)
+        self.assertEqual(response.status_code, 400)
+        self.assertDictEqual(response.json(), {'errors': [
+            'The following Individuals do not exist: NA19675 (1kg project nåme with uniçøde), NA21234 (Non-Analyst Project), NA19675_1 (Non-project)']})
+
+        # Send valid request
+        mock_load_uploaded_file.return_value = pm_projects_rows
+        response = self.client.post(url, content_type='application/json', data=request_data)
+        self.assertEqual(response.status_code, 200)
+
+        response_json = response.json()
+        self.assertSetEqual(set(response_json.keys()), {'uploadedFileId', 'errors', 'warnings', 'info', 'updates'})
+        self.assertListEqual(response_json['errors'], [])
+        self.assertListEqual(response_json['warnings'], [])
+        self.assertListEqual(response_json['info'], ['Parsed 4 rows in 3 individuals', 'No change detected for 1 rows'])
+        updates = [
+            {'individualGuid': 'I000001_na19675', 'individualId': 'NA19675_1', 'filePath': 'gs://readviz/batch_10.dcr.bed.gz', 'sampleId': 'NA19675'},
+            {'individualGuid': 'I000001_na19675', 'individualId': 'NA19675_1', 'filePath': 'gs://readviz/NA19675_1.bam', 'sampleId': None},
+            {'individualGuid': 'I000015_na20885', 'individualId': 'NA20885', 'filePath': 'gs://readviz/NA20885.cram', 'sampleId': None},
+        ]
+        self.assertListEqual(sorted(response_json['updates'], key=lambda o: o['individualGuid']), updates)
+
+        # test data manager access
+        self.login_data_manager_user()
+        mock_load_uploaded_file.return_value = rows
+        response = self.client.post(url, content_type='application/json', data=request_data)
+        self.assertEqual(response.status_code, 200)
+        response_json = response.json()
+        self.assertListEqual(response_json['info'], ['Parsed 5 rows in 4 individuals', 'No change detected for 1 rows'])
+        self.assertListEqual(sorted(response_json['updates'], key=lambda o: o['individualGuid']), updates + [
+            {'individualGuid': 'I000018_na21234', 'individualId': 'NA21234', 'filePath': 'gs://readviz/NA21234.cram', 'sampleId': None}
+        ])
+
     @mock.patch('seqr.utils.file_utils.subprocess.Popen')
     @mock.patch('seqr.utils.file_utils.os.path.isfile')
     def test_add_alignment_sample(self, mock_local_file_exists, mock_subprocess):

diff --git a/ui/pages/DataManagement/DataManagement.jsx b/ui/pages/DataManagement/DataManagement.jsx
@@ -7,6 +7,7 @@ import { getUser, getElasticsearchEnabled } from 'redux/selectors'
 import { Error404, Error401 } from 'shared/components/page/Errors'
 import { SimplePageHeader } from 'shared/components/page/PageHeaderLayout'
 
+import AddIGV from './components/AddIGV'
 import ElasticsearchStatus from './components/ElasticsearchStatus'
 import LoadData from './components/LoadData'
 import RnaSeq from './components/RnaSeq'
@@ -19,6 +20,7 @@ const IFRAME_STYLE = { position: 'fixed', left: '0', top: '95px' }
 
 const PM_DATA_MANAGEMENT_PAGES = [
   { path: 'load_data', component: LoadData },
+  { path: 'add_igv', component: AddIGV },
 ]
 
 const DATA_MANAGEMENT_PAGES = [

diff --git a/ui/pages/DataManagement/components/AddIGV.jsx b/ui/pages/DataManagement/components/AddIGV.jsx
@@ -0,0 +1,44 @@
+import React from 'react'
+import { connect } from 'react-redux'
+import { List, Segment } from 'semantic-ui-react'
+
+import FileUploadField, { validateUploadedFile } from 'shared/components/form/XHRUploaderField'
+import UploadFormPage from 'shared/components/page/UploadFormPage'
+
+import { getIgvUploadStats } from '../selectors'
+import { addIgv } from '../reducers'
+
+const mapStateToProps = state => ({
+  fields: [
+    {
+      name: 'mappingFile',
+      validate: validateUploadedFile,
+      component: FileUploadField,
+      dropzoneLabel: (
+        <Segment basic textAlign="left">
+          Upload a file with desired IGV tracks. Include one row per track.
+          For merged RNA tracks, include one row for coverage and one for junctions.
+          <br />
+          Columns are as follows:
+          <br />
+          <List ordered>
+            <List.Item>Project</List.Item>
+            <List.Item>Individual ID</List.Item>
+            <List.Item>IGV Track File Path</List.Item>
+            <List.Item>
+              Optional: Sample ID if different from Individual ID.
+              Used primarily for gCNV files to identify the sample in the batch path
+            </List.Item>
+          </List>
+        </Segment>
+      ),
+    },
+  ],
+  uploadStats: getIgvUploadStats(state),
+})
+
+const mapDispatchToProps = {
+  onSubmit: addIgv,
+}
+
+export default connect(mapStateToProps, mapDispatchToProps)(UploadFormPage)