From b22f5fce369440ce1fe238f59b18f9a3f18a909d Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 14 Aug 2023 15:06:24 -0400
Subject: [PATCH 01/19] use data type specific gregor metadata

---
 seqr/views/apis/report_api.py | 97 ++++++++++++++++++++++++++---------
 1 file changed, 72 insertions(+), 25 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index d5ae198e92..90a047add8 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -655,7 +655,10 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
 
 # GREGoR metadata
 
+GREGOR_DATA_TYPES = ['wes', 'wgs']
 SMID_FIELD = 'SMID'
+PARTICIPANT_ID_FIELD = 'CollaboratorParticipantID'
+COLLABORATOR_SAMPLE_ID_FIELD = 'CollaboratorSampleID'
 PARTICIPANT_TABLE_COLUMNS = [
     'participant_id', 'internal_project_id', 'gregor_center', 'consent_code', 'recontactable', 'prior_testing',
     'pmid_id', 'family_id', 'paternal_id', 'maternal_id', 'twin_id', 'proband_relationship',
@@ -694,7 +697,22 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
     'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',
     'caller_software', 'variant_types', 'analysis_details',
 ]
-ALL_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + CALLED_TABLE_COLUMNS
+
+DATA_TYPE_OMIT = {'wgs': ['targeted_regions_method'], 'wes': []}
+MAPPED_AIRTABLE_FIELDS = {'alignment_software': 'alignment_software_dna'}
+NO_DATA_TYPE_FIELDS = {'targeted_region_bed_file', 'reference_assembly', 'analysis_details'}
+
+DATA_TYPE_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + [
+    COLLABORATOR_SAMPLE_ID_FIELD, SMID_FIELD]
+ALL_AIRTABLE_COLUMNS = DATA_TYPE_AIRTABLE_COLUMNS + CALLED_TABLE_COLUMNS
+AIRTABLE_QUERY_COLUMNS = set(CALLED_TABLE_COLUMNS)
+AIRTABLE_QUERY_COLUMNS.remove('md5sum')
+AIRTABLE_QUERY_COLUMNS.update(NO_DATA_TYPE_FIELDS)
+AIRTABLE_QUERY_COLUMNS.update(MAPPED_AIRTABLE_FIELDS.values())
+for data_type in GREGOR_DATA_TYPES:
+    data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set(
+        MAPPED_AIRTABLE_FIELDS.keys()) - set(DATA_TYPE_OMIT[data_type])
+    AIRTABLE_QUERY_COLUMNS.update({f'{field}_{data_type}' for field in data_type_columns})
 
 TABLE_COLUMNS = {
     'participant': PARTICIPANT_TABLE_COLUMNS,
@@ -789,18 +807,31 @@ def gregor_export(request):
         consent_code=consent_code[0],
         projectcategory__name='GREGoR',
     )
-    individuals = Individual.objects.filter(
-        sample__in=get_search_samples(projects, active_only=False),
-    ).distinct().prefetch_related('family__project', 'mother', 'father')
+    sample_types = get_search_samples(projects, active_only=False).values_list('individual_id', 'sample_type')
+    individual_data_types = defaultdict(set)
+    for individual_db_id, sample_type in sample_types:
+        individual_data_types[individual_db_id].add(sample_type)
+    individuals = Individual.objects.filter(id__in=individual_data_types).prefetch_related(
+        'family__project', 'mother', 'father')
+
+    grouped_data_type_individuals = defaultdict(dict)
+    for i in individuals:
+        grouped_data_type_individuals[i.individual_id].update({data_type: i for data_type in individual_data_types[i.id]})
 
-    airtable_sample_records, airtable_metadata_by_smid = _get_gregor_airtable_data(individuals, request.user)
+    airtable_sample_records, airtable_metadata_by_participant = _get_gregor_airtable_data(
+        grouped_data_type_individuals.keys(), request.user)
 
     participant_rows = []
     family_map = {}
     phenotype_rows = []
     analyte_rows = []
     airtable_rows = []
-    for individual in individuals:
+    for data_type_individuals in grouped_data_type_individuals.values():
+        # If multiple individual records, prefer WGS
+        individual = next(
+            data_type_individuals[data_type] for data_type in ['WGS', 'WES'] if data_type_individuals.get(data_type)
+        )
+
         # family table
         family = individual.family
         if family not in family_map:
@@ -830,18 +861,24 @@ def gregor_export(request):
             dict(**base_phenotype_row, **_get_phenotype_row(feature)) for feature in individual.absent_features or []
         ]
 
-        analyte_id = None
+        analyte_ids = set()
         # airtable data
         if airtable_sample:
-            sm_id = airtable_sample[SMID_FIELD]
-            analyte_id = f'Broad_{sm_id}'
-            airtable_metadata = airtable_metadata_by_smid.get(sm_id)
-            if airtable_metadata:
-                experiment_ids = _get_experiment_ids(airtable_sample, airtable_metadata)
-                airtable_rows.append(dict(analyte_id=analyte_id, **airtable_metadata, **experiment_ids))
+            airtable_metadata = airtable_metadata_by_participant.get(airtable_sample[PARTICIPANT_ID_FIELD]) or {}
+            for data_type in data_type_individuals:
+                data_type_metadata = airtable_metadata.get(data_type)
+                if data_type_metadata:
+                    experiment_ids = _get_experiment_ids(data_type_metadata)
+                    analyte_ids.add(experiment_ids['analyte_id'])
+                    row = {**airtable_metadata, **data_type_metadata, **experiment_ids}
+                    row.update({k: row[v] for k, v in MAPPED_AIRTABLE_FIELDS.items()})
+                    airtable_rows.append(row)
 
         # analyte table
-        analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual)))
+        if not analyte_ids:
+            analyte_ids.add(_get_analyte_id(airtable_sample))
+        for analyte_id in analyte_ids:
+            analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual)))
 
     files, warnings = _get_validated_gregor_files([
         ('participant', participant_rows),
@@ -861,21 +898,25 @@ def gregor_export(request):
     })
 
 
-def _get_gregor_airtable_data(individuals, user):
+def _get_gregor_airtable_data(individual_ids, user):
     sample_records, session = _get_airtable_samples(
-        individuals.order_by('individual_id').values_list('individual_id', flat=True), user,
-        fields=[SMID_FIELD, 'CollaboratorSampleID', 'Recontactable'],
+        individual_ids, user, fields=[SMID_FIELD, PARTICIPANT_ID_FIELD, 'Recontactable'],
     )
 
-    fields = ALL_AIRTABLE_COLUMNS
     airtable_metadata = session.fetch_records(
         'GREGoR Data Model',
-        fields=[SMID_FIELD] + sorted(fields),
-        or_filters={f'{SMID_FIELD}': {r[SMID_FIELD] for r in sample_records.values()}},
+        fields=[PARTICIPANT_ID_FIELD] + sorted(AIRTABLE_QUERY_COLUMNS),
+        or_filters={f'{PARTICIPANT_ID_FIELD}': {r[PARTICIPANT_ID_FIELD] for r in sample_records.values()}},
     )
-    airtable_metadata_by_smid = {r[SMID_FIELD]: r for r in airtable_metadata.values()}
 
-    return sample_records, airtable_metadata_by_smid
+    airtable_metadata_by_participant = {r[PARTICIPANT_ID_FIELD]: r for r in airtable_metadata.values()}
+    for data_type in GREGOR_DATA_TYPES:
+        for r in airtable_metadata_by_participant.values():
+            data_type_fields = [f for f in r if f.endswith(f'_{data_type}')]
+            if data_type_fields:
+                r[data_type.upper()] = {f.replace(f'_{data_type}', ''): r.pop(f) for f in data_type_fields}
+
+    return sample_records, airtable_metadata_by_participant
 
 
 def _get_gregor_family_row(family):
@@ -932,16 +973,22 @@ def _get_analyte_row(individual):
     }
 
 
-def _get_experiment_ids(airtable_sample, airtable_metadata):
-    collaborator_sample_id = airtable_sample['CollaboratorSampleID']
-    experiment_dna_short_read_id = f'Broad_{airtable_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}'
+def _get_experiment_ids(data_type_metadata):
+    collaborator_sample_id = data_type_metadata[COLLABORATOR_SAMPLE_ID_FIELD]
+    experiment_dna_short_read_id = f'Broad_{data_type_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}'
     return {
+        'analyte_id': _get_analyte_id(data_type_metadata),
         'experiment_dna_short_read_id': experiment_dna_short_read_id,
         'experiment_sample_id': collaborator_sample_id,
         'aligned_dna_short_read_id': f'{experiment_dna_short_read_id}_1'
     }
 
 
+def _get_analyte_id(airtable_metadata):
+    sm_id = airtable_metadata[SMID_FIELD]
+    return f'Broad_{sm_id}' if sm_id else None
+
+
 def _get_validated_gregor_files(file_data):
     errors = []
     warnings = []

From ca2322db4393f0bfd9d584f0222fff3ac6cbf025 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 14 Aug 2023 16:37:28 -0400
Subject: [PATCH 02/19] update test to pass

---
 seqr/views/apis/report_api.py       |  2 +-
 seqr/views/apis/report_api_tests.py | 98 ++++++++++++++++++-----------
 2 files changed, 63 insertions(+), 37 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 90a047add8..5055e12a63 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -985,7 +985,7 @@ def _get_experiment_ids(data_type_metadata):
 
 
 def _get_analyte_id(airtable_metadata):
-    sm_id = airtable_metadata[SMID_FIELD]
+    sm_id = airtable_metadata.get(SMID_FIELD)
     return f'Broad_{sm_id}' if sm_id else None
 
 
diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 11fb5818e0..5ddfcb4caf 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -156,6 +156,7 @@
       "fields": {
         "SeqrCollaboratorSampleID": "VCGS_FAM203_621_D1",
         "CollaboratorSampleID": "NA19675_1",
+        'CollaboratorParticipantID': 'NA19675',
         'SMID': 'SM-AGHT',
         'Recontactable': 'Yes',
       },
@@ -165,30 +166,36 @@
       "fields": {
         "SeqrCollaboratorSampleID": "HG00731",
         "CollaboratorSampleID": "VCGS_FAM203_621_D2",
+        'CollaboratorParticipantID': 'VCGS_FAM203_621',
         'SMID': 'SM-JDBTM',
       },
     }
 ]}
+# TODO test grouped individuals multi data type
+# TODO test has data type in airtable but not seqr samples
+# TODO test analyte id fallback from airtable sample
 AIRTABLE_GREGOR_RECORDS = {
   "records": [
     {
       "id": "rec2B6OGmQpAkQW3s",
       "fields": {
-        'SMID': 'SM-JDBTM',
-        'seq_library_prep_kit_method': 'Kapa HyperPrep',
-        'read_length': '151',
-        'experiment_type': 'exome',
-        'targeted_regions_method': 'Twist',
+        'CollaboratorParticipantID': 'VCGS_FAM203_621',
+        'CollaboratorSampleID_wes': 'VCGS_FAM203_621_D2',
+        'SMID_wes': 'SM-JDBTM',
+        'seq_library_prep_kit_method_wes': 'Kapa HyperPrep',
+        'read_length_wes': '151',
+        'experiment_type_wes': 'exome',
+        'targeted_regions_method_wes': 'Twist',
         'targeted_region_bed_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed',
-        'date_data_generation': '2022-08-15',
-        'target_insert_size': '385',
-        'sequencing_platform': 'NovaSeq',
-        'aligned_dna_short_read_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram',
-        'aligned_dna_short_read_index_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai',
-        'md5sum': '129c28163df082',
+        'date_data_generation_wes': '2022-08-15',
+        'target_insert_size_wes': '385',
+        'sequencing_platform_wes': 'NovaSeq',
+        'aligned_dna_short_read_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram',
+        'aligned_dna_short_read_index_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai',
+        'md5sum_wes': '129c28163df082',
         'reference_assembly': 'GRCh38',
-        'alignment_software': 'BWA-MEM-2.3',
-        'mean_coverage': '42.4',
+        'alignment_software_dna': 'BWA-MEM-2.3',
+        'mean_coverage_wgs': '42.4',
         'analysis_details': 'DOI:10.5281/zenodo.4469317',
         'called_variants_dna_short_read_id': 'SX2-3',
         'aligned_dna_short_read_set_id': 'BCM_H7YG5DSX2',
@@ -197,12 +204,12 @@
         'variant_types': 'SNV',
       },
     },
-    {
-      "id": "rec2B6OGmCVzkQW3s",
-      "fields": {
-        'SMID': 'SM-AGHT',
-      },
-    },
+    # {
+    #   "id": "rec2B6OGmCVzkQW3s",
+    #   "fields": {
+    #     'SMID': 'SM-AGHT',
+    #   },
+    # },
 ]}
 
 EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = {
@@ -299,6 +306,23 @@
                 {'column': 'age_at_enrollment'},
             ],
         },
+        {
+            'table': 'aligned_dna_short_read',
+            'columns': [
+                {'column': 'aligned_dna_short_read_id', 'required': True},
+                {'column': 'experiment_dna_short_read_id', 'required': True},
+                {'column': 'aligned_dna_short_read_file'},
+                {'column': 'aligned_dna_short_read_index_file'},
+                {'column': 'alignment_software'},
+                {'column': 'analysis_details'},
+                {'column': 'md5sum'},
+                {'column': 'mean_coverage', 'required': True},
+                {'column': 'reference_assembly'},
+                {'column': 'reference_assembly_details'},
+                {'column': 'reference_assembly_uri'},
+                {'column': 'quality_issues'},
+            ],
+        },
         {
             'table': 'aligned_dna_short_read_set',
             'columns': [
@@ -691,11 +715,11 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
             'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
             'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
-        ] + skipped_file_validation_warnings[1:6] + skipped_file_validation_warnings[7:])
+        ] + skipped_file_validation_warnings[1:5] + skipped_file_validation_warnings[7:])
         self.assertListEqual(response.json()['errors'], [
             'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
             'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)',
-            'The following entries are missing required "aligned_dna_short_read_set_id" (from Airtable) in the "aligned_dna_short_read_set" table: NA19675_1',
+            'The following entries are missing required "mean_coverage" (from Airtable) in the "aligned_dna_short_read" table: VCGS_FAM203_621_D2',
         ])
 
         responses.add(responses.GET, MOCK_DATA_MODEL_URL, status=404)
@@ -769,7 +793,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             ['Broad_SM-AGHT', 'Broad_NA19675_1', 'DNA', '', 'UBERON:0003714', '', '', 'No', '', '', '', '', '', '', '', ''],
             row)
 
-        self.assertEqual(len(experiment_file), 3)
+        self.assertEqual(len(experiment_file), 2)
         self.assertEqual(experiment_file[0], [
             'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id', 'seq_library_prep_kit_method',
             'read_length', 'experiment_type', 'targeted_regions_method', 'targeted_region_bed_file',
@@ -779,22 +803,21 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             'Broad_exome_VCGS_FAM203_621_D2', 'Broad_SM-JDBTM', 'VCGS_FAM203_621_D2', 'Kapa HyperPrep', '151', 'exome',
             'Twist', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', '2022-08-15', '385', 'NovaSeq',
         ], experiment_file)
-        self.assertIn(['Broad_NA_NA19675_1', 'Broad_SM-AGHT', 'NA19675_1', '', '', '', '', '', '', '', ''], experiment_file)
 
-        self.assertEqual(len(read_file), 3)
+        self.assertEqual(len(read_file), 2)
         self.assertEqual(read_file[0], [
             'aligned_dna_short_read_id', 'experiment_dna_short_read_id', 'aligned_dna_short_read_file',
             'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri', 'reference_assembly_details',
             'alignment_software', 'mean_coverage', 'analysis_details',  'quality_issues',
         ])
-        self.assertIn([
+        self.assertEqual(read_file[1], [
             'Broad_exome_VCGS_FAM203_621_D2_1', 'Broad_exome_VCGS_FAM203_621_D2',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai', '129c28163df082', 'GRCh38',
-            '', '', 'BWA-MEM-2.3', '42.4', 'DOI:10.5281/zenodo.4469317', '',
-        ], read_file)
+            '', '', 'BWA-MEM-2.3', '', 'DOI:10.5281/zenodo.4469317', '',
+        ])
 
-        self.assertEqual(len(read_set_file), 3)
+        self.assertEqual(len(read_set_file), 2)
         self.assertEqual(read_set_file[0], ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id'])
         self.assertIn(['BCM_H7YG5DSX2', 'Broad_exome_VCGS_FAM203_621_D2_1'], read_set_file)
 
@@ -815,7 +838,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
                         "{CollaboratorSampleID}='NA20870',{CollaboratorSampleID}='NA20872',{CollaboratorSampleID}='NA20874'," \
                         "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20877'," \
                         "{CollaboratorSampleID}='NA20881')"
-        sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorSampleID', 'Recontactable']
+        sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorParticipantID', 'Recontactable']
         self._assert_expected_airtable_call(0, sample_filter, sample_fields)
         secondary_sample_filter = "OR({SeqrCollaboratorSampleID}='HG00731',{SeqrCollaboratorSampleID}='HG00732'," \
                         "{SeqrCollaboratorSampleID}='HG00733',{SeqrCollaboratorSampleID}='NA19678'," \
@@ -825,14 +848,17 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
         sample_fields[0] = 'SeqrCollaboratorSampleID'
         self._assert_expected_airtable_call(1, secondary_sample_filter, sample_fields)
         metadata_fields = [
-            'SMID', 'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'aligned_dna_short_read_set_id',
-            'alignment_software', 'analysis_details', 'analysis_details', 'called_variants_dna_file',
-            'called_variants_dna_short_read_id', 'caller_software', 'date_data_generation', 'experiment_type',
-            'md5sum', 'md5sum', 'mean_coverage', 'read_length', 'reference_assembly', 'seq_library_prep_kit_method',
-            'sequencing_platform', 'target_insert_size', 'targeted_region_bed_file', 'targeted_regions_method',
-            'variant_types',
+            'CollaboratorParticipantID', 'CollaboratorSampleID_wes', 'CollaboratorSampleID_wgs', 'SMID_wes', 'SMID_wgs',
+            'aligned_dna_short_read_file_wes', 'aligned_dna_short_read_file_wgs', 'aligned_dna_short_read_index_file_wes',
+            'aligned_dna_short_read_index_file_wgs', 'aligned_dna_short_read_set_id', 'alignment_software_dna',
+            'analysis_details', 'called_variants_dna_file', 'called_variants_dna_short_read_id', 'caller_software',
+            'date_data_generation_wes', 'date_data_generation_wgs', 'experiment_type_wes', 'experiment_type_wgs',
+            'md5sum_wes', 'md5sum_wgs', 'mean_coverage_wes', 'mean_coverage_wgs', 'read_length_wes', 'read_length_wgs',
+            'reference_assembly', 'seq_library_prep_kit_method_wes', 'seq_library_prep_kit_method_wgs',
+            'sequencing_platform_wes', 'sequencing_platform_wgs', 'target_insert_size_wes', 'target_insert_size_wgs',
+            'targeted_region_bed_file', 'targeted_regions_method_wes', 'variant_types',
         ]
-        self._assert_expected_airtable_call(2, "OR(SMID='SM-AGHT',SMID='SM-JDBTM')", metadata_fields)
+        self._assert_expected_airtable_call(2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='VCGS_FAM203_621')", metadata_fields)
 
         self.assertEqual(responses.calls[3].request.url, MOCK_DATA_MODEL_URL)
 

From 1bc7cb0b5d01a3900aada72007aa9e6457ee6ed8 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 14 Aug 2023 16:38:24 -0400
Subject: [PATCH 03/19] update test to pass

---
 seqr/views/apis/report_api_tests.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 5ddfcb4caf..290d585d24 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -204,12 +204,14 @@
         'variant_types': 'SNV',
       },
     },
-    # {
-    #   "id": "rec2B6OGmCVzkQW3s",
-    #   "fields": {
-    #     'SMID': 'SM-AGHT',
-    #   },
-    # },
+    {
+      "id": "rec2B6OGmCVzkQW3s",
+      "fields": {
+        'CollaboratorParticipantID': 'NA19675',
+        'CollaboratorSampleID_wgs': 'NA19675_1',
+        'SMID_wgs': 'SM-AGHT-2',
+      },
+    },
 ]}
 
 EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = {

From 1912dc6c5002d4228d7ff83af136d3f065dec087 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 14 Aug 2023 16:39:46 -0400
Subject: [PATCH 04/19] test wrond data in airtable case

---
 seqr/views/apis/report_api_tests.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 290d585d24..862e87cf84 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -172,8 +172,6 @@
     }
 ]}
 # TODO test grouped individuals multi data type
-# TODO test has data type in airtable but not seqr samples
-# TODO test analyte id fallback from airtable sample
 AIRTABLE_GREGOR_RECORDS = {
   "records": [
     {
@@ -210,6 +208,7 @@
         'CollaboratorParticipantID': 'NA19675',
         'CollaboratorSampleID_wgs': 'NA19675_1',
         'SMID_wgs': 'SM-AGHT-2',
+        'experiment_type_wgs': 'genome',
       },
     },
 ]}

From c7d56a48f8783a596485b24b2e2ba1eb24a5d335 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 15 Aug 2023 11:02:46 -0400
Subject: [PATCH 05/19] update test fixtures

---
 seqr/fixtures/1kg_project.json      |  4 ++--
 seqr/views/apis/report_api_tests.py | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json
index 1c522b8bf8..3e4d01bd96 100644
--- a/seqr/fixtures/1kg_project.json
+++ b/seqr/fixtures/1kg_project.json
@@ -695,7 +695,7 @@
         "created_by": null,
         "last_modified_date": "2017-03-13T09:07:50.158Z",
         "family": 8,
-        "individual_id": "NA20877",
+        "individual_id": "NA20888",
         "mother_id": null,
         "father_id": null,
         "sex": "M",
@@ -1144,7 +1144,7 @@
         "last_modified_date": "2017-03-13T09:07:50.277Z",
 
         "sample_id": "NA20888",
-        "sample_type": "WES",
+        "sample_type": "WGS",
         "is_active": false,
         "individual": 16,
         "dataset_type": "VARIANTS",
diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 862e87cf84..2b3f44e13d 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -712,13 +712,13 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             'The following tables are required in the data model but absent from the reports: subject',
             'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id',
             'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata',
-            'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
-            'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
-            'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
-            'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
+            'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
+            'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
+            'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
+            'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
         ] + skipped_file_validation_warnings[1:5] + skipped_file_validation_warnings[7:])
         self.assertListEqual(response.json()['errors'], [
-            'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
+            'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)',
             'The following entries are missing required "mean_coverage" (from Airtable) in the "aligned_dna_short_read" table: VCGS_FAM203_621_D2',
         ])
@@ -837,15 +837,15 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
         sample_filter = "OR({CollaboratorSampleID}='HG00731',{CollaboratorSampleID}='HG00732',{CollaboratorSampleID}='HG00733'," \
                         "{CollaboratorSampleID}='NA19675_1',{CollaboratorSampleID}='NA19678',{CollaboratorSampleID}='NA19679'," \
                         "{CollaboratorSampleID}='NA20870',{CollaboratorSampleID}='NA20872',{CollaboratorSampleID}='NA20874'," \
-                        "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20877'," \
-                        "{CollaboratorSampleID}='NA20881')"
+                        "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20881'," \
+                        "{CollaboratorSampleID}='NA20888')"
         sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorParticipantID', 'Recontactable']
         self._assert_expected_airtable_call(0, sample_filter, sample_fields)
         secondary_sample_filter = "OR({SeqrCollaboratorSampleID}='HG00731',{SeqrCollaboratorSampleID}='HG00732'," \
                         "{SeqrCollaboratorSampleID}='HG00733',{SeqrCollaboratorSampleID}='NA19678'," \
                         "{SeqrCollaboratorSampleID}='NA19679',{SeqrCollaboratorSampleID}='NA20870',{SeqrCollaboratorSampleID}='NA20872'," \
                         "{SeqrCollaboratorSampleID}='NA20874',{SeqrCollaboratorSampleID}='NA20875',{SeqrCollaboratorSampleID}='NA20876'," \
-                        "{SeqrCollaboratorSampleID}='NA20877',{SeqrCollaboratorSampleID}='NA20881')"
+                        "{SeqrCollaboratorSampleID}='NA20881',{SeqrCollaboratorSampleID}='NA20888')"
         sample_fields[0] = 'SeqrCollaboratorSampleID'
         self._assert_expected_airtable_call(1, secondary_sample_filter, sample_fields)
         metadata_fields = [

From 6d55da7f20ae61878210ee14faecf743cb9c70fa Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 15 Aug 2023 11:44:22 -0400
Subject: [PATCH 06/19] add mock airtable data

---
 seqr/views/apis/report_api_tests.py | 90 ++++++++++++++++++++++++-----
 1 file changed, 77 insertions(+), 13 deletions(-)

diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 2b3f44e13d..95d3702aeb 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -169,7 +169,17 @@
         'CollaboratorParticipantID': 'VCGS_FAM203_621',
         'SMID': 'SM-JDBTM',
       },
-    }
+    },
+    {
+      "id": "rec2Nkg1fKssJc7",
+      "fields": {
+        'SeqrCollaboratorSampleID': 'NA20888',
+        'CollaboratorSampleID': 'NA20888',
+        'CollaboratorParticipantID': 'NA20888',
+        'SMID': 'SM-L5QMP',
+        'Recontactable': 'No',
+      },
+    },
 ]}
 # TODO test grouped individuals multi data type
 AIRTABLE_GREGOR_RECORDS = {
@@ -211,6 +221,46 @@
         'experiment_type_wgs': 'genome',
       },
     },
+    {
+      "id": "rec2BFCGmQpAkQ7x",
+      "fields": {
+        'CollaboratorParticipantID': 'NA20888',
+        'CollaboratorSampleID_wes': 'NA20888',
+        'CollaboratorSampleID_wgs': 'NA20888_1',
+        'SMID_wes': 'SM-L5QMP',
+        'SMID_wgs': 'SM-L5QMWP',
+        'seq_library_prep_kit_method_wes': 'Kapa HyperPrep',
+        'seq_library_prep_kit_method_wgs': 'Kapa HyperPrep w/o amplification',
+        'read_length_wes': '151',
+        'read_length_wgs': '200',
+        'experiment_type_wes': 'exome',
+        'experiment_type_wgs': 'genome',
+        'targeted_regions_method_wes': 'Twist',
+        'targeted_region_bed_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed',
+        'date_data_generation_wes': '2022-06-05',
+        'date_data_generation_wgs': '2023-03-13',
+        'target_insert_size_wes': '380',
+        'target_insert_size_wgs': '450',
+        'sequencing_platform_wes': 'NovaSeq',
+        'sequencing_platform_wgs': 'NovaSeq2',
+        'aligned_dna_short_read_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.cram',
+        'aligned_dna_short_read_index_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai',
+        'aligned_dna_short_read_file_wgs': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.cram',
+        'aligned_dna_short_read_index_file_wgs': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.crai',
+        'md5sum_wes': 'a6f6308866765ce8',
+        'md5sum_wgs': '2aa33e8c32020b1c',
+        'reference_assembly': 'GRCh38',
+        'alignment_software_dna': 'BWA 0.7.15.r1140',
+        'mean_coverage_wes': '42.8',
+        'mean_coverage_wgs': '36.1',
+        'analysis_details': '',
+        'called_variants_dna_short_read_id': 'NA',
+        'aligned_dna_short_read_set_id': 'Broad_NA20888_D1',
+        'called_variants_dna_file': 'NA',
+        'caller_software': 'NA',
+        'variant_types': 'SNV',
+      },
+    },
 ]}
 
 EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = {
@@ -642,7 +692,9 @@ def test_sample_metadata_export(self, mock_google_authenticated):
             'NA19679', 'NA20870', 'HG00732', 'NA20876', 'NA20874', 'NA20875', 'NA19678', 'NA19675', 'HG00731',
             'NA20872', 'NA20881', 'HG00733',
         })
-        expected_samples.update(self.ADDITIONAL_SAMPLES)
+        if self.ADDITIONAL_SAMPLES:
+            expected_samples.remove('NA20888')
+            expected_samples.update(self.ADDITIONAL_SAMPLES)
         self.assertSetEqual({r['sample_id'] for r in response_json['rows']}, expected_samples)
         test_row = next(r for r in response_json['rows'] if r['sample_id'] == 'NA20889')
         self.assertDictEqual(EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW, test_row)
@@ -712,7 +764,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             'The following tables are required in the data model but absent from the reports: subject',
             'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id',
             'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata',
-            'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
+            'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881',
             'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
@@ -794,7 +846,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             ['Broad_SM-AGHT', 'Broad_NA19675_1', 'DNA', '', 'UBERON:0003714', '', '', 'No', '', '', '', '', '', '', '', ''],
             row)
 
-        self.assertEqual(len(experiment_file), 2)
+        self.assertEqual(len(experiment_file), 3)
         self.assertEqual(experiment_file[0], [
             'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id', 'seq_library_prep_kit_method',
             'read_length', 'experiment_type', 'targeted_regions_method', 'targeted_region_bed_file',
@@ -804,33 +856,45 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             'Broad_exome_VCGS_FAM203_621_D2', 'Broad_SM-JDBTM', 'VCGS_FAM203_621_D2', 'Kapa HyperPrep', '151', 'exome',
             'Twist', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', '2022-08-15', '385', 'NovaSeq',
         ], experiment_file)
+        self.assertIn([
+            'Broad_exome_NA20888', 'Broad_SM-L5QMP', 'NA20888', 'Kapa HyperPrep', '151', 'exome',
+            'Twist', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', '2022-06-05', '380', 'NovaSeq',
+        ], experiment_file)
 
-        self.assertEqual(len(read_file), 2)
+        self.assertEqual(len(read_file), 3)
         self.assertEqual(read_file[0], [
             'aligned_dna_short_read_id', 'experiment_dna_short_read_id', 'aligned_dna_short_read_file',
             'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri', 'reference_assembly_details',
             'alignment_software', 'mean_coverage', 'analysis_details',  'quality_issues',
         ])
-        self.assertEqual(read_file[1], [
+        self.assertIn([
             'Broad_exome_VCGS_FAM203_621_D2_1', 'Broad_exome_VCGS_FAM203_621_D2',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai', '129c28163df082', 'GRCh38',
             '', '', 'BWA-MEM-2.3', '', 'DOI:10.5281/zenodo.4469317', '',
-        ])
+        ], read_file)
+        self.assertIn([
+            'Broad_exome_NA20888_1', 'Broad_exome_NA20888',
+            'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.cram',
+            'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai', 'a6f6308866765ce8', 'GRCh38',
+            '', '', 'BWA 0.7.15.r1140', '42.8', '', '',
+        ], read_file)
 
-        self.assertEqual(len(read_set_file), 2)
+        self.assertEqual(len(read_set_file), 3)
         self.assertEqual(read_set_file[0], ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id'])
         self.assertIn(['BCM_H7YG5DSX2', 'Broad_exome_VCGS_FAM203_621_D2_1'], read_set_file)
+        self.assertIn(['Broad_NA20888_D1', 'Broad_exome_NA20888_1'], read_set_file)
 
-        self.assertEqual(len(called_file), 2)
+        self.assertEqual(len(called_file), 3)
         self.assertEqual(called_file[0], [
             'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',
             'caller_software', 'variant_types', 'analysis_details',
         ])
-        self.assertEqual(called_file[1], [
+        self.assertIn([
             'SX2-3', 'BCM_H7YG5DSX2', 'gs://fc-fed09429-e563-44a7-aaeb-776c8336ba02/COL_FAM1_1_D1.SV.vcf',
             '129c28163df082', 'gatk4.1.2', 'SNV', 'DOI:10.5281/zenodo.4469317',
-        ])
+        ], called_file)
+        self.assertIn(['NA', 'Broad_NA20888_D1', 'NA', 'a6f6308866765ce8', 'NA', 'SNV', ''], called_file)
 
         # test airtable calls
         self.assertEqual(len(responses.calls), 4)
@@ -845,7 +909,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
                         "{SeqrCollaboratorSampleID}='HG00733',{SeqrCollaboratorSampleID}='NA19678'," \
                         "{SeqrCollaboratorSampleID}='NA19679',{SeqrCollaboratorSampleID}='NA20870',{SeqrCollaboratorSampleID}='NA20872'," \
                         "{SeqrCollaboratorSampleID}='NA20874',{SeqrCollaboratorSampleID}='NA20875',{SeqrCollaboratorSampleID}='NA20876'," \
-                        "{SeqrCollaboratorSampleID}='NA20881',{SeqrCollaboratorSampleID}='NA20888')"
+                        "{SeqrCollaboratorSampleID}='NA20881')"
         sample_fields[0] = 'SeqrCollaboratorSampleID'
         self._assert_expected_airtable_call(1, secondary_sample_filter, sample_fields)
         metadata_fields = [
@@ -859,7 +923,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             'sequencing_platform_wes', 'sequencing_platform_wgs', 'target_insert_size_wes', 'target_insert_size_wgs',
             'targeted_region_bed_file', 'targeted_regions_method_wes', 'variant_types',
         ]
-        self._assert_expected_airtable_call(2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='VCGS_FAM203_621')", metadata_fields)
+        self._assert_expected_airtable_call(2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='NA20888',CollaboratorParticipantID='VCGS_FAM203_621')", metadata_fields)
 
         self.assertEqual(responses.calls[3].request.url, MOCK_DATA_MODEL_URL)
 

From 8ab2004b260d8f4c2c4febc2926c06692c79c180 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 15 Aug 2023 11:51:54 -0400
Subject: [PATCH 07/19] abstract gregor checks

---
 seqr/views/apis/report_api_tests.py | 46 ++++++++++++++++-------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 95d3702aeb..33faafe7dc 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -181,7 +181,7 @@
       },
     },
 ]}
-# TODO test grouped individuals multi data type
+
 AIRTABLE_GREGOR_RECORDS = {
   "records": [
     {
@@ -262,6 +262,10 @@
       },
     },
 ]}
+EXPECTED_GREGOR_FILES = [
+    'participant', 'family', 'phenotype', 'analyte', 'experiment_dna_short_read',
+    'aligned_dna_short_read', 'aligned_dna_short_read_set', 'called_variants_dna_short_read',
+]
 
 EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = {
     "project_guid": "R0003_test",
@@ -693,7 +697,7 @@ def test_sample_metadata_export(self, mock_google_authenticated):
             'NA20872', 'NA20881', 'HG00733',
         })
         if self.ADDITIONAL_SAMPLES:
-            expected_samples.remove('NA20888')
+            expected_samples.remove('NA20888')  # TODO investigate
             expected_samples.update(self.ADDITIONAL_SAMPLES)
         self.assertSetEqual({r['sample_id'] for r in response_json['rows']}, expected_samples)
         test_row = next(r for r in response_json['rows'] if r['sample_id'] == 'NA20889')
@@ -753,12 +757,8 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
         mock_google_authenticated.return_value = True
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 400)
-        expected_files = [
-            'participant', 'family', 'phenotype', 'analyte', 'experiment_dna_short_read',
-            'aligned_dna_short_read', 'aligned_dna_short_read_set', 'called_variants_dna_short_read',
-        ]
         skipped_file_validation_warnings = [
-            f'No data model found for "{file}" table so no validation was performed' for file in expected_files
+            f'No data model found for "{file}" table so no validation was performed' for file in EXPECTED_GREGOR_FILES
         ]
         self.assertListEqual(response.json()['warnings'], [
             'The following tables are required in the data model but absent from the reports: subject',
@@ -781,15 +781,29 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
         mock_google_authenticated.return_value = True
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 200)
-        self.assertDictEqual(response.json(), {
+        expected_response = {
             'info': ['Successfully validated and uploaded Gregor Report for 9 families'],
             'warnings': [
                 'Unable to load data model for validation: 404 Client Error: Not Found for url: http://raw.githubusercontent.com/gregor_data_model.json',
             ] + skipped_file_validation_warnings,
-        })
+        }
+        self.assertDictEqual(response.json(), expected_response)
+        self._assert_expected_gregor_files(mock_open)
+        self._test_expected_gregor_airtable_calls()
+
+        # test gsutil commands
+        mock_subprocess.assert_has_calls([
+            mock.call('gsutil ls gs://anvil-upload', stdout=-1, stderr=-2, shell=True),
+            mock.call().wait(),
+            mock.call('gsutil mv /mock/tmp/* gs://anvil-upload', stdout=-1, stderr=-2, shell=True),
+            mock.call().wait(),
+        ])
 
+        self.check_no_analyst_no_access(url)
+
+    def _assert_expected_gregor_files(self, mock_open):
         self.assertListEqual(
-            mock_open.call_args_list, [mock.call(f'/mock/tmp/{file}.tsv', 'w') for file in expected_files])
+            mock_open.call_args_list, [mock.call(f'/mock/tmp/{file}.tsv', 'w') for file in EXPECTED_GREGOR_FILES])
         files = [
             [row.split('\t') for row in write_call.args[0].split('\n')]
             for write_call in mock_open.return_value.__enter__.return_value.write.call_args_list
@@ -896,7 +910,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
         ], called_file)
         self.assertIn(['NA', 'Broad_NA20888_D1', 'NA', 'a6f6308866765ce8', 'NA', 'SNV', ''], called_file)
 
-        # test airtable calls
+    def _test_expected_gregor_airtable_calls(self):
         self.assertEqual(len(responses.calls), 4)
         sample_filter = "OR({CollaboratorSampleID}='HG00731',{CollaboratorSampleID}='HG00732',{CollaboratorSampleID}='HG00733'," \
                         "{CollaboratorSampleID}='NA19675_1',{CollaboratorSampleID}='NA19678',{CollaboratorSampleID}='NA19679'," \
@@ -927,16 +941,6 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
 
         self.assertEqual(responses.calls[3].request.url, MOCK_DATA_MODEL_URL)
 
-        # test gsutil commands
-        mock_subprocess.assert_has_calls([
-            mock.call('gsutil ls gs://anvil-upload', stdout=-1, stderr=-2, shell=True),
-            mock.call().wait(),
-            mock.call('gsutil mv /mock/tmp/* gs://anvil-upload', stdout=-1, stderr=-2, shell=True),
-            mock.call().wait(),
-        ])
-
-        self.check_no_analyst_no_access(url)
-
 
 class LocalReportAPITest(AuthenticationTestCase, ReportAPITest):
     fixtures = ['users', '1kg_project', 'reference_data', 'report_variants']

From a31e7ed9636da57285e448b1d241f94afd8a47ef Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 15 Aug 2023 12:42:04 -0400
Subject: [PATCH 08/19] test multiple data types

---
 seqr/fixtures/1kg_project.json      |  2 +-
 seqr/views/apis/report_api_tests.py | 98 +++++++++++++++++++++--------
 2 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json
index 3e4d01bd96..8dabf313b5 100644
--- a/seqr/fixtures/1kg_project.json
+++ b/seqr/fixtures/1kg_project.json
@@ -698,7 +698,7 @@
         "individual_id": "NA20888",
         "mother_id": null,
         "father_id": null,
-        "sex": "M",
+        "sex": "F",
         "affected": "A",
         "display_name": "",
 	"notes": "",
diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 33faafe7dc..aea400a8b4 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -799,9 +799,23 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             mock.call().wait(),
         ])
 
+        # Test multiple project with shared sample IDs
+        project = Project.objects.get(id=3)
+        project.consent_code = 'H'
+        project.save()
+
+        responses.calls.reset()
+        mock_open.reset_mock()
+        response = self.client.post(url, content_type='application/json', data=json.dumps(body))
+        self.assertEqual(response.status_code, 200)
+        expected_response['info'][0] = expected_response['info'][0].replace('9', '10')
+        self.assertDictEqual(response.json(), expected_response)
+        self._assert_expected_gregor_files(mock_open, has_second_project=True)
+        self._test_expected_gregor_airtable_calls(additional_samples=['NA20885', 'NA20889'])
+
         self.check_no_analyst_no_access(url)
 
-    def _assert_expected_gregor_files(self, mock_open):
+    def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
         self.assertListEqual(
             mock_open.call_args_list, [mock.call(f'/mock/tmp/{file}.tsv', 'w') for file in EXPECTED_GREGOR_FILES])
         files = [
@@ -810,7 +824,7 @@ def _assert_expected_gregor_files(self, mock_open):
         ]
         participant_file, family_file, phenotype_file, analyte_file, experiment_file, read_file, read_set_file, called_file = files
 
-        self.assertEqual(len(participant_file), 14)
+        self.assertEqual(len(participant_file), 16 if has_second_project else 14)
         self.assertEqual(participant_file[0], [
             'participant_id', 'internal_project_id', 'gregor_center', 'consent_code', 'recontactable', 'prior_testing',
             'pmid_id', 'family_id', 'paternal_id', 'maternal_id', 'twin_id', 'proband_relationship',
@@ -828,15 +842,29 @@ def _assert_expected_gregor_files(self, mock_open):
             'Broad_HG00731', 'Broad_1kg project nme with unide', 'BROAD', 'HMB', '', '', '', 'Broad_2', 'Broad_HG00732',
             'Broad_HG00733', '', '', '', 'Female', '', '', 'Hispanic or Latino', 'Other', '', 'Affected', '', '',
         ], hispanic_row)
+        multi_data_type_row = next(r for r in participant_file if r[0] == 'Broad_NA20888')
+        self.assertListEqual([
+            'Broad_NA20888', 'Broad_Test Reprocessed Project' if has_second_project else 'Broad_1kg project nme with unide',
+            'BROAD', 'HMB', 'No', '', '', 'Broad_12' if has_second_project else 'Broad_8', '0', '0', '', '', '',
+            'Male' if has_second_project else 'Female', '', '', '', '', '', 'Affected', '', '',
+        ], multi_data_type_row)
 
-        self.assertEqual(len(family_file), 10)
+        self.assertEqual(len(family_file), 11 if has_second_project else 10)
         self.assertEqual(family_file[0], [
             'family_id', 'consanguinity', 'consanguinity_detail', 'pedigree_file', 'pedigree_file_detail',
             'family_history_detail',
         ])
         self.assertIn(['Broad_1', 'Present', '', '', '', ''], family_file)
-
-        self.assertEqual(len(phenotype_file), 10)
+        fam_8_row = ['Broad_8', 'Unknown', '', '', '', '']
+        fam_11_row = ['Broad_11', 'None suspected', '', '', '', '']
+        if has_second_project:
+            self.assertIn(fam_11_row, family_file)
+            self.assertNotIn(fam_8_row, family_file)
+        else:
+            self.assertIn(fam_8_row, family_file)
+            self.assertNotIn(fam_11_row, family_file)
+
+        self.assertEqual(len(phenotype_file), 14 if has_second_project else 10)
         self.assertEqual(phenotype_file[0], [
             'phenotype_id', 'participant_id', 'term_id', 'presence', 'ontology', 'additional_details',
             'onset_age_range', 'additional_modifiers',
@@ -848,7 +876,7 @@ def _assert_expected_gregor_files(self, mock_open):
             '', 'Broad_NA19675_1', 'HP:0001674', 'Absent', 'HPO', 'originally indicated', '', '',
         ], phenotype_file)
 
-        self.assertEqual(len(analyte_file), 14)
+        self.assertEqual(len(analyte_file), 17 if has_second_project else 14)
         self.assertEqual(analyte_file[0], [
             'analyte_id', 'participant_id', 'analyte_type', 'analyte_processing_details', 'primary_biosample',
             'primary_biosample_id', 'primary_biosample_details', 'tissue_affected_status', 'age_at_collection',
@@ -859,8 +887,15 @@ def _assert_expected_gregor_files(self, mock_open):
         self.assertListEqual(
             ['Broad_SM-AGHT', 'Broad_NA19675_1', 'DNA', '', 'UBERON:0003714', '', '', 'No', '', '', '', '', '', '', '', ''],
             row)
+        self.assertIn(
+            ['Broad_SM-L5QMP', 'Broad_NA20888', '', '', '', '', '', 'No', '', '', '', '', '', '', '', ''], analyte_file)
+        self.assertEqual(
+            ['Broad_SM-L5QMWP', 'Broad_NA20888', '', '', '', '', '', 'No', '', '', '', '', '', '', '', ''] in analyte_file,
+            has_second_project
+        )
 
-        self.assertEqual(len(experiment_file), 3)
+        num_airtable_rows = 4 if has_second_project else 3
+        self.assertEqual(len(experiment_file), num_airtable_rows)
         self.assertEqual(experiment_file[0], [
             'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id', 'seq_library_prep_kit_method',
             'read_length', 'experiment_type', 'targeted_regions_method', 'targeted_region_bed_file',
@@ -874,8 +909,12 @@ def _assert_expected_gregor_files(self, mock_open):
             'Broad_exome_NA20888', 'Broad_SM-L5QMP', 'NA20888', 'Kapa HyperPrep', '151', 'exome',
             'Twist', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', '2022-06-05', '380', 'NovaSeq',
         ], experiment_file)
+        self.assertEqual([
+             'Broad_genome_NA20888_1', 'Broad_SM-L5QMWP', 'NA20888_1', 'Kapa HyperPrep w/o amplification', '200', 'genome',
+             '', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', '2023-03-13', '450', 'NovaSeq2',
+        ] in experiment_file, has_second_project)
 
-        self.assertEqual(len(read_file), 3)
+        self.assertEqual(len(read_file), num_airtable_rows)
         self.assertEqual(read_file[0], [
             'aligned_dna_short_read_id', 'experiment_dna_short_read_id', 'aligned_dna_short_read_file',
             'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri', 'reference_assembly_details',
@@ -890,16 +929,23 @@ def _assert_expected_gregor_files(self, mock_open):
         self.assertIn([
             'Broad_exome_NA20888_1', 'Broad_exome_NA20888',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.cram',
-            'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai', 'a6f6308866765ce8', 'GRCh38',
-            '', '', 'BWA 0.7.15.r1140', '42.8', '', '',
+            'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai', 'a6f6308866765ce8', 'GRCh38', '', '',
+            'BWA 0.7.15.r1140', '42.8', '', '',
         ], read_file)
-
-        self.assertEqual(len(read_set_file), 3)
+        self.assertEqual([
+             'Broad_genome_NA20888_1_1', 'Broad_genome_NA20888_1',
+             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.cram',
+             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.crai', '2aa33e8c32020b1c', 'GRCh38', '', '',
+             'BWA 0.7.15.r1140', '36.1', '', '',
+        ] in read_file, has_second_project)
+
+        self.assertEqual(len(read_set_file), num_airtable_rows)
         self.assertEqual(read_set_file[0], ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id'])
         self.assertIn(['BCM_H7YG5DSX2', 'Broad_exome_VCGS_FAM203_621_D2_1'], read_set_file)
         self.assertIn(['Broad_NA20888_D1', 'Broad_exome_NA20888_1'], read_set_file)
+        self.assertEqual(['Broad_NA20888_D1', 'Broad_genome_NA20888_1_1'] in read_set_file, has_second_project)
 
-        self.assertEqual(len(called_file), 3)
+        self.assertEqual(len(called_file), num_airtable_rows)
         self.assertEqual(called_file[0], [
             'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',
             'caller_software', 'variant_types', 'analysis_details',
@@ -909,23 +955,23 @@ def _assert_expected_gregor_files(self, mock_open):
             '129c28163df082', 'gatk4.1.2', 'SNV', 'DOI:10.5281/zenodo.4469317',
         ], called_file)
         self.assertIn(['NA', 'Broad_NA20888_D1', 'NA', 'a6f6308866765ce8', 'NA', 'SNV', ''], called_file)
+        self.assertEqual(
+            ['NA', 'Broad_NA20888_D1', 'NA', '2aa33e8c32020b1c', 'NA', 'SNV', ''] in called_file, has_second_project)
 
-    def _test_expected_gregor_airtable_calls(self):
+    def _test_expected_gregor_airtable_calls(self, additional_samples=None):
         self.assertEqual(len(responses.calls), 4)
-        sample_filter = "OR({CollaboratorSampleID}='HG00731',{CollaboratorSampleID}='HG00732',{CollaboratorSampleID}='HG00733'," \
-                        "{CollaboratorSampleID}='NA19675_1',{CollaboratorSampleID}='NA19678',{CollaboratorSampleID}='NA19679'," \
-                        "{CollaboratorSampleID}='NA20870',{CollaboratorSampleID}='NA20872',{CollaboratorSampleID}='NA20874'," \
-                        "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20881'," \
-                        "{CollaboratorSampleID}='NA20888')"
+        sample_ids = {
+             'HG00731', 'HG00732', 'HG00733', 'NA19675_1', 'NA19678', 'NA19679', 'NA20870', 'NA20872', 'NA20874',
+             'NA20875', 'NA20876', 'NA20881', 'NA20888',
+        }
+        sample_ids.update(additional_samples or [])
+        sample_filter = ','.join([f"{{CollaboratorSampleID}}='{sample_id}'" for sample_id in sorted(sample_ids)])
         sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorParticipantID', 'Recontactable']
-        self._assert_expected_airtable_call(0, sample_filter, sample_fields)
-        secondary_sample_filter = "OR({SeqrCollaboratorSampleID}='HG00731',{SeqrCollaboratorSampleID}='HG00732'," \
-                        "{SeqrCollaboratorSampleID}='HG00733',{SeqrCollaboratorSampleID}='NA19678'," \
-                        "{SeqrCollaboratorSampleID}='NA19679',{SeqrCollaboratorSampleID}='NA20870',{SeqrCollaboratorSampleID}='NA20872'," \
-                        "{SeqrCollaboratorSampleID}='NA20874',{SeqrCollaboratorSampleID}='NA20875',{SeqrCollaboratorSampleID}='NA20876'," \
-                        "{SeqrCollaboratorSampleID}='NA20881')"
+        self._assert_expected_airtable_call(0, f"OR({sample_filter})", sample_fields)
+        sample_ids -= {'NA19675_1', 'NA20888'}
+        secondary_sample_filter = ','.join([f"{{SeqrCollaboratorSampleID}}='{sample_id}'" for sample_id in sorted(sample_ids)])
         sample_fields[0] = 'SeqrCollaboratorSampleID'
-        self._assert_expected_airtable_call(1, secondary_sample_filter, sample_fields)
+        self._assert_expected_airtable_call(1, f"OR({secondary_sample_filter})", sample_fields)
         metadata_fields = [
             'CollaboratorParticipantID', 'CollaboratorSampleID_wes', 'CollaboratorSampleID_wgs', 'SMID_wes', 'SMID_wgs',
             'aligned_dna_short_read_file_wes', 'aligned_dna_short_read_file_wgs', 'aligned_dna_short_read_index_file_wes',

From b01440a73410039a170658c64d54f13a79e73b31 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 15 Aug 2023 12:48:23 -0400
Subject: [PATCH 09/19] clean up

---
 seqr/views/apis/report_api_tests.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index aea400a8b4..15abc03bc3 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -696,9 +696,7 @@ def test_sample_metadata_export(self, mock_google_authenticated):
             'NA19679', 'NA20870', 'HG00732', 'NA20876', 'NA20874', 'NA20875', 'NA19678', 'NA19675', 'HG00731',
             'NA20872', 'NA20881', 'HG00733',
         })
-        if self.ADDITIONAL_SAMPLES:
-            expected_samples.remove('NA20888')  # TODO investigate
-            expected_samples.update(self.ADDITIONAL_SAMPLES)
+        expected_samples.update(self.ADDITIONAL_SAMPLES)
         self.assertSetEqual({r['sample_id'] for r in response_json['rows']}, expected_samples)
         test_row = next(r for r in response_json['rows'] if r['sample_id'] == 'NA20889')
         self.assertDictEqual(EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW, test_row)

From f18f909b6787e8a98fe7091a15875db7d156867b Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 15 Aug 2023 15:27:35 -0400
Subject: [PATCH 10/19] add conditional validation

---
 seqr/views/apis/report_api.py       | 26 +++++++++++++++++++++++---
 seqr/views/apis/report_api_tests.py |  8 +++++++-
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index d0d4f812d6..7105f62bcb 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -8,6 +8,7 @@
 from django.db.models.functions import Replace, JSONObject
 from django.utils import timezone
 import json
+import re
 import requests
 
 from seqr.utils.file_utils import is_google_bucket_file_path, does_file_exist
@@ -999,9 +1000,12 @@ def _get_validated_gregor_files(file_data):
     except Exception as e:
         warnings.append(f'Unable to load data model for validation: {e}')
         validators = {}
-        required_tables = set()
+        required_tables = {}
 
-    missing_tables = required_tables.difference({f[0] for f in file_data})
+    tables = {f[0] for f in file_data}
+    missing_tables = [
+        table for table, validator in required_tables.items() if not _has_required_table(table, validator, tables)
+    ]
     if missing_tables:
         warnings.append(
             f'The following tables are required in the data model but absent from the reports: {", ".join(missing_tables)}'
@@ -1050,10 +1054,26 @@ def _load_data_model_validators():
         t['table']: {c['column']: c for c in t['columns']}
         for t in table_models
     }
-    required_tables = {t['table'] for t in table_models if t.get('required')}
+    required_tables = {t['table']: _parse_table_required(t['required']) for t in table_models if t.get('required')}
     return validators, required_tables
 
 
+def _parse_table_required(required_validator):
+    if required_validator is True:
+        return True
+
+    match = re.match(r'CONDITIONAL \(([\w+(\s,)?]+)\)', required_validator)
+    return match and match.group(1).split(', ')
+
+
+def _has_required_table(table, validator, tables):
+    if table in tables:
+        return True
+    if validator is True:
+        return False
+    return tables.isdisjoint(validator)
+
+
 def _validate_column_data(column, file_name, data, column_validator, warnings, errors):
     enum = column_validator.get('enumerations')
     required = column_validator.get('required')
diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index b071cdaddb..78f7fafbb3 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -363,6 +363,7 @@
         },
         {
             'table': 'aligned_dna_short_read',
+            'required': 'CONDITIONAL (aligned_dna_short_read_set, called_variants_dna_short_read)',
             'columns': [
                 {'column': 'aligned_dna_short_read_id', 'required': True},
                 {'column': 'experiment_dna_short_read_id', 'required': True},
@@ -389,6 +390,11 @@
             'table': 'dna_read_data',
             'columns': [{'column': 'analyte_id', 'required': True}],
         },
+        {
+            'table': 'dna_read_data_set',
+            'required': 'CONDITIONAL (aligned_dna_short_read_set, dna_read_data)',
+            'columns': [{'column': 'analyte_id', 'required': True}],
+        },
     ]
 }
 
@@ -761,7 +767,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             f'No data model found for "{file}" table so no validation was performed' for file in EXPECTED_GREGOR_FILES
         ]
         self.assertListEqual(response.json()['warnings'], [
-            'The following tables are required in the data model but absent from the reports: subject',
+            'The following tables are required in the data model but absent from the reports: subject, dna_read_data_set',
             'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id',
             'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata',
             'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881',

From faa522363388f1b95823995f16b2137e9464d156 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 15 Aug 2023 15:53:55 -0400
Subject: [PATCH 11/19] add rna columns

---
 seqr/views/apis/report_api.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 7105f62bcb..71d71347c7 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -688,6 +688,12 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
 EXPERIMENT_TABLE_COLUMNS = [
     'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id',
 ] + EXPERIMENT_TABLE_AIRTABLE_FIELDS
+EXPERIMENT_RNA_TABLE_COLUMNS = ['experiment_rna_short_read_id'] + [
+    c for c in EXPERIMENT_TABLE_COLUMNS[1:] if not c.startswith('target')] + [
+    'single_or_paired_ends', 'within_site_batch_name', 'RIN', 'estimated_library_size', 'total_reads', 'percent_rRNA',
+    'percent_mRNA', 'percent_mtRNA', 'percent_Globin', 'percent_UMI', '5prime3prime_bias', 'percent_GC', 'percent_chrX_Y',
+]
+EXPERIMENT_RNA_TABLE_COLUMNS.insert(4, 'library_prep_type')
 READ_TABLE_AIRTABLE_FIELDS = [
     'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly',
     'alignment_software', 'mean_coverage', 'analysis_details',
@@ -695,6 +701,14 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
 READ_TABLE_COLUMNS = ['aligned_dna_short_read_id', 'experiment_dna_short_read_id'] + READ_TABLE_AIRTABLE_FIELDS + ['quality_issues']
 READ_TABLE_COLUMNS.insert(6, 'reference_assembly_details')
 READ_TABLE_COLUMNS.insert(6, 'reference_assembly_uri')
+READ_RNA_TABLE_COLUMNS = [
+    'aligned_rna_short_read_id', 'experiment_rna_short_read_id', 'aligned_rna_short_read_file',
+    'aligned_rna_short_read_index_file',
+] + READ_TABLE_COLUMNS[4:-2] + ['percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned', 'quality_issues']
+READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation_details')
+READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation')
+READ_RNA_TABLE_COLUMNS.insert(7, 'alignment_postprocessing')
+READ_RNA_TABLE_COLUMNS.insert(7, 'alignment_log_file')
 READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id']
 CALLED_TABLE_COLUMNS = [
     'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',
@@ -726,6 +740,8 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
     'aligned_dna_short_read': READ_TABLE_COLUMNS,
     'aligned_dna_short_read_set': READ_SET_TABLE_COLUMNS,
     'called_variants_dna_short_read': CALLED_TABLE_COLUMNS,
+    'experiment_rna_short_read': EXPERIMENT_RNA_TABLE_COLUMNS,
+    'aligned_rna_short_read': READ_RNA_TABLE_COLUMNS,
 }
 WARN_MISSING_TABLE_COLUMNS = {
     'participant': ['recontactable',  'reported_race', 'affected_status', 'phenotype_description', 'age_at_enrollment'],
@@ -892,6 +908,8 @@ def gregor_export(request):
         ('aligned_dna_short_read', airtable_rows),
         ('aligned_dna_short_read_set', airtable_rows),
         ('called_variants_dna_short_read', airtable_rows),
+        ('aligned_rna_short_read', airtable_rows),
+        ('experiment_rna_short_read', airtable_rows),
     ])
     write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv')
 

From 3d0516b9d8de3590894d7faced1b8433b549905c Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 15 Aug 2023 16:43:29 -0400
Subject: [PATCH 12/19] add rna airtable data

---
 seqr/views/apis/report_api.py | 75 +++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 30 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 71d71347c7..638408e379 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -658,7 +658,7 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
 
 # GREGoR metadata
 
-GREGOR_DATA_TYPES = ['wes', 'wgs']
+GREGOR_DATA_TYPES = ['wgs', 'wes', 'rna']
 SMID_FIELD = 'SMID'
 PARTICIPANT_ID_FIELD = 'CollaboratorParticipantID'
 COLLABORATOR_SAMPLE_ID_FIELD = 'CollaboratorSampleID'
@@ -688,12 +688,14 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
 EXPERIMENT_TABLE_COLUMNS = [
     'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id',
 ] + EXPERIMENT_TABLE_AIRTABLE_FIELDS
+EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS = [
+    'library_prep_type', 'single_or_paired_ends', 'within_site_batch_name', 'RIN', 'estimated_library_size',
+    'total_reads', 'percent_rRNA', 'percent_mRNA', '5prime3prime_bias',
+]
 EXPERIMENT_RNA_TABLE_COLUMNS = ['experiment_rna_short_read_id'] + [
-    c for c in EXPERIMENT_TABLE_COLUMNS[1:] if not c.startswith('target')] + [
-    'single_or_paired_ends', 'within_site_batch_name', 'RIN', 'estimated_library_size', 'total_reads', 'percent_rRNA',
-    'percent_mRNA', 'percent_mtRNA', 'percent_Globin', 'percent_UMI', '5prime3prime_bias', 'percent_GC', 'percent_chrX_Y',
+    c for c in EXPERIMENT_TABLE_COLUMNS[1:] if not c.startswith('target')] + EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + [
+    'percent_mtRNA', 'percent_Globin', 'percent_UMI',  'percent_GC', 'percent_chrX_Y',
 ]
-EXPERIMENT_RNA_TABLE_COLUMNS.insert(4, 'library_prep_type')
 READ_TABLE_AIRTABLE_FIELDS = [
     'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly',
     'alignment_software', 'mean_coverage', 'analysis_details',
@@ -701,34 +703,42 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
 READ_TABLE_COLUMNS = ['aligned_dna_short_read_id', 'experiment_dna_short_read_id'] + READ_TABLE_AIRTABLE_FIELDS + ['quality_issues']
 READ_TABLE_COLUMNS.insert(6, 'reference_assembly_details')
 READ_TABLE_COLUMNS.insert(6, 'reference_assembly_uri')
-READ_RNA_TABLE_COLUMNS = [
-    'aligned_rna_short_read_id', 'experiment_rna_short_read_id', 'aligned_rna_short_read_file',
-    'aligned_rna_short_read_index_file',
-] + READ_TABLE_COLUMNS[4:-2] + ['percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned', 'quality_issues']
-READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation_details')
-READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation')
+READ_RNA_TABLE_AIRTABLE_ID_FIELDS = ['aligned_rna_short_read_file', 'aligned_rna_short_read_index_file']
+READ_RNA_TABLE_AIRTABLE_FIELDS = [
+    'gene_annotation', 'alignment_log_file', 'percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned',
+]
+READ_RNA_TABLE_COLUMNS = ['aligned_rna_short_read_id', 'experiment_rna_short_read_id'] + \
+    READ_RNA_TABLE_AIRTABLE_ID_FIELDS + READ_TABLE_COLUMNS[4:-2] + READ_RNA_TABLE_AIRTABLE_FIELDS + ['quality_issues']
+READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation_details')  # TODO placement
 READ_RNA_TABLE_COLUMNS.insert(7, 'alignment_postprocessing')
-READ_RNA_TABLE_COLUMNS.insert(7, 'alignment_log_file')
 READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id']
 CALLED_TABLE_COLUMNS = [
     'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',
     'caller_software', 'variant_types', 'analysis_details',
 ]
 
-DATA_TYPE_OMIT = {'wgs': ['targeted_regions_method'], 'wes': []}
-MAPPED_AIRTABLE_FIELDS = {'alignment_software': 'alignment_software_dna'}
-NO_DATA_TYPE_FIELDS = {'targeted_region_bed_file', 'reference_assembly', 'analysis_details'}
+RNA_ONLY = EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + READ_RNA_TABLE_AIRTABLE_FIELDS + [
+    'reference_assembly_uri', 'alignment_software']
+DATA_TYPE_OMIT = {
+    'wgs': ['targeted_regions_method'] + RNA_ONLY, 'wes': RNA_ONLY, 'rna': [
+        'targeted_regions_method', 'target_insert_size', 'mean_coverage', 'aligned_dna_short_read_file',
+        'aligned_dna_short_read_index_file',
+    ],
+}
+NO_DATA_TYPE_FIELDS = {
+    'targeted_region_bed_file', 'reference_assembly', 'analysis_details', 'percent_rRNA', 'percent_mRNA',
+    'alignment_software_dna',
+}
+NO_DATA_TYPE_FIELDS.update(READ_RNA_TABLE_AIRTABLE_ID_FIELDS)
 
-DATA_TYPE_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + [
+DATA_TYPE_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + RNA_ONLY + [
     COLLABORATOR_SAMPLE_ID_FIELD, SMID_FIELD]
 ALL_AIRTABLE_COLUMNS = DATA_TYPE_AIRTABLE_COLUMNS + CALLED_TABLE_COLUMNS
 AIRTABLE_QUERY_COLUMNS = set(CALLED_TABLE_COLUMNS)
 AIRTABLE_QUERY_COLUMNS.remove('md5sum')
 AIRTABLE_QUERY_COLUMNS.update(NO_DATA_TYPE_FIELDS)
-AIRTABLE_QUERY_COLUMNS.update(MAPPED_AIRTABLE_FIELDS.values())
 for data_type in GREGOR_DATA_TYPES:
-    data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set(
-        MAPPED_AIRTABLE_FIELDS.keys()) - set(DATA_TYPE_OMIT[data_type])
+    data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set(DATA_TYPE_OMIT[data_type])
     AIRTABLE_QUERY_COLUMNS.update({f'{field}_{data_type}' for field in data_type_columns})
 
 TABLE_COLUMNS = {
@@ -826,7 +836,7 @@ def gregor_export(request):
         consent_code=consent_code[0],
         projectcategory__name='GREGoR',
     )
-    sample_types = get_search_samples(projects, active_only=False).values_list('individual_id', 'sample_type')
+    sample_types = Sample.objects.filter(individual__family__project__in=projects).values_list('individual_id', 'sample_type')
     individual_data_types = defaultdict(set)
     for individual_db_id, sample_type in sample_types:
         individual_data_types[individual_db_id].add(sample_type)
@@ -845,10 +855,12 @@ def gregor_export(request):
     phenotype_rows = []
     analyte_rows = []
     airtable_rows = []
+    airtable_rna_rows = []
     for data_type_individuals in grouped_data_type_individuals.values():
         # If multiple individual records, prefer WGS
         individual = next(
-            data_type_individuals[data_type] for data_type in ['WGS', 'WES'] if data_type_individuals.get(data_type)
+            data_type_individuals[data_type.upper()] for data_type in GREGOR_DATA_TYPES
+            if data_type_individuals.get(data_type.upper())
         )
 
         # family table
@@ -887,11 +899,14 @@ def gregor_export(request):
             for data_type in data_type_individuals:
                 data_type_metadata = airtable_metadata.get(data_type)
                 if data_type_metadata:
-                    experiment_ids = _get_experiment_ids(data_type_metadata)
+                    is_rna = data_type == 'RNA'
+                    experiment_ids = _get_experiment_ids(data_type_metadata, is_rna)
                     analyte_ids.add(experiment_ids['analyte_id'])
                     row = {**airtable_metadata, **data_type_metadata, **experiment_ids}
-                    row.update({k: row[v] for k, v in MAPPED_AIRTABLE_FIELDS.items()})
-                    airtable_rows.append(row)
+                    if not is_rna:
+                        row['alignment_software'] = row['alignment_software_dna']
+                    rows = airtable_rna_rows if is_rna else airtable_rows
+                    rows.append(row)
 
         # analyte table
         if not analyte_ids:
@@ -908,8 +923,8 @@ def gregor_export(request):
         ('aligned_dna_short_read', airtable_rows),
         ('aligned_dna_short_read_set', airtable_rows),
         ('called_variants_dna_short_read', airtable_rows),
-        ('aligned_rna_short_read', airtable_rows),
-        ('experiment_rna_short_read', airtable_rows),
+        ('aligned_rna_short_read', airtable_rna_rows),
+        ('experiment_rna_short_read', airtable_rna_rows),
     ])
     write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv')
 
@@ -994,14 +1009,14 @@ def _get_analyte_row(individual):
     }
 
 
-def _get_experiment_ids(data_type_metadata):
+def _get_experiment_ids(data_type_metadata, is_rna):
     collaborator_sample_id = data_type_metadata[COLLABORATOR_SAMPLE_ID_FIELD]
-    experiment_dna_short_read_id = f'Broad_{data_type_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}'
+    experiment_short_read_id = f'Broad_{data_type_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}'
     return {
         'analyte_id': _get_analyte_id(data_type_metadata),
-        'experiment_dna_short_read_id': experiment_dna_short_read_id,
+        f'experiment_{"rna" if is_rna else "dna"}_short_read_id': experiment_short_read_id,
         'experiment_sample_id': collaborator_sample_id,
-        'aligned_dna_short_read_id': f'{experiment_dna_short_read_id}_1'
+        f'aligned_{"rna" if is_rna else "dna"}_short_read_id': f'{experiment_short_read_id}_1'
     }
 
 

From 3fb1aeb5c211f61e5e76ac064c6ee1b182158741 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 15 Aug 2023 16:53:08 -0400
Subject: [PATCH 13/19] clean up column order

---
 seqr/views/apis/report_api.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 638408e379..988a1ac047 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -698,27 +698,26 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
 ]
 READ_TABLE_AIRTABLE_FIELDS = [
     'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly',
-    'alignment_software', 'mean_coverage', 'analysis_details',
+    'mean_coverage', 'alignment_software', 'analysis_details',
 ]
 READ_TABLE_COLUMNS = ['aligned_dna_short_read_id', 'experiment_dna_short_read_id'] + READ_TABLE_AIRTABLE_FIELDS + ['quality_issues']
 READ_TABLE_COLUMNS.insert(6, 'reference_assembly_details')
 READ_TABLE_COLUMNS.insert(6, 'reference_assembly_uri')
 READ_RNA_TABLE_AIRTABLE_ID_FIELDS = ['aligned_rna_short_read_file', 'aligned_rna_short_read_index_file']
 READ_RNA_TABLE_AIRTABLE_FIELDS = [
-    'gene_annotation', 'alignment_log_file', 'percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned',
+    'gene_annotation', 'alignment_software', 'alignment_log_file', 'percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned',
 ]
 READ_RNA_TABLE_COLUMNS = ['aligned_rna_short_read_id', 'experiment_rna_short_read_id'] + \
-    READ_RNA_TABLE_AIRTABLE_ID_FIELDS + READ_TABLE_COLUMNS[4:-2] + READ_RNA_TABLE_AIRTABLE_FIELDS + ['quality_issues']
-READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation_details')  # TODO placement
-READ_RNA_TABLE_COLUMNS.insert(7, 'alignment_postprocessing')
+    READ_RNA_TABLE_AIRTABLE_ID_FIELDS + READ_TABLE_COLUMNS[4:-3] + READ_RNA_TABLE_AIRTABLE_FIELDS + ['quality_issues']
+READ_RNA_TABLE_COLUMNS.insert(10, 'gene_annotation_details')
+READ_RNA_TABLE_COLUMNS.insert(13, 'alignment_postprocessing')
 READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id']
 CALLED_TABLE_COLUMNS = [
     'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',
     'caller_software', 'variant_types', 'analysis_details',
 ]
 
-RNA_ONLY = EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + READ_RNA_TABLE_AIRTABLE_FIELDS + [
-    'reference_assembly_uri', 'alignment_software']
+RNA_ONLY = EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + READ_RNA_TABLE_AIRTABLE_FIELDS + ['reference_assembly_uri']
 DATA_TYPE_OMIT = {
     'wgs': ['targeted_regions_method'] + RNA_ONLY, 'wes': RNA_ONLY, 'rna': [
         'targeted_regions_method', 'target_insert_size', 'mean_coverage', 'aligned_dna_short_read_file',

From a30987e19a4edce9d7bbf0a33e85a43e7f33ed3a Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 15 Aug 2023 17:22:52 -0400
Subject: [PATCH 14/19] get tests passing

---
 seqr/views/apis/report_api.py       |  4 ++--
 seqr/views/apis/report_api_tests.py | 37 ++++++++++++++++++-----------
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 988a1ac047..37ff1e3d80 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -751,7 +751,7 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
     'called_variants_dna_short_read': CALLED_TABLE_COLUMNS,
     'experiment_rna_short_read': EXPERIMENT_RNA_TABLE_COLUMNS,
     'aligned_rna_short_read': READ_RNA_TABLE_COLUMNS,
-}
+}  # TODO add experiment table
 WARN_MISSING_TABLE_COLUMNS = {
     'participant': ['recontactable',  'reported_race', 'affected_status', 'phenotype_description', 'age_at_enrollment'],
 }
@@ -922,8 +922,8 @@ def gregor_export(request):
         ('aligned_dna_short_read', airtable_rows),
         ('aligned_dna_short_read_set', airtable_rows),
         ('called_variants_dna_short_read', airtable_rows),
-        ('aligned_rna_short_read', airtable_rna_rows),
         ('experiment_rna_short_read', airtable_rna_rows),
+        ('aligned_rna_short_read', airtable_rna_rows),
     ])
     write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv')
 
diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 78f7fafbb3..458334d7ab 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -265,6 +265,7 @@
 EXPECTED_GREGOR_FILES = [
     'participant', 'family', 'phenotype', 'analyte', 'experiment_dna_short_read',
     'aligned_dna_short_read', 'aligned_dna_short_read_set', 'called_variants_dna_short_read',
+    'experiment_rna_short_read', 'aligned_rna_short_read',
 ]
 
 EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = {
@@ -828,7 +829,8 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
             [row.split('\t') for row in write_call.args[0].split('\n')]
             for write_call in mock_open.return_value.__enter__.return_value.write.call_args_list
         ]
-        participant_file, family_file, phenotype_file, analyte_file, experiment_file, read_file, read_set_file, called_file = files
+        participant_file, family_file, phenotype_file, analyte_file, experiment_file, read_file, read_set_file, \
+        called_file, experiment_rna_file, aligned_rna_file = files
 
         self.assertEqual(len(participant_file), 16 if has_second_project else 14)
         self.assertEqual(participant_file[0], [
@@ -924,25 +926,25 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
         self.assertEqual(read_file[0], [
             'aligned_dna_short_read_id', 'experiment_dna_short_read_id', 'aligned_dna_short_read_file',
             'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri', 'reference_assembly_details',
-            'alignment_software', 'mean_coverage', 'analysis_details',  'quality_issues',
+            'mean_coverage', 'alignment_software', 'analysis_details',  'quality_issues',
         ])
         self.assertIn([
             'Broad_exome_VCGS_FAM203_621_D2_1', 'Broad_exome_VCGS_FAM203_621_D2',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai', '129c28163df082', 'GRCh38',
-            '', '', 'BWA-MEM-2.3', '', 'DOI:10.5281/zenodo.4469317', '',
+            '', '', '', 'BWA-MEM-2.3', 'DOI:10.5281/zenodo.4469317', '',
         ], read_file)
         self.assertIn([
             'Broad_exome_NA20888_1', 'Broad_exome_NA20888',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.cram',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai', 'a6f6308866765ce8', 'GRCh38', '', '',
-            'BWA 0.7.15.r1140', '42.8', '', '',
+            '42.8', 'BWA 0.7.15.r1140', '', '',
         ], read_file)
         self.assertEqual([
              'Broad_genome_NA20888_1_1', 'Broad_genome_NA20888_1',
              'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.cram',
              'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.crai', '2aa33e8c32020b1c', 'GRCh38', '', '',
-             'BWA 0.7.15.r1140', '36.1', '', '',
+             '36.1', 'BWA 0.7.15.r1140', '', '',
         ] in read_file, has_second_project)
 
         self.assertEqual(len(read_set_file), num_airtable_rows)
@@ -979,15 +981,22 @@ def _test_expected_gregor_airtable_calls(self, additional_samples=None):
         sample_fields[0] = 'SeqrCollaboratorSampleID'
         self._assert_expected_airtable_call(1, f"OR({secondary_sample_filter})", sample_fields)
         metadata_fields = [
-            'CollaboratorParticipantID', 'CollaboratorSampleID_wes', 'CollaboratorSampleID_wgs', 'SMID_wes', 'SMID_wgs',
-            'aligned_dna_short_read_file_wes', 'aligned_dna_short_read_file_wgs', 'aligned_dna_short_read_index_file_wes',
-            'aligned_dna_short_read_index_file_wgs', 'aligned_dna_short_read_set_id', 'alignment_software_dna',
-            'analysis_details', 'called_variants_dna_file', 'called_variants_dna_short_read_id', 'caller_software',
-            'date_data_generation_wes', 'date_data_generation_wgs', 'experiment_type_wes', 'experiment_type_wgs',
-            'md5sum_wes', 'md5sum_wgs', 'mean_coverage_wes', 'mean_coverage_wgs', 'read_length_wes', 'read_length_wgs',
-            'reference_assembly', 'seq_library_prep_kit_method_wes', 'seq_library_prep_kit_method_wgs',
-            'sequencing_platform_wes', 'sequencing_platform_wgs', 'target_insert_size_wes', 'target_insert_size_wgs',
-            'targeted_region_bed_file', 'targeted_regions_method_wes', 'variant_types',
+            'CollaboratorParticipantID', '5prime3prime_bias_rna', 'CollaboratorSampleID_rna', 'CollaboratorSampleID_wes',
+            'CollaboratorSampleID_wgs', 'RIN_rna', 'SMID_rna', 'SMID_wes', 'SMID_wgs', 'aligned_dna_short_read_file_wes',
+            'aligned_dna_short_read_file_wgs', 'aligned_dna_short_read_index_file_wes',
+            'aligned_dna_short_read_index_file_wgs', 'aligned_dna_short_read_set_id',
+            'aligned_rna_short_read_file', 'aligned_rna_short_read_index_file', 'alignment_log_file_rna',
+            'alignment_software_dna', 'alignment_software_rna', 'analysis_details', 'called_variants_dna_file',
+            'called_variants_dna_short_read_id', 'caller_software', 'date_data_generation_rna', 'date_data_generation_wes',
+            'date_data_generation_wgs', 'estimated_library_size_rna', 'experiment_type_rna', 'experiment_type_wes',
+            'experiment_type_wgs', 'gene_annotation_rna', 'library_prep_type_rna', 'md5sum_rna', 'md5sum_wes',
+            'md5sum_wgs', 'mean_coverage_wes', 'mean_coverage_wgs', 'percent_mRNA', 'percent_multimapped_rna',
+            'percent_rRNA', 'percent_unaligned_rna', 'percent_uniquely_aligned_rna', 'read_length_rna', 'read_length_wes',
+            'read_length_wgs', 'reference_assembly', 'reference_assembly_uri_rna', 'seq_library_prep_kit_method_rna',
+            'seq_library_prep_kit_method_wes', 'seq_library_prep_kit_method_wgs', 'sequencing_platform_rna',
+            'sequencing_platform_wes', 'sequencing_platform_wgs', 'single_or_paired_ends_rna', 'target_insert_size_wes',
+            'target_insert_size_wgs', 'targeted_region_bed_file', 'targeted_regions_method_wes', 'total_reads_rna',
+            'variant_types', 'within_site_batch_name_rna',
         ]
         self._assert_expected_airtable_call(2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='NA20888',CollaboratorParticipantID='VCGS_FAM203_621')", metadata_fields)
 

From dd5a245fef08ce9df57565c4adf98f8200302b5b Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 16 Aug 2023 10:13:09 -0400
Subject: [PATCH 15/19] clean up gregor file config

---
 seqr/views/apis/report_api.py | 48 +++++++++++++----------------------
 1 file changed, 17 insertions(+), 31 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 37ff1e3d80..566915d851 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -740,18 +740,7 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
     data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set(DATA_TYPE_OMIT[data_type])
     AIRTABLE_QUERY_COLUMNS.update({f'{field}_{data_type}' for field in data_type_columns})
 
-TABLE_COLUMNS = {
-    'participant': PARTICIPANT_TABLE_COLUMNS,
-    'family': GREGOR_FAMILY_TABLE_COLUMNS,
-    'phenotype': PHENOTYPE_TABLE_COLUMNS,
-    'analyte': ANALYTE_TABLE_COLUMNS,
-    'experiment_dna_short_read': EXPERIMENT_TABLE_COLUMNS,
-    'aligned_dna_short_read': READ_TABLE_COLUMNS,
-    'aligned_dna_short_read_set': READ_SET_TABLE_COLUMNS,
-    'called_variants_dna_short_read': CALLED_TABLE_COLUMNS,
-    'experiment_rna_short_read': EXPERIMENT_RNA_TABLE_COLUMNS,
-    'aligned_rna_short_read': READ_RNA_TABLE_COLUMNS,
-}  # TODO add experiment table
+# 'experiment': ['experiment_id', 'table_name', 'id_in_table', 'participant_id'], # TODO
 WARN_MISSING_TABLE_COLUMNS = {
     'participant': ['recontactable',  'reported_race', 'affected_status', 'phenotype_description', 'age_at_enrollment'],
 }
@@ -913,18 +902,19 @@ def gregor_export(request):
         for analyte_id in analyte_ids:
             analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual)))
 
-    files, warnings = _get_validated_gregor_files([
-        ('participant', participant_rows),
-        ('family', list(family_map.values())),
-        ('phenotype', phenotype_rows),
-        ('analyte', analyte_rows),
-        ('experiment_dna_short_read', airtable_rows),
-        ('aligned_dna_short_read', airtable_rows),
-        ('aligned_dna_short_read_set', airtable_rows),
-        ('called_variants_dna_short_read', airtable_rows),
-        ('experiment_rna_short_read', airtable_rna_rows),
-        ('aligned_rna_short_read', airtable_rna_rows),
-    ])
+    files = [
+        ('participant', PARTICIPANT_TABLE_COLUMNS, participant_rows),
+        ('family', GREGOR_FAMILY_TABLE_COLUMNS, list(family_map.values())),
+        ('phenotype', PHENOTYPE_TABLE_COLUMNS, phenotype_rows),
+        ('analyte', ANALYTE_TABLE_COLUMNS, analyte_rows),
+        ('experiment_dna_short_read', EXPERIMENT_TABLE_COLUMNS, airtable_rows),
+        ('aligned_dna_short_read', READ_TABLE_COLUMNS, airtable_rows),
+        ('aligned_dna_short_read_set', READ_SET_TABLE_COLUMNS, airtable_rows),
+        ('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, airtable_rows),
+        ('experiment_rna_short_read', EXPERIMENT_RNA_TABLE_COLUMNS, airtable_rna_rows),
+        ('aligned_rna_short_read', READ_RNA_TABLE_COLUMNS, airtable_rna_rows),
+    ]
+    warnings = _validate_gregor_files(files)
     write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv')
 
     return create_json_response({
@@ -1024,7 +1014,7 @@ def _get_analyte_id(airtable_metadata):
     return f'Broad_{sm_id}' if sm_id else None
 
 
-def _get_validated_gregor_files(file_data):
+def _validate_gregor_files(file_data):
     errors = []
     warnings = []
     try:
@@ -1043,11 +1033,7 @@ def _get_validated_gregor_files(file_data):
             f'The following tables are required in the data model but absent from the reports: {", ".join(missing_tables)}'
         )
 
-    files = []
-    for file_name, data in file_data:
-        columns = TABLE_COLUMNS[file_name]
-        files.append([file_name, columns, data])
-
+    for file_name, columns, data in file_data:
         table_validator = validators.get(file_name)
         if not table_validator:
             warnings.append(f'No data model found for "{file_name}" table so no validation was performed')
@@ -1075,7 +1061,7 @@ def _get_validated_gregor_files(file_data):
     if errors:
         raise ErrorsWarningsException(errors, warnings)
 
-    return files, warnings
+    return warnings
 
 
 def _load_data_model_validators():

From 9d5b3ce99a9d9a17e31ba6b5df484fe78f2a6674 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 16 Aug 2023 10:36:53 -0400
Subject: [PATCH 16/19] add experiment lookup file

---
 seqr/views/apis/report_api.py       | 47 ++++++++++++++++++++---------
 seqr/views/apis/report_api_tests.py |  4 +--
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 566915d851..3e1e67995c 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -696,6 +696,7 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
     c for c in EXPERIMENT_TABLE_COLUMNS[1:] if not c.startswith('target')] + EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + [
     'percent_mtRNA', 'percent_Globin', 'percent_UMI',  'percent_GC', 'percent_chrX_Y',
 ]
+EXPERIMENT_LOOKUP_TABLE_COLUMNS = ['experiment_id', 'table_name', 'id_in_table', 'participant_id']
 READ_TABLE_AIRTABLE_FIELDS = [
     'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly',
     'mean_coverage', 'alignment_software', 'analysis_details',
@@ -740,7 +741,6 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
     data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set(DATA_TYPE_OMIT[data_type])
     AIRTABLE_QUERY_COLUMNS.update({f'{field}_{data_type}' for field in data_type_columns})
 
-# 'experiment': ['experiment_id', 'table_name', 'id_in_table', 'participant_id'], # TODO
 WARN_MISSING_TABLE_COLUMNS = {
     'participant': ['recontactable',  'reported_race', 'affected_status', 'phenotype_description', 'age_at_enrollment'],
 }
@@ -844,6 +844,7 @@ def gregor_export(request):
     analyte_rows = []
     airtable_rows = []
     airtable_rna_rows = []
+    experiment_lookup_rows = []
     for data_type_individuals in grouped_data_type_individuals.values():
         # If multiple individual records, prefer WGS
         individual = next(
@@ -885,16 +886,17 @@ def gregor_export(request):
         if airtable_sample:
             airtable_metadata = airtable_metadata_by_participant.get(airtable_sample[PARTICIPANT_ID_FIELD]) or {}
             for data_type in data_type_individuals:
-                data_type_metadata = airtable_metadata.get(data_type)
-                if data_type_metadata:
-                    is_rna = data_type == 'RNA'
-                    experiment_ids = _get_experiment_ids(data_type_metadata, is_rna)
-                    analyte_ids.add(experiment_ids['analyte_id'])
-                    row = {**airtable_metadata, **data_type_metadata, **experiment_ids}
-                    if not is_rna:
-                        row['alignment_software'] = row['alignment_software_dna']
-                    rows = airtable_rna_rows if is_rna else airtable_rows
-                    rows.append(row)
+                if data_type not in airtable_metadata:
+                    continue
+                row = _get_airtable_row(data_type, airtable_metadata)
+                analyte_ids.add(row['analyte_id'])
+                is_rna = data_type == 'RNA'
+                if not is_rna:
+                    row['alignment_software'] = row['alignment_software_dna']
+                (airtable_rna_rows if is_rna else airtable_rows).append(row)
+                experiment_lookup_rows.append(
+                    {'participant_id': participant_id, **_get_experiment_lookup_row(is_rna, row)}
+                )
 
         # analyte table
         if not analyte_ids:
@@ -913,6 +915,7 @@ def gregor_export(request):
         ('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, airtable_rows),
         ('experiment_rna_short_read', EXPERIMENT_RNA_TABLE_COLUMNS, airtable_rna_rows),
         ('aligned_rna_short_read', READ_RNA_TABLE_COLUMNS, airtable_rna_rows),
+        ('experiment', EXPERIMENT_LOOKUP_TABLE_COLUMNS, experiment_lookup_rows),
     ]
     warnings = _validate_gregor_files(files)
     write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv')
@@ -998,14 +1001,20 @@ def _get_analyte_row(individual):
     }
 
 
-def _get_experiment_ids(data_type_metadata, is_rna):
+def _get_airtable_row(data_type, airtable_metadata):
+    data_type_metadata = airtable_metadata[data_type]
     collaborator_sample_id = data_type_metadata[COLLABORATOR_SAMPLE_ID_FIELD]
     experiment_short_read_id = f'Broad_{data_type_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}'
+    aligned_short_read_id = f'{experiment_short_read_id}_1'
     return {
         'analyte_id': _get_analyte_id(data_type_metadata),
-        f'experiment_{"rna" if is_rna else "dna"}_short_read_id': experiment_short_read_id,
+        'experiment_dna_short_read_id': experiment_short_read_id,
+        'experiment_rna_short_read_id': experiment_short_read_id,
         'experiment_sample_id': collaborator_sample_id,
-        f'aligned_{"rna" if is_rna else "dna"}_short_read_id': f'{experiment_short_read_id}_1'
+        'aligned_dna_short_read_id': aligned_short_read_id,
+        'aligned_rna_short_read_id': aligned_short_read_id,
+        **airtable_metadata,
+        **data_type_metadata,
     }
 
 
@@ -1014,6 +1023,16 @@ def _get_analyte_id(airtable_metadata):
     return f'Broad_{sm_id}' if sm_id else None
 
 
+def _get_experiment_lookup_row(is_rna, row_data):
+    table_name = f'experiment_{"rna" if is_rna else "dna"}_short_read'
+    id_in_table = row_data[f'{table_name}_id']
+    return {
+        'table_name': table_name,
+        'id_in_table': id_in_table,
+        'experiment_id': f'{table_name}.{id_in_table}',
+    }
+
+
 def _validate_gregor_files(file_data):
     errors = []
     warnings = []
diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 458334d7ab..0ff383f8c4 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -265,7 +265,7 @@
 EXPECTED_GREGOR_FILES = [
     'participant', 'family', 'phenotype', 'analyte', 'experiment_dna_short_read',
     'aligned_dna_short_read', 'aligned_dna_short_read_set', 'called_variants_dna_short_read',
-    'experiment_rna_short_read', 'aligned_rna_short_read',
+    'experiment_rna_short_read', 'aligned_rna_short_read', 'experiment',
 ]
 
 EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = {
@@ -830,7 +830,7 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
             for write_call in mock_open.return_value.__enter__.return_value.write.call_args_list
         ]
         participant_file, family_file, phenotype_file, analyte_file, experiment_file, read_file, read_set_file, \
-        called_file, experiment_rna_file, aligned_rna_file = files
+        called_file, experiment_rna_file, aligned_rna_file, experiment_lookup_file = files
 
         self.assertEqual(len(participant_file), 16 if has_second_project else 14)
         self.assertEqual(participant_file[0], [

From 9ea6297883552cbaa8ff205f75d3ce09f69a8913 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 16 Aug 2023 10:57:07 -0400
Subject: [PATCH 17/19] test empty rna data

---
 seqr/views/apis/report_api_tests.py | 33 +++++++++++++++++++++++++++++
 seqr/views/utils/export_utils.py    |  5 ++---
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 0ff383f8c4..e7d9f5a3f8 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -966,6 +966,39 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
         self.assertEqual(
             ['NA', 'Broad_NA20888_D1', 'NA', '2aa33e8c32020b1c', 'NA', 'SNV', ''] in called_file, has_second_project)
 
+        self.assertEqual(len(experiment_rna_file), 1)
+        self.assertEqual(experiment_rna_file[0], [
+            'experiment_rna_short_read_id', 'analyte_id', 'experiment_sample_id', 'seq_library_prep_kit_method',
+            'read_length', 'experiment_type', 'date_data_generation', 'sequencing_platform', 'library_prep_type',
+            'single_or_paired_ends', 'within_site_batch_name', 'RIN', 'estimated_library_size', 'total_reads',
+            'percent_rRNA', 'percent_mRNA', '5prime3prime_bias', 'percent_mtRNA', 'percent_Globin', 'percent_UMI',
+            'percent_GC', 'percent_chrX_Y',
+        ])
+
+        self.assertEqual(len(aligned_rna_file), 1)
+        self.assertEqual(aligned_rna_file[0], [
+            'aligned_rna_short_read_id', 'experiment_rna_short_read_id', 'aligned_rna_short_read_file',
+            'aligned_rna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri',
+            'reference_assembly_details', 'mean_coverage', 'gene_annotation', 'gene_annotation_details',
+            'alignment_software', 'alignment_log_file', 'alignment_postprocessing', 'percent_uniquely_aligned',
+            'percent_multimapped', 'percent_unaligned', 'quality_issues'
+        ])
+
+        self.assertEqual(len(experiment_lookup_file), num_airtable_rows)
+        self.assertEqual(experiment_lookup_file[0], ['experiment_id', 'table_name', 'id_in_table', 'participant_id'])
+        self.assertIn([
+            'experiment_dna_short_read.Broad_exome_VCGS_FAM203_621_D2', 'experiment_dna_short_read',
+            'Broad_exome_VCGS_FAM203_621_D2', 'Broad_HG00731',
+        ], experiment_lookup_file)
+        self.assertIn([
+            'experiment_dna_short_read.Broad_exome_NA20888', 'experiment_dna_short_read',
+            'Broad_exome_NA20888', 'Broad_NA20888',
+        ], experiment_lookup_file)
+        self.assertEqual([
+            'experiment_dna_short_read.Broad_genome_NA20888_1', 'experiment_dna_short_read', 'Broad_genome_NA20888_1',
+            'Broad_NA20888',
+        ] in experiment_lookup_file, has_second_project)
+
     def _test_expected_gregor_airtable_calls(self, additional_samples=None):
         self.assertEqual(len(responses.calls), 4)
         sample_ids = {
diff --git a/seqr/views/utils/export_utils.py b/seqr/views/utils/export_utils.py
index c82c16645e..f60bb7efc0 100644
--- a/seqr/views/utils/export_utils.py
+++ b/seqr/views/utils/export_utils.py
@@ -75,10 +75,9 @@ def _format_files_content(files,  file_format='csv', add_header_prefix=False, bl
             header_display = ['{}-{}'.format(str(header_tuple[0]).zfill(2), header_tuple[1]) for header_tuple in
                               enumerate(header)]
             header_display[0] = header[0]
-        content = DELIMITERS[file_format].join(header_display) + '\n'
         content_rows = [[row.get(key) or blank_value for key in header] for row in rows]
-        content += '\n'.join([
-            DELIMITERS[file_format].join(row) for row in content_rows
+        content = '\n'.join([
+            DELIMITERS[file_format].join(row) for row in [header_display] + content_rows
             if any(val != blank_value for val in row)
         ])
         content = str(content.encode('utf-8'), 'ascii', errors='ignore')  # Strip unicode chars in the content

From a61c81171e07b910653f94c9e51cddeb4ed1eeec Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 16 Aug 2023 11:35:42 -0400
Subject: [PATCH 18/19] test rna data

---
 seqr/views/apis/report_api_tests.py | 80 ++++++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 8 deletions(-)

diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index e7d9f5a3f8..b03234cebe 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -161,6 +161,16 @@
         'Recontactable': 'Yes',
       },
     },
+    {
+      "id": "rec2B67GmXpAkQW8z",
+      "fields": {
+        'SeqrCollaboratorSampleID': 'NA19679',
+        'CollaboratorSampleID': 'NA19679',
+        'CollaboratorParticipantID': 'NA19679',
+        'SMID': 'SM-N1P91',
+        'Recontactable': 'Yes',
+      },
+    },
     {
       "id": "rec2Nkg10N1KssPc3",
       "fields": {
@@ -221,6 +231,41 @@
         'experiment_type_wgs': 'genome',
       },
     },
+    {
+      "id": "rec4B7OGmQpVkQW7z",
+      "fields": {
+        'CollaboratorParticipantID': 'NA19679',
+        'CollaboratorSampleID_rna': 'NA19679',
+        'SMID_rna': 'SM-N1P91',
+        'seq_library_prep_kit_method_rna': 'Unknown',
+        'library_prep_type_rna': 'stranded poly-A pulldown',
+        'read_length_rna': '151',
+        'experiment_type_rna': 'paired-end',
+        'single_or_paired_ends_rna': 'paired-end',
+        'date_data_generation_rna': '2023-02-11',
+        'sequencing_platform_rna': 'NovaSeq',
+        'aligned_rna_short_read_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Aligned.out.cram',
+        'aligned_rna_short_read_index_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Aligned.out.crai',
+        'aligned_rna_short_read_id': '',
+        'md5sum_rna': 'f6490b8ebdf2',
+        '5prime3prime_bias_rna': '1.05',
+        'gene_annotation_rna': 'GENCODEv26',
+        'reference_assembly': 'GRCh38',
+        'reference_assembly_uri_rna': 'gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta',
+        'alignment_software_rna': 'STARv2.7.10b',
+        'alignment_log_file_rna': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Log.final.out',
+        'percent_uniquely_aligned_rna': '80.53',
+        'percent_multimapped_rna': '17.08',
+        'percent_unaligned_rna': '1.71',
+        'percent_mRNA': '80.2',
+        'percent_rRNA': '5.9',
+        'RIN_rna': '8.9818',
+        'total_reads_rna': '106,842,386',
+        'within_site_batch_name_rna': 'LCSET-26942',
+        'estimated_library_size_rna': '19,480,858',
+        'variant_types': 'SNV',
+      },
+    },
     {
       "id": "rec2BFCGmQpAkQ7x",
       "fields": {
@@ -771,7 +816,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             'The following tables are required in the data model but absent from the reports: subject, dna_read_data_set',
             'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id',
             'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata',
-            'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881',
+            'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881',
             'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
@@ -966,7 +1011,7 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
         self.assertEqual(
             ['NA', 'Broad_NA20888_D1', 'NA', '2aa33e8c32020b1c', 'NA', 'SNV', ''] in called_file, has_second_project)
 
-        self.assertEqual(len(experiment_rna_file), 1)
+        self.assertEqual(len(experiment_rna_file), 2)
         self.assertEqual(experiment_rna_file[0], [
             'experiment_rna_short_read_id', 'analyte_id', 'experiment_sample_id', 'seq_library_prep_kit_method',
             'read_length', 'experiment_type', 'date_data_generation', 'sequencing_platform', 'library_prep_type',
@@ -974,8 +1019,13 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
             'percent_rRNA', 'percent_mRNA', '5prime3prime_bias', 'percent_mtRNA', 'percent_Globin', 'percent_UMI',
             'percent_GC', 'percent_chrX_Y',
         ])
+        self.assertEqual(experiment_rna_file[1], [
+            'Broad_paired-end_NA19679', 'Broad_SM-N1P91', 'NA19679', 'Unknown', '151', 'paired-end', '2023-02-11',
+            'NovaSeq', 'stranded poly-A pulldown', 'paired-end', 'LCSET-26942', '8.9818', '19,480,858', '106,842,386',
+            '5.9', '80.2', '1.05', '', '', '', '', '',
+        ])
 
-        self.assertEqual(len(aligned_rna_file), 1)
+        self.assertEqual(len(aligned_rna_file), 2)
         self.assertEqual(aligned_rna_file[0], [
             'aligned_rna_short_read_id', 'experiment_rna_short_read_id', 'aligned_rna_short_read_file',
             'aligned_rna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri',
@@ -983,16 +1033,27 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
             'alignment_software', 'alignment_log_file', 'alignment_postprocessing', 'percent_uniquely_aligned',
             'percent_multimapped', 'percent_unaligned', 'quality_issues'
         ])
+        self.assertEqual(aligned_rna_file[1], [
+            '', 'Broad_paired-end_NA19679', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Aligned.out.cram',
+            'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Aligned.out.crai', 'f6490b8ebdf2', 'GRCh38',
+            'gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta', '', '', 'GENCODEv26', '',
+            'STARv2.7.10b', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Log.final.out', '', '80.53', '17.08',
+            '1.71', ''
+        ])
 
-        self.assertEqual(len(experiment_lookup_file), num_airtable_rows)
+        self.assertEqual(len(experiment_lookup_file), num_airtable_rows + 1)
         self.assertEqual(experiment_lookup_file[0], ['experiment_id', 'table_name', 'id_in_table', 'participant_id'])
+        self.assertIn([
+            'experiment_rna_short_read.Broad_paired-end_NA19679', 'experiment_rna_short_read',
+            'Broad_paired-end_NA19679', 'Broad_NA19679',
+        ], experiment_lookup_file)
         self.assertIn([
             'experiment_dna_short_read.Broad_exome_VCGS_FAM203_621_D2', 'experiment_dna_short_read',
             'Broad_exome_VCGS_FAM203_621_D2', 'Broad_HG00731',
         ], experiment_lookup_file)
         self.assertIn([
-            'experiment_dna_short_read.Broad_exome_NA20888', 'experiment_dna_short_read',
-            'Broad_exome_NA20888', 'Broad_NA20888',
+            'experiment_dna_short_read.Broad_exome_NA20888', 'experiment_dna_short_read', 'Broad_exome_NA20888',
+            'Broad_NA20888',
         ], experiment_lookup_file)
         self.assertEqual([
             'experiment_dna_short_read.Broad_genome_NA20888_1', 'experiment_dna_short_read', 'Broad_genome_NA20888_1',
@@ -1009,7 +1070,7 @@ def _test_expected_gregor_airtable_calls(self, additional_samples=None):
         sample_filter = ','.join([f"{{CollaboratorSampleID}}='{sample_id}'" for sample_id in sorted(sample_ids)])
         sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorParticipantID', 'Recontactable']
         self._assert_expected_airtable_call(0, f"OR({sample_filter})", sample_fields)
-        sample_ids -= {'NA19675_1', 'NA20888'}
+        sample_ids -= {'NA19675_1', 'NA19679', 'NA20888'}
         secondary_sample_filter = ','.join([f"{{SeqrCollaboratorSampleID}}='{sample_id}'" for sample_id in sorted(sample_ids)])
         sample_fields[0] = 'SeqrCollaboratorSampleID'
         self._assert_expected_airtable_call(1, f"OR({secondary_sample_filter})", sample_fields)
@@ -1031,7 +1092,10 @@ def _test_expected_gregor_airtable_calls(self, additional_samples=None):
             'target_insert_size_wgs', 'targeted_region_bed_file', 'targeted_regions_method_wes', 'total_reads_rna',
             'variant_types', 'within_site_batch_name_rna',
         ]
-        self._assert_expected_airtable_call(2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='NA20888',CollaboratorParticipantID='VCGS_FAM203_621')", metadata_fields)
+        self._assert_expected_airtable_call(
+            2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='NA19679',CollaboratorParticipantID='NA20888',CollaboratorParticipantID='VCGS_FAM203_621')",
+            metadata_fields,
+        )
 
         self.assertEqual(responses.calls[3].request.url, MOCK_DATA_MODEL_URL)
 

From 252b90a8ef8287dc55c59a63bdc86b7a8b1ba668 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 17 Aug 2023 15:20:50 -0400
Subject: [PATCH 19/19] pr feedback

---
 seqr/views/apis/report_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 3e1e67995c..cdc87d1cdb 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -710,8 +710,8 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
 ]
 READ_RNA_TABLE_COLUMNS = ['aligned_rna_short_read_id', 'experiment_rna_short_read_id'] + \
     READ_RNA_TABLE_AIRTABLE_ID_FIELDS + READ_TABLE_COLUMNS[4:-3] + READ_RNA_TABLE_AIRTABLE_FIELDS + ['quality_issues']
-READ_RNA_TABLE_COLUMNS.insert(10, 'gene_annotation_details')
-READ_RNA_TABLE_COLUMNS.insert(13, 'alignment_postprocessing')
+READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('gene_annotation')+1, 'gene_annotation_details')
+READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('alignment_log_file')+1, 'alignment_postprocessing')
 READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id']
 CALLED_TABLE_COLUMNS = [
     'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',