broadinstitute · hanars · Aug 17, 2023 · Aug 14, 2023 · Aug 14, 2023 · Aug 14, 2023
diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json
@@ -698,7 +698,7 @@
         "individual_id": "NA20888",
         "mother_id": null,
         "father_id": null,
-        "sex": "M",
+        "sex": "F",
         "affected": "A",
         "display_name": "",
 	"notes": "",
@@ -1144,7 +1144,7 @@
         "last_modified_date": "2017-03-13T09:07:50.277Z",
 
         "sample_id": "NA20888",
-        "sample_type": "WES",
+        "sample_type": "WGS",
         "is_active": false,
         "individual": 16,
         "dataset_type": "VARIANTS",

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
@@ -8,6 +8,7 @@
 from django.db.models.functions import Replace, JSONObject
 from django.utils import timezone
 import json
+import re
 import requests
 
 from seqr.utils.file_utils import is_google_bucket_file_path, does_file_exist
@@ -657,7 +658,10 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
 
 # GREGoR metadata
 
+GREGOR_DATA_TYPES = ['wgs', 'wes', 'rna']
 SMID_FIELD = 'SMID'
+PARTICIPANT_ID_FIELD = 'CollaboratorParticipantID'
+COLLABORATOR_SAMPLE_ID_FIELD = 'CollaboratorSampleID'
 PARTICIPANT_TABLE_COLUMNS = [
     'participant_id', 'internal_project_id', 'gregor_center', 'consent_code', 'recontactable', 'prior_testing',
     'pmid_id', 'family_id', 'paternal_id', 'maternal_id', 'twin_id', 'proband_relationship',
@@ -684,30 +688,59 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
 EXPERIMENT_TABLE_COLUMNS = [
     'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id',
 ] + EXPERIMENT_TABLE_AIRTABLE_FIELDS
+EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS = [
+    'library_prep_type', 'single_or_paired_ends', 'within_site_batch_name', 'RIN', 'estimated_library_size',
+    'total_reads', 'percent_rRNA', 'percent_mRNA', '5prime3prime_bias',
+]
+EXPERIMENT_RNA_TABLE_COLUMNS = ['experiment_rna_short_read_id'] + [
+    c for c in EXPERIMENT_TABLE_COLUMNS[1:] if not c.startswith('target')] + EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + [
+    'percent_mtRNA', 'percent_Globin', 'percent_UMI',  'percent_GC', 'percent_chrX_Y',
+]
+EXPERIMENT_LOOKUP_TABLE_COLUMNS = ['experiment_id', 'table_name', 'id_in_table', 'participant_id']
 READ_TABLE_AIRTABLE_FIELDS = [
     'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly',
-    'alignment_software', 'mean_coverage', 'analysis_details',
+    'mean_coverage', 'alignment_software', 'analysis_details',
 ]
 READ_TABLE_COLUMNS = ['aligned_dna_short_read_id', 'experiment_dna_short_read_id'] + READ_TABLE_AIRTABLE_FIELDS + ['quality_issues']
 READ_TABLE_COLUMNS.insert(6, 'reference_assembly_details')
 READ_TABLE_COLUMNS.insert(6, 'reference_assembly_uri')
+READ_RNA_TABLE_AIRTABLE_ID_FIELDS = ['aligned_rna_short_read_file', 'aligned_rna_short_read_index_file']
+READ_RNA_TABLE_AIRTABLE_FIELDS = [
+    'gene_annotation', 'alignment_software', 'alignment_log_file', 'percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned',
+]
+READ_RNA_TABLE_COLUMNS = ['aligned_rna_short_read_id', 'experiment_rna_short_read_id'] + \
+    READ_RNA_TABLE_AIRTABLE_ID_FIELDS + READ_TABLE_COLUMNS[4:-3] + READ_RNA_TABLE_AIRTABLE_FIELDS + ['quality_issues']
+READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('gene_annotation')+1, 'gene_annotation_details')
+READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('alignment_log_file')+1, 'alignment_postprocessing')
 READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id']
 CALLED_TABLE_COLUMNS = [
     'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',
     'caller_software', 'variant_types', 'analysis_details',
 ]
-ALL_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + CALLED_TABLE_COLUMNS
-
-TABLE_COLUMNS = {
-    'participant': PARTICIPANT_TABLE_COLUMNS,
-    'family': GREGOR_FAMILY_TABLE_COLUMNS,
-    'phenotype': PHENOTYPE_TABLE_COLUMNS,
-    'analyte': ANALYTE_TABLE_COLUMNS,
-    'experiment_dna_short_read': EXPERIMENT_TABLE_COLUMNS,
-    'aligned_dna_short_read': READ_TABLE_COLUMNS,
-    'aligned_dna_short_read_set': READ_SET_TABLE_COLUMNS,
-    'called_variants_dna_short_read': CALLED_TABLE_COLUMNS,
+
+RNA_ONLY = EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + READ_RNA_TABLE_AIRTABLE_FIELDS + ['reference_assembly_uri']
+DATA_TYPE_OMIT = {
+    'wgs': ['targeted_regions_method'] + RNA_ONLY, 'wes': RNA_ONLY, 'rna': [
+        'targeted_regions_method', 'target_insert_size', 'mean_coverage', 'aligned_dna_short_read_file',
+        'aligned_dna_short_read_index_file',
+    ],
+}
+NO_DATA_TYPE_FIELDS = {
+    'targeted_region_bed_file', 'reference_assembly', 'analysis_details', 'percent_rRNA', 'percent_mRNA',
+    'alignment_software_dna',
 }
+NO_DATA_TYPE_FIELDS.update(READ_RNA_TABLE_AIRTABLE_ID_FIELDS)
+
+DATA_TYPE_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + RNA_ONLY + [
+    COLLABORATOR_SAMPLE_ID_FIELD, SMID_FIELD]
+ALL_AIRTABLE_COLUMNS = DATA_TYPE_AIRTABLE_COLUMNS + CALLED_TABLE_COLUMNS
+AIRTABLE_QUERY_COLUMNS = set(CALLED_TABLE_COLUMNS)
+AIRTABLE_QUERY_COLUMNS.remove('md5sum')
+AIRTABLE_QUERY_COLUMNS.update(NO_DATA_TYPE_FIELDS)
+for data_type in GREGOR_DATA_TYPES:
+    data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set(DATA_TYPE_OMIT[data_type])
+    AIRTABLE_QUERY_COLUMNS.update({f'{field}_{data_type}' for field in data_type_columns})
+
 WARN_MISSING_TABLE_COLUMNS = {
     'participant': ['recontactable',  'reported_race', 'affected_status', 'phenotype_description', 'age_at_enrollment'],
 }
@@ -791,18 +824,34 @@ def gregor_export(request):
         consent_code=consent_code[0],
         projectcategory__name='GREGoR',
     )
-    individuals = Individual.objects.filter(
-        sample__in=get_search_samples(projects, active_only=False),
-    ).distinct().prefetch_related('family__project', 'mother', 'father')
+    sample_types = Sample.objects.filter(individual__family__project__in=projects).values_list('individual_id', 'sample_type')
+    individual_data_types = defaultdict(set)
+    for individual_db_id, sample_type in sample_types:
+        individual_data_types[individual_db_id].add(sample_type)
+    individuals = Individual.objects.filter(id__in=individual_data_types).prefetch_related(
+        'family__project', 'mother', 'father')
+
+    grouped_data_type_individuals = defaultdict(dict)
+    for i in individuals:
+        grouped_data_type_individuals[i.individual_id].update({data_type: i for data_type in individual_data_types[i.id]})
 
-    airtable_sample_records, airtable_metadata_by_smid = _get_gregor_airtable_data(individuals, request.user)
+    airtable_sample_records, airtable_metadata_by_participant = _get_gregor_airtable_data(
+        grouped_data_type_individuals.keys(), request.user)
 
     participant_rows = []
     family_map = {}
     phenotype_rows = []
     analyte_rows = []
     airtable_rows = []
-    for individual in individuals:
+    airtable_rna_rows = []
+    experiment_lookup_rows = []
+    for data_type_individuals in grouped_data_type_individuals.values():
+        # If multiple individual records, prefer WGS
+        individual = next(
+            data_type_individuals[data_type.upper()] for data_type in GREGOR_DATA_TYPES
+            if data_type_individuals.get(data_type.upper())
+        )
+
         # family table
         family = individual.family
         if family not in family_map:
@@ -832,29 +881,43 @@ def gregor_export(request):
             dict(**base_phenotype_row, **_get_phenotype_row(feature)) for feature in individual.absent_features or []
         ]
 
-        analyte_id = None
+        analyte_ids = set()
         # airtable data
         if airtable_sample:
-            sm_id = airtable_sample[SMID_FIELD]
-            analyte_id = f'Broad_{sm_id}'
-            airtable_metadata = airtable_metadata_by_smid.get(sm_id)
-            if airtable_metadata:
-                experiment_ids = _get_experiment_ids(airtable_sample, airtable_metadata)
-                airtable_rows.append(dict(analyte_id=analyte_id, **airtable_metadata, **experiment_ids))
+            airtable_metadata = airtable_metadata_by_participant.get(airtable_sample[PARTICIPANT_ID_FIELD]) or {}
+            for data_type in data_type_individuals:
+                if data_type not in airtable_metadata:
+                    continue
+                row = _get_airtable_row(data_type, airtable_metadata)
+                analyte_ids.add(row['analyte_id'])
+                is_rna = data_type == 'RNA'
+                if not is_rna:
+                    row['alignment_software'] = row['alignment_software_dna']
+                (airtable_rna_rows if is_rna else airtable_rows).append(row)
+                experiment_lookup_rows.append(
+                    {'participant_id': participant_id, **_get_experiment_lookup_row(is_rna, row)}
+                )
 
         # analyte table
-        analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual)))
-
-    files, warnings = _get_validated_gregor_files([
-        ('participant', participant_rows),
-        ('family', list(family_map.values())),
-        ('phenotype', phenotype_rows),
-        ('analyte', analyte_rows),
-        ('experiment_dna_short_read', airtable_rows),
-        ('aligned_dna_short_read', airtable_rows),
-        ('aligned_dna_short_read_set', airtable_rows),
-        ('called_variants_dna_short_read', airtable_rows),
-    ])
+        if not analyte_ids:
+            analyte_ids.add(_get_analyte_id(airtable_sample))
+        for analyte_id in analyte_ids:
+            analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual)))
+
+    files = [
+        ('participant', PARTICIPANT_TABLE_COLUMNS, participant_rows),
+        ('family', GREGOR_FAMILY_TABLE_COLUMNS, list(family_map.values())),
+        ('phenotype', PHENOTYPE_TABLE_COLUMNS, phenotype_rows),
+        ('analyte', ANALYTE_TABLE_COLUMNS, analyte_rows),
+        ('experiment_dna_short_read', EXPERIMENT_TABLE_COLUMNS, airtable_rows),
+        ('aligned_dna_short_read', READ_TABLE_COLUMNS, airtable_rows),
+        ('aligned_dna_short_read_set', READ_SET_TABLE_COLUMNS, airtable_rows),
+        ('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, airtable_rows),
+        ('experiment_rna_short_read', EXPERIMENT_RNA_TABLE_COLUMNS, airtable_rna_rows),
+        ('aligned_rna_short_read', READ_RNA_TABLE_COLUMNS, airtable_rna_rows),
+        ('experiment', EXPERIMENT_LOOKUP_TABLE_COLUMNS, experiment_lookup_rows),
+    ]
+    warnings = _validate_gregor_files(files)
     write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv')
 
     return create_json_response({
@@ -863,21 +926,25 @@ def gregor_export(request):
     })
 
 
-def _get_gregor_airtable_data(individuals, user):
+def _get_gregor_airtable_data(individual_ids, user):
     sample_records, session = _get_airtable_samples(
-        individuals.order_by('individual_id').values_list('individual_id', flat=True), user,
-        fields=[SMID_FIELD, 'CollaboratorSampleID', 'Recontactable'],
+        individual_ids, user, fields=[SMID_FIELD, PARTICIPANT_ID_FIELD, 'Recontactable'],
     )
 
-    fields = ALL_AIRTABLE_COLUMNS
     airtable_metadata = session.fetch_records(
         'GREGoR Data Model',
-        fields=[SMID_FIELD] + sorted(fields),
-        or_filters={f'{SMID_FIELD}': {r[SMID_FIELD] for r in sample_records.values()}},
+        fields=[PARTICIPANT_ID_FIELD] + sorted(AIRTABLE_QUERY_COLUMNS),
+        or_filters={f'{PARTICIPANT_ID_FIELD}': {r[PARTICIPANT_ID_FIELD] for r in sample_records.values()}},
     )
-    airtable_metadata_by_smid = {r[SMID_FIELD]: r for r in airtable_metadata.values()}
 
-    return sample_records, airtable_metadata_by_smid
+    airtable_metadata_by_participant = {r[PARTICIPANT_ID_FIELD]: r for r in airtable_metadata.values()}
+    for data_type in GREGOR_DATA_TYPES:
+        for r in airtable_metadata_by_participant.values():
+            data_type_fields = [f for f in r if f.endswith(f'_{data_type}')]
+            if data_type_fields:
+                r[data_type.upper()] = {f.replace(f'_{data_type}', ''): r.pop(f) for f in data_type_fields}
+
+    return sample_records, airtable_metadata_by_participant
 
 
 def _get_gregor_family_row(family):
@@ -934,37 +1001,58 @@ def _get_analyte_row(individual):
     }
 
 
-def _get_experiment_ids(airtable_sample, airtable_metadata):
-    collaborator_sample_id = airtable_sample['CollaboratorSampleID']
-    experiment_dna_short_read_id = f'Broad_{airtable_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}'
+def _get_airtable_row(data_type, airtable_metadata):
+    data_type_metadata = airtable_metadata[data_type]
+    collaborator_sample_id = data_type_metadata[COLLABORATOR_SAMPLE_ID_FIELD]
+    experiment_short_read_id = f'Broad_{data_type_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}'
+    aligned_short_read_id = f'{experiment_short_read_id}_1'
     return {
-        'experiment_dna_short_read_id': experiment_dna_short_read_id,
+        'analyte_id': _get_analyte_id(data_type_metadata),
+        'experiment_dna_short_read_id': experiment_short_read_id,
+        'experiment_rna_short_read_id': experiment_short_read_id,
         'experiment_sample_id': collaborator_sample_id,
-        'aligned_dna_short_read_id': f'{experiment_dna_short_read_id}_1'
+        'aligned_dna_short_read_id': aligned_short_read_id,
+        'aligned_rna_short_read_id': aligned_short_read_id,
+        **airtable_metadata,
+        **data_type_metadata,
+    }
+
+
+def _get_analyte_id(airtable_metadata):
+    sm_id = airtable_metadata.get(SMID_FIELD)
+    return f'Broad_{sm_id}' if sm_id else None
+
+
+def _get_experiment_lookup_row(is_rna, row_data):
+    table_name = f'experiment_{"rna" if is_rna else "dna"}_short_read'
+    id_in_table = row_data[f'{table_name}_id']
+    return {
+        'table_name': table_name,
+        'id_in_table': id_in_table,
+        'experiment_id': f'{table_name}.{id_in_table}',
     }
 
 
-def _get_validated_gregor_files(file_data):
+def _validate_gregor_files(file_data):
     errors = []
     warnings = []
     try:
         validators, required_tables = _load_data_model_validators()
     except Exception as e:
         warnings.append(f'Unable to load data model for validation: {e}')
         validators = {}
-        required_tables = set()
+        required_tables = {}
 
-    missing_tables = required_tables.difference({f[0] for f in file_data})
+    tables = {f[0] for f in file_data}
+    missing_tables = [
+        table for table, validator in required_tables.items() if not _has_required_table(table, validator, tables)
+    ]
     if missing_tables:
         warnings.append(
             f'The following tables are required in the data model but absent from the reports: {", ".join(missing_tables)}'
         )
 
-    files = []
-    for file_name, data in file_data:
-        columns = TABLE_COLUMNS[file_name]
-        files.append([file_name, columns, data])
-
+    for file_name, columns, data in file_data:
         table_validator = validators.get(file_name)
         if not table_validator:
             warnings.append(f'No data model found for "{file_name}" table so no validation was performed')
@@ -992,7 +1080,7 @@ def _get_validated_gregor_files(file_data):
     if errors:
         raise ErrorsWarningsException(errors, warnings)
 
-    return files, warnings
+    return warnings
 
 
 def _load_data_model_validators():
@@ -1003,10 +1091,26 @@ def _load_data_model_validators():
         t['table']: {c['column']: c for c in t['columns']}
         for t in table_models
     }
-    required_tables = {t['table'] for t in table_models if t.get('required')}
+    required_tables = {t['table']: _parse_table_required(t['required']) for t in table_models if t.get('required')}
     return validators, required_tables
 
 
+def _parse_table_required(required_validator):
+    if required_validator is True:
+        return True
+
+    match = re.match(r'CONDITIONAL \(([\w+(\s,)?]+)\)', required_validator)
+    return match and match.group(1).split(', ')
+
+
+def _has_required_table(table, validator, tables):
+    if table in tables:
+        return True
+    if validator is True:
+        return False
+    return tables.isdisjoint(validator)
+
+
 def _validate_column_data(column, file_name, data, column_validator, warnings, errors):
     enum = column_validator.get('enumerations')
     required = column_validator.get('required')