broadinstitute · hanars · Mar 25, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 15, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,11 @@
 # _seqr_ Changes
 
 ## dev
+* Add ability to import project metadata from gregor metadata
+  * Only enabled for a project if tag is first created via 
+    ```
+    ./manage.py add_project_tag --name="GREGoR Finding" --order=0.5 --color=#c25fc4 --project=<project>
+    ```
 
 ## 3/13/24
 * Add "Probably Solved" analysis status (REQUIRES DB MIGRATION)

diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json
@@ -148,6 +148,7 @@
         "description": "",
         "pedigree_image": "ped_2.png",
         "analysis_status": "Q",
+        "coded_phenotype": "microcephaly, seizures",
         "mondo_id": "MONDO:0044970",
         "case_review_notes": "<div>internal notes 2</div>\n<div>&nbsp;</div>",
         "case_review_summary": "<div>internal case review summary 2</div>\n<div>&nbsp;</div>"
@@ -556,6 +557,8 @@
         "sex": "M",
         "affected": "N",
         "display_name": "",
+        "population": "NFE",
+        "proband_relationship": "F",
 	"notes": "",
         "case_review_status": "R",
         "case_review_status_last_modified_date": null,
@@ -798,6 +801,7 @@
         "individual_id": "NA20888",
         "mother_id": null,
         "father_id": null,
+        "population": "SAS",
         "sex": "M",
         "affected": "A",
         "display_name": "",
@@ -1714,6 +1718,22 @@
         "order": 17.5
     }
 },
+{
+    "model": "seqr.varianttagtype",
+    "pk": 6,
+    "fields": {
+        "guid": "VTT_gregor_finding",
+        "created_date": "2021-11-14T00:00:00.000Z",
+        "created_by": null,
+        "last_modified_date": null,
+        "project": 3,
+        "name": "GREGoR Finding",
+        "category": "",
+        "description": "",
+        "color": "#c25fc4",
+        "order": 0.5
+    }
+},
 {
     "model": "seqr.savedvariant",
     "pk": 1,

diff --git a/seqr/management/tests/copy_project_tags_tests.py b/seqr/management/tests/copy_project_tags_tests.py
@@ -19,10 +19,10 @@ def test_command(self, mock_logger):
                 'Error: the following arguments are required: --source, --target'])
 
         # Test user did confirm.
-        call_command('copy_project_tags', '--source=R0001_1kg', '--target=R0003_test')
-        mock_logger.info.assert_called_with('Saved tag Excluded (new id = 6)')
+        call_command('copy_project_tags', '--source=R0001_1kg', '--target=R0002_empty')
+        mock_logger.info.assert_called_with('Saved tag Excluded (new id = 7)')
 
         src_tags = VariantTagType.objects.filter(project__guid = 'R0001_1kg')
-        target_tags = VariantTagType.objects.filter(project__guid = 'R0003_test')
+        target_tags = VariantTagType.objects.filter(project__guid = 'R0002_empty')
         self.assertEqual(src_tags.count(), target_tags.count())
         self.assertEqual(target_tags.all()[0].name, 'Excluded')
diff --git a/seqr/urls.py b/seqr/urls.py
@@ -37,6 +37,7 @@
     update_individual_handler, \
     edit_individuals_handler, \
     delete_individuals_handler, \
+    import_gregor_metadata, \
     receive_individuals_table_handler, \
     save_individuals_table_handler, \
     receive_individuals_metadata_handler, \
@@ -226,6 +227,7 @@
     'project/(?P<project_guid>[^/]+)/delete_families': delete_families_handler,
     'project/(?P<project_guid>[^/]+)/edit_individuals': edit_individuals_handler,
     'project/(?P<project_guid>[^/]+)/delete_individuals': delete_individuals_handler,
+    'project/(?P<project_guid>[^/]+)/import_gregor_metadata': import_gregor_metadata,
     'project/(?P<project_guid>[^/]+)/upload_families_table': receive_families_table_handler,
 
     'project/(?P<project_guid>[^/]+)/upload_individuals_table': receive_individuals_table_handler,

diff --git a/seqr/utils/file_utils.py b/seqr/utils/file_utils.py
@@ -12,12 +12,12 @@ def run_command(command, user=None, pipe_errors=False):
     return subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE if pipe_errors else subprocess.STDOUT, shell=True) # nosec
 
 
-def _run_gsutil_command(command, gs_path, gunzip=False, user=None, pipe_errors=False):
+def _run_gsutil_command(command, gs_path, gunzip=False, user=None, pipe_errors=False, no_project=False):
     if not is_google_bucket_file_path(gs_path):
         raise Exception('A Google Storage path is expected.')
 
     #  Anvil buckets are requester-pays and we bill them to the anvil project
-    google_project = get_google_project(gs_path)
+    google_project = get_google_project(gs_path) if not no_project else None
     project_arg = '-u {} '.format(google_project) if google_project else ''
     command = 'gsutil {project_arg}{command} {gs_path}'.format(
         project_arg=project_arg, command=command, gs_path=gs_path,
@@ -47,9 +47,9 @@ def does_file_exist(file_path, user=None):
     return os.path.isfile(file_path)
 
 
-def file_iter(file_path, byte_range=None, raw_content=False, user=None):
+def file_iter(file_path, byte_range=None, raw_content=False, user=None, **kwargs):
     if is_google_bucket_file_path(file_path):
-        for line in _google_bucket_file_iter(file_path, byte_range=byte_range, raw_content=raw_content, user=user):
+        for line in _google_bucket_file_iter(file_path, byte_range=byte_range, raw_content=raw_content, user=user, **kwargs):
             yield line
     elif byte_range:
         command = 'dd skip={offset} count={size} bs=1 if={file_path} status="none"'.format(
@@ -68,11 +68,11 @@ def file_iter(file_path, byte_range=None, raw_content=False, user=None):
                 yield line
 
 
-def _google_bucket_file_iter(gs_path, byte_range=None, raw_content=False, user=None):
+def _google_bucket_file_iter(gs_path, byte_range=None, raw_content=False, user=None, **kwargs):
     """Iterate over lines in the given file"""
     range_arg = ' -r {}-{}'.format(byte_range[0], byte_range[1]) if byte_range else ''
     process = _run_gsutil_command(
-        'cat{}'.format(range_arg), gs_path, gunzip=gs_path.endswith("gz") and not raw_content, user=user)
+        'cat{}'.format(range_arg), gs_path, gunzip=gs_path.endswith("gz") and not raw_content, user=user, **kwargs)
     for line in process.stdout:
         if not raw_content:
             line = line.decode('utf-8')

diff --git a/seqr/views/apis/individual_api.py b/seqr/views/apis/individual_api.py
@@ -9,19 +9,24 @@
 from django.db.models import prefetch_related_objects
 
 from reference_data.models import HumanPhenotypeOntology
-from seqr.models import Individual, Family
-from seqr.utils.gene_utils import get_genes
-from seqr.views.utils.file_utils import save_uploaded_file, load_uploaded_file
+from seqr.models import Individual, Family, CAN_VIEW
+from seqr.utils.file_utils import file_iter
+from seqr.utils.gene_utils import get_genes, get_gene_ids_for_gene_symbols
+from seqr.views.utils.anvil_metadata_utils import PARTICIPANT_TABLE, PHENOTYPE_TABLE, EXPERIMENT_TABLE, \
+    EXPERIMENT_LOOKUP_TABLE, FINDINGS_TABLE, FINDING_METADATA_COLUMNS, TRANSCRIPT_FIELDS, parse_population
+from seqr.views.utils.file_utils import save_uploaded_file, load_uploaded_file, parse_file
 from seqr.views.utils.json_to_orm_utils import update_individual_from_json, update_model_from_json
-from seqr.views.utils.json_utils import create_json_response, _to_snake_case
+from seqr.views.utils.json_utils import create_json_response, _to_snake_case, _to_camel_case
 from seqr.views.utils.orm_to_json_utils import _get_json_for_model, _get_json_for_individuals, add_individual_hpo_details, \
-    _get_json_for_families, get_json_for_rna_seq_outliers, get_project_collaborators_by_username
+    _get_json_for_families, get_json_for_rna_seq_outliers, get_project_collaborators_by_username, INDIVIDUAL_DISPLAY_NAME_EXPR, \
+    GREGOR_FINDING_TAG_TYPE
 from seqr.views.utils.pedigree_info_utils import parse_pedigree_table, validate_fam_file_records, JsonConstants, ErrorsWarningsException
 from seqr.views.utils.permissions_utils import get_project_and_check_permissions, check_project_permissions, \
     get_project_and_check_pm_permissions, login_and_policies_required, has_project_permissions, project_has_anvil, \
-    is_internal_anvil_project
+    is_internal_anvil_project, pm_or_data_manager_required, check_workspace_perm
+from seqr.views.utils.project_context_utils import add_project_tag_types
 from seqr.views.utils.individual_utils import delete_individuals, add_or_update_individuals_and_families
-
+from seqr.views.utils.variant_utils import bulk_create_tagged_variants
 
 _SEX_TO_EXPORTED_VALUE = dict(Individual.SEX_LOOKUP)
 _SEX_TO_EXPORTED_VALUE['U'] = ''
@@ -621,12 +626,16 @@ def _has_same_features(individual, present_features, absent_features):
            {feature['id'] for feature in individual.absent_features or []} == set(absent_features or [])
 
 
-def _parse_individual_hpo_terms(json_records, project, user):
+def _get_valid_hpo_terms(json_records):
     all_hpo_terms = set()
     for record in json_records:
         all_hpo_terms.update(record.get(FEATURES_COL, []))
         all_hpo_terms.update(record.get(ABSENT_FEATURES_COL, []))
-    hpo_terms = set(HumanPhenotypeOntology.objects.filter(hpo_id__in=all_hpo_terms).values_list('hpo_id', flat=True))
+    return set(HumanPhenotypeOntology.objects.filter(hpo_id__in=all_hpo_terms).values_list('hpo_id', flat=True))
+
+
+def _parse_individual_hpo_terms(json_records, project, user):
+    hpo_terms = _get_valid_hpo_terms(json_records)
 
     individual_ids = [record[INDIVIDUAL_ID_COL] for record in json_records]
     individual_ids += ['{}_{}'.format(record[FAMILY_ID_COL], record[INDIVIDUAL_ID_COL])
@@ -806,6 +815,170 @@ def save_individuals_metadata_table_handler(request, project_guid, upload_file_i
     return create_json_response(response)
 
 
+@pm_or_data_manager_required
+def import_gregor_metadata(request, project_guid):
+    request_json = json.loads(request.body)
+    sample_type = request_json.get('sampleType', 'genome')
+    project = get_project_and_check_permissions(project_guid, request.user, can_edit=True)
+    workspace_meta = check_workspace_perm(
+        request.user, CAN_VIEW, request_json['workspaceNamespace'], request_json['workspaceName'],
+        meta_fields=['workspace.bucketName']
+    )
+    bucket_name = workspace_meta['workspace']['bucketName']
+    metadata_files_path = f'gs://{bucket_name}/data_tables'
+
+    experiment_sample_lookup = {
+        row['experiment_dna_short_read_id']: row['experiment_sample_id'] for row in _iter_metadata_table(
+            metadata_files_path, EXPERIMENT_TABLE, request.user,
+            lambda r: r['experiment_type'] == sample_type and r['experiment_sample_id'] != 'NA',
+        )
+    }
+    participant_sample_lookup = {
+        row['participant_id']: experiment_sample_lookup[row['id_in_table']] for row in _iter_metadata_table(
+            metadata_files_path, EXPERIMENT_LOOKUP_TABLE, request.user,
+            lambda r: r['id_in_table'] in experiment_sample_lookup and r['table_name'] == 'experiment_dna_short_read',
+        )
+    }
+
+    participant_rows = list(_iter_metadata_table(metadata_files_path, PARTICIPANT_TABLE, request.user, lambda r: True))
+    family_ids = {row['family_id'] for row in participant_rows if row['participant_id'] in participant_sample_lookup}
+    individuals_by_participant = {row['participant_id']: {
+        JsonConstants.INDIVIDUAL_ID_COLUMN: participant_sample_lookup.get(row['participant_id'], row['participant_id']),
+        **dict([_parse_participant_val(k, v, participant_sample_lookup) for k, v in row.items()]),
+        'population': parse_population(row),
+        FEATURES_COL: [],
+        ABSENT_FEATURES_COL: [],
+    } for row in participant_rows if row['family_id'] in family_ids}
+    individuals = individuals_by_participant.values()
+
+    warnings = validate_fam_file_records(project, individuals, clear_invalid_values=True)
+
+    for row in _iter_metadata_table(
+        metadata_files_path, PHENOTYPE_TABLE, request.user,
+        lambda r: r['participant_id'] in individuals_by_participant and r['ontology'] == 'HPO' and r['presence'] in {'Present', 'Absent'},
+    ):
+        col = FEATURES_COL if row['presence'] == 'Present' else ABSENT_FEATURES_COL
+        individuals_by_participant[row['participant_id']][col].append(row['term_id'])
+    hpo_terms = _get_valid_hpo_terms(individuals)
+    invalid_hpo_terms = set()
+    for row in individuals:
+        invalid_hpo_terms.update(_remove_invalid_hpo_terms(row, hpo_terms))
+        row.update({k: INDIVIDUAL_METADATA_FIELDS[k](v) for k, v in row.items() if k in [FEATURES_COL, ABSENT_FEATURES_COL]})
+    if invalid_hpo_terms:
+        warnings.append(f"Skipped the following unrecognized HPO terms: {', '.join(sorted(invalid_hpo_terms))}")
+
+    response_json, num_created_families, num_created_individuals = add_or_update_individuals_and_families(
+        project, individuals, request.user, get_created_counts=True, allow_features_update=True,
+    )
+
+    num_updated_families = len(response_json['familiesByGuid'])
+    num_updated_individuals = len(response_json['individualsByGuid'])
+    info = [
+        f'Imported {len(individuals)} individuals',
+        f'Created {num_created_families} new families, {num_created_individuals} new individuals',
+        f'Updated {num_updated_families - num_created_families} existing families, {num_updated_individuals - num_created_individuals} existing individuals',
+        f'Skipped {len(individuals) - num_updated_individuals} unchanged individuals',
+    ]
+
+    participant_individual_map = {
+        i['participant_id']: i for i in Individual.objects.annotate(
+            participant_id=INDIVIDUAL_DISPLAY_NAME_EXPR).filter(
+            family__project=project, participant_id__in=individuals_by_participant,
+        ).values('participant_id', 'guid', 'family_id')
+    }
+
+    family_variant_data = {}
+    finding_id_map = {}
+    genes = set()
+    for row in _iter_metadata_table(
+        metadata_files_path, FINDINGS_TABLE, request.user,
+            lambda r: r['participant_id'] in participant_individual_map and r['variant_type'] == 'SNV/INDEL',
+    ):
+        individual = participant_individual_map[row['participant_id']]
+        variant_id = '-'.join([row[col] for col in ['chrom', 'pos', 'ref', 'alt']])
+        key = (individual['family_id'], variant_id)
+        variant = {k: v for k, v in row.items() if v and v != 'NA'}
+        variant.update({
+            'pos': int(variant['pos']),
+            'genomeVersion': variant['variant_reference_assembly'].replace('GRCh', ''),
+            'transcripts': {},
+            'transcript': {
+                config.get('seqr_field', k): variant.pop(k, None) for k, config in TRANSCRIPT_FIELDS.items()
+            },
+            'genotypes': {individual['guid']: {'numAlt': 2 if variant['zygosity'] == 'Homozygous' else 1}},
+            'support_vars': [],
+        })
+        family_variant_data[key] = variant
+        genes.add(variant['gene'])
+        finding_id_map[variant['genetic_findings_id']] = variant_id
+
+    gene_symbols_to_ids = {k: v[0] for k, v in get_gene_ids_for_gene_symbols(genes).items()}
+    missing_genes = set()
+    for variant in family_variant_data.values():
+        gene = variant['gene']
+        transcript = variant.pop('transcript')
+        if gene in gene_symbols_to_ids:
+            variant.update({
+                'transcripts': {gene_symbols_to_ids[gene]: [transcript]},
+                'mainTranscriptId': transcript['transcriptId'],
+            })
+        else:
+            missing_genes.add(gene)
+        if variant.get('linked_variant') in finding_id_map:
+            variant['support_vars'].append(finding_id_map[variant['linked_variant']])
+
+    if missing_genes:
+        warnings.append(f'The following unknown genes were omitted in the findings tags: {", ".join(sorted(missing_genes))}')
+
+    num_new, num_updated = bulk_create_tagged_variants(
+        family_variant_data, tag_name=GREGOR_FINDING_TAG_TYPE, user=request.user, project=project,
+        get_metadata=lambda v: {k: v[k] for k in FINDING_METADATA_COLUMNS if k in v}
+    )
+    info.append(f'Loaded {num_new} new and {num_updated} updated findings tags')
+
+    response_json['projectsByGuid'] = {project_guid: {}}
+    response_json['familyTagTypeCounts'] = add_project_tag_types(response_json['projectsByGuid'], add_counts=True)
+
+    response_json['importStats'] = {'gregorMetadata': {'info': info, 'warnings': warnings}}
+    return create_json_response(response_json)
+
+
+def _iter_metadata_table(file_path, table_name, user, filter_row):
+    file_name = f'{file_path}/{table_name}.tsv'
+    file_rows = parse_file(file_name, file_iter(file_name, user=user, no_project=True), iter_file=True)
+    header = next(file_rows)
+    for row in file_rows:
+        row_dict = dict(zip(header, row))
+        if filter_row(row_dict):
+            yield row_dict
+
+
+GREGOR_PARTICIPANT_COLUMN_MAP = {
+    'participant_id': 'displayName',
+    'affected_status': JsonConstants.AFFECTED_COLUMN,
+    'phenotype_description': JsonConstants.CODED_PHENOTYPE_COLUMN,
+}
+ENUM_COLUMNS = {
+    column: {v: k for k, v in choices} for column, choices in [
+        (JsonConstants.SEX_COLUMN, Individual.SEX_CHOICES),
+        (JsonConstants.AFFECTED_COLUMN, Individual.AFFECTED_STATUS_CHOICES),
+        (JsonConstants.PROBAND_RELATIONSHIP, Individual.RELATIONSHIP_CHOICES),
+    ]
+}
+
+
+def _parse_participant_val(column, value, participant_sample_lookup):
+    column = GREGOR_PARTICIPANT_COLUMN_MAP.get(column, _to_camel_case(column))
+    if column in ENUM_COLUMNS and value:
+        value = ENUM_COLUMNS[column].get(value, 'U')
+    if column in {JsonConstants.MATERNAL_ID_COLUMN, JsonConstants.PATERNAL_ID_COLUMN}:
+        if value == '0':
+            value = None
+        elif value in participant_sample_lookup:
+            value = participant_sample_lookup[value]
+    return column, value
+
+
 @login_and_policies_required
 def get_individual_rna_seq_data(request, individual_guid):
     individual = Individual.objects.get(guid=individual_guid)