-
Notifications
You must be signed in to change notification settings - Fork 88
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Populate from gregor metadata #3990
Changes from all commits
d685da2
ee4375c
1e08b1a
32a34d6
8428f5a
ead4588
15187fb
da30607
4e29fb0
3c6a49e
a6b1ea9
22b3c83
3154433
0fd09ad
f075d9b
64fb268
6916156
50a99db
cd8fecd
45ccce0
71a425a
5e2223d
20c5c5c
5944882
b79d40c
e06f427
ceffcff
9b3fc56
fea3035
c2f2295
08427bb
118361e
8073336
340c605
2bda5c4
0a321f3
da3e996
54d88f6
cffa82f
6146d7d
ef2059a
4b9350f
134b4a7
4267dcc
618c7eb
d2a1c6a
2f34287
e49f7ea
8dd3774
bd82ee1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,19 +9,24 @@ | |
from django.db.models import prefetch_related_objects | ||
|
||
from reference_data.models import HumanPhenotypeOntology | ||
from seqr.models import Individual, Family | ||
from seqr.utils.gene_utils import get_genes | ||
from seqr.views.utils.file_utils import save_uploaded_file, load_uploaded_file | ||
from seqr.models import Individual, Family, CAN_VIEW | ||
from seqr.utils.file_utils import file_iter | ||
from seqr.utils.gene_utils import get_genes, get_gene_ids_for_gene_symbols | ||
from seqr.views.utils.anvil_metadata_utils import PARTICIPANT_TABLE, PHENOTYPE_TABLE, EXPERIMENT_TABLE, \ | ||
EXPERIMENT_LOOKUP_TABLE, FINDINGS_TABLE, FINDING_METADATA_COLUMNS, TRANSCRIPT_FIELDS, parse_population | ||
from seqr.views.utils.file_utils import save_uploaded_file, load_uploaded_file, parse_file | ||
from seqr.views.utils.json_to_orm_utils import update_individual_from_json, update_model_from_json | ||
from seqr.views.utils.json_utils import create_json_response, _to_snake_case | ||
from seqr.views.utils.json_utils import create_json_response, _to_snake_case, _to_camel_case | ||
from seqr.views.utils.orm_to_json_utils import _get_json_for_model, _get_json_for_individuals, add_individual_hpo_details, \ | ||
_get_json_for_families, get_json_for_rna_seq_outliers, get_project_collaborators_by_username | ||
_get_json_for_families, get_json_for_rna_seq_outliers, get_project_collaborators_by_username, INDIVIDUAL_DISPLAY_NAME_EXPR, \ | ||
GREGOR_FINDING_TAG_TYPE | ||
from seqr.views.utils.pedigree_info_utils import parse_pedigree_table, validate_fam_file_records, JsonConstants, ErrorsWarningsException | ||
from seqr.views.utils.permissions_utils import get_project_and_check_permissions, check_project_permissions, \ | ||
get_project_and_check_pm_permissions, login_and_policies_required, has_project_permissions, project_has_anvil, \ | ||
is_internal_anvil_project | ||
is_internal_anvil_project, pm_or_data_manager_required, check_workspace_perm | ||
from seqr.views.utils.project_context_utils import add_project_tag_types | ||
from seqr.views.utils.individual_utils import delete_individuals, add_or_update_individuals_and_families | ||
|
||
from seqr.views.utils.variant_utils import bulk_create_tagged_variants | ||
|
||
_SEX_TO_EXPORTED_VALUE = dict(Individual.SEX_LOOKUP) | ||
_SEX_TO_EXPORTED_VALUE['U'] = '' | ||
|
@@ -621,12 +626,16 @@ def _has_same_features(individual, present_features, absent_features): | |
{feature['id'] for feature in individual.absent_features or []} == set(absent_features or []) | ||
|
||
|
||
def _parse_individual_hpo_terms(json_records, project, user): | ||
def _get_valid_hpo_terms(json_records): | ||
all_hpo_terms = set() | ||
for record in json_records: | ||
all_hpo_terms.update(record.get(FEATURES_COL, [])) | ||
all_hpo_terms.update(record.get(ABSENT_FEATURES_COL, [])) | ||
hpo_terms = set(HumanPhenotypeOntology.objects.filter(hpo_id__in=all_hpo_terms).values_list('hpo_id', flat=True)) | ||
return set(HumanPhenotypeOntology.objects.filter(hpo_id__in=all_hpo_terms).values_list('hpo_id', flat=True)) | ||
|
||
|
||
def _parse_individual_hpo_terms(json_records, project, user): | ||
hpo_terms = _get_valid_hpo_terms(json_records) | ||
|
||
individual_ids = [record[INDIVIDUAL_ID_COL] for record in json_records] | ||
individual_ids += ['{}_{}'.format(record[FAMILY_ID_COL], record[INDIVIDUAL_ID_COL]) | ||
|
@@ -806,6 +815,170 @@ def save_individuals_metadata_table_handler(request, project_guid, upload_file_i | |
return create_json_response(response) | ||
|
||
|
||
@pm_or_data_manager_required | ||
def import_gregor_metadata(request, project_guid): | ||
request_json = json.loads(request.body) | ||
sample_type = request_json.get('sampleType', 'genome') | ||
project = get_project_and_check_permissions(project_guid, request.user, can_edit=True) | ||
workspace_meta = check_workspace_perm( | ||
request.user, CAN_VIEW, request_json['workspaceNamespace'], request_json['workspaceName'], | ||
meta_fields=['workspace.bucketName'] | ||
) | ||
bucket_name = workspace_meta['workspace']['bucketName'] | ||
metadata_files_path = f'gs://{bucket_name}/data_tables' | ||
|
||
experiment_sample_lookup = { | ||
row['experiment_dna_short_read_id']: row['experiment_sample_id'] for row in _iter_metadata_table( | ||
metadata_files_path, EXPERIMENT_TABLE, request.user, | ||
lambda r: r['experiment_type'] == sample_type and r['experiment_sample_id'] != 'NA', | ||
) | ||
} | ||
participant_sample_lookup = { | ||
row['participant_id']: experiment_sample_lookup[row['id_in_table']] for row in _iter_metadata_table( | ||
metadata_files_path, EXPERIMENT_LOOKUP_TABLE, request.user, | ||
lambda r: r['id_in_table'] in experiment_sample_lookup and r['table_name'] == 'experiment_dna_short_read', | ||
) | ||
} | ||
|
||
participant_rows = list(_iter_metadata_table(metadata_files_path, PARTICIPANT_TABLE, request.user, lambda r: True)) | ||
family_ids = {row['family_id'] for row in participant_rows if row['participant_id'] in participant_sample_lookup} | ||
individuals_by_participant = {row['participant_id']: { | ||
JsonConstants.INDIVIDUAL_ID_COLUMN: participant_sample_lookup.get(row['participant_id'], row['participant_id']), | ||
**dict([_parse_participant_val(k, v, participant_sample_lookup) for k, v in row.items()]), | ||
'population': parse_population(row), | ||
FEATURES_COL: [], | ||
ABSENT_FEATURES_COL: [], | ||
} for row in participant_rows if row['family_id'] in family_ids} | ||
individuals = individuals_by_participant.values() | ||
|
||
warnings = validate_fam_file_records(project, individuals, clear_invalid_values=True) | ||
|
||
for row in _iter_metadata_table( | ||
metadata_files_path, PHENOTYPE_TABLE, request.user, | ||
lambda r: r['participant_id'] in individuals_by_participant and r['ontology'] == 'HPO' and r['presence'] in {'Present', 'Absent'}, | ||
): | ||
col = FEATURES_COL if row['presence'] == 'Present' else ABSENT_FEATURES_COL | ||
individuals_by_participant[row['participant_id']][col].append(row['term_id']) | ||
hpo_terms = _get_valid_hpo_terms(individuals) | ||
invalid_hpo_terms = set() | ||
for row in individuals: | ||
invalid_hpo_terms.update(_remove_invalid_hpo_terms(row, hpo_terms)) | ||
row.update({k: INDIVIDUAL_METADATA_FIELDS[k](v) for k, v in row.items() if k in [FEATURES_COL, ABSENT_FEATURES_COL]}) | ||
if invalid_hpo_terms: | ||
warnings.append(f"Skipped the following unrecognized HPO terms: {', '.join(sorted(invalid_hpo_terms))}") | ||
|
||
response_json, num_created_families, num_created_individuals = add_or_update_individuals_and_families( | ||
project, individuals, request.user, get_created_counts=True, allow_features_update=True, | ||
) | ||
|
||
num_updated_families = len(response_json['familiesByGuid']) | ||
num_updated_individuals = len(response_json['individualsByGuid']) | ||
info = [ | ||
f'Imported {len(individuals)} individuals', | ||
f'Created {num_created_families} new families, {num_created_individuals} new individuals', | ||
f'Updated {num_updated_families - num_created_families} existing families, {num_updated_individuals - num_created_individuals} existing individuals', | ||
f'Skipped {len(individuals) - num_updated_individuals} unchanged individuals', | ||
] | ||
|
||
participant_individual_map = { | ||
i['participant_id']: i for i in Individual.objects.annotate( | ||
participant_id=INDIVIDUAL_DISPLAY_NAME_EXPR).filter( | ||
family__project=project, participant_id__in=individuals_by_participant, | ||
).values('participant_id', 'guid', 'family_id') | ||
} | ||
|
||
family_variant_data = {} | ||
finding_id_map = {} | ||
genes = set() | ||
for row in _iter_metadata_table( | ||
metadata_files_path, FINDINGS_TABLE, request.user, | ||
lambda r: r['participant_id'] in participant_individual_map and r['variant_type'] == 'SNV/INDEL', | ||
): | ||
individual = participant_individual_map[row['participant_id']] | ||
variant_id = '-'.join([row[col] for col in ['chrom', 'pos', 'ref', 'alt']]) | ||
key = (individual['family_id'], variant_id) | ||
variant = {k: v for k, v in row.items() if v and v != 'NA'} | ||
variant.update({ | ||
'pos': int(variant['pos']), | ||
'genomeVersion': variant['variant_reference_assembly'].replace('GRCh', ''), | ||
'transcripts': {}, | ||
'transcript': { | ||
config.get('seqr_field', k): variant.pop(k, None) for k, config in TRANSCRIPT_FIELDS.items() | ||
}, | ||
'genotypes': {individual['guid']: {'numAlt': 2 if variant['zygosity'] == 'Homozygous' else 1}}, | ||
'support_vars': [], | ||
}) | ||
family_variant_data[key] = variant | ||
genes.add(variant['gene']) | ||
finding_id_map[variant['genetic_findings_id']] = variant_id | ||
|
||
gene_symbols_to_ids = {k: v[0] for k, v in get_gene_ids_for_gene_symbols(genes).items()} | ||
missing_genes = set() | ||
for variant in family_variant_data.values(): | ||
gene = variant['gene'] | ||
transcript = variant.pop('transcript') | ||
if gene in gene_symbols_to_ids: | ||
variant.update({ | ||
'transcripts': {gene_symbols_to_ids[gene]: [transcript]}, | ||
'mainTranscriptId': transcript['transcriptId'], | ||
}) | ||
else: | ||
missing_genes.add(gene) | ||
if variant.get('linked_variant') in finding_id_map: | ||
variant['support_vars'].append(finding_id_map[variant['linked_variant']]) | ||
|
||
if missing_genes: | ||
warnings.append(f'The following unknown genes were omitted in the findings tags: {", ".join(sorted(missing_genes))}') | ||
|
||
num_new, num_updated = bulk_create_tagged_variants( | ||
family_variant_data, tag_name=GREGOR_FINDING_TAG_TYPE, user=request.user, project=project, | ||
get_metadata=lambda v: {k: v[k] for k in FINDING_METADATA_COLUMNS if k in v} | ||
) | ||
info.append(f'Loaded {num_new} new and {num_updated} updated findings tags') | ||
|
||
response_json['projectsByGuid'] = {project_guid: {}} | ||
response_json['familyTagTypeCounts'] = add_project_tag_types(response_json['projectsByGuid'], add_counts=True) | ||
|
||
response_json['importStats'] = {'gregorMetadata': {'info': info, 'warnings': warnings}} | ||
return create_json_response(response_json) | ||
|
||
|
||
def _iter_metadata_table(file_path, table_name, user, filter_row): | ||
file_name = f'{file_path}/{table_name}.tsv' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do these files come from the gregor report generated by seqr? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kind of. All the centers submit the same reports and then the DCC makes them into one big multi-site report. But in general, we want the schema for the report we generate to match the schema for the metadata we import here |
||
file_rows = parse_file(file_name, file_iter(file_name, user=user, no_project=True), iter_file=True) | ||
header = next(file_rows) | ||
for row in file_rows: | ||
row_dict = dict(zip(header, row)) | ||
if filter_row(row_dict): | ||
yield row_dict | ||
|
||
|
||
GREGOR_PARTICIPANT_COLUMN_MAP = { | ||
'participant_id': 'displayName', | ||
'affected_status': JsonConstants.AFFECTED_COLUMN, | ||
'phenotype_description': JsonConstants.CODED_PHENOTYPE_COLUMN, | ||
} | ||
ENUM_COLUMNS = { | ||
column: {v: k for k, v in choices} for column, choices in [ | ||
(JsonConstants.SEX_COLUMN, Individual.SEX_CHOICES), | ||
(JsonConstants.AFFECTED_COLUMN, Individual.AFFECTED_STATUS_CHOICES), | ||
(JsonConstants.PROBAND_RELATIONSHIP, Individual.RELATIONSHIP_CHOICES), | ||
] | ||
} | ||
|
||
|
||
def _parse_participant_val(column, value, participant_sample_lookup): | ||
column = GREGOR_PARTICIPANT_COLUMN_MAP.get(column, _to_camel_case(column)) | ||
if column in ENUM_COLUMNS and value: | ||
value = ENUM_COLUMNS[column].get(value, 'U') | ||
if column in {JsonConstants.MATERNAL_ID_COLUMN, JsonConstants.PATERNAL_ID_COLUMN}: | ||
if value == '0': | ||
value = None | ||
elif value in participant_sample_lookup: | ||
value = participant_sample_lookup[value] | ||
return column, value | ||
|
||
|
||
@login_and_policies_required | ||
def get_individual_rna_seq_data(request, individual_guid): | ||
individual = Individual.objects.get(guid=individual_guid) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why aren't we including a billing project when importing the gregor metadata file?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
because that workspace is a special snowflake somehow and including the billing project causes it to fail shrug