Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gregor metadata airtable update #3556

Merged
merged 20 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions seqr/fixtures/1kg_project.json
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@
"individual_id": "NA20888",
"mother_id": null,
"father_id": null,
"sex": "M",
"sex": "F",
"affected": "A",
"display_name": "",
"notes": "",
Expand Down Expand Up @@ -1144,7 +1144,7 @@
"last_modified_date": "2017-03-13T09:07:50.277Z",

"sample_id": "NA20888",
"sample_type": "WES",
"sample_type": "WGS",
"is_active": false,
"individual": 16,
"dataset_type": "VARIANTS",
Expand Down
222 changes: 163 additions & 59 deletions seqr/views/apis/report_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from django.db.models.functions import Replace, JSONObject
from django.utils import timezone
import json
import re
import requests

from seqr.utils.file_utils import is_google_bucket_file_path, does_file_exist
Expand Down Expand Up @@ -657,7 +658,10 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):

# GREGoR metadata

GREGOR_DATA_TYPES = ['wgs', 'wes', 'rna']
SMID_FIELD = 'SMID'
PARTICIPANT_ID_FIELD = 'CollaboratorParticipantID'
COLLABORATOR_SAMPLE_ID_FIELD = 'CollaboratorSampleID'
PARTICIPANT_TABLE_COLUMNS = [
'participant_id', 'internal_project_id', 'gregor_center', 'consent_code', 'recontactable', 'prior_testing',
'pmid_id', 'family_id', 'paternal_id', 'maternal_id', 'twin_id', 'proband_relationship',
Expand All @@ -684,30 +688,59 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
EXPERIMENT_TABLE_COLUMNS = [
'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id',
] + EXPERIMENT_TABLE_AIRTABLE_FIELDS
EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS = [
'library_prep_type', 'single_or_paired_ends', 'within_site_batch_name', 'RIN', 'estimated_library_size',
'total_reads', 'percent_rRNA', 'percent_mRNA', '5prime3prime_bias',
]
EXPERIMENT_RNA_TABLE_COLUMNS = ['experiment_rna_short_read_id'] + [
c for c in EXPERIMENT_TABLE_COLUMNS[1:] if not c.startswith('target')] + EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + [
'percent_mtRNA', 'percent_Globin', 'percent_UMI', 'percent_GC', 'percent_chrX_Y',
]
EXPERIMENT_LOOKUP_TABLE_COLUMNS = ['experiment_id', 'table_name', 'id_in_table', 'participant_id']
READ_TABLE_AIRTABLE_FIELDS = [
'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly',
'alignment_software', 'mean_coverage', 'analysis_details',
'mean_coverage', 'alignment_software', 'analysis_details',
]
READ_TABLE_COLUMNS = ['aligned_dna_short_read_id', 'experiment_dna_short_read_id'] + READ_TABLE_AIRTABLE_FIELDS + ['quality_issues']
READ_TABLE_COLUMNS.insert(6, 'reference_assembly_details')
READ_TABLE_COLUMNS.insert(6, 'reference_assembly_uri')
READ_RNA_TABLE_AIRTABLE_ID_FIELDS = ['aligned_rna_short_read_file', 'aligned_rna_short_read_index_file']
READ_RNA_TABLE_AIRTABLE_FIELDS = [
'gene_annotation', 'alignment_software', 'alignment_log_file', 'percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned',
]
READ_RNA_TABLE_COLUMNS = ['aligned_rna_short_read_id', 'experiment_rna_short_read_id'] + \
READ_RNA_TABLE_AIRTABLE_ID_FIELDS + READ_TABLE_COLUMNS[4:-3] + READ_RNA_TABLE_AIRTABLE_FIELDS + ['quality_issues']
READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('gene_annotation')+1, 'gene_annotation_details')
READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('alignment_log_file')+1, 'alignment_postprocessing')
READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id']
CALLED_TABLE_COLUMNS = [
'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',
'caller_software', 'variant_types', 'analysis_details',
]
ALL_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + CALLED_TABLE_COLUMNS

TABLE_COLUMNS = {
'participant': PARTICIPANT_TABLE_COLUMNS,
'family': GREGOR_FAMILY_TABLE_COLUMNS,
'phenotype': PHENOTYPE_TABLE_COLUMNS,
'analyte': ANALYTE_TABLE_COLUMNS,
'experiment_dna_short_read': EXPERIMENT_TABLE_COLUMNS,
'aligned_dna_short_read': READ_TABLE_COLUMNS,
'aligned_dna_short_read_set': READ_SET_TABLE_COLUMNS,
'called_variants_dna_short_read': CALLED_TABLE_COLUMNS,

RNA_ONLY = EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + READ_RNA_TABLE_AIRTABLE_FIELDS + ['reference_assembly_uri']
DATA_TYPE_OMIT = {
'wgs': ['targeted_regions_method'] + RNA_ONLY, 'wes': RNA_ONLY, 'rna': [
'targeted_regions_method', 'target_insert_size', 'mean_coverage', 'aligned_dna_short_read_file',
'aligned_dna_short_read_index_file',
],
}
NO_DATA_TYPE_FIELDS = {
'targeted_region_bed_file', 'reference_assembly', 'analysis_details', 'percent_rRNA', 'percent_mRNA',
'alignment_software_dna',
Comment on lines +721 to +730
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is worth defining constants for these field names.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? We don't for the other 7 tables already in this report, why would we do it for this new one

}
NO_DATA_TYPE_FIELDS.update(READ_RNA_TABLE_AIRTABLE_ID_FIELDS)

DATA_TYPE_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + RNA_ONLY + [
COLLABORATOR_SAMPLE_ID_FIELD, SMID_FIELD]
ALL_AIRTABLE_COLUMNS = DATA_TYPE_AIRTABLE_COLUMNS + CALLED_TABLE_COLUMNS
AIRTABLE_QUERY_COLUMNS = set(CALLED_TABLE_COLUMNS)
AIRTABLE_QUERY_COLUMNS.remove('md5sum')
AIRTABLE_QUERY_COLUMNS.update(NO_DATA_TYPE_FIELDS)
for data_type in GREGOR_DATA_TYPES:
data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set(DATA_TYPE_OMIT[data_type])
AIRTABLE_QUERY_COLUMNS.update({f'{field}_{data_type}' for field in data_type_columns})

WARN_MISSING_TABLE_COLUMNS = {
'participant': ['recontactable', 'reported_race', 'affected_status', 'phenotype_description', 'age_at_enrollment'],
}
Expand Down Expand Up @@ -791,18 +824,34 @@ def gregor_export(request):
consent_code=consent_code[0],
projectcategory__name='GREGoR',
)
individuals = Individual.objects.filter(
sample__in=get_search_samples(projects, active_only=False),
).distinct().prefetch_related('family__project', 'mother', 'father')
sample_types = Sample.objects.filter(individual__family__project__in=projects).values_list('individual_id', 'sample_type')
individual_data_types = defaultdict(set)
for individual_db_id, sample_type in sample_types:
individual_data_types[individual_db_id].add(sample_type)
individuals = Individual.objects.filter(id__in=individual_data_types).prefetch_related(
Comment on lines +828 to +831
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interestingly, we don't have to use individual_data_types.keys() for Individual.objects.filter(id__in=individual_data_types).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thats just how iterating a dictionary in python3 works

'family__project', 'mother', 'father')

grouped_data_type_individuals = defaultdict(dict)
for i in individuals:
grouped_data_type_individuals[i.individual_id].update({data_type: i for data_type in individual_data_types[i.id]})

airtable_sample_records, airtable_metadata_by_smid = _get_gregor_airtable_data(individuals, request.user)
airtable_sample_records, airtable_metadata_by_participant = _get_gregor_airtable_data(
grouped_data_type_individuals.keys(), request.user)

participant_rows = []
family_map = {}
phenotype_rows = []
analyte_rows = []
airtable_rows = []
for individual in individuals:
airtable_rna_rows = []
experiment_lookup_rows = []
for data_type_individuals in grouped_data_type_individuals.values():
# If multiple individual records, prefer WGS
individual = next(
data_type_individuals[data_type.upper()] for data_type in GREGOR_DATA_TYPES
if data_type_individuals.get(data_type.upper())
)

# family table
family = individual.family
if family not in family_map:
Expand Down Expand Up @@ -832,29 +881,43 @@ def gregor_export(request):
dict(**base_phenotype_row, **_get_phenotype_row(feature)) for feature in individual.absent_features or []
]

analyte_id = None
analyte_ids = set()
# airtable data
if airtable_sample:
sm_id = airtable_sample[SMID_FIELD]
analyte_id = f'Broad_{sm_id}'
airtable_metadata = airtable_metadata_by_smid.get(sm_id)
if airtable_metadata:
experiment_ids = _get_experiment_ids(airtable_sample, airtable_metadata)
airtable_rows.append(dict(analyte_id=analyte_id, **airtable_metadata, **experiment_ids))
airtable_metadata = airtable_metadata_by_participant.get(airtable_sample[PARTICIPANT_ID_FIELD]) or {}
for data_type in data_type_individuals:
if data_type not in airtable_metadata:
continue
row = _get_airtable_row(data_type, airtable_metadata)
analyte_ids.add(row['analyte_id'])
is_rna = data_type == 'RNA'
if not is_rna:
row['alignment_software'] = row['alignment_software_dna']
(airtable_rna_rows if is_rna else airtable_rows).append(row)
experiment_lookup_rows.append(
{'participant_id': participant_id, **_get_experiment_lookup_row(is_rna, row)}
)

# analyte table
analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual)))

files, warnings = _get_validated_gregor_files([
('participant', participant_rows),
('family', list(family_map.values())),
('phenotype', phenotype_rows),
('analyte', analyte_rows),
('experiment_dna_short_read', airtable_rows),
('aligned_dna_short_read', airtable_rows),
('aligned_dna_short_read_set', airtable_rows),
('called_variants_dna_short_read', airtable_rows),
])
if not analyte_ids:
analyte_ids.add(_get_analyte_id(airtable_sample))
for analyte_id in analyte_ids:
analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual)))

files = [
('participant', PARTICIPANT_TABLE_COLUMNS, participant_rows),
('family', GREGOR_FAMILY_TABLE_COLUMNS, list(family_map.values())),
('phenotype', PHENOTYPE_TABLE_COLUMNS, phenotype_rows),
('analyte', ANALYTE_TABLE_COLUMNS, analyte_rows),
('experiment_dna_short_read', EXPERIMENT_TABLE_COLUMNS, airtable_rows),
('aligned_dna_short_read', READ_TABLE_COLUMNS, airtable_rows),
('aligned_dna_short_read_set', READ_SET_TABLE_COLUMNS, airtable_rows),
('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, airtable_rows),
('experiment_rna_short_read', EXPERIMENT_RNA_TABLE_COLUMNS, airtable_rna_rows),
('aligned_rna_short_read', READ_RNA_TABLE_COLUMNS, airtable_rna_rows),
('experiment', EXPERIMENT_LOOKUP_TABLE_COLUMNS, experiment_lookup_rows),
]
warnings = _validate_gregor_files(files)
write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv')

return create_json_response({
Expand All @@ -863,21 +926,25 @@ def gregor_export(request):
})


def _get_gregor_airtable_data(individuals, user):
def _get_gregor_airtable_data(individual_ids, user):
sample_records, session = _get_airtable_samples(
individuals.order_by('individual_id').values_list('individual_id', flat=True), user,
fields=[SMID_FIELD, 'CollaboratorSampleID', 'Recontactable'],
individual_ids, user, fields=[SMID_FIELD, PARTICIPANT_ID_FIELD, 'Recontactable'],
)

fields = ALL_AIRTABLE_COLUMNS
airtable_metadata = session.fetch_records(
'GREGoR Data Model',
fields=[SMID_FIELD] + sorted(fields),
or_filters={f'{SMID_FIELD}': {r[SMID_FIELD] for r in sample_records.values()}},
fields=[PARTICIPANT_ID_FIELD] + sorted(AIRTABLE_QUERY_COLUMNS),
or_filters={f'{PARTICIPANT_ID_FIELD}': {r[PARTICIPANT_ID_FIELD] for r in sample_records.values()}},
)
airtable_metadata_by_smid = {r[SMID_FIELD]: r for r in airtable_metadata.values()}

return sample_records, airtable_metadata_by_smid
airtable_metadata_by_participant = {r[PARTICIPANT_ID_FIELD]: r for r in airtable_metadata.values()}
for data_type in GREGOR_DATA_TYPES:
for r in airtable_metadata_by_participant.values():
data_type_fields = [f for f in r if f.endswith(f'_{data_type}')]
if data_type_fields:
r[data_type.upper()] = {f.replace(f'_{data_type}', ''): r.pop(f) for f in data_type_fields}

return sample_records, airtable_metadata_by_participant


def _get_gregor_family_row(family):
Expand Down Expand Up @@ -934,37 +1001,58 @@ def _get_analyte_row(individual):
}


def _get_experiment_ids(airtable_sample, airtable_metadata):
collaborator_sample_id = airtable_sample['CollaboratorSampleID']
experiment_dna_short_read_id = f'Broad_{airtable_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}'
def _get_airtable_row(data_type, airtable_metadata):
data_type_metadata = airtable_metadata[data_type]
collaborator_sample_id = data_type_metadata[COLLABORATOR_SAMPLE_ID_FIELD]
experiment_short_read_id = f'Broad_{data_type_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}'
aligned_short_read_id = f'{experiment_short_read_id}_1'
return {
'experiment_dna_short_read_id': experiment_dna_short_read_id,
'analyte_id': _get_analyte_id(data_type_metadata),
'experiment_dna_short_read_id': experiment_short_read_id,
'experiment_rna_short_read_id': experiment_short_read_id,
'experiment_sample_id': collaborator_sample_id,
'aligned_dna_short_read_id': f'{experiment_dna_short_read_id}_1'
'aligned_dna_short_read_id': aligned_short_read_id,
'aligned_rna_short_read_id': aligned_short_read_id,
**airtable_metadata,
**data_type_metadata,
}


def _get_analyte_id(airtable_metadata):
sm_id = airtable_metadata.get(SMID_FIELD)
return f'Broad_{sm_id}' if sm_id else None


def _get_experiment_lookup_row(is_rna, row_data):
table_name = f'experiment_{"rna" if is_rna else "dna"}_short_read'
id_in_table = row_data[f'{table_name}_id']
return {
'table_name': table_name,
'id_in_table': id_in_table,
'experiment_id': f'{table_name}.{id_in_table}',
}


def _get_validated_gregor_files(file_data):
def _validate_gregor_files(file_data):
errors = []
warnings = []
try:
validators, required_tables = _load_data_model_validators()
except Exception as e:
warnings.append(f'Unable to load data model for validation: {e}')
validators = {}
required_tables = set()
required_tables = {}

missing_tables = required_tables.difference({f[0] for f in file_data})
tables = {f[0] for f in file_data}
missing_tables = [
table for table, validator in required_tables.items() if not _has_required_table(table, validator, tables)
]
if missing_tables:
warnings.append(
f'The following tables are required in the data model but absent from the reports: {", ".join(missing_tables)}'
)

files = []
for file_name, data in file_data:
columns = TABLE_COLUMNS[file_name]
files.append([file_name, columns, data])

for file_name, columns, data in file_data:
table_validator = validators.get(file_name)
if not table_validator:
warnings.append(f'No data model found for "{file_name}" table so no validation was performed')
Expand Down Expand Up @@ -992,7 +1080,7 @@ def _get_validated_gregor_files(file_data):
if errors:
raise ErrorsWarningsException(errors, warnings)

return files, warnings
return warnings


def _load_data_model_validators():
Expand All @@ -1003,10 +1091,26 @@ def _load_data_model_validators():
t['table']: {c['column']: c for c in t['columns']}
for t in table_models
}
required_tables = {t['table'] for t in table_models if t.get('required')}
required_tables = {t['table']: _parse_table_required(t['required']) for t in table_models if t.get('required')}
return validators, required_tables


def _parse_table_required(required_validator):
if required_validator is True:
return True

match = re.match(r'CONDITIONAL \(([\w+(\s,)?]+)\)', required_validator)
return match and match.group(1).split(', ')


def _has_required_table(table, validator, tables):
if table in tables:
return True
if validator is True:
return False
return tables.isdisjoint(validator)


def _validate_column_data(column, file_name, data, column_validator, warnings, errors):
enum = column_validator.get('enumerations')
required = column_validator.get('required')
Expand Down
Loading
Loading