-
Notifications
You must be signed in to change notification settings - Fork 88
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Gregor metadata airtable update #3556
Changes from all commits
b22f5fc
ca2322d
1bc7cb0
1912dc6
c7d56a4
6d55da7
8ab2004
a31e7ed
b01440a
5ef7e05
f18f909
faa5223
3d0516b
3fb1aeb
a30987e
dd5a245
9d5b3ce
9ea6297
a61c811
252b90a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
from django.db.models.functions import Replace, JSONObject | ||
from django.utils import timezone | ||
import json | ||
import re | ||
import requests | ||
|
||
from seqr.utils.file_utils import is_google_bucket_file_path, does_file_exist | ||
|
@@ -657,7 +658,10 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): | |
|
||
# GREGoR metadata | ||
|
||
GREGOR_DATA_TYPES = ['wgs', 'wes', 'rna'] | ||
SMID_FIELD = 'SMID' | ||
PARTICIPANT_ID_FIELD = 'CollaboratorParticipantID' | ||
COLLABORATOR_SAMPLE_ID_FIELD = 'CollaboratorSampleID' | ||
PARTICIPANT_TABLE_COLUMNS = [ | ||
'participant_id', 'internal_project_id', 'gregor_center', 'consent_code', 'recontactable', 'prior_testing', | ||
'pmid_id', 'family_id', 'paternal_id', 'maternal_id', 'twin_id', 'proband_relationship', | ||
|
@@ -684,30 +688,59 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): | |
EXPERIMENT_TABLE_COLUMNS = [ | ||
'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id', | ||
] + EXPERIMENT_TABLE_AIRTABLE_FIELDS | ||
EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS = [ | ||
'library_prep_type', 'single_or_paired_ends', 'within_site_batch_name', 'RIN', 'estimated_library_size', | ||
'total_reads', 'percent_rRNA', 'percent_mRNA', '5prime3prime_bias', | ||
] | ||
EXPERIMENT_RNA_TABLE_COLUMNS = ['experiment_rna_short_read_id'] + [ | ||
c for c in EXPERIMENT_TABLE_COLUMNS[1:] if not c.startswith('target')] + EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + [ | ||
'percent_mtRNA', 'percent_Globin', 'percent_UMI', 'percent_GC', 'percent_chrX_Y', | ||
] | ||
EXPERIMENT_LOOKUP_TABLE_COLUMNS = ['experiment_id', 'table_name', 'id_in_table', 'participant_id'] | ||
READ_TABLE_AIRTABLE_FIELDS = [ | ||
'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', | ||
'alignment_software', 'mean_coverage', 'analysis_details', | ||
'mean_coverage', 'alignment_software', 'analysis_details', | ||
] | ||
READ_TABLE_COLUMNS = ['aligned_dna_short_read_id', 'experiment_dna_short_read_id'] + READ_TABLE_AIRTABLE_FIELDS + ['quality_issues'] | ||
READ_TABLE_COLUMNS.insert(6, 'reference_assembly_details') | ||
READ_TABLE_COLUMNS.insert(6, 'reference_assembly_uri') | ||
READ_RNA_TABLE_AIRTABLE_ID_FIELDS = ['aligned_rna_short_read_file', 'aligned_rna_short_read_index_file'] | ||
READ_RNA_TABLE_AIRTABLE_FIELDS = [ | ||
'gene_annotation', 'alignment_software', 'alignment_log_file', 'percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned', | ||
] | ||
READ_RNA_TABLE_COLUMNS = ['aligned_rna_short_read_id', 'experiment_rna_short_read_id'] + \ | ||
READ_RNA_TABLE_AIRTABLE_ID_FIELDS + READ_TABLE_COLUMNS[4:-3] + READ_RNA_TABLE_AIRTABLE_FIELDS + ['quality_issues'] | ||
READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('gene_annotation')+1, 'gene_annotation_details') | ||
READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('alignment_log_file')+1, 'alignment_postprocessing') | ||
READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id'] | ||
CALLED_TABLE_COLUMNS = [ | ||
'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum', | ||
'caller_software', 'variant_types', 'analysis_details', | ||
] | ||
ALL_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + CALLED_TABLE_COLUMNS | ||
|
||
TABLE_COLUMNS = { | ||
'participant': PARTICIPANT_TABLE_COLUMNS, | ||
'family': GREGOR_FAMILY_TABLE_COLUMNS, | ||
'phenotype': PHENOTYPE_TABLE_COLUMNS, | ||
'analyte': ANALYTE_TABLE_COLUMNS, | ||
'experiment_dna_short_read': EXPERIMENT_TABLE_COLUMNS, | ||
'aligned_dna_short_read': READ_TABLE_COLUMNS, | ||
'aligned_dna_short_read_set': READ_SET_TABLE_COLUMNS, | ||
'called_variants_dna_short_read': CALLED_TABLE_COLUMNS, | ||
|
||
RNA_ONLY = EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + READ_RNA_TABLE_AIRTABLE_FIELDS + ['reference_assembly_uri'] | ||
DATA_TYPE_OMIT = { | ||
'wgs': ['targeted_regions_method'] + RNA_ONLY, 'wes': RNA_ONLY, 'rna': [ | ||
'targeted_regions_method', 'target_insert_size', 'mean_coverage', 'aligned_dna_short_read_file', | ||
'aligned_dna_short_read_index_file', | ||
], | ||
} | ||
NO_DATA_TYPE_FIELDS = { | ||
'targeted_region_bed_file', 'reference_assembly', 'analysis_details', 'percent_rRNA', 'percent_mRNA', | ||
'alignment_software_dna', | ||
} | ||
NO_DATA_TYPE_FIELDS.update(READ_RNA_TABLE_AIRTABLE_ID_FIELDS) | ||
|
||
DATA_TYPE_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + RNA_ONLY + [ | ||
COLLABORATOR_SAMPLE_ID_FIELD, SMID_FIELD] | ||
ALL_AIRTABLE_COLUMNS = DATA_TYPE_AIRTABLE_COLUMNS + CALLED_TABLE_COLUMNS | ||
AIRTABLE_QUERY_COLUMNS = set(CALLED_TABLE_COLUMNS) | ||
AIRTABLE_QUERY_COLUMNS.remove('md5sum') | ||
AIRTABLE_QUERY_COLUMNS.update(NO_DATA_TYPE_FIELDS) | ||
for data_type in GREGOR_DATA_TYPES: | ||
data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set(DATA_TYPE_OMIT[data_type]) | ||
AIRTABLE_QUERY_COLUMNS.update({f'{field}_{data_type}' for field in data_type_columns}) | ||
|
||
WARN_MISSING_TABLE_COLUMNS = { | ||
'participant': ['recontactable', 'reported_race', 'affected_status', 'phenotype_description', 'age_at_enrollment'], | ||
} | ||
|
@@ -791,18 +824,34 @@ def gregor_export(request): | |
consent_code=consent_code[0], | ||
projectcategory__name='GREGoR', | ||
) | ||
individuals = Individual.objects.filter( | ||
sample__in=get_search_samples(projects, active_only=False), | ||
).distinct().prefetch_related('family__project', 'mother', 'father') | ||
sample_types = Sample.objects.filter(individual__family__project__in=projects).values_list('individual_id', 'sample_type') | ||
individual_data_types = defaultdict(set) | ||
for individual_db_id, sample_type in sample_types: | ||
individual_data_types[individual_db_id].add(sample_type) | ||
individuals = Individual.objects.filter(id__in=individual_data_types).prefetch_related( | ||
Comment on lines
+828
to
+831
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interestingly, we don't have to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thats just how iterating a dictionary in python3 works |
||
'family__project', 'mother', 'father') | ||
|
||
grouped_data_type_individuals = defaultdict(dict) | ||
for i in individuals: | ||
grouped_data_type_individuals[i.individual_id].update({data_type: i for data_type in individual_data_types[i.id]}) | ||
|
||
airtable_sample_records, airtable_metadata_by_smid = _get_gregor_airtable_data(individuals, request.user) | ||
airtable_sample_records, airtable_metadata_by_participant = _get_gregor_airtable_data( | ||
grouped_data_type_individuals.keys(), request.user) | ||
|
||
participant_rows = [] | ||
family_map = {} | ||
phenotype_rows = [] | ||
analyte_rows = [] | ||
airtable_rows = [] | ||
for individual in individuals: | ||
airtable_rna_rows = [] | ||
experiment_lookup_rows = [] | ||
for data_type_individuals in grouped_data_type_individuals.values(): | ||
# If multiple individual records, prefer WGS | ||
individual = next( | ||
data_type_individuals[data_type.upper()] for data_type in GREGOR_DATA_TYPES | ||
if data_type_individuals.get(data_type.upper()) | ||
) | ||
|
||
# family table | ||
family = individual.family | ||
if family not in family_map: | ||
|
@@ -832,29 +881,43 @@ def gregor_export(request): | |
dict(**base_phenotype_row, **_get_phenotype_row(feature)) for feature in individual.absent_features or [] | ||
] | ||
|
||
analyte_id = None | ||
analyte_ids = set() | ||
# airtable data | ||
if airtable_sample: | ||
sm_id = airtable_sample[SMID_FIELD] | ||
analyte_id = f'Broad_{sm_id}' | ||
airtable_metadata = airtable_metadata_by_smid.get(sm_id) | ||
if airtable_metadata: | ||
experiment_ids = _get_experiment_ids(airtable_sample, airtable_metadata) | ||
airtable_rows.append(dict(analyte_id=analyte_id, **airtable_metadata, **experiment_ids)) | ||
airtable_metadata = airtable_metadata_by_participant.get(airtable_sample[PARTICIPANT_ID_FIELD]) or {} | ||
for data_type in data_type_individuals: | ||
if data_type not in airtable_metadata: | ||
continue | ||
row = _get_airtable_row(data_type, airtable_metadata) | ||
analyte_ids.add(row['analyte_id']) | ||
is_rna = data_type == 'RNA' | ||
if not is_rna: | ||
row['alignment_software'] = row['alignment_software_dna'] | ||
(airtable_rna_rows if is_rna else airtable_rows).append(row) | ||
experiment_lookup_rows.append( | ||
{'participant_id': participant_id, **_get_experiment_lookup_row(is_rna, row)} | ||
) | ||
|
||
# analyte table | ||
analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual))) | ||
|
||
files, warnings = _get_validated_gregor_files([ | ||
('participant', participant_rows), | ||
('family', list(family_map.values())), | ||
('phenotype', phenotype_rows), | ||
('analyte', analyte_rows), | ||
('experiment_dna_short_read', airtable_rows), | ||
('aligned_dna_short_read', airtable_rows), | ||
('aligned_dna_short_read_set', airtable_rows), | ||
('called_variants_dna_short_read', airtable_rows), | ||
]) | ||
if not analyte_ids: | ||
analyte_ids.add(_get_analyte_id(airtable_sample)) | ||
for analyte_id in analyte_ids: | ||
analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual))) | ||
|
||
files = [ | ||
('participant', PARTICIPANT_TABLE_COLUMNS, participant_rows), | ||
('family', GREGOR_FAMILY_TABLE_COLUMNS, list(family_map.values())), | ||
('phenotype', PHENOTYPE_TABLE_COLUMNS, phenotype_rows), | ||
('analyte', ANALYTE_TABLE_COLUMNS, analyte_rows), | ||
('experiment_dna_short_read', EXPERIMENT_TABLE_COLUMNS, airtable_rows), | ||
('aligned_dna_short_read', READ_TABLE_COLUMNS, airtable_rows), | ||
('aligned_dna_short_read_set', READ_SET_TABLE_COLUMNS, airtable_rows), | ||
('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, airtable_rows), | ||
('experiment_rna_short_read', EXPERIMENT_RNA_TABLE_COLUMNS, airtable_rna_rows), | ||
('aligned_rna_short_read', READ_RNA_TABLE_COLUMNS, airtable_rna_rows), | ||
('experiment', EXPERIMENT_LOOKUP_TABLE_COLUMNS, experiment_lookup_rows), | ||
] | ||
warnings = _validate_gregor_files(files) | ||
write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv') | ||
|
||
return create_json_response({ | ||
|
@@ -863,21 +926,25 @@ def gregor_export(request): | |
}) | ||
|
||
|
||
def _get_gregor_airtable_data(individuals, user): | ||
def _get_gregor_airtable_data(individual_ids, user): | ||
sample_records, session = _get_airtable_samples( | ||
individuals.order_by('individual_id').values_list('individual_id', flat=True), user, | ||
fields=[SMID_FIELD, 'CollaboratorSampleID', 'Recontactable'], | ||
individual_ids, user, fields=[SMID_FIELD, PARTICIPANT_ID_FIELD, 'Recontactable'], | ||
) | ||
|
||
fields = ALL_AIRTABLE_COLUMNS | ||
airtable_metadata = session.fetch_records( | ||
'GREGoR Data Model', | ||
fields=[SMID_FIELD] + sorted(fields), | ||
or_filters={f'{SMID_FIELD}': {r[SMID_FIELD] for r in sample_records.values()}}, | ||
fields=[PARTICIPANT_ID_FIELD] + sorted(AIRTABLE_QUERY_COLUMNS), | ||
or_filters={f'{PARTICIPANT_ID_FIELD}': {r[PARTICIPANT_ID_FIELD] for r in sample_records.values()}}, | ||
) | ||
airtable_metadata_by_smid = {r[SMID_FIELD]: r for r in airtable_metadata.values()} | ||
|
||
return sample_records, airtable_metadata_by_smid | ||
airtable_metadata_by_participant = {r[PARTICIPANT_ID_FIELD]: r for r in airtable_metadata.values()} | ||
for data_type in GREGOR_DATA_TYPES: | ||
for r in airtable_metadata_by_participant.values(): | ||
data_type_fields = [f for f in r if f.endswith(f'_{data_type}')] | ||
if data_type_fields: | ||
r[data_type.upper()] = {f.replace(f'_{data_type}', ''): r.pop(f) for f in data_type_fields} | ||
|
||
return sample_records, airtable_metadata_by_participant | ||
|
||
|
||
def _get_gregor_family_row(family): | ||
|
@@ -934,37 +1001,58 @@ def _get_analyte_row(individual): | |
} | ||
|
||
|
||
def _get_experiment_ids(airtable_sample, airtable_metadata): | ||
collaborator_sample_id = airtable_sample['CollaboratorSampleID'] | ||
experiment_dna_short_read_id = f'Broad_{airtable_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}' | ||
def _get_airtable_row(data_type, airtable_metadata): | ||
data_type_metadata = airtable_metadata[data_type] | ||
collaborator_sample_id = data_type_metadata[COLLABORATOR_SAMPLE_ID_FIELD] | ||
experiment_short_read_id = f'Broad_{data_type_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}' | ||
aligned_short_read_id = f'{experiment_short_read_id}_1' | ||
return { | ||
'experiment_dna_short_read_id': experiment_dna_short_read_id, | ||
'analyte_id': _get_analyte_id(data_type_metadata), | ||
'experiment_dna_short_read_id': experiment_short_read_id, | ||
'experiment_rna_short_read_id': experiment_short_read_id, | ||
'experiment_sample_id': collaborator_sample_id, | ||
'aligned_dna_short_read_id': f'{experiment_dna_short_read_id}_1' | ||
'aligned_dna_short_read_id': aligned_short_read_id, | ||
'aligned_rna_short_read_id': aligned_short_read_id, | ||
**airtable_metadata, | ||
**data_type_metadata, | ||
} | ||
|
||
|
||
def _get_analyte_id(airtable_metadata): | ||
sm_id = airtable_metadata.get(SMID_FIELD) | ||
return f'Broad_{sm_id}' if sm_id else None | ||
|
||
|
||
def _get_experiment_lookup_row(is_rna, row_data): | ||
table_name = f'experiment_{"rna" if is_rna else "dna"}_short_read' | ||
id_in_table = row_data[f'{table_name}_id'] | ||
return { | ||
'table_name': table_name, | ||
'id_in_table': id_in_table, | ||
'experiment_id': f'{table_name}.{id_in_table}', | ||
} | ||
|
||
|
||
def _get_validated_gregor_files(file_data): | ||
def _validate_gregor_files(file_data): | ||
errors = [] | ||
warnings = [] | ||
try: | ||
validators, required_tables = _load_data_model_validators() | ||
except Exception as e: | ||
warnings.append(f'Unable to load data model for validation: {e}') | ||
validators = {} | ||
required_tables = set() | ||
required_tables = {} | ||
|
||
missing_tables = required_tables.difference({f[0] for f in file_data}) | ||
tables = {f[0] for f in file_data} | ||
missing_tables = [ | ||
table for table, validator in required_tables.items() if not _has_required_table(table, validator, tables) | ||
] | ||
if missing_tables: | ||
warnings.append( | ||
f'The following tables are required in the data model but absent from the reports: {", ".join(missing_tables)}' | ||
) | ||
|
||
files = [] | ||
for file_name, data in file_data: | ||
columns = TABLE_COLUMNS[file_name] | ||
files.append([file_name, columns, data]) | ||
|
||
for file_name, columns, data in file_data: | ||
table_validator = validators.get(file_name) | ||
if not table_validator: | ||
warnings.append(f'No data model found for "{file_name}" table so no validation was performed') | ||
|
@@ -992,7 +1080,7 @@ def _get_validated_gregor_files(file_data): | |
if errors: | ||
raise ErrorsWarningsException(errors, warnings) | ||
|
||
return files, warnings | ||
return warnings | ||
|
||
|
||
def _load_data_model_validators(): | ||
|
@@ -1003,10 +1091,26 @@ def _load_data_model_validators(): | |
t['table']: {c['column']: c for c in t['columns']} | ||
for t in table_models | ||
} | ||
required_tables = {t['table'] for t in table_models if t.get('required')} | ||
required_tables = {t['table']: _parse_table_required(t['required']) for t in table_models if t.get('required')} | ||
return validators, required_tables | ||
|
||
|
||
def _parse_table_required(required_validator): | ||
if required_validator is True: | ||
return True | ||
|
||
match = re.match(r'CONDITIONAL \(([\w+(\s,)?]+)\)', required_validator) | ||
return match and match.group(1).split(', ') | ||
|
||
|
||
def _has_required_table(table, validator, tables): | ||
if table in tables: | ||
return True | ||
if validator is True: | ||
return False | ||
return tables.isdisjoint(validator) | ||
|
||
|
||
def _validate_column_data(column, file_name, data, column_validator, warnings, errors): | ||
enum = column_validator.get('enumerations') | ||
required = column_validator.get('required') | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is worth defining constants for these field names.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why? We don't for the other 7 tables already in this report, why would we do it for this new one