From b22f5fce369440ce1fe238f59b18f9a3f18a909d Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 14 Aug 2023 15:06:24 -0400 Subject: [PATCH 01/19] use data type specific gregor metadata --- seqr/views/apis/report_api.py | 97 ++++++++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 25 deletions(-) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index d5ae198e92..90a047add8 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -655,7 +655,10 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): # GREGoR metadata +GREGOR_DATA_TYPES = ['wes', 'wgs'] SMID_FIELD = 'SMID' +PARTICIPANT_ID_FIELD = 'CollaboratorParticipantID' +COLLABORATOR_SAMPLE_ID_FIELD = 'CollaboratorSampleID' PARTICIPANT_TABLE_COLUMNS = [ 'participant_id', 'internal_project_id', 'gregor_center', 'consent_code', 'recontactable', 'prior_testing', 'pmid_id', 'family_id', 'paternal_id', 'maternal_id', 'twin_id', 'proband_relationship', @@ -694,7 +697,22 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): 'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum', 'caller_software', 'variant_types', 'analysis_details', ] -ALL_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + CALLED_TABLE_COLUMNS + +DATA_TYPE_OMIT = {'wgs': ['targeted_regions_method'], 'wes': []} +MAPPED_AIRTABLE_FIELDS = {'alignment_software': 'alignment_software_dna'} +NO_DATA_TYPE_FIELDS = {'targeted_region_bed_file', 'reference_assembly', 'analysis_details'} + +DATA_TYPE_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + [ + COLLABORATOR_SAMPLE_ID_FIELD, SMID_FIELD] +ALL_AIRTABLE_COLUMNS = DATA_TYPE_AIRTABLE_COLUMNS + CALLED_TABLE_COLUMNS +AIRTABLE_QUERY_COLUMNS = set(CALLED_TABLE_COLUMNS) +AIRTABLE_QUERY_COLUMNS.remove('md5sum') +AIRTABLE_QUERY_COLUMNS.update(NO_DATA_TYPE_FIELDS) +AIRTABLE_QUERY_COLUMNS.update(MAPPED_AIRTABLE_FIELDS.values()) +for data_type in GREGOR_DATA_TYPES: + data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set( + MAPPED_AIRTABLE_FIELDS.keys()) - set(DATA_TYPE_OMIT[data_type]) + AIRTABLE_QUERY_COLUMNS.update({f'{field}_{data_type}' for field in data_type_columns}) TABLE_COLUMNS = { 'participant': PARTICIPANT_TABLE_COLUMNS, @@ -789,18 +807,31 @@ def gregor_export(request): consent_code=consent_code[0], projectcategory__name='GREGoR', ) - individuals = Individual.objects.filter( - sample__in=get_search_samples(projects, active_only=False), - ).distinct().prefetch_related('family__project', 'mother', 'father') + sample_types = get_search_samples(projects, active_only=False).values_list('individual_id', 'sample_type') + individual_data_types = defaultdict(set) + for individual_db_id, sample_type in sample_types: + individual_data_types[individual_db_id].add(sample_type) + individuals = Individual.objects.filter(id__in=individual_data_types).prefetch_related( + 'family__project', 'mother', 'father') + + grouped_data_type_individuals = defaultdict(dict) + for i in individuals: + grouped_data_type_individuals[i.individual_id].update({data_type: i for data_type in individual_data_types[i.id]}) - airtable_sample_records, airtable_metadata_by_smid = _get_gregor_airtable_data(individuals, request.user) + airtable_sample_records, airtable_metadata_by_participant = _get_gregor_airtable_data( + grouped_data_type_individuals.keys(), request.user) participant_rows = [] family_map = {} phenotype_rows = [] analyte_rows = [] airtable_rows = [] - for individual in individuals: + for data_type_individuals in grouped_data_type_individuals.values(): + # If multiple individual records, prefer WGS + individual = next( + data_type_individuals[data_type] for data_type in ['WGS', 'WES'] if data_type_individuals.get(data_type) + ) + # family table family = individual.family if family not in family_map: @@ -830,18 +861,24 @@ def gregor_export(request): dict(**base_phenotype_row, **_get_phenotype_row(feature)) for feature in individual.absent_features or [] ] - analyte_id = None + analyte_ids = set() # airtable data if airtable_sample: - sm_id = airtable_sample[SMID_FIELD] - analyte_id = f'Broad_{sm_id}' - airtable_metadata = airtable_metadata_by_smid.get(sm_id) - if airtable_metadata: - experiment_ids = _get_experiment_ids(airtable_sample, airtable_metadata) - airtable_rows.append(dict(analyte_id=analyte_id, **airtable_metadata, **experiment_ids)) + airtable_metadata = airtable_metadata_by_participant.get(airtable_sample[PARTICIPANT_ID_FIELD]) or {} + for data_type in data_type_individuals: + data_type_metadata = airtable_metadata.get(data_type) + if data_type_metadata: + experiment_ids = _get_experiment_ids(data_type_metadata) + analyte_ids.add(experiment_ids['analyte_id']) + row = {**airtable_metadata, **data_type_metadata, **experiment_ids} + row.update({k: row[v] for k, v in MAPPED_AIRTABLE_FIELDS.items()}) + airtable_rows.append(row) # analyte table - analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual))) + if not analyte_ids: + analyte_ids.add(_get_analyte_id(airtable_sample)) + for analyte_id in analyte_ids: + analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual))) files, warnings = _get_validated_gregor_files([ ('participant', participant_rows), @@ -861,21 +898,25 @@ def gregor_export(request): }) -def _get_gregor_airtable_data(individuals, user): +def _get_gregor_airtable_data(individual_ids, user): sample_records, session = _get_airtable_samples( - individuals.order_by('individual_id').values_list('individual_id', flat=True), user, - fields=[SMID_FIELD, 'CollaboratorSampleID', 'Recontactable'], + individual_ids, user, fields=[SMID_FIELD, PARTICIPANT_ID_FIELD, 'Recontactable'], ) - fields = ALL_AIRTABLE_COLUMNS airtable_metadata = session.fetch_records( 'GREGoR Data Model', - fields=[SMID_FIELD] + sorted(fields), - or_filters={f'{SMID_FIELD}': {r[SMID_FIELD] for r in sample_records.values()}}, + fields=[PARTICIPANT_ID_FIELD] + sorted(AIRTABLE_QUERY_COLUMNS), + or_filters={f'{PARTICIPANT_ID_FIELD}': {r[PARTICIPANT_ID_FIELD] for r in sample_records.values()}}, ) - airtable_metadata_by_smid = {r[SMID_FIELD]: r for r in airtable_metadata.values()} - return sample_records, airtable_metadata_by_smid + airtable_metadata_by_participant = {r[PARTICIPANT_ID_FIELD]: r for r in airtable_metadata.values()} + for data_type in GREGOR_DATA_TYPES: + for r in airtable_metadata_by_participant.values(): + data_type_fields = [f for f in r if f.endswith(f'_{data_type}')] + if data_type_fields: + r[data_type.upper()] = {f.replace(f'_{data_type}', ''): r.pop(f) for f in data_type_fields} + + return sample_records, airtable_metadata_by_participant def _get_gregor_family_row(family): @@ -932,16 +973,22 @@ def _get_analyte_row(individual): } -def _get_experiment_ids(airtable_sample, airtable_metadata): - collaborator_sample_id = airtable_sample['CollaboratorSampleID'] - experiment_dna_short_read_id = f'Broad_{airtable_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}' +def _get_experiment_ids(data_type_metadata): + collaborator_sample_id = data_type_metadata[COLLABORATOR_SAMPLE_ID_FIELD] + experiment_dna_short_read_id = f'Broad_{data_type_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}' return { + 'analyte_id': _get_analyte_id(data_type_metadata), 'experiment_dna_short_read_id': experiment_dna_short_read_id, 'experiment_sample_id': collaborator_sample_id, 'aligned_dna_short_read_id': f'{experiment_dna_short_read_id}_1' } +def _get_analyte_id(airtable_metadata): + sm_id = airtable_metadata[SMID_FIELD] + return f'Broad_{sm_id}' if sm_id else None + + def _get_validated_gregor_files(file_data): errors = [] warnings = [] From ca2322db4393f0bfd9d584f0222fff3ac6cbf025 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 14 Aug 2023 16:37:28 -0400 Subject: [PATCH 02/19] update test to pass --- seqr/views/apis/report_api.py | 2 +- seqr/views/apis/report_api_tests.py | 98 ++++++++++++++++++----------- 2 files changed, 63 insertions(+), 37 deletions(-) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index 90a047add8..5055e12a63 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -985,7 +985,7 @@ def _get_experiment_ids(data_type_metadata): def _get_analyte_id(airtable_metadata): - sm_id = airtable_metadata[SMID_FIELD] + sm_id = airtable_metadata.get(SMID_FIELD) return f'Broad_{sm_id}' if sm_id else None diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 11fb5818e0..5ddfcb4caf 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -156,6 +156,7 @@ "fields": { "SeqrCollaboratorSampleID": "VCGS_FAM203_621_D1", "CollaboratorSampleID": "NA19675_1", + 'CollaboratorParticipantID': 'NA19675', 'SMID': 'SM-AGHT', 'Recontactable': 'Yes', }, @@ -165,30 +166,36 @@ "fields": { "SeqrCollaboratorSampleID": "HG00731", "CollaboratorSampleID": "VCGS_FAM203_621_D2", + 'CollaboratorParticipantID': 'VCGS_FAM203_621', 'SMID': 'SM-JDBTM', }, } ]} +# TODO test grouped individuals multi data type +# TODO test has data type in airtable but not seqr samples +# TODO test analyte id fallback from airtable sample AIRTABLE_GREGOR_RECORDS = { "records": [ { "id": "rec2B6OGmQpAkQW3s", "fields": { - 'SMID': 'SM-JDBTM', - 'seq_library_prep_kit_method': 'Kapa HyperPrep', - 'read_length': '151', - 'experiment_type': 'exome', - 'targeted_regions_method': 'Twist', + 'CollaboratorParticipantID': 'VCGS_FAM203_621', + 'CollaboratorSampleID_wes': 'VCGS_FAM203_621_D2', + 'SMID_wes': 'SM-JDBTM', + 'seq_library_prep_kit_method_wes': 'Kapa HyperPrep', + 'read_length_wes': '151', + 'experiment_type_wes': 'exome', + 'targeted_regions_method_wes': 'Twist', 'targeted_region_bed_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', - 'date_data_generation': '2022-08-15', - 'target_insert_size': '385', - 'sequencing_platform': 'NovaSeq', - 'aligned_dna_short_read_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram', - 'aligned_dna_short_read_index_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai', - 'md5sum': '129c28163df082', + 'date_data_generation_wes': '2022-08-15', + 'target_insert_size_wes': '385', + 'sequencing_platform_wes': 'NovaSeq', + 'aligned_dna_short_read_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram', + 'aligned_dna_short_read_index_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai', + 'md5sum_wes': '129c28163df082', 'reference_assembly': 'GRCh38', - 'alignment_software': 'BWA-MEM-2.3', - 'mean_coverage': '42.4', + 'alignment_software_dna': 'BWA-MEM-2.3', + 'mean_coverage_wgs': '42.4', 'analysis_details': 'DOI:10.5281/zenodo.4469317', 'called_variants_dna_short_read_id': 'SX2-3', 'aligned_dna_short_read_set_id': 'BCM_H7YG5DSX2', @@ -197,12 +204,12 @@ 'variant_types': 'SNV', }, }, - { - "id": "rec2B6OGmCVzkQW3s", - "fields": { - 'SMID': 'SM-AGHT', - }, - }, + # { + # "id": "rec2B6OGmCVzkQW3s", + # "fields": { + # 'SMID': 'SM-AGHT', + # }, + # }, ]} EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = { @@ -299,6 +306,23 @@ {'column': 'age_at_enrollment'}, ], }, + { + 'table': 'aligned_dna_short_read', + 'columns': [ + {'column': 'aligned_dna_short_read_id', 'required': True}, + {'column': 'experiment_dna_short_read_id', 'required': True}, + {'column': 'aligned_dna_short_read_file'}, + {'column': 'aligned_dna_short_read_index_file'}, + {'column': 'alignment_software'}, + {'column': 'analysis_details'}, + {'column': 'md5sum'}, + {'column': 'mean_coverage', 'required': True}, + {'column': 'reference_assembly'}, + {'column': 'reference_assembly_details'}, + {'column': 'reference_assembly_uri'}, + {'column': 'quality_issues'}, + ], + }, { 'table': 'aligned_dna_short_read_set', 'columns': [ @@ -691,11 +715,11 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat 'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', 'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', 'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', - ] + skipped_file_validation_warnings[1:6] + skipped_file_validation_warnings[7:]) + ] + skipped_file_validation_warnings[1:5] + skipped_file_validation_warnings[7:]) self.assertListEqual(response.json()['errors'], [ 'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', 'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)', - 'The following entries are missing required "aligned_dna_short_read_set_id" (from Airtable) in the "aligned_dna_short_read_set" table: NA19675_1', + 'The following entries are missing required "mean_coverage" (from Airtable) in the "aligned_dna_short_read" table: VCGS_FAM203_621_D2', ]) responses.add(responses.GET, MOCK_DATA_MODEL_URL, status=404) @@ -769,7 +793,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat ['Broad_SM-AGHT', 'Broad_NA19675_1', 'DNA', '', 'UBERON:0003714', '', '', 'No', '', '', '', '', '', '', '', ''], row) - self.assertEqual(len(experiment_file), 3) + self.assertEqual(len(experiment_file), 2) self.assertEqual(experiment_file[0], [ 'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id', 'seq_library_prep_kit_method', 'read_length', 'experiment_type', 'targeted_regions_method', 'targeted_region_bed_file', @@ -779,22 +803,21 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat 'Broad_exome_VCGS_FAM203_621_D2', 'Broad_SM-JDBTM', 'VCGS_FAM203_621_D2', 'Kapa HyperPrep', '151', 'exome', 'Twist', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', '2022-08-15', '385', 'NovaSeq', ], experiment_file) - self.assertIn(['Broad_NA_NA19675_1', 'Broad_SM-AGHT', 'NA19675_1', '', '', '', '', '', '', '', ''], experiment_file) - self.assertEqual(len(read_file), 3) + self.assertEqual(len(read_file), 2) self.assertEqual(read_file[0], [ 'aligned_dna_short_read_id', 'experiment_dna_short_read_id', 'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri', 'reference_assembly_details', 'alignment_software', 'mean_coverage', 'analysis_details', 'quality_issues', ]) - self.assertIn([ + self.assertEqual(read_file[1], [ 'Broad_exome_VCGS_FAM203_621_D2_1', 'Broad_exome_VCGS_FAM203_621_D2', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai', '129c28163df082', 'GRCh38', - '', '', 'BWA-MEM-2.3', '42.4', 'DOI:10.5281/zenodo.4469317', '', - ], read_file) + '', '', 'BWA-MEM-2.3', '', 'DOI:10.5281/zenodo.4469317', '', + ]) - self.assertEqual(len(read_set_file), 3) + self.assertEqual(len(read_set_file), 2) self.assertEqual(read_set_file[0], ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id']) self.assertIn(['BCM_H7YG5DSX2', 'Broad_exome_VCGS_FAM203_621_D2_1'], read_set_file) @@ -815,7 +838,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat "{CollaboratorSampleID}='NA20870',{CollaboratorSampleID}='NA20872',{CollaboratorSampleID}='NA20874'," \ "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20877'," \ "{CollaboratorSampleID}='NA20881')" - sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorSampleID', 'Recontactable'] + sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorParticipantID', 'Recontactable'] self._assert_expected_airtable_call(0, sample_filter, sample_fields) secondary_sample_filter = "OR({SeqrCollaboratorSampleID}='HG00731',{SeqrCollaboratorSampleID}='HG00732'," \ "{SeqrCollaboratorSampleID}='HG00733',{SeqrCollaboratorSampleID}='NA19678'," \ @@ -825,14 +848,17 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat sample_fields[0] = 'SeqrCollaboratorSampleID' self._assert_expected_airtable_call(1, secondary_sample_filter, sample_fields) metadata_fields = [ - 'SMID', 'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'aligned_dna_short_read_set_id', - 'alignment_software', 'analysis_details', 'analysis_details', 'called_variants_dna_file', - 'called_variants_dna_short_read_id', 'caller_software', 'date_data_generation', 'experiment_type', - 'md5sum', 'md5sum', 'mean_coverage', 'read_length', 'reference_assembly', 'seq_library_prep_kit_method', - 'sequencing_platform', 'target_insert_size', 'targeted_region_bed_file', 'targeted_regions_method', - 'variant_types', + 'CollaboratorParticipantID', 'CollaboratorSampleID_wes', 'CollaboratorSampleID_wgs', 'SMID_wes', 'SMID_wgs', + 'aligned_dna_short_read_file_wes', 'aligned_dna_short_read_file_wgs', 'aligned_dna_short_read_index_file_wes', + 'aligned_dna_short_read_index_file_wgs', 'aligned_dna_short_read_set_id', 'alignment_software_dna', + 'analysis_details', 'called_variants_dna_file', 'called_variants_dna_short_read_id', 'caller_software', + 'date_data_generation_wes', 'date_data_generation_wgs', 'experiment_type_wes', 'experiment_type_wgs', + 'md5sum_wes', 'md5sum_wgs', 'mean_coverage_wes', 'mean_coverage_wgs', 'read_length_wes', 'read_length_wgs', + 'reference_assembly', 'seq_library_prep_kit_method_wes', 'seq_library_prep_kit_method_wgs', + 'sequencing_platform_wes', 'sequencing_platform_wgs', 'target_insert_size_wes', 'target_insert_size_wgs', + 'targeted_region_bed_file', 'targeted_regions_method_wes', 'variant_types', ] - self._assert_expected_airtable_call(2, "OR(SMID='SM-AGHT',SMID='SM-JDBTM')", metadata_fields) + self._assert_expected_airtable_call(2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='VCGS_FAM203_621')", metadata_fields) self.assertEqual(responses.calls[3].request.url, MOCK_DATA_MODEL_URL) From 1bc7cb0b5d01a3900aada72007aa9e6457ee6ed8 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 14 Aug 2023 16:38:24 -0400 Subject: [PATCH 03/19] update test to pass --- seqr/views/apis/report_api_tests.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 5ddfcb4caf..290d585d24 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -204,12 +204,14 @@ 'variant_types': 'SNV', }, }, - # { - # "id": "rec2B6OGmCVzkQW3s", - # "fields": { - # 'SMID': 'SM-AGHT', - # }, - # }, + { + "id": "rec2B6OGmCVzkQW3s", + "fields": { + 'CollaboratorParticipantID': 'NA19675', + 'CollaboratorSampleID_wgs': 'NA19675_1', + 'SMID_wgs': 'SM-AGHT-2', + }, + }, ]} EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = { From 1912dc6c5002d4228d7ff83af136d3f065dec087 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 14 Aug 2023 16:39:46 -0400 Subject: [PATCH 04/19] test wrond data in airtable case --- seqr/views/apis/report_api_tests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 290d585d24..862e87cf84 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -172,8 +172,6 @@ } ]} # TODO test grouped individuals multi data type -# TODO test has data type in airtable but not seqr samples -# TODO test analyte id fallback from airtable sample AIRTABLE_GREGOR_RECORDS = { "records": [ { @@ -210,6 +208,7 @@ 'CollaboratorParticipantID': 'NA19675', 'CollaboratorSampleID_wgs': 'NA19675_1', 'SMID_wgs': 'SM-AGHT-2', + 'experiment_type_wgs': 'genome', }, }, ]} From c7d56a48f8783a596485b24b2e2ba1eb24a5d335 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 15 Aug 2023 11:02:46 -0400 Subject: [PATCH 05/19] update test fixtures --- seqr/fixtures/1kg_project.json | 4 ++-- seqr/views/apis/report_api_tests.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json index 1c522b8bf8..3e4d01bd96 100644 --- a/seqr/fixtures/1kg_project.json +++ b/seqr/fixtures/1kg_project.json @@ -695,7 +695,7 @@ "created_by": null, "last_modified_date": "2017-03-13T09:07:50.158Z", "family": 8, - "individual_id": "NA20877", + "individual_id": "NA20888", "mother_id": null, "father_id": null, "sex": "M", @@ -1144,7 +1144,7 @@ "last_modified_date": "2017-03-13T09:07:50.277Z", "sample_id": "NA20888", - "sample_type": "WES", + "sample_type": "WGS", "is_active": false, "individual": 16, "dataset_type": "VARIANTS", diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 862e87cf84..2b3f44e13d 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -712,13 +712,13 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat 'The following tables are required in the data model but absent from the reports: subject', 'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id', 'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata', - 'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', - 'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', - 'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', - 'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', + 'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', + 'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', + 'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', + 'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', ] + skipped_file_validation_warnings[1:5] + skipped_file_validation_warnings[7:]) self.assertListEqual(response.json()['errors'], [ - 'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881', + 'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', 'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)', 'The following entries are missing required "mean_coverage" (from Airtable) in the "aligned_dna_short_read" table: VCGS_FAM203_621_D2', ]) @@ -837,15 +837,15 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat sample_filter = "OR({CollaboratorSampleID}='HG00731',{CollaboratorSampleID}='HG00732',{CollaboratorSampleID}='HG00733'," \ "{CollaboratorSampleID}='NA19675_1',{CollaboratorSampleID}='NA19678',{CollaboratorSampleID}='NA19679'," \ "{CollaboratorSampleID}='NA20870',{CollaboratorSampleID}='NA20872',{CollaboratorSampleID}='NA20874'," \ - "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20877'," \ - "{CollaboratorSampleID}='NA20881')" + "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20881'," \ + "{CollaboratorSampleID}='NA20888')" sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorParticipantID', 'Recontactable'] self._assert_expected_airtable_call(0, sample_filter, sample_fields) secondary_sample_filter = "OR({SeqrCollaboratorSampleID}='HG00731',{SeqrCollaboratorSampleID}='HG00732'," \ "{SeqrCollaboratorSampleID}='HG00733',{SeqrCollaboratorSampleID}='NA19678'," \ "{SeqrCollaboratorSampleID}='NA19679',{SeqrCollaboratorSampleID}='NA20870',{SeqrCollaboratorSampleID}='NA20872'," \ "{SeqrCollaboratorSampleID}='NA20874',{SeqrCollaboratorSampleID}='NA20875',{SeqrCollaboratorSampleID}='NA20876'," \ - "{SeqrCollaboratorSampleID}='NA20877',{SeqrCollaboratorSampleID}='NA20881')" + "{SeqrCollaboratorSampleID}='NA20881',{SeqrCollaboratorSampleID}='NA20888')" sample_fields[0] = 'SeqrCollaboratorSampleID' self._assert_expected_airtable_call(1, secondary_sample_filter, sample_fields) metadata_fields = [ From 6d55da7f20ae61878210ee14faecf743cb9c70fa Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 15 Aug 2023 11:44:22 -0400 Subject: [PATCH 06/19] add mock airtable data --- seqr/views/apis/report_api_tests.py | 90 ++++++++++++++++++++++++----- 1 file changed, 77 insertions(+), 13 deletions(-) diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 2b3f44e13d..95d3702aeb 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -169,7 +169,17 @@ 'CollaboratorParticipantID': 'VCGS_FAM203_621', 'SMID': 'SM-JDBTM', }, - } + }, + { + "id": "rec2Nkg1fKssJc7", + "fields": { + 'SeqrCollaboratorSampleID': 'NA20888', + 'CollaboratorSampleID': 'NA20888', + 'CollaboratorParticipantID': 'NA20888', + 'SMID': 'SM-L5QMP', + 'Recontactable': 'No', + }, + }, ]} # TODO test grouped individuals multi data type AIRTABLE_GREGOR_RECORDS = { @@ -211,6 +221,46 @@ 'experiment_type_wgs': 'genome', }, }, + { + "id": "rec2BFCGmQpAkQ7x", + "fields": { + 'CollaboratorParticipantID': 'NA20888', + 'CollaboratorSampleID_wes': 'NA20888', + 'CollaboratorSampleID_wgs': 'NA20888_1', + 'SMID_wes': 'SM-L5QMP', + 'SMID_wgs': 'SM-L5QMWP', + 'seq_library_prep_kit_method_wes': 'Kapa HyperPrep', + 'seq_library_prep_kit_method_wgs': 'Kapa HyperPrep w/o amplification', + 'read_length_wes': '151', + 'read_length_wgs': '200', + 'experiment_type_wes': 'exome', + 'experiment_type_wgs': 'genome', + 'targeted_regions_method_wes': 'Twist', + 'targeted_region_bed_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', + 'date_data_generation_wes': '2022-06-05', + 'date_data_generation_wgs': '2023-03-13', + 'target_insert_size_wes': '380', + 'target_insert_size_wgs': '450', + 'sequencing_platform_wes': 'NovaSeq', + 'sequencing_platform_wgs': 'NovaSeq2', + 'aligned_dna_short_read_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.cram', + 'aligned_dna_short_read_index_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai', + 'aligned_dna_short_read_file_wgs': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.cram', + 'aligned_dna_short_read_index_file_wgs': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.crai', + 'md5sum_wes': 'a6f6308866765ce8', + 'md5sum_wgs': '2aa33e8c32020b1c', + 'reference_assembly': 'GRCh38', + 'alignment_software_dna': 'BWA 0.7.15.r1140', + 'mean_coverage_wes': '42.8', + 'mean_coverage_wgs': '36.1', + 'analysis_details': '', + 'called_variants_dna_short_read_id': 'NA', + 'aligned_dna_short_read_set_id': 'Broad_NA20888_D1', + 'called_variants_dna_file': 'NA', + 'caller_software': 'NA', + 'variant_types': 'SNV', + }, + }, ]} EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = { @@ -642,7 +692,9 @@ def test_sample_metadata_export(self, mock_google_authenticated): 'NA19679', 'NA20870', 'HG00732', 'NA20876', 'NA20874', 'NA20875', 'NA19678', 'NA19675', 'HG00731', 'NA20872', 'NA20881', 'HG00733', }) - expected_samples.update(self.ADDITIONAL_SAMPLES) + if self.ADDITIONAL_SAMPLES: + expected_samples.remove('NA20888') + expected_samples.update(self.ADDITIONAL_SAMPLES) self.assertSetEqual({r['sample_id'] for r in response_json['rows']}, expected_samples) test_row = next(r for r in response_json['rows'] if r['sample_id'] == 'NA20889') self.assertDictEqual(EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW, test_row) @@ -712,7 +764,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat 'The following tables are required in the data model but absent from the reports: subject', 'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id', 'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata', - 'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', + 'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881', 'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', 'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', 'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', @@ -794,7 +846,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat ['Broad_SM-AGHT', 'Broad_NA19675_1', 'DNA', '', 'UBERON:0003714', '', '', 'No', '', '', '', '', '', '', '', ''], row) - self.assertEqual(len(experiment_file), 2) + self.assertEqual(len(experiment_file), 3) self.assertEqual(experiment_file[0], [ 'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id', 'seq_library_prep_kit_method', 'read_length', 'experiment_type', 'targeted_regions_method', 'targeted_region_bed_file', @@ -804,33 +856,45 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat 'Broad_exome_VCGS_FAM203_621_D2', 'Broad_SM-JDBTM', 'VCGS_FAM203_621_D2', 'Kapa HyperPrep', '151', 'exome', 'Twist', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', '2022-08-15', '385', 'NovaSeq', ], experiment_file) + self.assertIn([ + 'Broad_exome_NA20888', 'Broad_SM-L5QMP', 'NA20888', 'Kapa HyperPrep', '151', 'exome', + 'Twist', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', '2022-06-05', '380', 'NovaSeq', + ], experiment_file) - self.assertEqual(len(read_file), 2) + self.assertEqual(len(read_file), 3) self.assertEqual(read_file[0], [ 'aligned_dna_short_read_id', 'experiment_dna_short_read_id', 'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri', 'reference_assembly_details', 'alignment_software', 'mean_coverage', 'analysis_details', 'quality_issues', ]) - self.assertEqual(read_file[1], [ + self.assertIn([ 'Broad_exome_VCGS_FAM203_621_D2_1', 'Broad_exome_VCGS_FAM203_621_D2', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai', '129c28163df082', 'GRCh38', '', '', 'BWA-MEM-2.3', '', 'DOI:10.5281/zenodo.4469317', '', - ]) + ], read_file) + self.assertIn([ + 'Broad_exome_NA20888_1', 'Broad_exome_NA20888', + 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.cram', + 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai', 'a6f6308866765ce8', 'GRCh38', + '', '', 'BWA 0.7.15.r1140', '42.8', '', '', + ], read_file) - self.assertEqual(len(read_set_file), 2) + self.assertEqual(len(read_set_file), 3) self.assertEqual(read_set_file[0], ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id']) self.assertIn(['BCM_H7YG5DSX2', 'Broad_exome_VCGS_FAM203_621_D2_1'], read_set_file) + self.assertIn(['Broad_NA20888_D1', 'Broad_exome_NA20888_1'], read_set_file) - self.assertEqual(len(called_file), 2) + self.assertEqual(len(called_file), 3) self.assertEqual(called_file[0], [ 'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum', 'caller_software', 'variant_types', 'analysis_details', ]) - self.assertEqual(called_file[1], [ + self.assertIn([ 'SX2-3', 'BCM_H7YG5DSX2', 'gs://fc-fed09429-e563-44a7-aaeb-776c8336ba02/COL_FAM1_1_D1.SV.vcf', '129c28163df082', 'gatk4.1.2', 'SNV', 'DOI:10.5281/zenodo.4469317', - ]) + ], called_file) + self.assertIn(['NA', 'Broad_NA20888_D1', 'NA', 'a6f6308866765ce8', 'NA', 'SNV', ''], called_file) # test airtable calls self.assertEqual(len(responses.calls), 4) @@ -845,7 +909,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat "{SeqrCollaboratorSampleID}='HG00733',{SeqrCollaboratorSampleID}='NA19678'," \ "{SeqrCollaboratorSampleID}='NA19679',{SeqrCollaboratorSampleID}='NA20870',{SeqrCollaboratorSampleID}='NA20872'," \ "{SeqrCollaboratorSampleID}='NA20874',{SeqrCollaboratorSampleID}='NA20875',{SeqrCollaboratorSampleID}='NA20876'," \ - "{SeqrCollaboratorSampleID}='NA20881',{SeqrCollaboratorSampleID}='NA20888')" + "{SeqrCollaboratorSampleID}='NA20881')" sample_fields[0] = 'SeqrCollaboratorSampleID' self._assert_expected_airtable_call(1, secondary_sample_filter, sample_fields) metadata_fields = [ @@ -859,7 +923,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat 'sequencing_platform_wes', 'sequencing_platform_wgs', 'target_insert_size_wes', 'target_insert_size_wgs', 'targeted_region_bed_file', 'targeted_regions_method_wes', 'variant_types', ] - self._assert_expected_airtable_call(2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='VCGS_FAM203_621')", metadata_fields) + self._assert_expected_airtable_call(2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='NA20888',CollaboratorParticipantID='VCGS_FAM203_621')", metadata_fields) self.assertEqual(responses.calls[3].request.url, MOCK_DATA_MODEL_URL) From 8ab2004b260d8f4c2c4febc2926c06692c79c180 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 15 Aug 2023 11:51:54 -0400 Subject: [PATCH 07/19] abstract gregor checks --- seqr/views/apis/report_api_tests.py | 46 ++++++++++++++++------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 95d3702aeb..33faafe7dc 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -181,7 +181,7 @@ }, }, ]} -# TODO test grouped individuals multi data type + AIRTABLE_GREGOR_RECORDS = { "records": [ { @@ -262,6 +262,10 @@ }, }, ]} +EXPECTED_GREGOR_FILES = [ + 'participant', 'family', 'phenotype', 'analyte', 'experiment_dna_short_read', + 'aligned_dna_short_read', 'aligned_dna_short_read_set', 'called_variants_dna_short_read', +] EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = { "project_guid": "R0003_test", @@ -693,7 +697,7 @@ def test_sample_metadata_export(self, mock_google_authenticated): 'NA20872', 'NA20881', 'HG00733', }) if self.ADDITIONAL_SAMPLES: - expected_samples.remove('NA20888') + expected_samples.remove('NA20888') # TODO investigate expected_samples.update(self.ADDITIONAL_SAMPLES) self.assertSetEqual({r['sample_id'] for r in response_json['rows']}, expected_samples) test_row = next(r for r in response_json['rows'] if r['sample_id'] == 'NA20889') @@ -753,12 +757,8 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat mock_google_authenticated.return_value = True response = self.client.post(url, content_type='application/json', data=json.dumps(body)) self.assertEqual(response.status_code, 400) - expected_files = [ - 'participant', 'family', 'phenotype', 'analyte', 'experiment_dna_short_read', - 'aligned_dna_short_read', 'aligned_dna_short_read_set', 'called_variants_dna_short_read', - ] skipped_file_validation_warnings = [ - f'No data model found for "{file}" table so no validation was performed' for file in expected_files + f'No data model found for "{file}" table so no validation was performed' for file in EXPECTED_GREGOR_FILES ] self.assertListEqual(response.json()['warnings'], [ 'The following tables are required in the data model but absent from the reports: subject', @@ -781,15 +781,29 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat mock_google_authenticated.return_value = True response = self.client.post(url, content_type='application/json', data=json.dumps(body)) self.assertEqual(response.status_code, 200) - self.assertDictEqual(response.json(), { + expected_response = { 'info': ['Successfully validated and uploaded Gregor Report for 9 families'], 'warnings': [ 'Unable to load data model for validation: 404 Client Error: Not Found for url: http://raw.githubusercontent.com/gregor_data_model.json', ] + skipped_file_validation_warnings, - }) + } + self.assertDictEqual(response.json(), expected_response) + self._assert_expected_gregor_files(mock_open) + self._test_expected_gregor_airtable_calls() + + # test gsutil commands + mock_subprocess.assert_has_calls([ + mock.call('gsutil ls gs://anvil-upload', stdout=-1, stderr=-2, shell=True), + mock.call().wait(), + mock.call('gsutil mv /mock/tmp/* gs://anvil-upload', stdout=-1, stderr=-2, shell=True), + mock.call().wait(), + ]) + self.check_no_analyst_no_access(url) + + def _assert_expected_gregor_files(self, mock_open): self.assertListEqual( - mock_open.call_args_list, [mock.call(f'/mock/tmp/{file}.tsv', 'w') for file in expected_files]) + mock_open.call_args_list, [mock.call(f'/mock/tmp/{file}.tsv', 'w') for file in EXPECTED_GREGOR_FILES]) files = [ [row.split('\t') for row in write_call.args[0].split('\n')] for write_call in mock_open.return_value.__enter__.return_value.write.call_args_list @@ -896,7 +910,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat ], called_file) self.assertIn(['NA', 'Broad_NA20888_D1', 'NA', 'a6f6308866765ce8', 'NA', 'SNV', ''], called_file) - # test airtable calls + def _test_expected_gregor_airtable_calls(self): self.assertEqual(len(responses.calls), 4) sample_filter = "OR({CollaboratorSampleID}='HG00731',{CollaboratorSampleID}='HG00732',{CollaboratorSampleID}='HG00733'," \ "{CollaboratorSampleID}='NA19675_1',{CollaboratorSampleID}='NA19678',{CollaboratorSampleID}='NA19679'," \ @@ -927,16 +941,6 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat self.assertEqual(responses.calls[3].request.url, MOCK_DATA_MODEL_URL) - # test gsutil commands - mock_subprocess.assert_has_calls([ - mock.call('gsutil ls gs://anvil-upload', stdout=-1, stderr=-2, shell=True), - mock.call().wait(), - mock.call('gsutil mv /mock/tmp/* gs://anvil-upload', stdout=-1, stderr=-2, shell=True), - mock.call().wait(), - ]) - - self.check_no_analyst_no_access(url) - class LocalReportAPITest(AuthenticationTestCase, ReportAPITest): fixtures = ['users', '1kg_project', 'reference_data', 'report_variants'] From a31e7ed9636da57285e448b1d241f94afd8a47ef Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 15 Aug 2023 12:42:04 -0400 Subject: [PATCH 08/19] test multiple data types --- seqr/fixtures/1kg_project.json | 2 +- seqr/views/apis/report_api_tests.py | 98 +++++++++++++++++++++-------- 2 files changed, 73 insertions(+), 27 deletions(-) diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json index 3e4d01bd96..8dabf313b5 100644 --- a/seqr/fixtures/1kg_project.json +++ b/seqr/fixtures/1kg_project.json @@ -698,7 +698,7 @@ "individual_id": "NA20888", "mother_id": null, "father_id": null, - "sex": "M", + "sex": "F", "affected": "A", "display_name": "", "notes": "", diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 33faafe7dc..aea400a8b4 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -799,9 +799,23 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat mock.call().wait(), ]) + # Test multiple project with shared sample IDs + project = Project.objects.get(id=3) + project.consent_code = 'H' + project.save() + + responses.calls.reset() + mock_open.reset_mock() + response = self.client.post(url, content_type='application/json', data=json.dumps(body)) + self.assertEqual(response.status_code, 200) + expected_response['info'][0] = expected_response['info'][0].replace('9', '10') + self.assertDictEqual(response.json(), expected_response) + self._assert_expected_gregor_files(mock_open, has_second_project=True) + self._test_expected_gregor_airtable_calls(additional_samples=['NA20885', 'NA20889']) + self.check_no_analyst_no_access(url) - def _assert_expected_gregor_files(self, mock_open): + def _assert_expected_gregor_files(self, mock_open, has_second_project=False): self.assertListEqual( mock_open.call_args_list, [mock.call(f'/mock/tmp/{file}.tsv', 'w') for file in EXPECTED_GREGOR_FILES]) files = [ @@ -810,7 +824,7 @@ def _assert_expected_gregor_files(self, mock_open): ] participant_file, family_file, phenotype_file, analyte_file, experiment_file, read_file, read_set_file, called_file = files - self.assertEqual(len(participant_file), 14) + self.assertEqual(len(participant_file), 16 if has_second_project else 14) self.assertEqual(participant_file[0], [ 'participant_id', 'internal_project_id', 'gregor_center', 'consent_code', 'recontactable', 'prior_testing', 'pmid_id', 'family_id', 'paternal_id', 'maternal_id', 'twin_id', 'proband_relationship', @@ -828,15 +842,29 @@ def _assert_expected_gregor_files(self, mock_open): 'Broad_HG00731', 'Broad_1kg project nme with unide', 'BROAD', 'HMB', '', '', '', 'Broad_2', 'Broad_HG00732', 'Broad_HG00733', '', '', '', 'Female', '', '', 'Hispanic or Latino', 'Other', '', 'Affected', '', '', ], hispanic_row) + multi_data_type_row = next(r for r in participant_file if r[0] == 'Broad_NA20888') + self.assertListEqual([ + 'Broad_NA20888', 'Broad_Test Reprocessed Project' if has_second_project else 'Broad_1kg project nme with unide', + 'BROAD', 'HMB', 'No', '', '', 'Broad_12' if has_second_project else 'Broad_8', '0', '0', '', '', '', + 'Male' if has_second_project else 'Female', '', '', '', '', '', 'Affected', '', '', + ], multi_data_type_row) - self.assertEqual(len(family_file), 10) + self.assertEqual(len(family_file), 11 if has_second_project else 10) self.assertEqual(family_file[0], [ 'family_id', 'consanguinity', 'consanguinity_detail', 'pedigree_file', 'pedigree_file_detail', 'family_history_detail', ]) self.assertIn(['Broad_1', 'Present', '', '', '', ''], family_file) - - self.assertEqual(len(phenotype_file), 10) + fam_8_row = ['Broad_8', 'Unknown', '', '', '', ''] + fam_11_row = ['Broad_11', 'None suspected', '', '', '', ''] + if has_second_project: + self.assertIn(fam_11_row, family_file) + self.assertNotIn(fam_8_row, family_file) + else: + self.assertIn(fam_8_row, family_file) + self.assertNotIn(fam_11_row, family_file) + + self.assertEqual(len(phenotype_file), 14 if has_second_project else 10) self.assertEqual(phenotype_file[0], [ 'phenotype_id', 'participant_id', 'term_id', 'presence', 'ontology', 'additional_details', 'onset_age_range', 'additional_modifiers', @@ -848,7 +876,7 @@ def _assert_expected_gregor_files(self, mock_open): '', 'Broad_NA19675_1', 'HP:0001674', 'Absent', 'HPO', 'originally indicated', '', '', ], phenotype_file) - self.assertEqual(len(analyte_file), 14) + self.assertEqual(len(analyte_file), 17 if has_second_project else 14) self.assertEqual(analyte_file[0], [ 'analyte_id', 'participant_id', 'analyte_type', 'analyte_processing_details', 'primary_biosample', 'primary_biosample_id', 'primary_biosample_details', 'tissue_affected_status', 'age_at_collection', @@ -859,8 +887,15 @@ def _assert_expected_gregor_files(self, mock_open): self.assertListEqual( ['Broad_SM-AGHT', 'Broad_NA19675_1', 'DNA', '', 'UBERON:0003714', '', '', 'No', '', '', '', '', '', '', '', ''], row) + self.assertIn( + ['Broad_SM-L5QMP', 'Broad_NA20888', '', '', '', '', '', 'No', '', '', '', '', '', '', '', ''], analyte_file) + self.assertEqual( + ['Broad_SM-L5QMWP', 'Broad_NA20888', '', '', '', '', '', 'No', '', '', '', '', '', '', '', ''] in analyte_file, + has_second_project + ) - self.assertEqual(len(experiment_file), 3) + num_airtable_rows = 4 if has_second_project else 3 + self.assertEqual(len(experiment_file), num_airtable_rows) self.assertEqual(experiment_file[0], [ 'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id', 'seq_library_prep_kit_method', 'read_length', 'experiment_type', 'targeted_regions_method', 'targeted_region_bed_file', @@ -874,8 +909,12 @@ def _assert_expected_gregor_files(self, mock_open): 'Broad_exome_NA20888', 'Broad_SM-L5QMP', 'NA20888', 'Kapa HyperPrep', '151', 'exome', 'Twist', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', '2022-06-05', '380', 'NovaSeq', ], experiment_file) + self.assertEqual([ + 'Broad_genome_NA20888_1', 'Broad_SM-L5QMWP', 'NA20888_1', 'Kapa HyperPrep w/o amplification', '200', 'genome', + '', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed', '2023-03-13', '450', 'NovaSeq2', + ] in experiment_file, has_second_project) - self.assertEqual(len(read_file), 3) + self.assertEqual(len(read_file), num_airtable_rows) self.assertEqual(read_file[0], [ 'aligned_dna_short_read_id', 'experiment_dna_short_read_id', 'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri', 'reference_assembly_details', @@ -890,16 +929,23 @@ def _assert_expected_gregor_files(self, mock_open): self.assertIn([ 'Broad_exome_NA20888_1', 'Broad_exome_NA20888', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.cram', - 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai', 'a6f6308866765ce8', 'GRCh38', - '', '', 'BWA 0.7.15.r1140', '42.8', '', '', + 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai', 'a6f6308866765ce8', 'GRCh38', '', '', + 'BWA 0.7.15.r1140', '42.8', '', '', ], read_file) - - self.assertEqual(len(read_set_file), 3) + self.assertEqual([ + 'Broad_genome_NA20888_1_1', 'Broad_genome_NA20888_1', + 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.cram', + 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.crai', '2aa33e8c32020b1c', 'GRCh38', '', '', + 'BWA 0.7.15.r1140', '36.1', '', '', + ] in read_file, has_second_project) + + self.assertEqual(len(read_set_file), num_airtable_rows) self.assertEqual(read_set_file[0], ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id']) self.assertIn(['BCM_H7YG5DSX2', 'Broad_exome_VCGS_FAM203_621_D2_1'], read_set_file) self.assertIn(['Broad_NA20888_D1', 'Broad_exome_NA20888_1'], read_set_file) + self.assertEqual(['Broad_NA20888_D1', 'Broad_genome_NA20888_1_1'] in read_set_file, has_second_project) - self.assertEqual(len(called_file), 3) + self.assertEqual(len(called_file), num_airtable_rows) self.assertEqual(called_file[0], [ 'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum', 'caller_software', 'variant_types', 'analysis_details', @@ -909,23 +955,23 @@ def _assert_expected_gregor_files(self, mock_open): '129c28163df082', 'gatk4.1.2', 'SNV', 'DOI:10.5281/zenodo.4469317', ], called_file) self.assertIn(['NA', 'Broad_NA20888_D1', 'NA', 'a6f6308866765ce8', 'NA', 'SNV', ''], called_file) + self.assertEqual( + ['NA', 'Broad_NA20888_D1', 'NA', '2aa33e8c32020b1c', 'NA', 'SNV', ''] in called_file, has_second_project) - def _test_expected_gregor_airtable_calls(self): + def _test_expected_gregor_airtable_calls(self, additional_samples=None): self.assertEqual(len(responses.calls), 4) - sample_filter = "OR({CollaboratorSampleID}='HG00731',{CollaboratorSampleID}='HG00732',{CollaboratorSampleID}='HG00733'," \ - "{CollaboratorSampleID}='NA19675_1',{CollaboratorSampleID}='NA19678',{CollaboratorSampleID}='NA19679'," \ - "{CollaboratorSampleID}='NA20870',{CollaboratorSampleID}='NA20872',{CollaboratorSampleID}='NA20874'," \ - "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20881'," \ - "{CollaboratorSampleID}='NA20888')" + sample_ids = { + 'HG00731', 'HG00732', 'HG00733', 'NA19675_1', 'NA19678', 'NA19679', 'NA20870', 'NA20872', 'NA20874', + 'NA20875', 'NA20876', 'NA20881', 'NA20888', + } + sample_ids.update(additional_samples or []) + sample_filter = ','.join([f"{{CollaboratorSampleID}}='{sample_id}'" for sample_id in sorted(sample_ids)]) sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorParticipantID', 'Recontactable'] - self._assert_expected_airtable_call(0, sample_filter, sample_fields) - secondary_sample_filter = "OR({SeqrCollaboratorSampleID}='HG00731',{SeqrCollaboratorSampleID}='HG00732'," \ - "{SeqrCollaboratorSampleID}='HG00733',{SeqrCollaboratorSampleID}='NA19678'," \ - "{SeqrCollaboratorSampleID}='NA19679',{SeqrCollaboratorSampleID}='NA20870',{SeqrCollaboratorSampleID}='NA20872'," \ - "{SeqrCollaboratorSampleID}='NA20874',{SeqrCollaboratorSampleID}='NA20875',{SeqrCollaboratorSampleID}='NA20876'," \ - "{SeqrCollaboratorSampleID}='NA20881')" + self._assert_expected_airtable_call(0, f"OR({sample_filter})", sample_fields) + sample_ids -= {'NA19675_1', 'NA20888'} + secondary_sample_filter = ','.join([f"{{SeqrCollaboratorSampleID}}='{sample_id}'" for sample_id in sorted(sample_ids)]) sample_fields[0] = 'SeqrCollaboratorSampleID' - self._assert_expected_airtable_call(1, secondary_sample_filter, sample_fields) + self._assert_expected_airtable_call(1, f"OR({secondary_sample_filter})", sample_fields) metadata_fields = [ 'CollaboratorParticipantID', 'CollaboratorSampleID_wes', 'CollaboratorSampleID_wgs', 'SMID_wes', 'SMID_wgs', 'aligned_dna_short_read_file_wes', 'aligned_dna_short_read_file_wgs', 'aligned_dna_short_read_index_file_wes', From b01440a73410039a170658c64d54f13a79e73b31 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 15 Aug 2023 12:48:23 -0400 Subject: [PATCH 09/19] clean up --- seqr/views/apis/report_api_tests.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index aea400a8b4..15abc03bc3 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -696,9 +696,7 @@ def test_sample_metadata_export(self, mock_google_authenticated): 'NA19679', 'NA20870', 'HG00732', 'NA20876', 'NA20874', 'NA20875', 'NA19678', 'NA19675', 'HG00731', 'NA20872', 'NA20881', 'HG00733', }) - if self.ADDITIONAL_SAMPLES: - expected_samples.remove('NA20888') # TODO investigate - expected_samples.update(self.ADDITIONAL_SAMPLES) + expected_samples.update(self.ADDITIONAL_SAMPLES) self.assertSetEqual({r['sample_id'] for r in response_json['rows']}, expected_samples) test_row = next(r for r in response_json['rows'] if r['sample_id'] == 'NA20889') self.assertDictEqual(EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW, test_row) From f18f909b6787e8a98fe7091a15875db7d156867b Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 15 Aug 2023 15:27:35 -0400 Subject: [PATCH 10/19] add conditional validation --- seqr/views/apis/report_api.py | 26 +++++++++++++++++++++++--- seqr/views/apis/report_api_tests.py | 8 +++++++- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index d0d4f812d6..7105f62bcb 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -8,6 +8,7 @@ from django.db.models.functions import Replace, JSONObject from django.utils import timezone import json +import re import requests from seqr.utils.file_utils import is_google_bucket_file_path, does_file_exist @@ -999,9 +1000,12 @@ def _get_validated_gregor_files(file_data): except Exception as e: warnings.append(f'Unable to load data model for validation: {e}') validators = {} - required_tables = set() + required_tables = {} - missing_tables = required_tables.difference({f[0] for f in file_data}) + tables = {f[0] for f in file_data} + missing_tables = [ + table for table, validator in required_tables.items() if not _has_required_table(table, validator, tables) + ] if missing_tables: warnings.append( f'The following tables are required in the data model but absent from the reports: {", ".join(missing_tables)}' @@ -1050,10 +1054,26 @@ def _load_data_model_validators(): t['table']: {c['column']: c for c in t['columns']} for t in table_models } - required_tables = {t['table'] for t in table_models if t.get('required')} + required_tables = {t['table']: _parse_table_required(t['required']) for t in table_models if t.get('required')} return validators, required_tables +def _parse_table_required(required_validator): + if required_validator is True: + return True + + match = re.match(r'CONDITIONAL \(([\w+(\s,)?]+)\)', required_validator) + return match and match.group(1).split(', ') + + +def _has_required_table(table, validator, tables): + if table in tables: + return True + if validator is True: + return False + return tables.isdisjoint(validator) + + def _validate_column_data(column, file_name, data, column_validator, warnings, errors): enum = column_validator.get('enumerations') required = column_validator.get('required') diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index b071cdaddb..78f7fafbb3 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -363,6 +363,7 @@ }, { 'table': 'aligned_dna_short_read', + 'required': 'CONDITIONAL (aligned_dna_short_read_set, called_variants_dna_short_read)', 'columns': [ {'column': 'aligned_dna_short_read_id', 'required': True}, {'column': 'experiment_dna_short_read_id', 'required': True}, @@ -389,6 +390,11 @@ 'table': 'dna_read_data', 'columns': [{'column': 'analyte_id', 'required': True}], }, + { + 'table': 'dna_read_data_set', + 'required': 'CONDITIONAL (aligned_dna_short_read_set, dna_read_data)', + 'columns': [{'column': 'analyte_id', 'required': True}], + }, ] } @@ -761,7 +767,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat f'No data model found for "{file}" table so no validation was performed' for file in EXPECTED_GREGOR_FILES ] self.assertListEqual(response.json()['warnings'], [ - 'The following tables are required in the data model but absent from the reports: subject', + 'The following tables are required in the data model but absent from the reports: subject, dna_read_data_set', 'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id', 'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata', 'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881', From faa522363388f1b95823995f16b2137e9464d156 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 15 Aug 2023 15:53:55 -0400 Subject: [PATCH 11/19] add rna columns --- seqr/views/apis/report_api.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index 7105f62bcb..71d71347c7 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -688,6 +688,12 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): EXPERIMENT_TABLE_COLUMNS = [ 'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id', ] + EXPERIMENT_TABLE_AIRTABLE_FIELDS +EXPERIMENT_RNA_TABLE_COLUMNS = ['experiment_rna_short_read_id'] + [ + c for c in EXPERIMENT_TABLE_COLUMNS[1:] if not c.startswith('target')] + [ + 'single_or_paired_ends', 'within_site_batch_name', 'RIN', 'estimated_library_size', 'total_reads', 'percent_rRNA', + 'percent_mRNA', 'percent_mtRNA', 'percent_Globin', 'percent_UMI', '5prime3prime_bias', 'percent_GC', 'percent_chrX_Y', +] +EXPERIMENT_RNA_TABLE_COLUMNS.insert(4, 'library_prep_type') READ_TABLE_AIRTABLE_FIELDS = [ 'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', 'alignment_software', 'mean_coverage', 'analysis_details', @@ -695,6 +701,14 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): READ_TABLE_COLUMNS = ['aligned_dna_short_read_id', 'experiment_dna_short_read_id'] + READ_TABLE_AIRTABLE_FIELDS + ['quality_issues'] READ_TABLE_COLUMNS.insert(6, 'reference_assembly_details') READ_TABLE_COLUMNS.insert(6, 'reference_assembly_uri') +READ_RNA_TABLE_COLUMNS = [ + 'aligned_rna_short_read_id', 'experiment_rna_short_read_id', 'aligned_rna_short_read_file', + 'aligned_rna_short_read_index_file', +] + READ_TABLE_COLUMNS[4:-2] + ['percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned', 'quality_issues'] +READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation_details') +READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation') +READ_RNA_TABLE_COLUMNS.insert(7, 'alignment_postprocessing') +READ_RNA_TABLE_COLUMNS.insert(7, 'alignment_log_file') READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id'] CALLED_TABLE_COLUMNS = [ 'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum', @@ -726,6 +740,8 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): 'aligned_dna_short_read': READ_TABLE_COLUMNS, 'aligned_dna_short_read_set': READ_SET_TABLE_COLUMNS, 'called_variants_dna_short_read': CALLED_TABLE_COLUMNS, + 'experiment_rna_short_read': EXPERIMENT_RNA_TABLE_COLUMNS, + 'aligned_rna_short_read': READ_RNA_TABLE_COLUMNS, } WARN_MISSING_TABLE_COLUMNS = { 'participant': ['recontactable', 'reported_race', 'affected_status', 'phenotype_description', 'age_at_enrollment'], @@ -892,6 +908,8 @@ def gregor_export(request): ('aligned_dna_short_read', airtable_rows), ('aligned_dna_short_read_set', airtable_rows), ('called_variants_dna_short_read', airtable_rows), + ('aligned_rna_short_read', airtable_rows), + ('experiment_rna_short_read', airtable_rows), ]) write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv') From 3d0516b9d8de3590894d7faced1b8433b549905c Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 15 Aug 2023 16:43:29 -0400 Subject: [PATCH 12/19] add rna airtable data --- seqr/views/apis/report_api.py | 75 +++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index 71d71347c7..638408e379 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -658,7 +658,7 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): # GREGoR metadata -GREGOR_DATA_TYPES = ['wes', 'wgs'] +GREGOR_DATA_TYPES = ['wgs', 'wes', 'rna'] SMID_FIELD = 'SMID' PARTICIPANT_ID_FIELD = 'CollaboratorParticipantID' COLLABORATOR_SAMPLE_ID_FIELD = 'CollaboratorSampleID' @@ -688,12 +688,14 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): EXPERIMENT_TABLE_COLUMNS = [ 'experiment_dna_short_read_id', 'analyte_id', 'experiment_sample_id', ] + EXPERIMENT_TABLE_AIRTABLE_FIELDS +EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS = [ + 'library_prep_type', 'single_or_paired_ends', 'within_site_batch_name', 'RIN', 'estimated_library_size', + 'total_reads', 'percent_rRNA', 'percent_mRNA', '5prime3prime_bias', +] EXPERIMENT_RNA_TABLE_COLUMNS = ['experiment_rna_short_read_id'] + [ - c for c in EXPERIMENT_TABLE_COLUMNS[1:] if not c.startswith('target')] + [ - 'single_or_paired_ends', 'within_site_batch_name', 'RIN', 'estimated_library_size', 'total_reads', 'percent_rRNA', - 'percent_mRNA', 'percent_mtRNA', 'percent_Globin', 'percent_UMI', '5prime3prime_bias', 'percent_GC', 'percent_chrX_Y', + c for c in EXPERIMENT_TABLE_COLUMNS[1:] if not c.startswith('target')] + EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + [ + 'percent_mtRNA', 'percent_Globin', 'percent_UMI', 'percent_GC', 'percent_chrX_Y', ] -EXPERIMENT_RNA_TABLE_COLUMNS.insert(4, 'library_prep_type') READ_TABLE_AIRTABLE_FIELDS = [ 'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', 'alignment_software', 'mean_coverage', 'analysis_details', @@ -701,34 +703,42 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): READ_TABLE_COLUMNS = ['aligned_dna_short_read_id', 'experiment_dna_short_read_id'] + READ_TABLE_AIRTABLE_FIELDS + ['quality_issues'] READ_TABLE_COLUMNS.insert(6, 'reference_assembly_details') READ_TABLE_COLUMNS.insert(6, 'reference_assembly_uri') -READ_RNA_TABLE_COLUMNS = [ - 'aligned_rna_short_read_id', 'experiment_rna_short_read_id', 'aligned_rna_short_read_file', - 'aligned_rna_short_read_index_file', -] + READ_TABLE_COLUMNS[4:-2] + ['percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned', 'quality_issues'] -READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation_details') -READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation') +READ_RNA_TABLE_AIRTABLE_ID_FIELDS = ['aligned_rna_short_read_file', 'aligned_rna_short_read_index_file'] +READ_RNA_TABLE_AIRTABLE_FIELDS = [ + 'gene_annotation', 'alignment_log_file', 'percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned', +] +READ_RNA_TABLE_COLUMNS = ['aligned_rna_short_read_id', 'experiment_rna_short_read_id'] + \ + READ_RNA_TABLE_AIRTABLE_ID_FIELDS + READ_TABLE_COLUMNS[4:-2] + READ_RNA_TABLE_AIRTABLE_FIELDS + ['quality_issues'] +READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation_details') # TODO placement READ_RNA_TABLE_COLUMNS.insert(7, 'alignment_postprocessing') -READ_RNA_TABLE_COLUMNS.insert(7, 'alignment_log_file') READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id'] CALLED_TABLE_COLUMNS = [ 'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum', 'caller_software', 'variant_types', 'analysis_details', ] -DATA_TYPE_OMIT = {'wgs': ['targeted_regions_method'], 'wes': []} -MAPPED_AIRTABLE_FIELDS = {'alignment_software': 'alignment_software_dna'} -NO_DATA_TYPE_FIELDS = {'targeted_region_bed_file', 'reference_assembly', 'analysis_details'} +RNA_ONLY = EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + READ_RNA_TABLE_AIRTABLE_FIELDS + [ + 'reference_assembly_uri', 'alignment_software'] +DATA_TYPE_OMIT = { + 'wgs': ['targeted_regions_method'] + RNA_ONLY, 'wes': RNA_ONLY, 'rna': [ + 'targeted_regions_method', 'target_insert_size', 'mean_coverage', 'aligned_dna_short_read_file', + 'aligned_dna_short_read_index_file', + ], +} +NO_DATA_TYPE_FIELDS = { + 'targeted_region_bed_file', 'reference_assembly', 'analysis_details', 'percent_rRNA', 'percent_mRNA', + 'alignment_software_dna', +} +NO_DATA_TYPE_FIELDS.update(READ_RNA_TABLE_AIRTABLE_ID_FIELDS) -DATA_TYPE_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + [ +DATA_TYPE_AIRTABLE_COLUMNS = EXPERIMENT_TABLE_AIRTABLE_FIELDS + READ_TABLE_AIRTABLE_FIELDS + RNA_ONLY + [ COLLABORATOR_SAMPLE_ID_FIELD, SMID_FIELD] ALL_AIRTABLE_COLUMNS = DATA_TYPE_AIRTABLE_COLUMNS + CALLED_TABLE_COLUMNS AIRTABLE_QUERY_COLUMNS = set(CALLED_TABLE_COLUMNS) AIRTABLE_QUERY_COLUMNS.remove('md5sum') AIRTABLE_QUERY_COLUMNS.update(NO_DATA_TYPE_FIELDS) -AIRTABLE_QUERY_COLUMNS.update(MAPPED_AIRTABLE_FIELDS.values()) for data_type in GREGOR_DATA_TYPES: - data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set( - MAPPED_AIRTABLE_FIELDS.keys()) - set(DATA_TYPE_OMIT[data_type]) + data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set(DATA_TYPE_OMIT[data_type]) AIRTABLE_QUERY_COLUMNS.update({f'{field}_{data_type}' for field in data_type_columns}) TABLE_COLUMNS = { @@ -826,7 +836,7 @@ def gregor_export(request): consent_code=consent_code[0], projectcategory__name='GREGoR', ) - sample_types = get_search_samples(projects, active_only=False).values_list('individual_id', 'sample_type') + sample_types = Sample.objects.filter(individual__family__project__in=projects).values_list('individual_id', 'sample_type') individual_data_types = defaultdict(set) for individual_db_id, sample_type in sample_types: individual_data_types[individual_db_id].add(sample_type) @@ -845,10 +855,12 @@ def gregor_export(request): phenotype_rows = [] analyte_rows = [] airtable_rows = [] + airtable_rna_rows = [] for data_type_individuals in grouped_data_type_individuals.values(): # If multiple individual records, prefer WGS individual = next( - data_type_individuals[data_type] for data_type in ['WGS', 'WES'] if data_type_individuals.get(data_type) + data_type_individuals[data_type.upper()] for data_type in GREGOR_DATA_TYPES + if data_type_individuals.get(data_type.upper()) ) # family table @@ -887,11 +899,14 @@ def gregor_export(request): for data_type in data_type_individuals: data_type_metadata = airtable_metadata.get(data_type) if data_type_metadata: - experiment_ids = _get_experiment_ids(data_type_metadata) + is_rna = data_type == 'RNA' + experiment_ids = _get_experiment_ids(data_type_metadata, is_rna) analyte_ids.add(experiment_ids['analyte_id']) row = {**airtable_metadata, **data_type_metadata, **experiment_ids} - row.update({k: row[v] for k, v in MAPPED_AIRTABLE_FIELDS.items()}) - airtable_rows.append(row) + if not is_rna: + row['alignment_software'] = row['alignment_software_dna'] + rows = airtable_rna_rows if is_rna else airtable_rows + rows.append(row) # analyte table if not analyte_ids: @@ -908,8 +923,8 @@ def gregor_export(request): ('aligned_dna_short_read', airtable_rows), ('aligned_dna_short_read_set', airtable_rows), ('called_variants_dna_short_read', airtable_rows), - ('aligned_rna_short_read', airtable_rows), - ('experiment_rna_short_read', airtable_rows), + ('aligned_rna_short_read', airtable_rna_rows), + ('experiment_rna_short_read', airtable_rna_rows), ]) write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv') @@ -994,14 +1009,14 @@ def _get_analyte_row(individual): } -def _get_experiment_ids(data_type_metadata): +def _get_experiment_ids(data_type_metadata, is_rna): collaborator_sample_id = data_type_metadata[COLLABORATOR_SAMPLE_ID_FIELD] - experiment_dna_short_read_id = f'Broad_{data_type_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}' + experiment_short_read_id = f'Broad_{data_type_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}' return { 'analyte_id': _get_analyte_id(data_type_metadata), - 'experiment_dna_short_read_id': experiment_dna_short_read_id, + f'experiment_{"rna" if is_rna else "dna"}_short_read_id': experiment_short_read_id, 'experiment_sample_id': collaborator_sample_id, - 'aligned_dna_short_read_id': f'{experiment_dna_short_read_id}_1' + f'aligned_{"rna" if is_rna else "dna"}_short_read_id': f'{experiment_short_read_id}_1' } From 3fb1aeb5c211f61e5e76ac064c6ee1b182158741 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 15 Aug 2023 16:53:08 -0400 Subject: [PATCH 13/19] clean up column order --- seqr/views/apis/report_api.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index 638408e379..988a1ac047 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -698,27 +698,26 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): ] READ_TABLE_AIRTABLE_FIELDS = [ 'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', - 'alignment_software', 'mean_coverage', 'analysis_details', + 'mean_coverage', 'alignment_software', 'analysis_details', ] READ_TABLE_COLUMNS = ['aligned_dna_short_read_id', 'experiment_dna_short_read_id'] + READ_TABLE_AIRTABLE_FIELDS + ['quality_issues'] READ_TABLE_COLUMNS.insert(6, 'reference_assembly_details') READ_TABLE_COLUMNS.insert(6, 'reference_assembly_uri') READ_RNA_TABLE_AIRTABLE_ID_FIELDS = ['aligned_rna_short_read_file', 'aligned_rna_short_read_index_file'] READ_RNA_TABLE_AIRTABLE_FIELDS = [ - 'gene_annotation', 'alignment_log_file', 'percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned', + 'gene_annotation', 'alignment_software', 'alignment_log_file', 'percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned', ] READ_RNA_TABLE_COLUMNS = ['aligned_rna_short_read_id', 'experiment_rna_short_read_id'] + \ - READ_RNA_TABLE_AIRTABLE_ID_FIELDS + READ_TABLE_COLUMNS[4:-2] + READ_RNA_TABLE_AIRTABLE_FIELDS + ['quality_issues'] -READ_RNA_TABLE_COLUMNS.insert(6, 'gene_annotation_details') # TODO placement -READ_RNA_TABLE_COLUMNS.insert(7, 'alignment_postprocessing') + READ_RNA_TABLE_AIRTABLE_ID_FIELDS + READ_TABLE_COLUMNS[4:-3] + READ_RNA_TABLE_AIRTABLE_FIELDS + ['quality_issues'] +READ_RNA_TABLE_COLUMNS.insert(10, 'gene_annotation_details') +READ_RNA_TABLE_COLUMNS.insert(13, 'alignment_postprocessing') READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id'] CALLED_TABLE_COLUMNS = [ 'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum', 'caller_software', 'variant_types', 'analysis_details', ] -RNA_ONLY = EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + READ_RNA_TABLE_AIRTABLE_FIELDS + [ - 'reference_assembly_uri', 'alignment_software'] +RNA_ONLY = EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + READ_RNA_TABLE_AIRTABLE_FIELDS + ['reference_assembly_uri'] DATA_TYPE_OMIT = { 'wgs': ['targeted_regions_method'] + RNA_ONLY, 'wes': RNA_ONLY, 'rna': [ 'targeted_regions_method', 'target_insert_size', 'mean_coverage', 'aligned_dna_short_read_file', From a30987e19a4edce9d7bbf0a33e85a43e7f33ed3a Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 15 Aug 2023 17:22:52 -0400 Subject: [PATCH 14/19] get tests passing --- seqr/views/apis/report_api.py | 4 ++-- seqr/views/apis/report_api_tests.py | 37 ++++++++++++++++++----------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index 988a1ac047..37ff1e3d80 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -751,7 +751,7 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): 'called_variants_dna_short_read': CALLED_TABLE_COLUMNS, 'experiment_rna_short_read': EXPERIMENT_RNA_TABLE_COLUMNS, 'aligned_rna_short_read': READ_RNA_TABLE_COLUMNS, -} +} # TODO add experiment table WARN_MISSING_TABLE_COLUMNS = { 'participant': ['recontactable', 'reported_race', 'affected_status', 'phenotype_description', 'age_at_enrollment'], } @@ -922,8 +922,8 @@ def gregor_export(request): ('aligned_dna_short_read', airtable_rows), ('aligned_dna_short_read_set', airtable_rows), ('called_variants_dna_short_read', airtable_rows), - ('aligned_rna_short_read', airtable_rna_rows), ('experiment_rna_short_read', airtable_rna_rows), + ('aligned_rna_short_read', airtable_rna_rows), ]) write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv') diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 78f7fafbb3..458334d7ab 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -265,6 +265,7 @@ EXPECTED_GREGOR_FILES = [ 'participant', 'family', 'phenotype', 'analyte', 'experiment_dna_short_read', 'aligned_dna_short_read', 'aligned_dna_short_read_set', 'called_variants_dna_short_read', + 'experiment_rna_short_read', 'aligned_rna_short_read', ] EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = { @@ -828,7 +829,8 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): [row.split('\t') for row in write_call.args[0].split('\n')] for write_call in mock_open.return_value.__enter__.return_value.write.call_args_list ] - participant_file, family_file, phenotype_file, analyte_file, experiment_file, read_file, read_set_file, called_file = files + participant_file, family_file, phenotype_file, analyte_file, experiment_file, read_file, read_set_file, \ + called_file, experiment_rna_file, aligned_rna_file = files self.assertEqual(len(participant_file), 16 if has_second_project else 14) self.assertEqual(participant_file[0], [ @@ -924,25 +926,25 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): self.assertEqual(read_file[0], [ 'aligned_dna_short_read_id', 'experiment_dna_short_read_id', 'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri', 'reference_assembly_details', - 'alignment_software', 'mean_coverage', 'analysis_details', 'quality_issues', + 'mean_coverage', 'alignment_software', 'analysis_details', 'quality_issues', ]) self.assertIn([ 'Broad_exome_VCGS_FAM203_621_D2_1', 'Broad_exome_VCGS_FAM203_621_D2', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai', '129c28163df082', 'GRCh38', - '', '', 'BWA-MEM-2.3', '', 'DOI:10.5281/zenodo.4469317', '', + '', '', '', 'BWA-MEM-2.3', 'DOI:10.5281/zenodo.4469317', '', ], read_file) self.assertIn([ 'Broad_exome_NA20888_1', 'Broad_exome_NA20888', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.cram', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai', 'a6f6308866765ce8', 'GRCh38', '', '', - 'BWA 0.7.15.r1140', '42.8', '', '', + '42.8', 'BWA 0.7.15.r1140', '', '', ], read_file) self.assertEqual([ 'Broad_genome_NA20888_1_1', 'Broad_genome_NA20888_1', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.cram', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.crai', '2aa33e8c32020b1c', 'GRCh38', '', '', - 'BWA 0.7.15.r1140', '36.1', '', '', + '36.1', 'BWA 0.7.15.r1140', '', '', ] in read_file, has_second_project) self.assertEqual(len(read_set_file), num_airtable_rows) @@ -979,15 +981,22 @@ def _test_expected_gregor_airtable_calls(self, additional_samples=None): sample_fields[0] = 'SeqrCollaboratorSampleID' self._assert_expected_airtable_call(1, f"OR({secondary_sample_filter})", sample_fields) metadata_fields = [ - 'CollaboratorParticipantID', 'CollaboratorSampleID_wes', 'CollaboratorSampleID_wgs', 'SMID_wes', 'SMID_wgs', - 'aligned_dna_short_read_file_wes', 'aligned_dna_short_read_file_wgs', 'aligned_dna_short_read_index_file_wes', - 'aligned_dna_short_read_index_file_wgs', 'aligned_dna_short_read_set_id', 'alignment_software_dna', - 'analysis_details', 'called_variants_dna_file', 'called_variants_dna_short_read_id', 'caller_software', - 'date_data_generation_wes', 'date_data_generation_wgs', 'experiment_type_wes', 'experiment_type_wgs', - 'md5sum_wes', 'md5sum_wgs', 'mean_coverage_wes', 'mean_coverage_wgs', 'read_length_wes', 'read_length_wgs', - 'reference_assembly', 'seq_library_prep_kit_method_wes', 'seq_library_prep_kit_method_wgs', - 'sequencing_platform_wes', 'sequencing_platform_wgs', 'target_insert_size_wes', 'target_insert_size_wgs', - 'targeted_region_bed_file', 'targeted_regions_method_wes', 'variant_types', + 'CollaboratorParticipantID', '5prime3prime_bias_rna', 'CollaboratorSampleID_rna', 'CollaboratorSampleID_wes', + 'CollaboratorSampleID_wgs', 'RIN_rna', 'SMID_rna', 'SMID_wes', 'SMID_wgs', 'aligned_dna_short_read_file_wes', + 'aligned_dna_short_read_file_wgs', 'aligned_dna_short_read_index_file_wes', + 'aligned_dna_short_read_index_file_wgs', 'aligned_dna_short_read_set_id', + 'aligned_rna_short_read_file', 'aligned_rna_short_read_index_file', 'alignment_log_file_rna', + 'alignment_software_dna', 'alignment_software_rna', 'analysis_details', 'called_variants_dna_file', + 'called_variants_dna_short_read_id', 'caller_software', 'date_data_generation_rna', 'date_data_generation_wes', + 'date_data_generation_wgs', 'estimated_library_size_rna', 'experiment_type_rna', 'experiment_type_wes', + 'experiment_type_wgs', 'gene_annotation_rna', 'library_prep_type_rna', 'md5sum_rna', 'md5sum_wes', + 'md5sum_wgs', 'mean_coverage_wes', 'mean_coverage_wgs', 'percent_mRNA', 'percent_multimapped_rna', + 'percent_rRNA', 'percent_unaligned_rna', 'percent_uniquely_aligned_rna', 'read_length_rna', 'read_length_wes', + 'read_length_wgs', 'reference_assembly', 'reference_assembly_uri_rna', 'seq_library_prep_kit_method_rna', + 'seq_library_prep_kit_method_wes', 'seq_library_prep_kit_method_wgs', 'sequencing_platform_rna', + 'sequencing_platform_wes', 'sequencing_platform_wgs', 'single_or_paired_ends_rna', 'target_insert_size_wes', + 'target_insert_size_wgs', 'targeted_region_bed_file', 'targeted_regions_method_wes', 'total_reads_rna', + 'variant_types', 'within_site_batch_name_rna', ] self._assert_expected_airtable_call(2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='NA20888',CollaboratorParticipantID='VCGS_FAM203_621')", metadata_fields) From dd5a245fef08ce9df57565c4adf98f8200302b5b Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 16 Aug 2023 10:13:09 -0400 Subject: [PATCH 15/19] clean up gregor file config --- seqr/views/apis/report_api.py | 48 +++++++++++++---------------------- 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index 37ff1e3d80..566915d851 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -740,18 +740,7 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set(DATA_TYPE_OMIT[data_type]) AIRTABLE_QUERY_COLUMNS.update({f'{field}_{data_type}' for field in data_type_columns}) -TABLE_COLUMNS = { - 'participant': PARTICIPANT_TABLE_COLUMNS, - 'family': GREGOR_FAMILY_TABLE_COLUMNS, - 'phenotype': PHENOTYPE_TABLE_COLUMNS, - 'analyte': ANALYTE_TABLE_COLUMNS, - 'experiment_dna_short_read': EXPERIMENT_TABLE_COLUMNS, - 'aligned_dna_short_read': READ_TABLE_COLUMNS, - 'aligned_dna_short_read_set': READ_SET_TABLE_COLUMNS, - 'called_variants_dna_short_read': CALLED_TABLE_COLUMNS, - 'experiment_rna_short_read': EXPERIMENT_RNA_TABLE_COLUMNS, - 'aligned_rna_short_read': READ_RNA_TABLE_COLUMNS, -} # TODO add experiment table +# 'experiment': ['experiment_id', 'table_name', 'id_in_table', 'participant_id'], # TODO WARN_MISSING_TABLE_COLUMNS = { 'participant': ['recontactable', 'reported_race', 'affected_status', 'phenotype_description', 'age_at_enrollment'], } @@ -913,18 +902,19 @@ def gregor_export(request): for analyte_id in analyte_ids: analyte_rows.append(dict(participant_id=participant_id, analyte_id=analyte_id, **_get_analyte_row(individual))) - files, warnings = _get_validated_gregor_files([ - ('participant', participant_rows), - ('family', list(family_map.values())), - ('phenotype', phenotype_rows), - ('analyte', analyte_rows), - ('experiment_dna_short_read', airtable_rows), - ('aligned_dna_short_read', airtable_rows), - ('aligned_dna_short_read_set', airtable_rows), - ('called_variants_dna_short_read', airtable_rows), - ('experiment_rna_short_read', airtable_rna_rows), - ('aligned_rna_short_read', airtable_rna_rows), - ]) + files = [ + ('participant', PARTICIPANT_TABLE_COLUMNS, participant_rows), + ('family', GREGOR_FAMILY_TABLE_COLUMNS, list(family_map.values())), + ('phenotype', PHENOTYPE_TABLE_COLUMNS, phenotype_rows), + ('analyte', ANALYTE_TABLE_COLUMNS, analyte_rows), + ('experiment_dna_short_read', EXPERIMENT_TABLE_COLUMNS, airtable_rows), + ('aligned_dna_short_read', READ_TABLE_COLUMNS, airtable_rows), + ('aligned_dna_short_read_set', READ_SET_TABLE_COLUMNS, airtable_rows), + ('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, airtable_rows), + ('experiment_rna_short_read', EXPERIMENT_RNA_TABLE_COLUMNS, airtable_rna_rows), + ('aligned_rna_short_read', READ_RNA_TABLE_COLUMNS, airtable_rna_rows), + ] + warnings = _validate_gregor_files(files) write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv') return create_json_response({ @@ -1024,7 +1014,7 @@ def _get_analyte_id(airtable_metadata): return f'Broad_{sm_id}' if sm_id else None -def _get_validated_gregor_files(file_data): +def _validate_gregor_files(file_data): errors = [] warnings = [] try: @@ -1043,11 +1033,7 @@ def _get_validated_gregor_files(file_data): f'The following tables are required in the data model but absent from the reports: {", ".join(missing_tables)}' ) - files = [] - for file_name, data in file_data: - columns = TABLE_COLUMNS[file_name] - files.append([file_name, columns, data]) - + for file_name, columns, data in file_data: table_validator = validators.get(file_name) if not table_validator: warnings.append(f'No data model found for "{file_name}" table so no validation was performed') @@ -1075,7 +1061,7 @@ def _get_validated_gregor_files(file_data): if errors: raise ErrorsWarningsException(errors, warnings) - return files, warnings + return warnings def _load_data_model_validators(): From 9d5b3ce99a9d9a17e31ba6b5df484fe78f2a6674 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 16 Aug 2023 10:36:53 -0400 Subject: [PATCH 16/19] add experiment lookup file --- seqr/views/apis/report_api.py | 47 ++++++++++++++++++++--------- seqr/views/apis/report_api_tests.py | 4 +-- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index 566915d851..3e1e67995c 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -696,6 +696,7 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): c for c in EXPERIMENT_TABLE_COLUMNS[1:] if not c.startswith('target')] + EXPERIMENT_RNA_TABLE_AIRTABLE_FIELDS + [ 'percent_mtRNA', 'percent_Globin', 'percent_UMI', 'percent_GC', 'percent_chrX_Y', ] +EXPERIMENT_LOOKUP_TABLE_COLUMNS = ['experiment_id', 'table_name', 'id_in_table', 'participant_id'] READ_TABLE_AIRTABLE_FIELDS = [ 'aligned_dna_short_read_file', 'aligned_dna_short_read_index_file', 'md5sum', 'reference_assembly', 'mean_coverage', 'alignment_software', 'analysis_details', @@ -740,7 +741,6 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): data_type_columns = set(DATA_TYPE_AIRTABLE_COLUMNS) - NO_DATA_TYPE_FIELDS - set(DATA_TYPE_OMIT[data_type]) AIRTABLE_QUERY_COLUMNS.update({f'{field}_{data_type}' for field in data_type_columns}) -# 'experiment': ['experiment_id', 'table_name', 'id_in_table', 'participant_id'], # TODO WARN_MISSING_TABLE_COLUMNS = { 'participant': ['recontactable', 'reported_race', 'affected_status', 'phenotype_description', 'age_at_enrollment'], } @@ -844,6 +844,7 @@ def gregor_export(request): analyte_rows = [] airtable_rows = [] airtable_rna_rows = [] + experiment_lookup_rows = [] for data_type_individuals in grouped_data_type_individuals.values(): # If multiple individual records, prefer WGS individual = next( @@ -885,16 +886,17 @@ def gregor_export(request): if airtable_sample: airtable_metadata = airtable_metadata_by_participant.get(airtable_sample[PARTICIPANT_ID_FIELD]) or {} for data_type in data_type_individuals: - data_type_metadata = airtable_metadata.get(data_type) - if data_type_metadata: - is_rna = data_type == 'RNA' - experiment_ids = _get_experiment_ids(data_type_metadata, is_rna) - analyte_ids.add(experiment_ids['analyte_id']) - row = {**airtable_metadata, **data_type_metadata, **experiment_ids} - if not is_rna: - row['alignment_software'] = row['alignment_software_dna'] - rows = airtable_rna_rows if is_rna else airtable_rows - rows.append(row) + if data_type not in airtable_metadata: + continue + row = _get_airtable_row(data_type, airtable_metadata) + analyte_ids.add(row['analyte_id']) + is_rna = data_type == 'RNA' + if not is_rna: + row['alignment_software'] = row['alignment_software_dna'] + (airtable_rna_rows if is_rna else airtable_rows).append(row) + experiment_lookup_rows.append( + {'participant_id': participant_id, **_get_experiment_lookup_row(is_rna, row)} + ) # analyte table if not analyte_ids: @@ -913,6 +915,7 @@ def gregor_export(request): ('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, airtable_rows), ('experiment_rna_short_read', EXPERIMENT_RNA_TABLE_COLUMNS, airtable_rna_rows), ('aligned_rna_short_read', READ_RNA_TABLE_COLUMNS, airtable_rna_rows), + ('experiment', EXPERIMENT_LOOKUP_TABLE_COLUMNS, experiment_lookup_rows), ] warnings = _validate_gregor_files(files) write_multiple_files_to_gs(files, file_path, request.user, file_format='tsv') @@ -998,14 +1001,20 @@ def _get_analyte_row(individual): } -def _get_experiment_ids(data_type_metadata, is_rna): +def _get_airtable_row(data_type, airtable_metadata): + data_type_metadata = airtable_metadata[data_type] collaborator_sample_id = data_type_metadata[COLLABORATOR_SAMPLE_ID_FIELD] experiment_short_read_id = f'Broad_{data_type_metadata.get("experiment_type", "NA")}_{collaborator_sample_id}' + aligned_short_read_id = f'{experiment_short_read_id}_1' return { 'analyte_id': _get_analyte_id(data_type_metadata), - f'experiment_{"rna" if is_rna else "dna"}_short_read_id': experiment_short_read_id, + 'experiment_dna_short_read_id': experiment_short_read_id, + 'experiment_rna_short_read_id': experiment_short_read_id, 'experiment_sample_id': collaborator_sample_id, - f'aligned_{"rna" if is_rna else "dna"}_short_read_id': f'{experiment_short_read_id}_1' + 'aligned_dna_short_read_id': aligned_short_read_id, + 'aligned_rna_short_read_id': aligned_short_read_id, + **airtable_metadata, + **data_type_metadata, } @@ -1014,6 +1023,16 @@ def _get_analyte_id(airtable_metadata): return f'Broad_{sm_id}' if sm_id else None +def _get_experiment_lookup_row(is_rna, row_data): + table_name = f'experiment_{"rna" if is_rna else "dna"}_short_read' + id_in_table = row_data[f'{table_name}_id'] + return { + 'table_name': table_name, + 'id_in_table': id_in_table, + 'experiment_id': f'{table_name}.{id_in_table}', + } + + def _validate_gregor_files(file_data): errors = [] warnings = [] diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 458334d7ab..0ff383f8c4 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -265,7 +265,7 @@ EXPECTED_GREGOR_FILES = [ 'participant', 'family', 'phenotype', 'analyte', 'experiment_dna_short_read', 'aligned_dna_short_read', 'aligned_dna_short_read_set', 'called_variants_dna_short_read', - 'experiment_rna_short_read', 'aligned_rna_short_read', + 'experiment_rna_short_read', 'aligned_rna_short_read', 'experiment', ] EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW = { @@ -830,7 +830,7 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): for write_call in mock_open.return_value.__enter__.return_value.write.call_args_list ] participant_file, family_file, phenotype_file, analyte_file, experiment_file, read_file, read_set_file, \ - called_file, experiment_rna_file, aligned_rna_file = files + called_file, experiment_rna_file, aligned_rna_file, experiment_lookup_file = files self.assertEqual(len(participant_file), 16 if has_second_project else 14) self.assertEqual(participant_file[0], [ From 9ea6297883552cbaa8ff205f75d3ce09f69a8913 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 16 Aug 2023 10:57:07 -0400 Subject: [PATCH 17/19] test empty rna data --- seqr/views/apis/report_api_tests.py | 33 +++++++++++++++++++++++++++++ seqr/views/utils/export_utils.py | 5 ++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 0ff383f8c4..e7d9f5a3f8 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -966,6 +966,39 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): self.assertEqual( ['NA', 'Broad_NA20888_D1', 'NA', '2aa33e8c32020b1c', 'NA', 'SNV', ''] in called_file, has_second_project) + self.assertEqual(len(experiment_rna_file), 1) + self.assertEqual(experiment_rna_file[0], [ + 'experiment_rna_short_read_id', 'analyte_id', 'experiment_sample_id', 'seq_library_prep_kit_method', + 'read_length', 'experiment_type', 'date_data_generation', 'sequencing_platform', 'library_prep_type', + 'single_or_paired_ends', 'within_site_batch_name', 'RIN', 'estimated_library_size', 'total_reads', + 'percent_rRNA', 'percent_mRNA', '5prime3prime_bias', 'percent_mtRNA', 'percent_Globin', 'percent_UMI', + 'percent_GC', 'percent_chrX_Y', + ]) + + self.assertEqual(len(aligned_rna_file), 1) + self.assertEqual(aligned_rna_file[0], [ + 'aligned_rna_short_read_id', 'experiment_rna_short_read_id', 'aligned_rna_short_read_file', + 'aligned_rna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri', + 'reference_assembly_details', 'mean_coverage', 'gene_annotation', 'gene_annotation_details', + 'alignment_software', 'alignment_log_file', 'alignment_postprocessing', 'percent_uniquely_aligned', + 'percent_multimapped', 'percent_unaligned', 'quality_issues' + ]) + + self.assertEqual(len(experiment_lookup_file), num_airtable_rows) + self.assertEqual(experiment_lookup_file[0], ['experiment_id', 'table_name', 'id_in_table', 'participant_id']) + self.assertIn([ + 'experiment_dna_short_read.Broad_exome_VCGS_FAM203_621_D2', 'experiment_dna_short_read', + 'Broad_exome_VCGS_FAM203_621_D2', 'Broad_HG00731', + ], experiment_lookup_file) + self.assertIn([ + 'experiment_dna_short_read.Broad_exome_NA20888', 'experiment_dna_short_read', + 'Broad_exome_NA20888', 'Broad_NA20888', + ], experiment_lookup_file) + self.assertEqual([ + 'experiment_dna_short_read.Broad_genome_NA20888_1', 'experiment_dna_short_read', 'Broad_genome_NA20888_1', + 'Broad_NA20888', + ] in experiment_lookup_file, has_second_project) + def _test_expected_gregor_airtable_calls(self, additional_samples=None): self.assertEqual(len(responses.calls), 4) sample_ids = { diff --git a/seqr/views/utils/export_utils.py b/seqr/views/utils/export_utils.py index c82c16645e..f60bb7efc0 100644 --- a/seqr/views/utils/export_utils.py +++ b/seqr/views/utils/export_utils.py @@ -75,10 +75,9 @@ def _format_files_content(files, file_format='csv', add_header_prefix=False, bl header_display = ['{}-{}'.format(str(header_tuple[0]).zfill(2), header_tuple[1]) for header_tuple in enumerate(header)] header_display[0] = header[0] - content = DELIMITERS[file_format].join(header_display) + '\n' content_rows = [[row.get(key) or blank_value for key in header] for row in rows] - content += '\n'.join([ - DELIMITERS[file_format].join(row) for row in content_rows + content = '\n'.join([ + DELIMITERS[file_format].join(row) for row in [header_display] + content_rows if any(val != blank_value for val in row) ]) content = str(content.encode('utf-8'), 'ascii', errors='ignore') # Strip unicode chars in the content From a61c81171e07b910653f94c9e51cddeb4ed1eeec Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 16 Aug 2023 11:35:42 -0400 Subject: [PATCH 18/19] test rna data --- seqr/views/apis/report_api_tests.py | 80 ++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 8 deletions(-) diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index e7d9f5a3f8..b03234cebe 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -161,6 +161,16 @@ 'Recontactable': 'Yes', }, }, + { + "id": "rec2B67GmXpAkQW8z", + "fields": { + 'SeqrCollaboratorSampleID': 'NA19679', + 'CollaboratorSampleID': 'NA19679', + 'CollaboratorParticipantID': 'NA19679', + 'SMID': 'SM-N1P91', + 'Recontactable': 'Yes', + }, + }, { "id": "rec2Nkg10N1KssPc3", "fields": { @@ -221,6 +231,41 @@ 'experiment_type_wgs': 'genome', }, }, + { + "id": "rec4B7OGmQpVkQW7z", + "fields": { + 'CollaboratorParticipantID': 'NA19679', + 'CollaboratorSampleID_rna': 'NA19679', + 'SMID_rna': 'SM-N1P91', + 'seq_library_prep_kit_method_rna': 'Unknown', + 'library_prep_type_rna': 'stranded poly-A pulldown', + 'read_length_rna': '151', + 'experiment_type_rna': 'paired-end', + 'single_or_paired_ends_rna': 'paired-end', + 'date_data_generation_rna': '2023-02-11', + 'sequencing_platform_rna': 'NovaSeq', + 'aligned_rna_short_read_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Aligned.out.cram', + 'aligned_rna_short_read_index_file': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Aligned.out.crai', + 'aligned_rna_short_read_id': '', + 'md5sum_rna': 'f6490b8ebdf2', + '5prime3prime_bias_rna': '1.05', + 'gene_annotation_rna': 'GENCODEv26', + 'reference_assembly': 'GRCh38', + 'reference_assembly_uri_rna': 'gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta', + 'alignment_software_rna': 'STARv2.7.10b', + 'alignment_log_file_rna': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Log.final.out', + 'percent_uniquely_aligned_rna': '80.53', + 'percent_multimapped_rna': '17.08', + 'percent_unaligned_rna': '1.71', + 'percent_mRNA': '80.2', + 'percent_rRNA': '5.9', + 'RIN_rna': '8.9818', + 'total_reads_rna': '106,842,386', + 'within_site_batch_name_rna': 'LCSET-26942', + 'estimated_library_size_rna': '19,480,858', + 'variant_types': 'SNV', + }, + }, { "id": "rec2BFCGmQpAkQ7x", "fields": { @@ -771,7 +816,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat 'The following tables are required in the data model but absent from the reports: subject, dna_read_data_set', 'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id', 'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata', - 'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881', + 'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881', 'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', 'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', 'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', @@ -966,7 +1011,7 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): self.assertEqual( ['NA', 'Broad_NA20888_D1', 'NA', '2aa33e8c32020b1c', 'NA', 'SNV', ''] in called_file, has_second_project) - self.assertEqual(len(experiment_rna_file), 1) + self.assertEqual(len(experiment_rna_file), 2) self.assertEqual(experiment_rna_file[0], [ 'experiment_rna_short_read_id', 'analyte_id', 'experiment_sample_id', 'seq_library_prep_kit_method', 'read_length', 'experiment_type', 'date_data_generation', 'sequencing_platform', 'library_prep_type', @@ -974,8 +1019,13 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): 'percent_rRNA', 'percent_mRNA', '5prime3prime_bias', 'percent_mtRNA', 'percent_Globin', 'percent_UMI', 'percent_GC', 'percent_chrX_Y', ]) + self.assertEqual(experiment_rna_file[1], [ + 'Broad_paired-end_NA19679', 'Broad_SM-N1P91', 'NA19679', 'Unknown', '151', 'paired-end', '2023-02-11', + 'NovaSeq', 'stranded poly-A pulldown', 'paired-end', 'LCSET-26942', '8.9818', '19,480,858', '106,842,386', + '5.9', '80.2', '1.05', '', '', '', '', '', + ]) - self.assertEqual(len(aligned_rna_file), 1) + self.assertEqual(len(aligned_rna_file), 2) self.assertEqual(aligned_rna_file[0], [ 'aligned_rna_short_read_id', 'experiment_rna_short_read_id', 'aligned_rna_short_read_file', 'aligned_rna_short_read_index_file', 'md5sum', 'reference_assembly', 'reference_assembly_uri', @@ -983,16 +1033,27 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): 'alignment_software', 'alignment_log_file', 'alignment_postprocessing', 'percent_uniquely_aligned', 'percent_multimapped', 'percent_unaligned', 'quality_issues' ]) + self.assertEqual(aligned_rna_file[1], [ + '', 'Broad_paired-end_NA19679', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Aligned.out.cram', + 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Aligned.out.crai', 'f6490b8ebdf2', 'GRCh38', + 'gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta', '', '', 'GENCODEv26', '', + 'STARv2.7.10b', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/NA19679.Log.final.out', '', '80.53', '17.08', + '1.71', '' + ]) - self.assertEqual(len(experiment_lookup_file), num_airtable_rows) + self.assertEqual(len(experiment_lookup_file), num_airtable_rows + 1) self.assertEqual(experiment_lookup_file[0], ['experiment_id', 'table_name', 'id_in_table', 'participant_id']) + self.assertIn([ + 'experiment_rna_short_read.Broad_paired-end_NA19679', 'experiment_rna_short_read', + 'Broad_paired-end_NA19679', 'Broad_NA19679', + ], experiment_lookup_file) self.assertIn([ 'experiment_dna_short_read.Broad_exome_VCGS_FAM203_621_D2', 'experiment_dna_short_read', 'Broad_exome_VCGS_FAM203_621_D2', 'Broad_HG00731', ], experiment_lookup_file) self.assertIn([ - 'experiment_dna_short_read.Broad_exome_NA20888', 'experiment_dna_short_read', - 'Broad_exome_NA20888', 'Broad_NA20888', + 'experiment_dna_short_read.Broad_exome_NA20888', 'experiment_dna_short_read', 'Broad_exome_NA20888', + 'Broad_NA20888', ], experiment_lookup_file) self.assertEqual([ 'experiment_dna_short_read.Broad_genome_NA20888_1', 'experiment_dna_short_read', 'Broad_genome_NA20888_1', @@ -1009,7 +1070,7 @@ def _test_expected_gregor_airtable_calls(self, additional_samples=None): sample_filter = ','.join([f"{{CollaboratorSampleID}}='{sample_id}'" for sample_id in sorted(sample_ids)]) sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorParticipantID', 'Recontactable'] self._assert_expected_airtable_call(0, f"OR({sample_filter})", sample_fields) - sample_ids -= {'NA19675_1', 'NA20888'} + sample_ids -= {'NA19675_1', 'NA19679', 'NA20888'} secondary_sample_filter = ','.join([f"{{SeqrCollaboratorSampleID}}='{sample_id}'" for sample_id in sorted(sample_ids)]) sample_fields[0] = 'SeqrCollaboratorSampleID' self._assert_expected_airtable_call(1, f"OR({secondary_sample_filter})", sample_fields) @@ -1031,7 +1092,10 @@ def _test_expected_gregor_airtable_calls(self, additional_samples=None): 'target_insert_size_wgs', 'targeted_region_bed_file', 'targeted_regions_method_wes', 'total_reads_rna', 'variant_types', 'within_site_batch_name_rna', ] - self._assert_expected_airtable_call(2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='NA20888',CollaboratorParticipantID='VCGS_FAM203_621')", metadata_fields) + self._assert_expected_airtable_call( + 2, "OR(CollaboratorParticipantID='NA19675',CollaboratorParticipantID='NA19679',CollaboratorParticipantID='NA20888',CollaboratorParticipantID='VCGS_FAM203_621')", + metadata_fields, + ) self.assertEqual(responses.calls[3].request.url, MOCK_DATA_MODEL_URL) From 252b90a8ef8287dc55c59a63bdc86b7a8b1ba668 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 17 Aug 2023 15:20:50 -0400 Subject: [PATCH 19/19] pr feedback --- seqr/views/apis/report_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index 3e1e67995c..cdc87d1cdb 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -710,8 +710,8 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): ] READ_RNA_TABLE_COLUMNS = ['aligned_rna_short_read_id', 'experiment_rna_short_read_id'] + \ READ_RNA_TABLE_AIRTABLE_ID_FIELDS + READ_TABLE_COLUMNS[4:-3] + READ_RNA_TABLE_AIRTABLE_FIELDS + ['quality_issues'] -READ_RNA_TABLE_COLUMNS.insert(10, 'gene_annotation_details') -READ_RNA_TABLE_COLUMNS.insert(13, 'alignment_postprocessing') +READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('gene_annotation')+1, 'gene_annotation_details') +READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('alignment_log_file')+1, 'alignment_postprocessing') READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id'] CALLED_TABLE_COLUMNS = [ 'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',