From 641edd57a72725aff3172932cbe4ab36609ce431 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 13 Mar 2024 17:04:06 -0400 Subject: [PATCH 01/19] stream to file no dedup --- seqr/views/apis/data_manager_api.py | 15 +++++++++------ seqr/views/utils/dataset_utils.py | 19 ++++++++++--------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 3e1739435a..cd83b47e2d 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -272,10 +272,13 @@ def update_rna_seq(request): file_name_prefix = f'rna_sample_data__{data_type}__{datetime.now().isoformat()}' + sample_files = {} + def _save_sample_data(sample_guid, sample_data): - file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid)) - with gzip.open(file_name, 'wt') as f: - json.dump(sample_data, f) + if sample_guid not in sample_files: + file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid)) + sample_files[sample_guid] = gzip.open(file_name, 'wt') + sample_files[sample_guid].wirte(json.dumps(sample_data)) try: sample_guids, info, warnings = load_rna_seq( @@ -300,7 +303,7 @@ def _load_saved_sample_data(file_name_prefix, sample_guid): file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid)) if os.path.exists(file_name): with gzip.open(file_name, 'rt') as f: - return json.load(f) + return [json.loads(line) for line in f.readlines()] return None @@ -312,10 +315,10 @@ def load_rna_seq_sample_data(request, sample_guid): request_json = json.loads(request.body) file_name = request_json['fileName'] data_type = request_json['dataType'] - data_by_gene = _load_saved_sample_data(file_name, sample_guid) + data_rows = _load_saved_sample_data(file_name, sample_guid) model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class'] - model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_by_gene.values()]) + model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_rows]) update_model_from_json(sample, {'is_active': True}, user=request.user) return create_json_response({'success': True}) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 8824860637..9a196a51ee 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -402,7 +402,8 @@ def _load_rna_seq_file( if existing_data and existing_data != row_dict: mismatches[sample_guid].add(gene_or_unique_id) - samples_by_guid[sample_guid][gene_or_unique_id] = row_dict + #samples_by_guid[sample_guid][gene_or_unique_id] = row_dict + save_sample_data(sample_guid, row_dict) errors, warnings = _process_rna_errors( gene_ids, missing_required_fields, unmatched_samples, ignore_extra_samples, loaded_samples, @@ -508,18 +509,18 @@ def save_sample_data(sample_guid, sample_data): update_sample_models() created_samples.update(samples_to_create.keys()) - prev_data = load_saved_data(sample_guid) or {} - new_mismatches = {k for k, v in prev_data.items() if k in sample_data and v != sample_data[k]} - if new_mismatches: - mismatches[sample_guid].update(new_mismatches) - sample_data.update(prev_data) + # prev_data = load_saved_data(sample_guid) or {} + # new_mismatches = {k for k, v in prev_data.items() if k in sample_data and v != sample_data[k]} + # if new_mismatches: + # mismatches[sample_guid].update(new_mismatches) + # sample_data.update(prev_data) - if post_process: - post_process(sample_data) + # if post_process: + # post_process(sample_data) sample_guids_to_load.add(sample_guid) save_data(sample_guid, sample_data) - return new_mismatches + #return new_mismatches def get_matched_sample(sample_key, unmatched_samples, sample_id_to_individual_id_mapping): if sample_key in potential_samples: From fa2eee179c3a211235421d67466f8c93919bcd22 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 14 Mar 2024 10:51:23 -0400 Subject: [PATCH 02/19] fix typo --- seqr/views/apis/data_manager_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index cd83b47e2d..aa41218645 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -278,7 +278,7 @@ def _save_sample_data(sample_guid, sample_data): if sample_guid not in sample_files: file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid)) sample_files[sample_guid] = gzip.open(file_name, 'wt') - sample_files[sample_guid].wirte(json.dumps(sample_data)) + sample_files[sample_guid].write(json.dumps(sample_data)) try: sample_guids, info, warnings = load_rna_seq( From 9c4056f65602e79febb07b6bad4c10145a984b01 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 14 Mar 2024 11:34:21 -0400 Subject: [PATCH 03/19] update parsing code --- seqr/views/apis/data_manager_api.py | 12 +++-- seqr/views/utils/dataset_utils.py | 72 +++++++++++------------------ 2 files changed, 35 insertions(+), 49 deletions(-) diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index aa41218645..b726d32eba 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -20,7 +20,8 @@ from seqr.utils.vcf_utils import validate_vcf_exists from seqr.views.utils.airflow_utils import trigger_data_loading, write_data_loading_pedigree -from seqr.views.utils.dataset_utils import load_rna_seq, load_phenotype_prioritization_data_file, RNA_DATA_TYPE_CONFIGS +from seqr.views.utils.dataset_utils import load_rna_seq, load_phenotype_prioritization_data_file, RNA_DATA_TYPE_CONFIGS, \ + post_process_rna_data from seqr.views.utils.file_utils import parse_file, get_temp_upload_directory, load_uploaded_file from seqr.views.utils.json_utils import create_json_response from seqr.views.utils.json_to_orm_utils import update_model_from_json @@ -277,12 +278,12 @@ def update_rna_seq(request): def _save_sample_data(sample_guid, sample_data): if sample_guid not in sample_files: file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid)) - sample_files[sample_guid] = gzip.open(file_name, 'wt') + sample_files[sample_guid] = gzip.open(file_name, 'a') sample_files[sample_guid].write(json.dumps(sample_data)) try: sample_guids, info, warnings = load_rna_seq( - data_type, file_path, _save_sample_data, lambda sample_guid: _load_saved_sample_data(file_name_prefix, sample_guid), + data_type, file_path, _save_sample_data, user=request.user, mapping_file=mapping_file, ignore_extra_samples=request_json.get('ignoreExtraSamples')) except ValueError as e: return create_json_response({'error': str(e)}, status=400) @@ -315,9 +316,12 @@ def load_rna_seq_sample_data(request, sample_guid): request_json = json.loads(request.body) file_name = request_json['fileName'] data_type = request_json['dataType'] + config = RNA_DATA_TYPE_CONFIGS[data_type] + data_rows = _load_saved_sample_data(file_name, sample_guid) + post_process_rna_data(sample_guid, data_rows, **config.get('post_process_kwargs', {})) - model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class'] + model_cls = config['model_class'] model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_rows]) update_model_from_json(sample, {'is_active': True}, user=request.user) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 9a196a51ee..42cfc7b6d1 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -20,9 +20,6 @@ logger = SeqrLogger(__name__) -MAX_UNSAVED_DATA_PER_SAMPLE = 5000 - - def load_mapping_file(mapping_file_path, user): file_content = parse_file(mapping_file_path, file_iter(mapping_file_path, user=user)) return load_mapping_file_content(file_content) @@ -283,7 +280,7 @@ def _get_splice_id(row): def _add_splice_rank(sample_data_rows): - sorted_data_rows = sorted([data_row for data_row in sample_data_rows.values()], key=lambda d: d[P_VALUE_COL]) + sorted_data_rows = sorted([data_row for data_row in sample_data_rows], key=lambda d: d[P_VALUE_COL]) for i, data_row in enumerate(sorted_data_rows): data_row['rank'] = i @@ -305,8 +302,10 @@ def _add_splice_rank(sample_data_rows): 'additional_kwargs': { 'format_fields': SPLICE_OUTLIER_FORMATTER, 'allow_missing_gene': True, - 'get_unique_key': _get_splice_id, + }, + 'post_process_kwargs': { 'post_process': _add_splice_rank, + 'get_unique_key': _get_splice_id, }, }, } @@ -330,8 +329,8 @@ def _validate_rna_header(header, column_map): def _load_rna_seq_file( - file_path, user, potential_loaded_samples, update_sample_models, save_sample_data, get_matched_sample, mismatches, - column_map, mapping_file=None, get_unique_key=None, allow_missing_gene=False, ignore_extra_samples=False, + file_path, user, potential_loaded_samples, update_sample_models, save_sample_data, get_matched_sample, + column_map, mapping_file=None, allow_missing_gene=False, ignore_extra_samples=False, should_skip=None, format_fields=None, ): @@ -339,7 +338,6 @@ def _load_rna_seq_file( if mapping_file: sample_id_to_individual_id_mapping = load_mapping_file_content(mapping_file) - samples_by_guid = defaultdict(dict) f = file_iter(file_path, user=user) parsed_f = parse_file(file_path.replace('.gz', ''), f, iter_file=True) header = next(parsed_f) @@ -349,7 +347,6 @@ def _load_rna_seq_file( unmatched_samples = set() missing_required_fields = defaultdict(set) gene_ids = set() - current_sample = None for line in tqdm(parsed_f, unit=' rows'): row = dict(zip(header, line)) if should_skip and should_skip(row): @@ -390,35 +387,12 @@ def _load_rna_seq_file( # If there are definite errors, do not process/save data, just continue to check for additional errors continue - if current_sample != sample_guid: - # If a large amount of data has been parsed for the previous sample, save and do not keep in memory - if len(samples_by_guid[current_sample]) > MAX_UNSAVED_DATA_PER_SAMPLE: - save_sample_data(current_sample, samples_by_guid[current_sample]) - del samples_by_guid[current_sample] - current_sample = sample_guid - - gene_or_unique_id = get_unique_key(row_dict) if get_unique_key else gene_id - existing_data = samples_by_guid[sample_guid].get(gene_or_unique_id) - if existing_data and existing_data != row_dict: - mismatches[sample_guid].add(gene_or_unique_id) - - #samples_by_guid[sample_guid][gene_or_unique_id] = row_dict save_sample_data(sample_guid, row_dict) errors, warnings = _process_rna_errors( gene_ids, missing_required_fields, unmatched_samples, ignore_extra_samples, loaded_samples, ) - if not errors: - for sample_guid, sample_data in samples_by_guid.items(): - save_sample_data(sample_guid, sample_data) - - if mismatches: - errors = [ - f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {", ".join(mismatch_ids)}' - for sample_guid, mismatch_ids in mismatches.items() - ] + errors - if errors: raise ErrorsWarningsException(errors) @@ -454,7 +428,7 @@ def _process_rna_errors(gene_ids, missing_required_fields, unmatched_samples, ig return errors, warnings -def _load_rna_seq(model_cls, file_path, save_data, load_saved_data, *args, user=None, create_models_before_save=False, post_process=None, **kwargs): +def _load_rna_seq(model_cls, file_path, save_data, *args, user=None, create_models_before_save=False, **kwargs): projects = get_internal_projects() data_source = file_path.split('/')[-1].split('_-_')[-1] @@ -474,7 +448,6 @@ def _load_rna_seq(model_cls, file_path, save_data, load_saved_data, *args, user= existing_samples_by_guid = {} samples_to_create = {} created_samples = set() - mismatches = defaultdict(set) def update_sample_models(): remaining_samples_to_create = [s for key, s in samples_to_create.items() if key not in created_samples] @@ -509,18 +482,8 @@ def save_sample_data(sample_guid, sample_data): update_sample_models() created_samples.update(samples_to_create.keys()) - # prev_data = load_saved_data(sample_guid) or {} - # new_mismatches = {k for k, v in prev_data.items() if k in sample_data and v != sample_data[k]} - # if new_mismatches: - # mismatches[sample_guid].update(new_mismatches) - # sample_data.update(prev_data) - - # if post_process: - # post_process(sample_data) - sample_guids_to_load.add(sample_guid) save_data(sample_guid, sample_data) - #return new_mismatches def get_matched_sample(sample_key, unmatched_samples, sample_id_to_individual_id_mapping): if sample_key in potential_samples: @@ -542,7 +505,7 @@ def get_matched_sample(sample_key, unmatched_samples, sample_id_to_individual_id warnings, not_loaded_count = _load_rna_seq_file( file_path, user, potential_loaded_samples, update_sample_models, save_sample_data, get_matched_sample, - mismatches, *args, **kwargs) + *args, **kwargs) message = f'Parsed {len(sample_guids_to_load) + not_loaded_count} RNA-seq samples' info = [message] logger.info(message, user) @@ -565,6 +528,25 @@ def get_matched_sample(sample_key, unmatched_samples, sample_id_to_individual_id return sample_guids_to_load, info, warnings +def post_process_rna_data(sample_guid, data, get_unique_key=None, post_process=None): + mismatches = set() + + data_by_key = {} + for row in data: + gene_or_unique_id = get_unique_key(row) if get_unique_key else row[GENE_ID_COL] + existing_data = data_by_key.get(gene_or_unique_id) + if existing_data and existing_data != row: + mismatches.add(gene_or_unique_id) + + if mismatches: + raise ErrorsWarningsException([ + f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {", ".join(mismatches)}' + ]) + + if post_process: + post_process(data) + + RNA_MODEL_DISPLAY_NAME = { RnaSeqOutlier: 'Expression Outlier', RnaSeqSpliceOutlier: 'Splice Outlier', From ac512b2da28c31493407be7fcb62dedaade12d7c Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 14 Mar 2024 12:11:38 -0400 Subject: [PATCH 04/19] fix sample processing --- seqr/views/apis/data_manager_api.py | 8 +++++--- seqr/views/utils/dataset_utils.py | 12 ++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index b726d32eba..a9706a8e02 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -278,8 +278,8 @@ def update_rna_seq(request): def _save_sample_data(sample_guid, sample_data): if sample_guid not in sample_files: file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid)) - sample_files[sample_guid] = gzip.open(file_name, 'a') - sample_files[sample_guid].write(json.dumps(sample_data)) + sample_files[sample_guid] = gzip.open(file_name, 'at') + sample_files[sample_guid].write(f'{json.dumps(sample_data)}\n') try: sample_guids, info, warnings = load_rna_seq( @@ -319,7 +319,9 @@ def load_rna_seq_sample_data(request, sample_guid): config = RNA_DATA_TYPE_CONFIGS[data_type] data_rows = _load_saved_sample_data(file_name, sample_guid) - post_process_rna_data(sample_guid, data_rows, **config.get('post_process_kwargs', {})) + data_rows, error = post_process_rna_data(sample_guid, data_rows, **config.get('post_process_kwargs', {})) + if error: + return create_json_response({'error': error}, status=400) model_cls = config['model_class'] model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_rows]) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 42cfc7b6d1..88d4a553d3 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -537,15 +537,15 @@ def post_process_rna_data(sample_guid, data, get_unique_key=None, post_process=N existing_data = data_by_key.get(gene_or_unique_id) if existing_data and existing_data != row: mismatches.add(gene_or_unique_id) + data_by_key[gene_or_unique_id] = row - if mismatches: - raise ErrorsWarningsException([ - f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {", ".join(mismatches)}' - ]) - - if post_process: + error = f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {", ".join(mismatches)}' if mismatches else None + data = data_by_key.values() + if post_process and not error: post_process(data) + return data, error + RNA_MODEL_DISPLAY_NAME = { RnaSeqOutlier: 'Expression Outlier', From 175bfbeebb07781ec3e880fa19f780749193bb97 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 14 Mar 2024 12:50:39 -0400 Subject: [PATCH 05/19] update tests --- seqr/views/apis/data_manager_api_tests.py | 104 +++++++++++----------- 1 file changed, 51 insertions(+), 53 deletions(-) diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py index 71eb3ccf4d..59d0a7ed12 100644 --- a/seqr/views/apis/data_manager_api_tests.py +++ b/seqr/views/apis/data_manager_api_tests.py @@ -275,44 +275,42 @@ RNA_SPLICE_SAMPLE_GUID = 'S000151_na19675_1' PLACEHOLDER_GUID = 'S0000100' RNA_FILE_ID = 'gs://rna_data/new_muscle_samples.tsv.gz' -SAMPLE_GENE_OUTLIER_DATA = { - 'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'p_value': '0.01', 'p_adjust': '0.13', 'z_score': '-3.1'}, - 'ENSG00000233750': {'gene_id': 'ENSG00000233750', 'p_value': '0.064', 'p_adjust': '0.0000057', 'z_score': '7.8'}, -} -SAMPLE_GENE_TPM_DATA = { - 'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'tpm': '7.8'}, - 'ENSG00000233750': {'gene_id': 'ENSG00000233750', 'tpm': '0.0'}, -} -SAMPLE_GENE_SPLICE_DATA = { - 'ENSG00000233750-2-167254166-167258349-*-psi3': { +SAMPLE_GENE_OUTLIER_DATA = [ + {'gene_id': 'ENSG00000240361', 'p_value': '0.01', 'p_adjust': '0.13', 'z_score': '-3.1'}, + {'gene_id': 'ENSG00000233750', 'p_value': '0.064', 'p_adjust': '0.0000057', 'z_score': '7.8'}, +] +SAMPLE_GENE_TPM_DATA = [ + {'gene_id': 'ENSG00000240361', 'tpm': '7.8'}, + {'gene_id': 'ENSG00000233750', 'tpm': '0.0'}, +] +SAMPLE_GENE_SPLICE_DATA = [ + { 'chrom': '2', 'start': 167254166, 'end': 167258349, 'strand': '*', 'type': 'psi3', 'p_value': 1.56e-25, 'z_score': -4.9, 'delta_psi': -0.46, 'read_count': 166, 'gene_id': 'ENSG00000233750', - 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, 'rank': 1, + 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, }, - 'ENSG00000240361-7-132885746-132975168-*-psi5': { + { 'chrom': '7', 'start': 132885746, 'end': 132975168, 'strand': '*', 'type': 'psi5', 'p_value': 1.08e-56, 'z_score': -6.53, 'delta_psi': -0.85, 'read_count': 231, 'gene_id': 'ENSG00000240361', - 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, 'rank': 0, + 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, }, -} +] SAMPLE_GENE_SPLICE_DATA2 = { - '-2-167258096-167258349-*-psi3': { 'chrom': '2', 'start': 167258096, 'end': 167258349, 'strand': '*', 'type': 'psi3', 'p_value': 1.56e-25, 'z_score': 6.33, 'delta_psi': 0.45, 'read_count': 143, 'gene_id': '', - 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, 'rank': 0, + 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, } -} RNA_OUTLIER_SAMPLE_DATA = { - RNA_MUSCLE_SAMPLE_GUID: json.dumps(SAMPLE_GENE_OUTLIER_DATA), - PLACEHOLDER_GUID: json.dumps({'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'p_value': '0.04', 'p_adjust': '0.112', 'z_score': '1.9'}}), + RNA_MUSCLE_SAMPLE_GUID: '\n'.join([json.dumps(row) for row in SAMPLE_GENE_OUTLIER_DATA]) + '\n', + PLACEHOLDER_GUID: json.dumps({'gene_id': 'ENSG00000240361', 'p_value': '0.04', 'p_adjust': '0.112', 'z_score': '1.9'}) + '\n', } RNA_TPM_SAMPLE_DATA = { - RNA_MUSCLE_SAMPLE_GUID: json.dumps(SAMPLE_GENE_TPM_DATA), - PLACEHOLDER_GUID: json.dumps({'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'tpm': '0.112'}}), + RNA_MUSCLE_SAMPLE_GUID: '\n'.join([json.dumps(row) for row in SAMPLE_GENE_TPM_DATA]) + '\n', + PLACEHOLDER_GUID: json.dumps({'gene_id': 'ENSG00000240361', 'tpm': '0.112'}) + '\n', } RNA_SPLICE_SAMPLE_DATA = { - RNA_SPLICE_SAMPLE_GUID: json.dumps(SAMPLE_GENE_SPLICE_DATA), - PLACEHOLDER_GUID: json.dumps(SAMPLE_GENE_SPLICE_DATA2), + RNA_SPLICE_SAMPLE_GUID: '\n'.join([json.dumps(row) for row in SAMPLE_GENE_SPLICE_DATA]) + '\n', + PLACEHOLDER_GUID: json.dumps(SAMPLE_GENE_SPLICE_DATA2) + '\n', } RNA_FILENAME_TEMPLATE = 'rna_sample_data__{}__2020-04-15T00:00:00' @@ -670,8 +668,8 @@ def test_kibana_proxy(self): ['NA20870', '1kg project nåme with uniçøde', 'ENSG00000240361', 'fibroblasts', 'detail2', 0.01, 0.13, -3.1], ], 'write_data': { - '{"ENSG00000233750": {"gene_id": "ENSG00000233750", "p_value": "0.064", "p_adjust": "0.0000057", "z_score": "7.8"}}', - '{"ENSG00000240361": {"gene_id": "ENSG00000240361", "p_value": "0.01", "p_adjust": "0.13", "z_score": "-3.1"}}' + '{"gene_id": "ENSG00000233750", "p_value": "0.064", "p_adjust": "0.0000057", "z_score": "7.8"}\n', + '{"gene_id": "ENSG00000240361", "p_value": "0.01", "p_adjust": "0.13", "z_score": "-3.1"}\n' }, 'new_data': [ ['NA19675_D2', '1kg project nåme with uniçøde', 'ENSG00000240361', 'muscle', 'detail1', 0.01, 0.13, -3.1], @@ -702,8 +700,8 @@ def test_kibana_proxy(self): ['NA20870', 'Test Reprocessed Project', 'ENSG00000240361', 'NA20870', 'muscle', 7.8], ['NA20870', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA20870', 'fibroblasts', 0.0], ], - 'write_data': {'{"ENSG00000240361": {"gene_id": "ENSG00000240361", "tpm": "7.8"}}', - '{"ENSG00000233750": {"gene_id": "ENSG00000233750", "tpm": "0.0"}}'}, + 'write_data': {'{"gene_id": "ENSG00000240361", "tpm": "7.8"}\n', + '{"gene_id": "ENSG00000233750", "tpm": "0.0"}\n'}, 'new_data': [ # existing sample NA19675_D2 ['NA19675_D2', '1kg project nåme with uniçøde', 'ENSG00000240361', 'NA19675_D2', 'muscle', 7.8], @@ -726,6 +724,7 @@ def test_kibana_proxy(self): 'get_models_json': lambda models: list(models.values_list('gene_id', 'tpm')), 'expected_models_json': [('ENSG00000240361', 7.8), ('ENSG00000233750', 0.0)], 'sample_guid': RNA_MUSCLE_SAMPLE_GUID, + 'mismatch_field': 'tpm', }, 'splice_outlier': { 'model_cls': RnaSeqSpliceOutlier, @@ -744,14 +743,14 @@ def test_kibana_proxy(self): ['NA20870', '1kg project nåme with uniçøde', 'ENSG00000135953', 'chr2', 167258096, 167258349, '*', 'XIRP2', 'psi3', 1.56E-25, 6.33, 0.45, 143, 'muscle', 0.03454739, 1, 20], ], - 'write_data': {'{"ENSG00000233750-2-167258096-167258349-*-psi3": {"chrom": "2", "start": 167258096,' + 'write_data': {'{"chrom": "2", "start": 167258096,' ' "end": 167258349, "strand": "*", "type": "psi3", "p_value": 1.56e-25, "z_score": 6.33,' ' "delta_psi": 0.45, "read_count": 143, "gene_id": "ENSG00000233750",' - ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20, "rank": 0}}', - '{"ENSG00000135953-2-167258096-167258349-*-psi3": {"chrom": "2", "start": 167258096,' + ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20}\n', + '{"chrom": "2", "start": 167258096,' ' "end": 167258349, "strand": "*", "type": "psi3", "p_value": 1.56e-25, "z_score": 6.33,' ' "delta_psi": 0.45, "read_count": 143, "gene_id": "ENSG00000135953",' - ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20, "rank": 0}}', + ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20}\n', }, 'new_data': [ # existing sample NA19675_1 @@ -778,13 +777,13 @@ def test_kibana_proxy(self): 'allow_missing_gene': True, 'get_models_json': lambda models: list( models.values_list('gene_id', 'chrom', 'start', 'end', 'strand', 'type', 'p_value', 'z_score', 'delta_psi', - 'read_count', 'rare_disease_samples_with_junction', 'rare_disease_samples_total')), + 'read_count', 'rare_disease_samples_with_junction', 'rare_disease_samples_total', 'rank')), 'expected_models_json': [ - ('ENSG00000233750', '2', 167254166, 167258349, '*', 'psi3', 1.56e-25, -4.9, -0.46, 166, 1, 20), - ('ENSG00000240361', '7', 132885746, 132975168, '*', 'psi5', 1.08e-56, -6.53, -0.85, 231, 1, 20) + ('ENSG00000233750', '2', 167254166, 167258349, '*', 'psi3', 1.56e-25, -4.9, -0.46, 166, 1, 20, 1), + ('ENSG00000240361', '7', 132885746, 132975168, '*', 'psi5', 1.08e-56, -6.53, -0.85, 231, 1, 20, 0) ], 'sample_guid': RNA_SPLICE_SAMPLE_GUID, - 'row_id': 'ENSG00000240361-7-132885746-132886973-*-psi5', + 'row_id': 'ENSG00000233750-2-167254166-167258349-*-psi3', }, } @@ -876,15 +875,6 @@ def _set_file_iter_stdout(rows): f'{", ".join(sorted([col for col in header if col not in params["optional_headers"]]))}', }) - mismatch_row = loaded_data_row[:-1] + [loaded_data_row[-1] - 2] - _set_file_iter_stdout([header, loaded_data_row, loaded_data_row, mismatch_row]) - response = self.client.post(url, content_type='application/json', data=json.dumps(body)) - self.assertEqual(response.status_code, 400) - self.assertDictEqual(response.json(), { - 'errors': [f'Error in {loaded_data_row[0]}: mismatched entries for {params.get("row_id", mismatch_row[2])}'], - 'warnings': None, - }) - missing_sample_row = ['NA19675_D3'] + loaded_data_row[1:] _set_file_iter_stdout([header, loaded_data_row, missing_sample_row]) response = self.client.post(url, content_type='application/json', data=json.dumps(body)) @@ -1007,9 +997,9 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s f'{RNA_FILENAME_TEMPLATE.format(data_type)}__{new_sample_guid if sample_guid == PLACEHOLDER_GUID else sample_guid}.json.gz': data for sample_guid, data in params['parsed_file_data'].items() } - mock_open.assert_has_calls([mock.call(filename, 'wt') for filename in expected_files]) + mock_open.assert_has_calls([mock.call(filename, 'at') for filename in expected_files]) self.assertEqual( - ''.join([call.args[0] for call in mock_files[filename].__enter__.return_value.write.call_args_list]), + ''.join([call.args[0] for call in mock_files[filename].write.call_args_list]), expected_files[filename], ) @@ -1025,7 +1015,7 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s num_created_samples=2) self.assertSetEqual( - {''.join([call.args[0] for call in mock_file.__enter__.return_value.write.call_args_list]) for mock_file in mock_files.values()}, + {''.join([call.args[0] for call in mock_file.write.call_args_list]) for mock_file in mock_files.values()}, params['write_data'], ) @@ -1042,11 +1032,11 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s self.assertTrue(second_tissue_sample_guid != new_sample_guid) self.assertTrue(second_tissue_sample_guid in response_json['sampleGuids']) mock_open.assert_has_calls([ - mock.call(f'{RNA_FILENAME_TEMPLATE.format(data_type)}__{sample_guid}.json.gz', 'wt') + mock.call(f'{RNA_FILENAME_TEMPLATE.format(data_type)}__{sample_guid}.json.gz', 'at') for sample_guid in response_json['sampleGuids'] ]) self.assertSetEqual( - {''.join([call.args[0] for call in mock_file.__enter__.return_value.write.call_args_list]) for mock_file in mock_files.values()}, + {''.join([call.args[0] for call in mock_file.write.call_args_list]) for mock_file in mock_files.values()}, params['write_data'], ) @@ -1066,12 +1056,12 @@ def test_load_rna_seq_sample_data(self, mock_open, mock_os): model_cls = params['model_cls'] model_cls.objects.all().delete() self.reset_logs() - mock_open.return_value.__enter__.return_value.read.return_value = params['parsed_file_data'][sample_guid] + parsed_file_lines = params['parsed_file_data'][sample_guid].strip().split('\n') + mock_open.return_value.__enter__.return_value.readlines.return_value = parsed_file_lines file_name = RNA_FILENAME_TEMPLATE.format(data_type) - response = self.client.post(url, content_type='application/json', data=json.dumps({ - 'fileName': file_name, 'dataType': data_type, - })) + body = {'fileName': file_name, 'dataType': data_type} + response = self.client.post(url, content_type='application/json', data=json.dumps(body)) self.assertEqual(response.status_code, 200) self.assertDictEqual(response.json(), {'success': True}) @@ -1092,6 +1082,14 @@ def test_load_rna_seq_sample_data(self, mock_open, mock_os): self.assertListEqual(list(params['get_models_json'](models)), params['expected_models_json']) + mismatch_row = {**json.loads(parsed_file_lines[0]), params.get('mismatch_field', 'p_value'): '0.05'} + mock_open.return_value.__enter__.return_value.readlines.return_value = parsed_file_lines + [json.dumps(mismatch_row)] + response = self.client.post(url, content_type='application/json', data=json.dumps(body)) + self.assertEqual(response.status_code, 400) + self.assertDictEqual(response.json(), { + 'error': f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {params.get("row_id", mismatch_row["gene_id"])}' + }) + @classmethod def _join_data(cls, data): return ['\t'.join(line).encode('utf-8') for line in data] From 71d732d2047c2a51bde2f901d0aec95c74fa5a22 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 14 Mar 2024 15:25:45 -0400 Subject: [PATCH 06/19] fix loading command --- seqr/management/commands/load_rna_seq.py | 40 +++++++++++++++++------ seqr/views/apis/data_manager_api_tests.py | 4 +-- seqr/views/utils/dataset_utils.py | 14 +++----- 3 files changed, 36 insertions(+), 22 deletions(-) diff --git a/seqr/management/commands/load_rna_seq.py b/seqr/management/commands/load_rna_seq.py index 8aff327956..fa00715bb3 100644 --- a/seqr/management/commands/load_rna_seq.py +++ b/seqr/management/commands/load_rna_seq.py @@ -1,9 +1,10 @@ import logging +from collections import defaultdict from django.core.management.base import BaseCommand from seqr.models import Sample from seqr.views.utils.file_utils import parse_file -from seqr.views.utils.dataset_utils import load_rna_seq, RNA_DATA_TYPE_CONFIGS +from seqr.views.utils.dataset_utils import load_rna_seq, post_process_rna_data, RNA_DATA_TYPE_CONFIGS logger = logging.getLogger(__name__) @@ -24,18 +25,37 @@ def handle(self, *args, **options): mapping_file = parse_file(options['mapping_file'], f) data_type = options['data_type'] - self.model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class'] + model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class'] - sample_guids, _, _ = load_rna_seq( - data_type, options['input_file'], self._save_sample_data, lambda *args: {}, create_models_before_save=True, + sample_data_by_guid = defaultdict(list) + + def _save_sample_data(sample_guid, row): + sample_data_by_guid[sample_guid].append(row) + + possible_sample_guids, _, _ = load_rna_seq( + data_type, options['input_file'], _save_sample_data, mapping_file=mapping_file, ignore_extra_samples=options['ignore_extra_samples']) + sample_models_by_guid = { + s['guid']: s for s in Sample.objects.filter(guid__in=sample_data_by_guid).values('guid', 'id', 'sample_id') + } + errors = [] + sample_guids = [] + for sample_guid in possible_sample_guids: + data_rows, error = post_process_rna_data(sample_guid, sample_data_by_guid[sample_guid]) + if error: + errors.append(error) + continue + + sample_guids.append(sample_guid) + sample_model = sample_models_by_guid[sample_guid] + models = model_cls.objects.bulk_create( + [model_cls(sample_id=sample_model['id'], **data) for data in data_rows], batch_size=1000) + logger.info(f'create {len(models)} {model_cls.__name__} for {sample_model["sample_id"]}') + Sample.bulk_update(user=None, update_json={'is_active': True}, guid__in=sample_guids) - logger.info('DONE') + for error in errors: + logger.info(error) - def _save_sample_data(self, sample_guid, data_by_gene): - sample = Sample.objects.get(guid=sample_guid) - models = self.model_cls.objects.bulk_create( - [self.model_cls(sample=sample, **data) for data in data_by_gene.values()], batch_size=1000) - logger.info(f'create {len(models)} {self.model_cls.__name__} for {sample.sample_id}') + logger.info('DONE') diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py index 59d0a7ed12..cbaf6fe977 100644 --- a/seqr/views/apis/data_manager_api_tests.py +++ b/seqr/views/apis/data_manager_api_tests.py @@ -969,7 +969,7 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s 'parentEntityIds': [params['sample_guid']], 'updateType': 'bulk_delete'}}), ('update 1 Samples', {'dbUpdate': { 'dbEntity': 'Sample', 'entityIds': [params['sample_guid']], - 'updateType': 'bulk_update', 'updateFields': ['data_source']}}), + 'updateType': 'bulk_update', 'updateFields': ['data_source', 'is_active']}}), ]) self.assertTrue(params['sample_guid'] in response_json['sampleGuids']) self.assertEqual(mock_send_slack.call_count, 2) @@ -987,7 +987,7 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s # test database models are correct self.assertEqual(model_cls.objects.count(), params['initial_model_count'] - deleted_count) sample_guid = self._check_rna_sample_model(individual_id=1, data_source='new_muscle_samples.tsv.gz', - tissue_type=params.get('sample_tissue_type')) + tissue_type=params.get('sample_tissue_type'), is_active_sample=False) self.assertSetEqual(set(response_json['sampleGuids']), {sample_guid, new_sample_guid}) # test correct file interactions diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 88d4a553d3..fdf1f4746b 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -428,7 +428,7 @@ def _process_rna_errors(gene_ids, missing_required_fields, unmatched_samples, ig return errors, warnings -def _load_rna_seq(model_cls, file_path, save_data, *args, user=None, create_models_before_save=False, **kwargs): +def _load_rna_seq(model_cls, file_path, save_data, *args, user=None, **kwargs): projects = get_internal_projects() data_source = file_path.split('/')[-1].split('_-_')[-1] @@ -447,13 +447,11 @@ def _load_rna_seq(model_cls, file_path, save_data, *args, user=None, create_mode sample_guids_to_load = set() existing_samples_by_guid = {} samples_to_create = {} - created_samples = set() def update_sample_models(): - remaining_samples_to_create = [s for key, s in samples_to_create.items() if key not in created_samples] - if remaining_samples_to_create: + if samples_to_create: _create_samples( - remaining_samples_to_create, + samples_to_create.values(), user=user, data_source=data_source, sample_type=Sample.SAMPLE_TYPE_RNA, @@ -470,7 +468,7 @@ def update_sample_models(): if to_delete: model_cls.bulk_delete(user, to_delete) - Sample.bulk_update(user, {'data_source': data_source}, guid__in=existing_samples_by_guid) + Sample.bulk_update(user, {'data_source': data_source, 'is_active': False}, guid__in=existing_samples_by_guid) for guid in to_delete_sample_individuals: existing_samples_by_guid[guid]['dataSource'] = data_source @@ -478,10 +476,6 @@ def save_sample_data(sample_guid, sample_data): if not sample_data: return - if create_models_before_save: - update_sample_models() - created_samples.update(samples_to_create.keys()) - sample_guids_to_load.add(sample_guid) save_data(sample_guid, sample_data) From 796c71b0c8484789eaa753910113f702cd7e19cc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 16 Mar 2024 23:13:43 +0000 Subject: [PATCH 07/19] Bump follow-redirects from 1.15.4 to 1.15.6 in /ui Bumps [follow-redirects](https://github.com/follow-redirects/follow-redirects) from 1.15.4 to 1.15.6. - [Release notes](https://github.com/follow-redirects/follow-redirects/releases) - [Commits](https://github.com/follow-redirects/follow-redirects/compare/v1.15.4...v1.15.6) --- updated-dependencies: - dependency-name: follow-redirects dependency-type: indirect ... Signed-off-by: dependabot[bot] --- ui/package-lock.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ui/package-lock.json b/ui/package-lock.json index 080a18222d..ab391c8066 100644 --- a/ui/package-lock.json +++ b/ui/package-lock.json @@ -8485,9 +8485,9 @@ "dev": true }, "node_modules/follow-redirects": { - "version": "1.15.4", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", - "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", "dev": true, "funding": [ { @@ -25853,9 +25853,9 @@ "dev": true }, "follow-redirects": { - "version": "1.15.4", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", - "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", "dev": true }, "foreach": { From b83121a3dee1e89587761d942d530d8f4a2fb317 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 18 Mar 2024 12:30:20 -0400 Subject: [PATCH 08/19] properly parse clinvar significance --- ui/pages/Report/components/VariantMetadata.jsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ui/pages/Report/components/VariantMetadata.jsx b/ui/pages/Report/components/VariantMetadata.jsx index 6f03815d0e..2f7799961b 100644 --- a/ui/pages/Report/components/VariantMetadata.jsx +++ b/ui/pages/Report/components/VariantMetadata.jsx @@ -1,7 +1,7 @@ import React from 'react' import LoadReportTable from 'shared/components/table/LoadReportTable' -import { VARIANT_METADATA_COLUMNS } from 'shared/utils/constants' +import { clinvarSignificance, VARIANT_METADATA_COLUMNS } from 'shared/utils/constants' const VIEW_ALL_PAGES = [ { name: 'GREGoR', downloadName: 'GREGoR', path: 'gregor' }, @@ -13,7 +13,7 @@ const COLUMNS = [ ...VARIANT_METADATA_COLUMNS.slice(0, -1), { name: 'allele_balance_or_heteroplasmy_percentage' }, { name: 'ClinGen allele ID', format: ({ clinvar }) => clinvar?.alleleId }, - { name: 'ClinVar Clinical Significance', format: ({ clinvar }) => clinvar?.clinicalSignificance }, + { name: 'ClinVar Clinical Significance', format: ({ clinvar }) => clinvarSignificance(clinvar).pathogenicity }, { name: 'ClinVar gold star', format: ({ clinvar }) => clinvar?.goldStars }, { name: 'known_condition_name' }, { name: 'condition_id' }, From e14de3e3383b68b3707cdef69fcaafd46e50cfe0 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 18 Mar 2024 13:24:25 -0400 Subject: [PATCH 09/19] add better filtration for reloading SNV/INDEL variants --- seqr/views/utils/variant_utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index 3daba835ca..4af1fb3450 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -102,10 +102,13 @@ def update_project_saved_variant_json(project_id, family_guids=None, dataset_typ def saved_variants_dataset_type_filter(dataset_type): xpos_filter_key = 'xpos__gte' if dataset_type == Sample.DATASET_TYPE_MITO_CALLS else 'xpos__lt' - return { - 'alt__isnull': dataset_type == Sample.DATASET_TYPE_SV_CALLS, - xpos_filter_key: get_xpos('M', 1), - } + dataset_filter = {xpos_filter_key: get_xpos('M', 1)} + if dataset_type == Sample.DATASET_TYPE_SV_CALLS: + dataset_filter['alt__isnull'] = True + else: + # Filter out manual variants with invalid characters, such as those used for STRs + dataset_filter['alt__regex'] = '^[ACGT]$' + return dataset_filter def parse_saved_variant_json(variant_json, family): From a4c09a4ccb54d209eba1bb39a3b6719fbd36a145 Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Mon, 18 Mar 2024 13:25:56 -0400 Subject: [PATCH 10/19] add clinvar pathogenicity to ui constants --- ui/shared/utils/constants.js | 1 + 1 file changed, 1 insertion(+) diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js index c8d0671dc7..a7c0a96132 100644 --- a/ui/shared/utils/constants.js +++ b/ui/shared/utils/constants.js @@ -513,6 +513,7 @@ const CLINVAR_MIN_RISK_PATHOGENICITY = 'likely_risk_allele' const CLINVAR_PATHOGENICITIES = [ 'pathogenic', 'pathogenic/likely_pathogenic', + 'pathogenic/likely_pathogenic/established_risk_allele', 'pathogenic/likely_pathogenic/likely_risk_allele', 'pathogenic/likely_risk_allele', 'likely_pathogenic', From 4d8ca4d372b958f8496f526cf0065d88dd7694af Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 18 Mar 2024 16:43:51 -0400 Subject: [PATCH 11/19] add conditional column validation --- seqr/views/apis/report_api.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index fceb6bfcdf..840300aa7b 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -695,6 +695,24 @@ def _has_required_table(table, validator, tables): return tables.isdisjoint(validator) +def _is_required_col(required_validator, row): + if not required_validator: + return False + + if required_validator is True: + return True + + match = re.match(r'CONDITIONAL \(([\w+(\s)?]+) = ([\w+(\s)?]+)\)', required_validator) + if not match: + return True + + field, value = match.groups() + return row[field] == value + + + + + def _validate_column_data(column, file_name, data, column_validator, warnings, errors): data_type = column_validator.get('data_type') data_type_validator = DATA_TYPE_VALIDATORS.get(data_type) @@ -712,7 +730,7 @@ def _validate_column_data(column, file_name, data, column_validator, warnings, e for row in data: value = row.get(column) if not value: - if required: + if _is_required_col(required, row): missing.append(_get_row_id(row)) elif recommended: check_recommend_condition = WARN_MISSING_CONDITIONAL_COLUMNS.get(column) From 4879c39262cdc2b96b310d360150780f179bc255 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 18 Mar 2024 17:16:31 -0400 Subject: [PATCH 12/19] update tests --- seqr/views/apis/report_api_tests.py | 6 +++--- seqr/views/utils/anvil_metadata_utils.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 647ff3d730..f37031f63b 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -447,7 +447,7 @@ 'participant': { 'internal_project_id': {'data_type': 'reference'}, 'prior_testing': {'data_type': 'enumeration'}, - 'proband_relationship': {'required': True}, + 'proband_relationship': {'required': 'CONDITIONAL (sex = Male)'}, 'reported_race': {'enumerations': ['Asian', 'White', 'Black']}, 'age_at_enrollment': {'data_type': 'date'} }, @@ -717,7 +717,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat ] + [ 'The following tables are required in the data model but absent from the reports: subject, dna_read_data_set', ] + [ - 'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', + 'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00732, Broad_NA19678, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881', 'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)', 'The following entries have invalid values for "age_at_enrollment" in the "participant" table. Allowed values have data type date. Invalid values: Broad_NA19675_1 (18)', 'The following entries have invalid values for "reference_assembly" (from Airtable) in the "aligned_dna_short_read" table. Allowed values have data type integer. Invalid values: NA20888 (GRCh38), VCGS_FAM203_621_D2 (GRCh38)', @@ -1126,7 +1126,7 @@ def test_family_metadata(self): 'consanguinity': 'Unknown', 'condition_id': 'OMIM:615123', 'known_condition_name': '', - 'condition_inheritance': '', + 'condition_inheritance': 'Unknown', }) # Test empty project diff --git a/seqr/views/utils/anvil_metadata_utils.py b/seqr/views/utils/anvil_metadata_utils.py index 800887229e..4ba6094f0e 100644 --- a/seqr/views/utils/anvil_metadata_utils.py +++ b/seqr/views/utils/anvil_metadata_utils.py @@ -565,5 +565,5 @@ def _format_omim_conditions(conditions): 'known_condition_name': '|'.join(sorted({o['phenotype_description'] for o in conditions if o.get('phenotype_description')})), 'condition_inheritance': '|'.join(sorted({ MIM_INHERITANCE_MAP.get(i, i) for o in conditions if o.get('phenotype_inheritance') for i in o['phenotype_inheritance'].split(', ') - })) + })) or 'Unknown', } From ed11ef2d82b0d47dbbd263036d300633c4394322 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Mar 2024 21:51:08 +0000 Subject: [PATCH 13/19] Bump django from 3.2.24 to 3.2.25 Bumps [django](https://github.com/django/django) from 3.2.24 to 3.2.25. - [Commits](https://github.com/django/django/compare/3.2.24...3.2.25) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements-dev.txt | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index e22f64d378..20a944635d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -22,7 +22,7 @@ click==8.1.3 # via pip-tools coverage==5.1 # via -r requirements-dev.in -django==3.2.24 +django==3.2.25 # via # -c requirements.txt # django-appconf diff --git a/requirements.txt b/requirements.txt index 774e584491..453c702f08 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,7 +26,7 @@ defusedxml==0.7.1 # via # python3-openid # social-auth-core -django==3.2.24 +django==3.2.25 # via # -r requirements.in # django-anymail From eb3a67b3884b1ce3da5d186df638cb03767bf3d0 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 19 Mar 2024 15:57:55 -0400 Subject: [PATCH 14/19] use tags to determine gene_known_for_phenotype --- seqr/views/apis/report_api.py | 1 - seqr/views/utils/anvil_metadata_utils.py | 17 +++++------------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index 840300aa7b..682eba561d 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -893,7 +893,6 @@ def _add_row(row, family_id, row_type): individual_data_types={i.individual_id: i.data_types for i in individuals}, add_row=_add_row, variant_json_fields=['clinvar', 'variantId'], - saved_variant_annotations={'tags': ArrayAgg('varianttag__variant_tag_type__name', distinct=True)}, mme_values={'variant_ids': ArrayAgg('matchmakersubmissiongenes__saved_variant__saved_variant_json__variantId')}, include_metadata=True, include_mondo=True, diff --git a/seqr/views/utils/anvil_metadata_utils.py b/seqr/views/utils/anvil_metadata_utils.py index 4ba6094f0e..e7dc41a648 100644 --- a/seqr/views/utils/anvil_metadata_utils.py +++ b/seqr/views/utils/anvil_metadata_utils.py @@ -126,7 +126,7 @@ def parse_anvil_metadata( variant_json_fields: Iterable[str] = None, post_process_variant: Callable[[dict, list[dict]], dict] = None, include_no_individual_families: bool = False, omit_airtable: bool = False, include_metadata: bool = False, include_discovery_sample_id: bool = False, include_mondo: bool = False, include_parent_mnvs: bool = False, - proband_only_variants: bool = False, saved_variant_annotations: dict = None): + proband_only_variants: bool = False): individual_samples = individual_samples or (_get_loaded_before_date_project_individual_samples(projects, max_loaded_date) \ if max_loaded_date else _get_all_project_individual_samples(projects)) @@ -147,7 +147,6 @@ def parse_anvil_metadata( saved_variants_by_family = _get_parsed_saved_discovery_variants_by_family( list(family_data_by_id.keys()), variant_filter=variant_filter, variant_json_fields=variant_json_fields, - saved_variant_annotations=saved_variant_annotations, ) condition_map = _get_condition_map(family_data_by_id.values()) @@ -285,21 +284,14 @@ def _post_process_variant_metadata(v, gene_variants, include_parent_mnvs=False): def _get_parsed_saved_discovery_variants_by_family( - families: Iterable[Family], variant_filter: dict, variant_json_fields: list[str], saved_variant_annotations: dict, + families: Iterable[Family], variant_filter: dict, variant_json_fields: list[str], ): tag_types = VariantTagType.objects.filter(project__isnull=True, category=DISCOVERY_CATEGORY) - annotations = { - 'gene_known_for_phenotype': Case(When( - Q(family__post_discovery_omim_numbers__len=0, family__mondo_id__isnull=True), - then=Value('Candidate')), default=Value('Known') - ), - **(saved_variant_annotations or {}), - } project_saved_variants = SavedVariant.objects.filter( varianttag__variant_tag_type__in=tag_types, family__id__in=families, **(variant_filter or {}), - ).order_by('created_date').distinct().annotate(**annotations) + ).order_by('created_date').distinct().annotate(tags=ArrayAgg('varianttag__variant_tag_type__name', distinct=True)) variants = [] gene_ids = set() @@ -321,8 +313,9 @@ def _get_parsed_saved_discovery_variants_by_family( 'hgvsc': (main_transcript.get('hgvsc') or '').split(':')[-1], 'hgvsp': (main_transcript.get('hgvsp') or '').split(':')[-1], 'seqr_chosen_consequence': main_transcript.get('majorConsequence'), + 'gene_known_for_phenotype': 'Known' if 'Known gene for phenotype' in variant.tags else 'Candidate', **{k: variant_json.get(k) for k in ['genotypes', 'svType', 'svName', 'end'] + (variant_json_fields or [])}, - **{k: getattr(variant, k) for k in ['family_id', 'ref', 'alt', *annotations.keys()]}, + **{k: getattr(variant, k) for k in ['family_id', 'ref', 'alt', 'tags']}, }) genes_by_id = get_genes(gene_ids) From c8c8b4df376e3882b7d4e60eaf0d0beec9bb11b2 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 19 Mar 2024 16:09:51 -0400 Subject: [PATCH 15/19] update tests --- seqr/views/apis/report_api_tests.py | 12 ++++++------ seqr/views/apis/summary_data_api_tests.py | 7 +++++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index f37031f63b..55ba602a9a 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -616,7 +616,7 @@ def test_anvil_export(self, mock_google_authenticated, mock_zip): '1_248367227_HG00731', 'HG00731', 'HG00731', 'RP11', 'Known', 'paternal', 'Homozygous', 'GRCh37', '1', '248367227', 'TC', 'T', '-', '-', '-', '-', '-', '-', '-'], discovery_file) self.assertIn([ - '21_3343353_NA19675_1', 'NA19675_1', 'NA19675', 'RP11', 'Known', 'de novo', + '21_3343353_NA19675_1', 'NA19675_1', 'NA19675', 'RP11', 'Candidate', 'de novo', 'Heterozygous', 'GRCh37', '21', '3343353', 'GAGA', 'G', 'c.375_377delTCT', 'p.Leu126del', 'ENST00000258436', '-', '-', '-', '-'], discovery_file) self.assertIn([ @@ -993,7 +993,7 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): ]) self.assertIn([ 'Broad_NA19675_1_21_3343353', 'Broad_NA19675_1', '', 'SNV/INDEL', 'GRCh37', '21', '3343353', 'GAGA', 'G', '', - 'RP11', 'ENST00000258436', 'c.375_377delTCT', 'p.Leu126del', 'Heterozygous', '', 'de novo', '', '', 'Known', + 'RP11', 'ENST00000258436', 'c.375_377delTCT', 'p.Leu126del', 'Heterozygous', '', 'de novo', '', '', 'Candidate', 'Myasthenic syndrome, congenital, 8, with pre- and postsynaptic defects', 'OMIM:615120', 'Autosomal recessive|X-linked', 'Full', '', '', 'SR-ES', '', ], genetic_findings_file) @@ -1006,12 +1006,12 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): self.assertIn([ 'Broad_NA20889_1_248367227', 'Broad_NA20889', '', 'SNV/INDEL', 'GRCh37', '1', '248367227', 'TC', 'T', '', 'OR4G11P', 'ENST00000505820', 'c.3955G>A', 'c.1586-17C>G', 'Heterozygous', '', 'unknown', - 'Broad_NA20889_1_249045487', '', 'Known', 'IRIDA syndrome', 'MONDO:0008788', 'Autosomal dominant', + 'Broad_NA20889_1_249045487', '', 'Candidate', 'IRIDA syndrome', 'MONDO:0008788', 'Autosomal dominant', 'Full', '', '', 'SR-ES', '', ], genetic_findings_file) self.assertIn([ 'Broad_NA20889_1_249045487', 'Broad_NA20889', '', 'SNV/INDEL', 'GRCh37', '1', '249045487', 'A', 'G', '', - 'OR4G11P', '', '', '', 'Heterozygous', '', 'unknown', 'Broad_NA20889_1_248367227', '', 'Known', + 'OR4G11P', '', '', '', 'Heterozygous', '', 'unknown', 'Broad_NA20889_1_248367227', '', 'Candidate', 'IRIDA syndrome', 'MONDO:0008788', 'Autosomal dominant', 'Full', '', '', 'SR-ES', '', ], genetic_findings_file) @@ -1225,7 +1225,7 @@ def test_variant_metadata(self): 'family_history': 'Yes', 'gene': 'OR4G11P', 'gene_id': 'ENSG00000240361', - 'gene_known_for_phenotype': 'Known', + 'gene_known_for_phenotype': 'Candidate', 'genetic_findings_id': 'NA20889_1_248367227', 'hgvsc': 'c.3955G>A', 'hgvsp': 'c.1586-17C>G', @@ -1253,7 +1253,7 @@ def test_variant_metadata(self): 'family_history': 'Yes', 'gene': None, 'gene_id': None, - 'gene_known_for_phenotype': 'Known', + 'gene_known_for_phenotype': 'Candidate', 'genetic_findings_id': 'NA20889_1_249045487', 'participant_id': 'NA20889', 'pos': 249045487, diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py index 62b682bc93..c1b94f1e08 100644 --- a/seqr/views/apis/summary_data_api_tests.py +++ b/seqr/views/apis/summary_data_api_tests.py @@ -37,8 +37,8 @@ "num_saved_variants": 2, "solve_status": "Unsolved", "sample_id": "NA20889", - "gene_known_for_phenotype-1": "Known", - "gene_known_for_phenotype-2": "Known", + "gene_known_for_phenotype-1": "Candidate", + "gene_known_for_phenotype-2": "Candidate", "variant_inheritance-1": "unknown", "variant_inheritance-2": "unknown", 'genetic_findings_id-1': 'NA20889_1_248367227', @@ -105,6 +105,8 @@ 'allele_balance_or_heteroplasmy_percentage-2': None, 'notes-1': None, 'notes-2': None, + 'tags-1': ['Tier 1 - Novel gene and phenotype'], + 'tags-2': ['Tier 1 - Novel gene and phenotype'], } EXPECTED_SAMPLE_METADATA_ROW = { "dbgap_submission": "No", @@ -147,6 +149,7 @@ 'alt-1': 'T', 'chrom-1': '1', 'gene_known_for_phenotype-1': 'Candidate', + 'tags-1': ['Tier 1 - Novel gene and phenotype'], 'pos-1': 248367227, 'end-1': None, 'ref-1': 'TC', From 4a37a44f55aaadcd4bc03b9eeebf287cdc59f9dd Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 19 Mar 2024 16:12:22 -0400 Subject: [PATCH 16/19] add hail to status endpoint --- hail_search/web_app.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hail_search/web_app.py b/hail_search/web_app.py index fc274d2c31..bc5b3aab61 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -47,8 +47,12 @@ def hl_json_dumps(obj): return json.dumps(obj, default=_hl_json_default) async def sync_to_async_hail_query(request: web.Request, query: Callable, *args, timeout_s=QUERY_TIMEOUT_S, **kwargs): + request_body = None + if request.body_exists: + request_body = await request.json() + loop = asyncio.get_running_loop() - future = loop.run_in_executor(request.app.pool, functools.partial(query, await request.json(), *args, **kwargs)) + future = loop.run_in_executor(request.app.pool, functools.partial(query, request_body, *args, **kwargs)) try: return await asyncio.wait_for(future, timeout_s) except asyncio.TimeoutError: @@ -94,6 +98,7 @@ async def multi_lookup(request: web.Request) -> web.Response: async def status(request: web.Request) -> web.Response: + _ = await sync_to_async_hail_query(request, lambda _: hl.eval(1 + 1)) return web.json_response({'success': True}) From 5d4bf803379e6de8cce69f3f5c75c0e23ca2c2c6 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 19 Mar 2024 16:17:04 -0400 Subject: [PATCH 17/19] do not allow unneccessary GTEX data in rna upload --- seqr/management/tests/load_rna_seq_tests.py | 2 -- seqr/views/apis/data_manager_api_tests.py | 2 -- seqr/views/utils/dataset_utils.py | 6 ++---- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/seqr/management/tests/load_rna_seq_tests.py b/seqr/management/tests/load_rna_seq_tests.py index 6353809f5a..936967322a 100644 --- a/seqr/management/tests/load_rna_seq_tests.py +++ b/seqr/management/tests/load_rna_seq_tests.py @@ -68,9 +68,7 @@ def test_tpm(self, mock_utils_logger): 'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000240361\t12.6\t\n', 'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000233750\t1.26\t\n', 'NA19678_D1\t1kg project nåme with uniçøde\t\tENSG00000233750\t 6.04\twhole_blood\n', - 'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000240361\t3.1\tinvalid\n', 'NA19677\t1kg project nåme with uniçøde\t\tENSG00000233750\t5.31\tmuscle\n', - 'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000233750\t7.8\tmuscle\n', 'NA19678\tTest Reprocessed Project\t\tENSG00000240361\t0.2\twhole_blood\n', ], unmatched_samples='NA19677, NA19678, NA19678_D1', diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py index cbaf6fe977..5f6c6bbc9b 100644 --- a/seqr/views/apis/data_manager_api_tests.py +++ b/seqr/views/apis/data_manager_api_tests.py @@ -708,8 +708,6 @@ def test_kibana_proxy(self): ['NA19675_D2', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA19675_D2', 'muscle', 0.0], # no matched individual NA19675_D3 ['NA19675_D3', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA19675_D3', 'fibroblasts', 0.064], - # skip GTEX samples - ['GTEX_001', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA19675_D3', 'whole_blood', 1.95], # a different project sample NA20888 ['NA20888', 'Test Reprocessed Project', 'ENSG00000240361', 'NA20888', 'muscle', 0.112], # a project mismatched sample NA20878 diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index fdf1f4746b..2738652a76 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -294,7 +294,7 @@ def _add_splice_rank(sample_data_rows): 'tpm': { 'model_class': RnaSeqTpm, 'columns': TPM_HEADER_COLS, - 'additional_kwargs': {'should_skip': lambda row: row[SAMPLE_ID_COL].startswith('GTEX')}, + 'additional_kwargs': {}, }, 'splice_outlier': { 'model_class': RnaSeqSpliceOutlier, @@ -331,7 +331,7 @@ def _validate_rna_header(header, column_map): def _load_rna_seq_file( file_path, user, potential_loaded_samples, update_sample_models, save_sample_data, get_matched_sample, column_map, mapping_file=None, allow_missing_gene=False, ignore_extra_samples=False, - should_skip=None, format_fields=None, + format_fields=None, ): sample_id_to_individual_id_mapping = {} @@ -349,8 +349,6 @@ def _load_rna_seq_file( gene_ids = set() for line in tqdm(parsed_f, unit=' rows'): row = dict(zip(header, line)) - if should_skip and should_skip(row): - continue row_dict = {mapped_key: row[col] for mapped_key, col in column_map.items()} for mapped_key, format_func in (format_fields or {}).items(): From 2f348151f7bdb5d3af6f3f4396a2a24a9e722b75 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 19 Mar 2024 16:19:49 -0400 Subject: [PATCH 18/19] Add comment --- hail_search/web_app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hail_search/web_app.py b/hail_search/web_app.py index bc5b3aab61..83efad67af 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -98,7 +98,8 @@ async def multi_lookup(request: web.Request) -> web.Response: async def status(request: web.Request) -> web.Response: - _ = await sync_to_async_hail_query(request, lambda _: hl.eval(1 + 1)) + # Make sure the hail backend process is still alive. + await sync_to_async_hail_query(request, lambda _: hl.eval(1 + 1)) return web.json_response({'success': True}) From 841188995c35df5ab482413761d449e045bf9f4f Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 19 Mar 2024 16:38:39 -0400 Subject: [PATCH 19/19] more decriptive error for unmatched sample ID --- seqr/management/tests/load_rna_seq_tests.py | 8 ++++---- seqr/views/apis/data_manager_api_tests.py | 8 ++++---- seqr/views/utils/dataset_utils.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/seqr/management/tests/load_rna_seq_tests.py b/seqr/management/tests/load_rna_seq_tests.py index 936967322a..2b95be2185 100644 --- a/seqr/management/tests/load_rna_seq_tests.py +++ b/seqr/management/tests/load_rna_seq_tests.py @@ -71,7 +71,7 @@ def test_tpm(self, mock_utils_logger): 'NA19677\t1kg project nåme with uniçøde\t\tENSG00000233750\t5.31\tmuscle\n', 'NA19678\tTest Reprocessed Project\t\tENSG00000240361\t0.2\twhole_blood\n', ], - unmatched_samples='NA19677, NA19678, NA19678_D1', + unmatched_samples='NA19677 (1kg project nåme with uniçøde), NA19678 (Test Reprocessed Project), NA19678_D1 (1kg project nåme with uniçøde)', additional_errors=['Samples missing required "tissue": NA19675_D2'], ) @@ -106,7 +106,7 @@ def test_tpm(self, mock_utils_logger): mock.call('DONE'), ]) mock_utils_logger.warning.assert_has_calls([ - mock.call('Skipped loading for the following 2 unmatched samples: NA19677, NA19678', None), + mock.call('Skipped loading for the following 2 unmatched samples: NA19677 (1kg project nåme with uniçøde), NA19678 (Test Reprocessed Project)', None), ]) # Test a new sample created for a mismatched tissue and a row with 0.0 tpm @@ -134,13 +134,13 @@ def test_outlier(self): 'NA19675_D3\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', 'NA19675_D4\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', ], - unmatched_samples='NA19675_D3, NA19675_D4', + unmatched_samples='NA19675_D3 (1kg project nåme with uniçøde), NA19675_D4 (1kg project nåme with uniçøde)', ) self.mock_open.return_value.__enter__.return_value.__iter__.return_value = ['NA19675_D4\tNA19678'] with self.assertRaises(ErrorsWarningsException) as e: call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--mapping-file', 'map.tsv') - self.assertEqual(e.exception.errors, ['Unable to find matches for the following samples: NA19675_D3']) + self.assertEqual(e.exception.errors, ['Unable to find matches for the following samples: NA19675_D3 (1kg project nåme with uniçøde)']) call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--ignore-extra-samples') diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py index 5f6c6bbc9b..1fbadfd7ef 100644 --- a/seqr/views/apis/data_manager_api_tests.py +++ b/seqr/views/apis/data_manager_api_tests.py @@ -678,7 +678,7 @@ def test_kibana_proxy(self): ['NA19675_D3', 'Test Reprocessed Project', 'ENSG00000233750', 'muscle', 'detail1', 0.064, '0.0000057', 7.8], ['NA20888', 'Test Reprocessed Project', 'ENSG00000240361', 'muscle', '', 0.04, 0.112, 1.9], ], - 'skipped_samples': 'NA19675_D3', + 'skipped_samples': 'NA19675_D3 (Test Reprocessed Project)', 'sample_tissue_type': 'M', 'num_parsed_samples': 3, 'initial_model_count': 3, @@ -713,7 +713,7 @@ def test_kibana_proxy(self): # a project mismatched sample NA20878 ['NA20878', 'Test Reprocessed Project', 'ENSG00000233750', 'NA20878', 'fibroblasts', 0.064], ], - 'skipped_samples': 'NA19675_D3, NA20878', + 'skipped_samples': 'NA19675_D3 (1kg project nåme with uniçøde), NA20878 (Test Reprocessed Project)', 'sample_tissue_type': 'M', 'num_parsed_samples': 4, 'initial_model_count': 4, @@ -766,7 +766,7 @@ def test_kibana_proxy(self): ['NA20878', 'Test Reprocessed Project', 'ENSG00000233750', 'chr2', 167258096, 167258349, '*', 'XIRP2', 'psi3', 1.56E-25, 6.33, 0.45, 143, 'fibroblasts', 0.03454739, 1, 20], ], - 'skipped_samples': 'NA19675_D3, NA20878', + 'skipped_samples': 'NA19675_D3 (1kg project nåme with uniçøde), NA20878 (Test Reprocessed Project)', 'sample_tissue_type': 'F', 'num_parsed_samples': 4, 'initial_model_count': 7, @@ -877,7 +877,7 @@ def _set_file_iter_stdout(rows): _set_file_iter_stdout([header, loaded_data_row, missing_sample_row]) response = self.client.post(url, content_type='application/json', data=json.dumps(body)) self.assertEqual(response.status_code, 400) - self.assertDictEqual(response.json(), {'errors': ['Unable to find matches for the following samples: NA19675_D3'], 'warnings': None}) + self.assertDictEqual(response.json(), {'errors': ['Unable to find matches for the following samples: NA19675_D3 (1kg project nåme with uniçøde)'], 'warnings': None}) unknown_gene_id_row1 = loaded_data_row[:2] + ['NOT_A_GENE_ID1'] + loaded_data_row[3:] unknown_gene_id_row2 = loaded_data_row[:2] + ['NOT_A_GENE_ID2'] + loaded_data_row[3:] diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 2738652a76..087b327718 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -414,7 +414,7 @@ def _process_rna_errors(gene_ids, missing_required_fields, unmatched_samples, ig errors.append(f'Unknown Gene IDs: {", ".join(sorted(unknown_gene_ids))}') if unmatched_samples: - unmatched_sample_ids = ', '.join(sorted([sample_key[0] for sample_key in unmatched_samples])) + unmatched_sample_ids = ', '.join(sorted({f'{sample_key[0]} ({sample_key[1]})' for sample_key in unmatched_samples})) if ignore_extra_samples: warnings.append(f'Skipped loading for the following {len(unmatched_samples)} unmatched samples: {unmatched_sample_ids}') else: