From ad46fbc4a2c3323a5674de22745262a68dd1e58b Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 5 Feb 2024 18:10:03 -0500 Subject: [PATCH 1/5] generic load rna function --- seqr/views/apis/data_manager_api.py | 15 ++-------- seqr/views/utils/dataset_utils.py | 46 ++++++++++++++++++----------- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 2205345302..8df488edf6 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -20,8 +20,7 @@ from seqr.utils.vcf_utils import validate_vcf_exists from seqr.views.utils.airflow_utils import trigger_data_loading, write_data_loading_pedigree -from seqr.views.utils.dataset_utils import load_rna_seq_outlier, load_rna_seq_tpm, load_phenotype_prioritization_data_file, \ - load_rna_seq_splice_outlier +from seqr.views.utils.dataset_utils import load_rna_seq, load_phenotype_prioritization_data_file, RNA_DATA_TYPE_CONFIGS from seqr.views.utils.file_utils import parse_file, get_temp_upload_directory, load_uploaded_file from seqr.views.utils.json_utils import create_json_response from seqr.views.utils.json_to_orm_utils import update_model_from_json @@ -257,13 +256,6 @@ def _update_individuals_sv_qc(json_records, user): 'kl_temp_manton_orphan-diseases_cmg-samples_exomes_v1', 'Interview Exomes', 'v02_loading_test_project', ] - -RNA_DATA_TYPE_CONFIGS = { - 'outlier': {'load_func': load_rna_seq_outlier, 'model_class': RnaSeqOutlier}, - 'tpm': {'load_func': load_rna_seq_tpm, 'model_class': RnaSeqTpm}, - 'splice_outlier': {'load_func': load_rna_seq_splice_outlier, 'model_class': RnaSeqSpliceOutlier} -} - @data_manager_required def update_rna_seq(request): request_json = json.loads(request.body) @@ -286,9 +278,8 @@ def _save_sample_data(sample_guid, sample_data): json.dump(sample_data, f) try: - load_func = RNA_DATA_TYPE_CONFIGS[data_type]['load_func'] - sample_guids, info, warnings = load_func( - file_path, _save_sample_data, lambda sample_guid: _load_saved_sample_data(file_name_prefix, sample_guid), + sample_guids, info, warnings = load_rna_seq( + data_type, file_path, _save_sample_data, lambda sample_guid: _load_saved_sample_data(file_name_prefix, sample_guid), user=request.user, mapping_file=mapping_file, ignore_extra_samples=request_json.get('ignoreExtraSamples')) except ValueError as e: return create_json_response({'error': str(e)}, status=400) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 1185575486..8497c1df25 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -274,34 +274,46 @@ def _parse_tsv_row(row): TISSUE_TYPE_MAP = {v: k for k, v in REVERSE_TISSUE_TYPE.items() if k != Sample.NO_TISSUE_TYPE} -def load_rna_seq_outlier(*args, **kwargs): - return _load_rna_seq(RnaSeqOutlier, *args, RNA_OUTLIER_COLUMNS, **kwargs) - - -def load_rna_seq_tpm(*args, **kwargs): - return _load_rna_seq( - RnaSeqTpm, *args, TPM_HEADER_COLS, should_skip=lambda row: row[SAMPLE_ID_COL].startswith('GTEX'), **kwargs, - ) - - def _get_splice_id(row): return '-'.join([row[GENE_ID_COL], row[CHROM_COL], str(row[START_COL]), str(row[END_COL]), row[STRAND_COL], row[SPLICE_TYPE_COL]]) -def load_rna_seq_splice_outlier(*args, **kwargs): - return _load_rna_seq( - RnaSeqSpliceOutlier, *args, SPLICE_OUTLIER_HEADER_COLS, format_fields=SPLICE_OUTLIER_FORMATTER, - get_unique_key=_get_splice_id, allow_missing_gene=True, post_process=_add_splice_rank, **kwargs - ) - - def _add_splice_rank(sample_data_rows): sorted_data_rows = sorted([data_row for data_row in sample_data_rows.values()], key=lambda d: d[P_VALUE_COL]) for i, data_row in enumerate(sorted_data_rows): data_row['rank'] = i +RNA_DATA_TYPE_CONFIGS = { + 'outlier': { + 'model_class': RnaSeqOutlier, + 'columns': RNA_OUTLIER_COLUMNS, + 'additional_kwargs': {}, + }, + 'tpm': { + 'model_class': RnaSeqTpm, + 'columns': TPM_HEADER_COLS, + 'additional_kwargs': {'should_skip': lambda row: row[SAMPLE_ID_COL].startswith('GTEX')}, + }, + 'splice_outlier': { + 'model_class': RnaSeqSpliceOutlier, + 'columns': SPLICE_OUTLIER_HEADER_COLS, + 'additional_kwargs': { + 'format_fields': SPLICE_OUTLIER_FORMATTER, + 'allow_missing_gene': True, + 'get_unique_key': _get_splice_id, + 'post_process': _add_splice_rank, + }, + }, +} + + +def load_rna_seq(data_type, *args, **kwargs): + config = RNA_DATA_TYPE_CONFIGS[data_type] + return _load_rna_seq(config['model_class'], *args, config['columns'], **config['additional_kwargs'], **kwargs) + + def _validate_rna_header(header, column_map): required_column_map = { column_map.get(col, col): col for col in [SAMPLE_ID_COL, PROJECT_COL, GENE_ID_COL, TISSUE_COL] From ec572d53ff2e2362d192e8b2c73208cf7b62d334 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 5 Feb 2024 18:21:16 -0500 Subject: [PATCH 2/5] generic manage command --- .../{load_rna_seq_tpm.py => load_rna_seq.py} | 32 ++++++++----------- ...seq_tpm_tests.py => load_rna_seq_tests.py} | 19 +++++++---- 2 files changed, 25 insertions(+), 26 deletions(-) rename seqr/management/commands/{load_rna_seq_tpm.py => load_rna_seq.py} (56%) rename seqr/management/tests/{load_rna_seq_tpm_tests.py => load_rna_seq_tests.py} (84%) diff --git a/seqr/management/commands/load_rna_seq_tpm.py b/seqr/management/commands/load_rna_seq.py similarity index 56% rename from seqr/management/commands/load_rna_seq_tpm.py rename to seqr/management/commands/load_rna_seq.py index 9f1e4fc1d1..d47deef6ca 100644 --- a/seqr/management/commands/load_rna_seq_tpm.py +++ b/seqr/management/commands/load_rna_seq.py @@ -1,27 +1,19 @@ import logging from django.core.management.base import BaseCommand -from seqr.models import RnaSeqTpm, Sample +from seqr.models import Sample from seqr.views.utils.file_utils import parse_file -from seqr.views.utils.dataset_utils import load_rna_seq_tpm +from seqr.views.utils.dataset_utils import load_rna_seq, RNA_DATA_TYPE_CONFIGS logger = logging.getLogger(__name__) -TISSUE_TYPE_MAP = { - 'whole_blood': 'WB', - 'fibroblasts': 'F', - 'muscle': 'M', - 'lymphocytes': 'L', -} - -REVERSE_TISSUE_TYPE = {v: k for k, v in TISSUE_TYPE_MAP.items()} - class Command(BaseCommand): - help = 'Load RNA-Seq TPM data' + help = 'Load RNA-Seq data' def add_arguments(self, parser): - parser.add_argument('input_file', help='tsv file with TPM data') + parser.add_argument('data_type', help='RNA data type', choices=sorted(RNA_DATA_TYPE_CONFIGS.keys())) + parser.add_argument('input_file', help='tsv file with RNA data') parser.add_argument('--mapping-file', help='optional file to map sample IDs to seqr individual IDs') parser.add_argument('--ignore-extra-samples', action='store_true', help='whether to suppress errors about extra samples') @@ -31,17 +23,19 @@ def handle(self, *args, **options): with open(options['mapping_file']) as f: mapping_file = parse_file(options['mapping_file'], f) - sample_guids, _, _ = load_rna_seq_tpm( - options['input_file'], self._save_sample_data, lambda *args: {}, create_models_before_save=True, + data_type = options['data_type'] + self.model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class'] + + sample_guids, _, _ = load_rna_seq( + data_type, options['input_file'], self._save_sample_data, lambda *args: {}, create_models_before_save=True, mapping_file=mapping_file, ignore_extra_samples=options['ignore_extra_samples']) Sample.bulk_update(user=None, update_json={'is_active': True}, guid__in=sample_guids) logger.info('DONE') - @staticmethod - def _save_sample_data(sample_guid, data_by_gene): + def _save_sample_data(self, sample_guid, data_by_gene): sample = Sample.objects.get(guid=sample_guid) - models = RnaSeqTpm.objects.bulk_create( - [RnaSeqTpm(sample=sample, **data) for data in data_by_gene.values()], batch_size=1000) + models = self.model_cls.objects.bulk_create( + [self.model_cls(sample=sample, **data) for data in data_by_gene.values()], batch_size=1000) logger.info(f'create {len(models)} RnaSeqTpm for {sample.sample_id}') diff --git a/seqr/management/tests/load_rna_seq_tpm_tests.py b/seqr/management/tests/load_rna_seq_tests.py similarity index 84% rename from seqr/management/tests/load_rna_seq_tpm_tests.py rename to seqr/management/tests/load_rna_seq_tests.py index f0170eb5c8..ce533d2901 100644 --- a/seqr/management/tests/load_rna_seq_tpm_tests.py +++ b/seqr/management/tests/load_rna_seq_tests.py @@ -2,6 +2,7 @@ import mock from django.core.management import call_command +from django.core.management.base import CommandError from seqr.models import Sample, RnaSeqTpm, RnaSeqOutlier from seqr.utils.middleware import ErrorsWarningsException @@ -16,9 +17,9 @@ class LoadRnaSeqTest(AuthenticationTestCase): @mock.patch('seqr.utils.file_utils.gzip.open') @mock.patch('seqr.views.utils.dataset_utils.logger') - @mock.patch('seqr.management.commands.load_rna_seq_tpm.logger') - @mock.patch('seqr.management.commands.load_rna_seq_tpm.open') - def test_command(self, mock_open, mock_logger, mock_utils_logger, mock_gzip_open): + @mock.patch('seqr.management.commands.load_rna_seq.logger') + @mock.patch('seqr.management.commands.load_rna_seq.open') + def test_tpm(self, mock_open, mock_logger, mock_utils_logger, mock_gzip_open): mock_gzip_file = mock_gzip_open.return_value.__enter__.return_value mock_gzip_file.__iter__.return_value = [ '', @@ -30,13 +31,17 @@ def test_command(self, mock_open, mock_logger, mock_utils_logger, mock_gzip_open 'NA19678\tTest Reprocessed Project\t\tENSG00000240361\t0.2\twhole_blood\n', ] + with self.assertRaises(CommandError) as e: + call_command('load_rna_seq', 'not_a_type', RNA_FILE_ID) + self.assertEqual(str(e.exception), "Error: argument data_type: invalid choice: 'not_a_type' (choose from 'outlier', 'splice_outlier', 'tpm')") + with self.assertRaises(ValueError) as e: - call_command('load_rna_seq_tpm', RNA_FILE_ID) + call_command('load_rna_seq', 'tpm', RNA_FILE_ID) self.assertEqual(str(e.exception), 'Invalid file: missing column(s): TPM, gene_id, project, sample_id, tissue') mock_gzip_file.__iter__.return_value[0] = 'sample_id\tproject\tindividual_id\tgene_id\tTPM\ttissue\n' with self.assertRaises(ErrorsWarningsException) as e: - call_command('load_rna_seq_tpm', RNA_FILE_ID) + call_command('load_rna_seq', 'tpm', RNA_FILE_ID) self.assertListEqual(e.exception.errors, [ 'Samples missing required "tissue": NA19675_D2', 'Unable to find matches for the following samples: NA19677, NA19678, NA19678_D1', @@ -46,7 +51,7 @@ def test_command(self, mock_open, mock_logger, mock_utils_logger, mock_gzip_open mock_gzip_file.__iter__.return_value[0], 'NA19678_D1\t1kg project nåme with uniçøde\tNA19678\tENSG00000233750\t 6.04\twhole_blood\n', ] + mock_gzip_file.__iter__.return_value[2:] - call_command('load_rna_seq_tpm', RNA_FILE_ID, '--ignore-extra-samples') + call_command('load_rna_seq', 'tpm', RNA_FILE_ID, '--ignore-extra-samples') # Existing outlier data should be unchanged self.assertEqual(RnaSeqOutlier.objects.count(), 3) @@ -83,7 +88,7 @@ def test_command(self, mock_open, mock_logger, mock_utils_logger, mock_gzip_open # Test a new sample created for a mismatched tissue and a row with 0.0 tpm mock_gzip_file.__iter__.return_value[1] = 'NA19678_D1\t1kg project nåme with uniçøde\tNA19678\tENSG00000233750\t0.0\tfibroblasts\n' - call_command('load_rna_seq_tpm', 'new_file.tsv.gz', '--ignore-extra-samples') + call_command('load_rna_seq', 'tpm', 'new_file.tsv.gz', '--ignore-extra-samples') models = RnaSeqTpm.objects.select_related('sample').filter(sample__sample_id='NA19678_D1') self.assertEqual(models.count(), 2) self.assertSetEqual(set(models.values_list('sample__tissue_type', flat=True)), {'F', 'WB'}) From 8b26e245fb8dbee6e6a877286b0f92a69b77f154 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 5 Feb 2024 18:21:57 -0500 Subject: [PATCH 3/5] generic manage command --- seqr/management/commands/load_rna_seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqr/management/commands/load_rna_seq.py b/seqr/management/commands/load_rna_seq.py index d47deef6ca..fb3d7ab205 100644 --- a/seqr/management/commands/load_rna_seq.py +++ b/seqr/management/commands/load_rna_seq.py @@ -38,4 +38,4 @@ def _save_sample_data(self, sample_guid, data_by_gene): sample = Sample.objects.get(guid=sample_guid) models = self.model_cls.objects.bulk_create( [self.model_cls(sample=sample, **data) for data in data_by_gene.values()], batch_size=1000) - logger.info(f'create {len(models)} RnaSeqTpm for {sample.sample_id}') + logger.info(f'create {len(models)} {self.model_cls} for {sample.sample_id}') From e5a1c34a613cef5f6535176346d652cf52df5e64 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 5 Feb 2024 18:27:02 -0500 Subject: [PATCH 4/5] fix outlier manage tests --- seqr/management/commands/load_rna_seq.py | 2 +- .../commands/load_rna_seq_outlier.py | 38 ----------- .../tests/load_rna_seq_outlier_tests.py | 63 ------------------- seqr/management/tests/load_rna_seq_tests.py | 54 ++++++++++++++++ 4 files changed, 55 insertions(+), 102 deletions(-) delete mode 100644 seqr/management/commands/load_rna_seq_outlier.py delete mode 100644 seqr/management/tests/load_rna_seq_outlier_tests.py diff --git a/seqr/management/commands/load_rna_seq.py b/seqr/management/commands/load_rna_seq.py index fb3d7ab205..8aff327956 100644 --- a/seqr/management/commands/load_rna_seq.py +++ b/seqr/management/commands/load_rna_seq.py @@ -38,4 +38,4 @@ def _save_sample_data(self, sample_guid, data_by_gene): sample = Sample.objects.get(guid=sample_guid) models = self.model_cls.objects.bulk_create( [self.model_cls(sample=sample, **data) for data in data_by_gene.values()], batch_size=1000) - logger.info(f'create {len(models)} {self.model_cls} for {sample.sample_id}') + logger.info(f'create {len(models)} {self.model_cls.__name__} for {sample.sample_id}') diff --git a/seqr/management/commands/load_rna_seq_outlier.py b/seqr/management/commands/load_rna_seq_outlier.py deleted file mode 100644 index 49d2d7091b..0000000000 --- a/seqr/management/commands/load_rna_seq_outlier.py +++ /dev/null @@ -1,38 +0,0 @@ -from django.core.management.base import BaseCommand -import logging - -from seqr.models import RnaSeqOutlier, Sample -from seqr.views.utils.dataset_utils import load_rna_seq_outlier -from seqr.views.utils.file_utils import parse_file - -logger = logging.getLogger(__name__) - -class Command(BaseCommand): - help = 'Load RNA-Seq Outlier data' - - def add_arguments(self, parser): - parser.add_argument('input_file') - parser.add_argument('--mapping-file') - parser.add_argument('--ignore-extra-samples', action='store_true') - - def handle(self, *args, **options): - mapping_file = None - if options['mapping_file']: - with open(options['mapping_file']) as f: - mapping_file = parse_file(options['mapping_file'], f) - - sample_guids, _, _ = load_rna_seq_outlier( - options['input_file'], self._save_sample_data, lambda *args: {}, create_models_before_save=True, - mapping_file=mapping_file, ignore_extra_samples=options['ignore_extra_samples']) - - Sample.bulk_update(user=None, update_json={'is_active': True}, guid__in=sample_guids) - - @staticmethod - def _save_sample_data(sample_guid, data_by_gene): - sample = Sample.objects.get(guid=sample_guid) - models = RnaSeqOutlier.objects.bulk_create( - [RnaSeqOutlier(sample=sample, **data) for data in data_by_gene.values()]) - logger.info(f'create {len(models)} RnaSeqOutliers for {sample.sample_id}') - - - diff --git a/seqr/management/tests/load_rna_seq_outlier_tests.py b/seqr/management/tests/load_rna_seq_outlier_tests.py deleted file mode 100644 index 703b1c029f..0000000000 --- a/seqr/management/tests/load_rna_seq_outlier_tests.py +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- -import mock - -from django.core.management import call_command -from django.test import TestCase - -from seqr.models import Sample, RnaSeqOutlier -from seqr.utils.middleware import ErrorsWarningsException - -RNA_FILE_ID = 'tmp_-_2021-03-01T00:00:00_-_test_data_manager_-_new_muscle_samples.tsv.gz' -EXISTING_SAMPLE_GUID = 'S000152_na19675_d2' - -class LoadRnaSeqTest(TestCase): - databases = '__all__' - fixtures = ['users', '1kg_project', 'reference_data'] - - @mock.patch('seqr.management.commands.load_rna_seq_outlier.logger.info') - @mock.patch('seqr.management.commands.load_rna_seq_outlier.open') - @mock.patch('seqr.utils.file_utils.gzip.open') - def test_command(self, mock_gzip_open, mock_open, mock_logger): - mock_gzip_file = mock_gzip_open.return_value.__enter__.return_value - mock_gzip_file.__iter__.return_value = ['invalid\theader'] - - with self.assertRaises(ValueError) as e: - call_command('load_rna_seq_outlier', RNA_FILE_ID) - self.assertEqual(str(e.exception), 'Invalid file: missing column(s): geneID, pValue, padjust, project, sampleID, tissue, zScore') - - mock_gzip_file.__iter__.return_value = [ - 'sampleID\tproject\tgeneID\tdetail\tpValue\tpadjust\tzScore\ttissue\n', - 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000240361\tdetail1\t0.01\t0.13\t-3.1\tmuscle\n', - 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000240361\tdetail2\t0.01\t0.13\t-3.1\tmuscle\n', - 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', - 'NA19675_D3\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', - 'NA19675_D4\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', - ] - mock_open.return_value.__enter__.return_value.__iter__.return_value = ['NA19675_D4\tNA19678'] - - with self.assertRaises(ErrorsWarningsException) as e: - call_command('load_rna_seq_outlier', RNA_FILE_ID) - self.assertEqual(e.exception.errors, ['Unable to find matches for the following samples: NA19675_D3, NA19675_D4']) - - with self.assertRaises(ErrorsWarningsException) as e: - call_command('load_rna_seq_outlier', RNA_FILE_ID, '--mapping-file', 'map.tsv') - self.assertEqual(e.exception.errors, ['Unable to find matches for the following samples: NA19675_D3']) - - call_command('load_rna_seq_outlier', RNA_FILE_ID, '--ignore-extra-samples') - - rna_samples = Sample.objects.filter(individual_id=1, sample_id='NA19675_D2', sample_type='RNA') - self.assertEqual(len(rna_samples), 1) - sample = rna_samples.first() - self.assertEqual(sample.guid, EXISTING_SAMPLE_GUID) - self.assertTrue(sample.is_active) - self.assertIsNone(sample.elasticsearch_index) - self.assertEqual(sample.data_source, 'new_muscle_samples.tsv.gz') - self.assertEqual(sample.tissue_type, 'M') - - models = RnaSeqOutlier.objects.all() - self.assertEqual(models.count(), 2) - self.assertSetEqual({model.sample for model in models}, {sample}) - self.assertListEqual(list(models.values_list('gene_id', 'p_adjust', 'p_value', 'z_score')), [ - ('ENSG00000240361', 0.13, 0.01, -3.1), ('ENSG00000233750', 0.0000057, 0.064, 7.8), - ]) - mock_logger.assert_called_with('create 2 RnaSeqOutliers for NA19675_D2') diff --git a/seqr/management/tests/load_rna_seq_tests.py b/seqr/management/tests/load_rna_seq_tests.py index ce533d2901..f7f618c7df 100644 --- a/seqr/management/tests/load_rna_seq_tests.py +++ b/seqr/management/tests/load_rna_seq_tests.py @@ -96,4 +96,58 @@ def test_tpm(self, mock_open, mock_logger, mock_utils_logger, mock_gzip_open): self.assertEqual(models.values('sample').distinct().count(), 2) mock_logger.info.assert_has_calls([ mock.call('create 1 RnaSeqTpm for NA19678_D1'), + mock.call('DONE'), + ]) + + @mock.patch('seqr.management.commands.load_rna_seq.logger.info') + @mock.patch('seqr.management.commands.load_rna_seq.open') + @mock.patch('seqr.utils.file_utils.gzip.open') + def test_outlier(self, mock_gzip_open, mock_open, mock_logger): + mock_gzip_file = mock_gzip_open.return_value.__enter__.return_value + mock_gzip_file.__iter__.return_value = ['invalid\theader'] + + with self.assertRaises(ValueError) as e: + call_command('load_rna_seq', 'outlier', RNA_FILE_ID) + self.assertEqual(str(e.exception), + 'Invalid file: missing column(s): geneID, pValue, padjust, project, sampleID, tissue, zScore') + + mock_gzip_file.__iter__.return_value = [ + 'sampleID\tproject\tgeneID\tdetail\tpValue\tpadjust\tzScore\ttissue\n', + 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000240361\tdetail1\t0.01\t0.13\t-3.1\tmuscle\n', + 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000240361\tdetail2\t0.01\t0.13\t-3.1\tmuscle\n', + 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', + 'NA19675_D3\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', + 'NA19675_D4\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', + ] + mock_open.return_value.__enter__.return_value.__iter__.return_value = ['NA19675_D4\tNA19678'] + + with self.assertRaises(ErrorsWarningsException) as e: + call_command('load_rna_seq', 'outlier', RNA_FILE_ID) + self.assertEqual(e.exception.errors, + ['Unable to find matches for the following samples: NA19675_D3, NA19675_D4']) + + with self.assertRaises(ErrorsWarningsException) as e: + call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--mapping-file', 'map.tsv') + self.assertEqual(e.exception.errors, ['Unable to find matches for the following samples: NA19675_D3']) + + call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--ignore-extra-samples') + + rna_samples = Sample.objects.filter(individual_id=1, sample_id='NA19675_D2', sample_type='RNA') + self.assertEqual(len(rna_samples), 1) + sample = rna_samples.first() + self.assertEqual(sample.guid, EXISTING_SAMPLE_GUID) + self.assertTrue(sample.is_active) + self.assertIsNone(sample.elasticsearch_index) + #self.assertEqual(sample.data_source, 'new_muscle_samples.tsv.gz') TODO? + self.assertEqual(sample.tissue_type, 'M') + + models = RnaSeqOutlier.objects.all() + self.assertEqual(models.count(), 2) + self.assertSetEqual({model.sample for model in models}, {sample}) + self.assertListEqual(list(models.values_list('gene_id', 'p_adjust', 'p_value', 'z_score')), [ + ('ENSG00000240361', 0.13, 0.01, -3.1), ('ENSG00000233750', 0.0000057, 0.064, 7.8), + ]) + mock_logger.assert_has_calls([ + mock.call('create 2 RnaSeqOutlier for NA19675_D2'), + mock.call('DONE'), ]) From e6b9a9e4d586a122c1693d1c6978e27bbfdaf71d Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 5 Feb 2024 18:55:27 -0500 Subject: [PATCH 5/5] better shared test behavior --- seqr/management/tests/load_rna_seq_tests.py | 150 ++++++++++---------- 1 file changed, 78 insertions(+), 72 deletions(-) diff --git a/seqr/management/tests/load_rna_seq_tests.py b/seqr/management/tests/load_rna_seq_tests.py index f7f618c7df..7c1cd0db02 100644 --- a/seqr/management/tests/load_rna_seq_tests.py +++ b/seqr/management/tests/load_rna_seq_tests.py @@ -12,59 +12,82 @@ MAPPING_FILE_ID = 'mapping.tsv' EXISTING_SAMPLE_GUID = 'S000152_na19675_d2' + class LoadRnaSeqTest(AuthenticationTestCase): fixtures = ['users', '1kg_project', 'reference_data'] - @mock.patch('seqr.utils.file_utils.gzip.open') - @mock.patch('seqr.views.utils.dataset_utils.logger') - @mock.patch('seqr.management.commands.load_rna_seq.logger') - @mock.patch('seqr.management.commands.load_rna_seq.open') - def test_tpm(self, mock_open, mock_logger, mock_utils_logger, mock_gzip_open): - mock_gzip_file = mock_gzip_open.return_value.__enter__.return_value - mock_gzip_file.__iter__.return_value = [ - '', - 'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000240361\t12.6\t\n', - 'NA19678_D1\t1kg project nåme with uniçøde\t\tENSG00000233750\t 6.04\twhole_blood\n', - 'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000240361\t3.1\tinvalid\n', - 'NA19677\t1kg project nåme with uniçøde\t\tENSG00000233750\t5.31\tmuscle\n', - 'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000233750\t7.8\tmuscle\n', - 'NA19678\tTest Reprocessed Project\t\tENSG00000240361\t0.2\twhole_blood\n', - ] + def setUp(self): + patcher = mock.patch('seqr.utils.file_utils.gzip.open') + mock_gzip_open = patcher.start() + self.mock_gzip_file_iter = mock_gzip_open.return_value.__enter__.return_value.__iter__ + self.addCleanup(patcher.stop) + patcher = mock.patch('seqr.management.commands.load_rna_seq.open') + self.mock_open = patcher.start() + self.addCleanup(patcher.stop) + patcher = mock.patch('seqr.management.commands.load_rna_seq.logger') + self.mock_logger = patcher.start() + self.addCleanup(patcher.stop) + + def _test_invalid_calls(self, data_type, expected_columns, file_data, unmatched_samples, additional_errors=None): + self.mock_gzip_file_iter.return_value = ['invalid\theader'] with self.assertRaises(CommandError) as e: call_command('load_rna_seq', 'not_a_type', RNA_FILE_ID) - self.assertEqual(str(e.exception), "Error: argument data_type: invalid choice: 'not_a_type' (choose from 'outlier', 'splice_outlier', 'tpm')") + self.assertEqual( + str(e.exception), + "Error: argument data_type: invalid choice: 'not_a_type' (choose from 'outlier', 'splice_outlier', 'tpm')") with self.assertRaises(ValueError) as e: - call_command('load_rna_seq', 'tpm', RNA_FILE_ID) - self.assertEqual(str(e.exception), 'Invalid file: missing column(s): TPM, gene_id, project, sample_id, tissue') + call_command('load_rna_seq', data_type, RNA_FILE_ID) + self.assertEqual(str(e.exception), f'Invalid file: missing column(s): {expected_columns}') - mock_gzip_file.__iter__.return_value[0] = 'sample_id\tproject\tindividual_id\tgene_id\tTPM\ttissue\n' + self.mock_gzip_file_iter.return_value = file_data with self.assertRaises(ErrorsWarningsException) as e: - call_command('load_rna_seq', 'tpm', RNA_FILE_ID) - self.assertListEqual(e.exception.errors, [ - 'Samples missing required "tissue": NA19675_D2', - 'Unable to find matches for the following samples: NA19677, NA19678, NA19678_D1', + call_command('load_rna_seq', data_type, RNA_FILE_ID) + self.assertListEqual(e.exception.errors, (additional_errors or []) + [ + f'Unable to find matches for the following samples: {unmatched_samples}', ]) - mock_gzip_file.__iter__.return_value = [ - mock_gzip_file.__iter__.return_value[0], + def _assert_expected_existing_sample(self, data_source): + existing_sample = Sample.objects.get(individual_id=1, sample_id='NA19675_D2', sample_type='RNA') + self.assertEqual(existing_sample.guid, EXISTING_SAMPLE_GUID) + self.assertEqual(existing_sample.sample_id, 'NA19675_D2') + self.assertTrue(existing_sample.is_active) + self.assertIsNone(existing_sample.elasticsearch_index) + self.assertEqual(existing_sample.tissue_type, 'M') + self.assertEqual(existing_sample.data_source, data_source) + return existing_sample + + @mock.patch('seqr.views.utils.dataset_utils.logger') + def test_tpm(self, mock_utils_logger): + self._test_invalid_calls( + 'tpm', + expected_columns='TPM, gene_id, project, sample_id, tissue', + file_data=[ + 'sample_id\tproject\tindividual_id\tgene_id\tTPM\ttissue\n', + 'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000240361\t12.6\t\n', + 'NA19678_D1\t1kg project nåme with uniçøde\t\tENSG00000233750\t 6.04\twhole_blood\n', + 'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000240361\t3.1\tinvalid\n', + 'NA19677\t1kg project nåme with uniçøde\t\tENSG00000233750\t5.31\tmuscle\n', + 'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000233750\t7.8\tmuscle\n', + 'NA19678\tTest Reprocessed Project\t\tENSG00000240361\t0.2\twhole_blood\n', + ], + unmatched_samples='NA19677, NA19678, NA19678_D1', + additional_errors=['Samples missing required "tissue": NA19675_D2'], + ) + + self.mock_gzip_file_iter.return_value = [ + self.mock_gzip_file_iter.return_value[0], 'NA19678_D1\t1kg project nåme with uniçøde\tNA19678\tENSG00000233750\t 6.04\twhole_blood\n', - ] + mock_gzip_file.__iter__.return_value[2:] + ] + self.mock_gzip_file_iter.return_value[2:] call_command('load_rna_seq', 'tpm', RNA_FILE_ID, '--ignore-extra-samples') # Existing outlier data should be unchanged self.assertEqual(RnaSeqOutlier.objects.count(), 3) # Test database models - existing_sample = Sample.objects.get(individual_id=1, sample_type='RNA', tissue_type='M') + existing_sample = self._assert_expected_existing_sample('muscle_samples.tsv.gz') existing_rna_samples = Sample.objects.filter(sample_type='RNA', rnaseqtpm__isnull=False) - self.assertEqual(existing_sample.guid, EXISTING_SAMPLE_GUID) - self.assertEqual(existing_sample.sample_id, 'NA19675_D2') - self.assertTrue(existing_sample.is_active) - self.assertIsNone(existing_sample.elasticsearch_index) - self.assertEqual(existing_sample.data_source, 'muscle_samples.tsv.gz') - self.assertEqual(existing_sample.tissue_type, 'M') new_sample = Sample.objects.get(individual_id=2, sample_type='RNA') self.assertEqual(new_sample.sample_id, 'NA19678_D1') @@ -79,67 +102,50 @@ def test_tpm(self, mock_open, mock_logger, mock_utils_logger, mock_gzip_open): self.assertEqual(models.filter(sample=existing_sample, gene_id='ENSG00000240361').count(), 0) self.assertEqual(models.get(sample=new_sample, gene_id='ENSG00000233750').tpm, 6.04) - mock_logger.info.assert_has_calls([ + self.mock_logger.info.assert_has_calls([ mock.call('create 1 RnaSeqTpm for NA19678_D1'), + mock.call('DONE'), ]) mock_utils_logger.warning.assert_has_calls([ mock.call('Skipped loading for the following 2 unmatched samples: NA19677, NA19678', None), ]) # Test a new sample created for a mismatched tissue and a row with 0.0 tpm - mock_gzip_file.__iter__.return_value[1] = 'NA19678_D1\t1kg project nåme with uniçøde\tNA19678\tENSG00000233750\t0.0\tfibroblasts\n' + self.mock_gzip_file_iter.return_value[1] = 'NA19678_D1\t1kg project nåme with uniçøde\tNA19678\tENSG00000233750\t0.0\tfibroblasts\n' call_command('load_rna_seq', 'tpm', 'new_file.tsv.gz', '--ignore-extra-samples') models = RnaSeqTpm.objects.select_related('sample').filter(sample__sample_id='NA19678_D1') self.assertEqual(models.count(), 2) self.assertSetEqual(set(models.values_list('sample__tissue_type', flat=True)), {'F', 'WB'}) self.assertEqual(models.get(gene_id='ENSG00000233750', sample__tissue_type='F').tpm, 0.0) self.assertEqual(models.values('sample').distinct().count(), 2) - mock_logger.info.assert_has_calls([ + self.mock_logger.info.assert_has_calls([ mock.call('create 1 RnaSeqTpm for NA19678_D1'), mock.call('DONE'), ]) - @mock.patch('seqr.management.commands.load_rna_seq.logger.info') - @mock.patch('seqr.management.commands.load_rna_seq.open') - @mock.patch('seqr.utils.file_utils.gzip.open') - def test_outlier(self, mock_gzip_open, mock_open, mock_logger): - mock_gzip_file = mock_gzip_open.return_value.__enter__.return_value - mock_gzip_file.__iter__.return_value = ['invalid\theader'] - - with self.assertRaises(ValueError) as e: - call_command('load_rna_seq', 'outlier', RNA_FILE_ID) - self.assertEqual(str(e.exception), - 'Invalid file: missing column(s): geneID, pValue, padjust, project, sampleID, tissue, zScore') - - mock_gzip_file.__iter__.return_value = [ - 'sampleID\tproject\tgeneID\tdetail\tpValue\tpadjust\tzScore\ttissue\n', - 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000240361\tdetail1\t0.01\t0.13\t-3.1\tmuscle\n', - 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000240361\tdetail2\t0.01\t0.13\t-3.1\tmuscle\n', - 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', - 'NA19675_D3\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', - 'NA19675_D4\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', - ] - mock_open.return_value.__enter__.return_value.__iter__.return_value = ['NA19675_D4\tNA19678'] - - with self.assertRaises(ErrorsWarningsException) as e: - call_command('load_rna_seq', 'outlier', RNA_FILE_ID) - self.assertEqual(e.exception.errors, - ['Unable to find matches for the following samples: NA19675_D3, NA19675_D4']) - + def test_outlier(self): + self._test_invalid_calls( + 'outlier', + expected_columns='geneID, pValue, padjust, project, sampleID, tissue, zScore', + file_data=[ + 'sampleID\tproject\tgeneID\tdetail\tpValue\tpadjust\tzScore\ttissue\n', + 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000240361\tdetail1\t0.01\t0.13\t-3.1\tmuscle\n', + 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000240361\tdetail2\t0.01\t0.13\t-3.1\tmuscle\n', + 'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', + 'NA19675_D3\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', + 'NA19675_D4\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', + ], + unmatched_samples='NA19675_D3, NA19675_D4', + ) + + self.mock_open.return_value.__enter__.return_value.__iter__.return_value = ['NA19675_D4\tNA19678'] with self.assertRaises(ErrorsWarningsException) as e: call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--mapping-file', 'map.tsv') self.assertEqual(e.exception.errors, ['Unable to find matches for the following samples: NA19675_D3']) call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--ignore-extra-samples') - rna_samples = Sample.objects.filter(individual_id=1, sample_id='NA19675_D2', sample_type='RNA') - self.assertEqual(len(rna_samples), 1) - sample = rna_samples.first() - self.assertEqual(sample.guid, EXISTING_SAMPLE_GUID) - self.assertTrue(sample.is_active) - self.assertIsNone(sample.elasticsearch_index) - #self.assertEqual(sample.data_source, 'new_muscle_samples.tsv.gz') TODO? - self.assertEqual(sample.tissue_type, 'M') + sample = self._assert_expected_existing_sample('all_tissue_tpms.tsv.gz') models = RnaSeqOutlier.objects.all() self.assertEqual(models.count(), 2) @@ -147,7 +153,7 @@ def test_outlier(self, mock_gzip_open, mock_open, mock_logger): self.assertListEqual(list(models.values_list('gene_id', 'p_adjust', 'p_value', 'z_score')), [ ('ENSG00000240361', 0.13, 0.01, -3.1), ('ENSG00000233750', 0.0000057, 0.064, 7.8), ]) - mock_logger.assert_has_calls([ + self.mock_logger.info.assert_has_calls([ mock.call('create 2 RnaSeqOutlier for NA19675_D2'), mock.call('DONE'), ])