diff --git a/hail_search/web_app.py b/hail_search/web_app.py index fc274d2c31..83efad67af 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -47,8 +47,12 @@ def hl_json_dumps(obj): return json.dumps(obj, default=_hl_json_default) async def sync_to_async_hail_query(request: web.Request, query: Callable, *args, timeout_s=QUERY_TIMEOUT_S, **kwargs): + request_body = None + if request.body_exists: + request_body = await request.json() + loop = asyncio.get_running_loop() - future = loop.run_in_executor(request.app.pool, functools.partial(query, await request.json(), *args, **kwargs)) + future = loop.run_in_executor(request.app.pool, functools.partial(query, request_body, *args, **kwargs)) try: return await asyncio.wait_for(future, timeout_s) except asyncio.TimeoutError: @@ -94,6 +98,8 @@ async def multi_lookup(request: web.Request) -> web.Response: async def status(request: web.Request) -> web.Response: + # Make sure the hail backend process is still alive. + await sync_to_async_hail_query(request, lambda _: hl.eval(1 + 1)) return web.json_response({'success': True}) diff --git a/requirements-dev.txt b/requirements-dev.txt index e22f64d378..20a944635d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -22,7 +22,7 @@ click==8.1.3 # via pip-tools coverage==5.1 # via -r requirements-dev.in -django==3.2.24 +django==3.2.25 # via # -c requirements.txt # django-appconf diff --git a/requirements.txt b/requirements.txt index 774e584491..453c702f08 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,7 +26,7 @@ defusedxml==0.7.1 # via # python3-openid # social-auth-core -django==3.2.24 +django==3.2.25 # via # -r requirements.in # django-anymail diff --git a/seqr/management/commands/load_rna_seq.py b/seqr/management/commands/load_rna_seq.py index 8aff327956..fa00715bb3 100644 --- a/seqr/management/commands/load_rna_seq.py +++ b/seqr/management/commands/load_rna_seq.py @@ -1,9 +1,10 @@ import logging +from collections import defaultdict from django.core.management.base import BaseCommand from seqr.models import Sample from seqr.views.utils.file_utils import parse_file -from seqr.views.utils.dataset_utils import load_rna_seq, RNA_DATA_TYPE_CONFIGS +from seqr.views.utils.dataset_utils import load_rna_seq, post_process_rna_data, RNA_DATA_TYPE_CONFIGS logger = logging.getLogger(__name__) @@ -24,18 +25,37 @@ def handle(self, *args, **options): mapping_file = parse_file(options['mapping_file'], f) data_type = options['data_type'] - self.model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class'] + model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class'] - sample_guids, _, _ = load_rna_seq( - data_type, options['input_file'], self._save_sample_data, lambda *args: {}, create_models_before_save=True, + sample_data_by_guid = defaultdict(list) + + def _save_sample_data(sample_guid, row): + sample_data_by_guid[sample_guid].append(row) + + possible_sample_guids, _, _ = load_rna_seq( + data_type, options['input_file'], _save_sample_data, mapping_file=mapping_file, ignore_extra_samples=options['ignore_extra_samples']) + sample_models_by_guid = { + s['guid']: s for s in Sample.objects.filter(guid__in=sample_data_by_guid).values('guid', 'id', 'sample_id') + } + errors = [] + sample_guids = [] + for sample_guid in possible_sample_guids: + data_rows, error = post_process_rna_data(sample_guid, sample_data_by_guid[sample_guid]) + if error: + errors.append(error) + continue + + sample_guids.append(sample_guid) + sample_model = sample_models_by_guid[sample_guid] + models = model_cls.objects.bulk_create( + [model_cls(sample_id=sample_model['id'], **data) for data in data_rows], batch_size=1000) + logger.info(f'create {len(models)} {model_cls.__name__} for {sample_model["sample_id"]}') + Sample.bulk_update(user=None, update_json={'is_active': True}, guid__in=sample_guids) - logger.info('DONE') + for error in errors: + logger.info(error) - def _save_sample_data(self, sample_guid, data_by_gene): - sample = Sample.objects.get(guid=sample_guid) - models = self.model_cls.objects.bulk_create( - [self.model_cls(sample=sample, **data) for data in data_by_gene.values()], batch_size=1000) - logger.info(f'create {len(models)} {self.model_cls.__name__} for {sample.sample_id}') + logger.info('DONE') diff --git a/seqr/management/tests/load_rna_seq_tests.py b/seqr/management/tests/load_rna_seq_tests.py index 6353809f5a..2b95be2185 100644 --- a/seqr/management/tests/load_rna_seq_tests.py +++ b/seqr/management/tests/load_rna_seq_tests.py @@ -68,12 +68,10 @@ def test_tpm(self, mock_utils_logger): 'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000240361\t12.6\t\n', 'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000233750\t1.26\t\n', 'NA19678_D1\t1kg project nåme with uniçøde\t\tENSG00000233750\t 6.04\twhole_blood\n', - 'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000240361\t3.1\tinvalid\n', 'NA19677\t1kg project nåme with uniçøde\t\tENSG00000233750\t5.31\tmuscle\n', - 'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000233750\t7.8\tmuscle\n', 'NA19678\tTest Reprocessed Project\t\tENSG00000240361\t0.2\twhole_blood\n', ], - unmatched_samples='NA19677, NA19678, NA19678_D1', + unmatched_samples='NA19677 (1kg project nåme with uniçøde), NA19678 (Test Reprocessed Project), NA19678_D1 (1kg project nåme with uniçøde)', additional_errors=['Samples missing required "tissue": NA19675_D2'], ) @@ -108,7 +106,7 @@ def test_tpm(self, mock_utils_logger): mock.call('DONE'), ]) mock_utils_logger.warning.assert_has_calls([ - mock.call('Skipped loading for the following 2 unmatched samples: NA19677, NA19678', None), + mock.call('Skipped loading for the following 2 unmatched samples: NA19677 (1kg project nåme with uniçøde), NA19678 (Test Reprocessed Project)', None), ]) # Test a new sample created for a mismatched tissue and a row with 0.0 tpm @@ -136,13 +134,13 @@ def test_outlier(self): 'NA19675_D3\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', 'NA19675_D4\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n', ], - unmatched_samples='NA19675_D3, NA19675_D4', + unmatched_samples='NA19675_D3 (1kg project nåme with uniçøde), NA19675_D4 (1kg project nåme with uniçøde)', ) self.mock_open.return_value.__enter__.return_value.__iter__.return_value = ['NA19675_D4\tNA19678'] with self.assertRaises(ErrorsWarningsException) as e: call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--mapping-file', 'map.tsv') - self.assertEqual(e.exception.errors, ['Unable to find matches for the following samples: NA19675_D3']) + self.assertEqual(e.exception.errors, ['Unable to find matches for the following samples: NA19675_D3 (1kg project nåme with uniçøde)']) call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--ignore-extra-samples') diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 3e1739435a..a9706a8e02 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -20,7 +20,8 @@ from seqr.utils.vcf_utils import validate_vcf_exists from seqr.views.utils.airflow_utils import trigger_data_loading, write_data_loading_pedigree -from seqr.views.utils.dataset_utils import load_rna_seq, load_phenotype_prioritization_data_file, RNA_DATA_TYPE_CONFIGS +from seqr.views.utils.dataset_utils import load_rna_seq, load_phenotype_prioritization_data_file, RNA_DATA_TYPE_CONFIGS, \ + post_process_rna_data from seqr.views.utils.file_utils import parse_file, get_temp_upload_directory, load_uploaded_file from seqr.views.utils.json_utils import create_json_response from seqr.views.utils.json_to_orm_utils import update_model_from_json @@ -272,14 +273,17 @@ def update_rna_seq(request): file_name_prefix = f'rna_sample_data__{data_type}__{datetime.now().isoformat()}' + sample_files = {} + def _save_sample_data(sample_guid, sample_data): - file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid)) - with gzip.open(file_name, 'wt') as f: - json.dump(sample_data, f) + if sample_guid not in sample_files: + file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid)) + sample_files[sample_guid] = gzip.open(file_name, 'at') + sample_files[sample_guid].write(f'{json.dumps(sample_data)}\n') try: sample_guids, info, warnings = load_rna_seq( - data_type, file_path, _save_sample_data, lambda sample_guid: _load_saved_sample_data(file_name_prefix, sample_guid), + data_type, file_path, _save_sample_data, user=request.user, mapping_file=mapping_file, ignore_extra_samples=request_json.get('ignoreExtraSamples')) except ValueError as e: return create_json_response({'error': str(e)}, status=400) @@ -300,7 +304,7 @@ def _load_saved_sample_data(file_name_prefix, sample_guid): file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid)) if os.path.exists(file_name): with gzip.open(file_name, 'rt') as f: - return json.load(f) + return [json.loads(line) for line in f.readlines()] return None @@ -312,10 +316,15 @@ def load_rna_seq_sample_data(request, sample_guid): request_json = json.loads(request.body) file_name = request_json['fileName'] data_type = request_json['dataType'] - data_by_gene = _load_saved_sample_data(file_name, sample_guid) + config = RNA_DATA_TYPE_CONFIGS[data_type] + + data_rows = _load_saved_sample_data(file_name, sample_guid) + data_rows, error = post_process_rna_data(sample_guid, data_rows, **config.get('post_process_kwargs', {})) + if error: + return create_json_response({'error': error}, status=400) - model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class'] - model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_by_gene.values()]) + model_cls = config['model_class'] + model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_rows]) update_model_from_json(sample, {'is_active': True}, user=request.user) return create_json_response({'success': True}) diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py index 71eb3ccf4d..1fbadfd7ef 100644 --- a/seqr/views/apis/data_manager_api_tests.py +++ b/seqr/views/apis/data_manager_api_tests.py @@ -275,44 +275,42 @@ RNA_SPLICE_SAMPLE_GUID = 'S000151_na19675_1' PLACEHOLDER_GUID = 'S0000100' RNA_FILE_ID = 'gs://rna_data/new_muscle_samples.tsv.gz' -SAMPLE_GENE_OUTLIER_DATA = { - 'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'p_value': '0.01', 'p_adjust': '0.13', 'z_score': '-3.1'}, - 'ENSG00000233750': {'gene_id': 'ENSG00000233750', 'p_value': '0.064', 'p_adjust': '0.0000057', 'z_score': '7.8'}, -} -SAMPLE_GENE_TPM_DATA = { - 'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'tpm': '7.8'}, - 'ENSG00000233750': {'gene_id': 'ENSG00000233750', 'tpm': '0.0'}, -} -SAMPLE_GENE_SPLICE_DATA = { - 'ENSG00000233750-2-167254166-167258349-*-psi3': { +SAMPLE_GENE_OUTLIER_DATA = [ + {'gene_id': 'ENSG00000240361', 'p_value': '0.01', 'p_adjust': '0.13', 'z_score': '-3.1'}, + {'gene_id': 'ENSG00000233750', 'p_value': '0.064', 'p_adjust': '0.0000057', 'z_score': '7.8'}, +] +SAMPLE_GENE_TPM_DATA = [ + {'gene_id': 'ENSG00000240361', 'tpm': '7.8'}, + {'gene_id': 'ENSG00000233750', 'tpm': '0.0'}, +] +SAMPLE_GENE_SPLICE_DATA = [ + { 'chrom': '2', 'start': 167254166, 'end': 167258349, 'strand': '*', 'type': 'psi3', 'p_value': 1.56e-25, 'z_score': -4.9, 'delta_psi': -0.46, 'read_count': 166, 'gene_id': 'ENSG00000233750', - 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, 'rank': 1, + 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, }, - 'ENSG00000240361-7-132885746-132975168-*-psi5': { + { 'chrom': '7', 'start': 132885746, 'end': 132975168, 'strand': '*', 'type': 'psi5', 'p_value': 1.08e-56, 'z_score': -6.53, 'delta_psi': -0.85, 'read_count': 231, 'gene_id': 'ENSG00000240361', - 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, 'rank': 0, + 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, }, -} +] SAMPLE_GENE_SPLICE_DATA2 = { - '-2-167258096-167258349-*-psi3': { 'chrom': '2', 'start': 167258096, 'end': 167258349, 'strand': '*', 'type': 'psi3', 'p_value': 1.56e-25, 'z_score': 6.33, 'delta_psi': 0.45, 'read_count': 143, 'gene_id': '', - 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, 'rank': 0, + 'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, } -} RNA_OUTLIER_SAMPLE_DATA = { - RNA_MUSCLE_SAMPLE_GUID: json.dumps(SAMPLE_GENE_OUTLIER_DATA), - PLACEHOLDER_GUID: json.dumps({'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'p_value': '0.04', 'p_adjust': '0.112', 'z_score': '1.9'}}), + RNA_MUSCLE_SAMPLE_GUID: '\n'.join([json.dumps(row) for row in SAMPLE_GENE_OUTLIER_DATA]) + '\n', + PLACEHOLDER_GUID: json.dumps({'gene_id': 'ENSG00000240361', 'p_value': '0.04', 'p_adjust': '0.112', 'z_score': '1.9'}) + '\n', } RNA_TPM_SAMPLE_DATA = { - RNA_MUSCLE_SAMPLE_GUID: json.dumps(SAMPLE_GENE_TPM_DATA), - PLACEHOLDER_GUID: json.dumps({'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'tpm': '0.112'}}), + RNA_MUSCLE_SAMPLE_GUID: '\n'.join([json.dumps(row) for row in SAMPLE_GENE_TPM_DATA]) + '\n', + PLACEHOLDER_GUID: json.dumps({'gene_id': 'ENSG00000240361', 'tpm': '0.112'}) + '\n', } RNA_SPLICE_SAMPLE_DATA = { - RNA_SPLICE_SAMPLE_GUID: json.dumps(SAMPLE_GENE_SPLICE_DATA), - PLACEHOLDER_GUID: json.dumps(SAMPLE_GENE_SPLICE_DATA2), + RNA_SPLICE_SAMPLE_GUID: '\n'.join([json.dumps(row) for row in SAMPLE_GENE_SPLICE_DATA]) + '\n', + PLACEHOLDER_GUID: json.dumps(SAMPLE_GENE_SPLICE_DATA2) + '\n', } RNA_FILENAME_TEMPLATE = 'rna_sample_data__{}__2020-04-15T00:00:00' @@ -670,8 +668,8 @@ def test_kibana_proxy(self): ['NA20870', '1kg project nåme with uniçøde', 'ENSG00000240361', 'fibroblasts', 'detail2', 0.01, 0.13, -3.1], ], 'write_data': { - '{"ENSG00000233750": {"gene_id": "ENSG00000233750", "p_value": "0.064", "p_adjust": "0.0000057", "z_score": "7.8"}}', - '{"ENSG00000240361": {"gene_id": "ENSG00000240361", "p_value": "0.01", "p_adjust": "0.13", "z_score": "-3.1"}}' + '{"gene_id": "ENSG00000233750", "p_value": "0.064", "p_adjust": "0.0000057", "z_score": "7.8"}\n', + '{"gene_id": "ENSG00000240361", "p_value": "0.01", "p_adjust": "0.13", "z_score": "-3.1"}\n' }, 'new_data': [ ['NA19675_D2', '1kg project nåme with uniçøde', 'ENSG00000240361', 'muscle', 'detail1', 0.01, 0.13, -3.1], @@ -680,7 +678,7 @@ def test_kibana_proxy(self): ['NA19675_D3', 'Test Reprocessed Project', 'ENSG00000233750', 'muscle', 'detail1', 0.064, '0.0000057', 7.8], ['NA20888', 'Test Reprocessed Project', 'ENSG00000240361', 'muscle', '', 0.04, 0.112, 1.9], ], - 'skipped_samples': 'NA19675_D3', + 'skipped_samples': 'NA19675_D3 (Test Reprocessed Project)', 'sample_tissue_type': 'M', 'num_parsed_samples': 3, 'initial_model_count': 3, @@ -702,22 +700,20 @@ def test_kibana_proxy(self): ['NA20870', 'Test Reprocessed Project', 'ENSG00000240361', 'NA20870', 'muscle', 7.8], ['NA20870', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA20870', 'fibroblasts', 0.0], ], - 'write_data': {'{"ENSG00000240361": {"gene_id": "ENSG00000240361", "tpm": "7.8"}}', - '{"ENSG00000233750": {"gene_id": "ENSG00000233750", "tpm": "0.0"}}'}, + 'write_data': {'{"gene_id": "ENSG00000240361", "tpm": "7.8"}\n', + '{"gene_id": "ENSG00000233750", "tpm": "0.0"}\n'}, 'new_data': [ # existing sample NA19675_D2 ['NA19675_D2', '1kg project nåme with uniçøde', 'ENSG00000240361', 'NA19675_D2', 'muscle', 7.8], ['NA19675_D2', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA19675_D2', 'muscle', 0.0], # no matched individual NA19675_D3 ['NA19675_D3', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA19675_D3', 'fibroblasts', 0.064], - # skip GTEX samples - ['GTEX_001', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA19675_D3', 'whole_blood', 1.95], # a different project sample NA20888 ['NA20888', 'Test Reprocessed Project', 'ENSG00000240361', 'NA20888', 'muscle', 0.112], # a project mismatched sample NA20878 ['NA20878', 'Test Reprocessed Project', 'ENSG00000233750', 'NA20878', 'fibroblasts', 0.064], ], - 'skipped_samples': 'NA19675_D3, NA20878', + 'skipped_samples': 'NA19675_D3 (1kg project nåme with uniçøde), NA20878 (Test Reprocessed Project)', 'sample_tissue_type': 'M', 'num_parsed_samples': 4, 'initial_model_count': 4, @@ -726,6 +722,7 @@ def test_kibana_proxy(self): 'get_models_json': lambda models: list(models.values_list('gene_id', 'tpm')), 'expected_models_json': [('ENSG00000240361', 7.8), ('ENSG00000233750', 0.0)], 'sample_guid': RNA_MUSCLE_SAMPLE_GUID, + 'mismatch_field': 'tpm', }, 'splice_outlier': { 'model_cls': RnaSeqSpliceOutlier, @@ -744,14 +741,14 @@ def test_kibana_proxy(self): ['NA20870', '1kg project nåme with uniçøde', 'ENSG00000135953', 'chr2', 167258096, 167258349, '*', 'XIRP2', 'psi3', 1.56E-25, 6.33, 0.45, 143, 'muscle', 0.03454739, 1, 20], ], - 'write_data': {'{"ENSG00000233750-2-167258096-167258349-*-psi3": {"chrom": "2", "start": 167258096,' + 'write_data': {'{"chrom": "2", "start": 167258096,' ' "end": 167258349, "strand": "*", "type": "psi3", "p_value": 1.56e-25, "z_score": 6.33,' ' "delta_psi": 0.45, "read_count": 143, "gene_id": "ENSG00000233750",' - ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20, "rank": 0}}', - '{"ENSG00000135953-2-167258096-167258349-*-psi3": {"chrom": "2", "start": 167258096,' + ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20}\n', + '{"chrom": "2", "start": 167258096,' ' "end": 167258349, "strand": "*", "type": "psi3", "p_value": 1.56e-25, "z_score": 6.33,' ' "delta_psi": 0.45, "read_count": 143, "gene_id": "ENSG00000135953",' - ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20, "rank": 0}}', + ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20}\n', }, 'new_data': [ # existing sample NA19675_1 @@ -769,7 +766,7 @@ def test_kibana_proxy(self): ['NA20878', 'Test Reprocessed Project', 'ENSG00000233750', 'chr2', 167258096, 167258349, '*', 'XIRP2', 'psi3', 1.56E-25, 6.33, 0.45, 143, 'fibroblasts', 0.03454739, 1, 20], ], - 'skipped_samples': 'NA19675_D3, NA20878', + 'skipped_samples': 'NA19675_D3 (1kg project nåme with uniçøde), NA20878 (Test Reprocessed Project)', 'sample_tissue_type': 'F', 'num_parsed_samples': 4, 'initial_model_count': 7, @@ -778,13 +775,13 @@ def test_kibana_proxy(self): 'allow_missing_gene': True, 'get_models_json': lambda models: list( models.values_list('gene_id', 'chrom', 'start', 'end', 'strand', 'type', 'p_value', 'z_score', 'delta_psi', - 'read_count', 'rare_disease_samples_with_junction', 'rare_disease_samples_total')), + 'read_count', 'rare_disease_samples_with_junction', 'rare_disease_samples_total', 'rank')), 'expected_models_json': [ - ('ENSG00000233750', '2', 167254166, 167258349, '*', 'psi3', 1.56e-25, -4.9, -0.46, 166, 1, 20), - ('ENSG00000240361', '7', 132885746, 132975168, '*', 'psi5', 1.08e-56, -6.53, -0.85, 231, 1, 20) + ('ENSG00000233750', '2', 167254166, 167258349, '*', 'psi3', 1.56e-25, -4.9, -0.46, 166, 1, 20, 1), + ('ENSG00000240361', '7', 132885746, 132975168, '*', 'psi5', 1.08e-56, -6.53, -0.85, 231, 1, 20, 0) ], 'sample_guid': RNA_SPLICE_SAMPLE_GUID, - 'row_id': 'ENSG00000240361-7-132885746-132886973-*-psi5', + 'row_id': 'ENSG00000233750-2-167254166-167258349-*-psi3', }, } @@ -876,20 +873,11 @@ def _set_file_iter_stdout(rows): f'{", ".join(sorted([col for col in header if col not in params["optional_headers"]]))}', }) - mismatch_row = loaded_data_row[:-1] + [loaded_data_row[-1] - 2] - _set_file_iter_stdout([header, loaded_data_row, loaded_data_row, mismatch_row]) - response = self.client.post(url, content_type='application/json', data=json.dumps(body)) - self.assertEqual(response.status_code, 400) - self.assertDictEqual(response.json(), { - 'errors': [f'Error in {loaded_data_row[0]}: mismatched entries for {params.get("row_id", mismatch_row[2])}'], - 'warnings': None, - }) - missing_sample_row = ['NA19675_D3'] + loaded_data_row[1:] _set_file_iter_stdout([header, loaded_data_row, missing_sample_row]) response = self.client.post(url, content_type='application/json', data=json.dumps(body)) self.assertEqual(response.status_code, 400) - self.assertDictEqual(response.json(), {'errors': ['Unable to find matches for the following samples: NA19675_D3'], 'warnings': None}) + self.assertDictEqual(response.json(), {'errors': ['Unable to find matches for the following samples: NA19675_D3 (1kg project nåme with uniçøde)'], 'warnings': None}) unknown_gene_id_row1 = loaded_data_row[:2] + ['NOT_A_GENE_ID1'] + loaded_data_row[3:] unknown_gene_id_row2 = loaded_data_row[:2] + ['NOT_A_GENE_ID2'] + loaded_data_row[3:] @@ -979,7 +967,7 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s 'parentEntityIds': [params['sample_guid']], 'updateType': 'bulk_delete'}}), ('update 1 Samples', {'dbUpdate': { 'dbEntity': 'Sample', 'entityIds': [params['sample_guid']], - 'updateType': 'bulk_update', 'updateFields': ['data_source']}}), + 'updateType': 'bulk_update', 'updateFields': ['data_source', 'is_active']}}), ]) self.assertTrue(params['sample_guid'] in response_json['sampleGuids']) self.assertEqual(mock_send_slack.call_count, 2) @@ -997,7 +985,7 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s # test database models are correct self.assertEqual(model_cls.objects.count(), params['initial_model_count'] - deleted_count) sample_guid = self._check_rna_sample_model(individual_id=1, data_source='new_muscle_samples.tsv.gz', - tissue_type=params.get('sample_tissue_type')) + tissue_type=params.get('sample_tissue_type'), is_active_sample=False) self.assertSetEqual(set(response_json['sampleGuids']), {sample_guid, new_sample_guid}) # test correct file interactions @@ -1007,9 +995,9 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s f'{RNA_FILENAME_TEMPLATE.format(data_type)}__{new_sample_guid if sample_guid == PLACEHOLDER_GUID else sample_guid}.json.gz': data for sample_guid, data in params['parsed_file_data'].items() } - mock_open.assert_has_calls([mock.call(filename, 'wt') for filename in expected_files]) + mock_open.assert_has_calls([mock.call(filename, 'at') for filename in expected_files]) self.assertEqual( - ''.join([call.args[0] for call in mock_files[filename].__enter__.return_value.write.call_args_list]), + ''.join([call.args[0] for call in mock_files[filename].write.call_args_list]), expected_files[filename], ) @@ -1025,7 +1013,7 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s num_created_samples=2) self.assertSetEqual( - {''.join([call.args[0] for call in mock_file.__enter__.return_value.write.call_args_list]) for mock_file in mock_files.values()}, + {''.join([call.args[0] for call in mock_file.write.call_args_list]) for mock_file in mock_files.values()}, params['write_data'], ) @@ -1042,11 +1030,11 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s self.assertTrue(second_tissue_sample_guid != new_sample_guid) self.assertTrue(second_tissue_sample_guid in response_json['sampleGuids']) mock_open.assert_has_calls([ - mock.call(f'{RNA_FILENAME_TEMPLATE.format(data_type)}__{sample_guid}.json.gz', 'wt') + mock.call(f'{RNA_FILENAME_TEMPLATE.format(data_type)}__{sample_guid}.json.gz', 'at') for sample_guid in response_json['sampleGuids'] ]) self.assertSetEqual( - {''.join([call.args[0] for call in mock_file.__enter__.return_value.write.call_args_list]) for mock_file in mock_files.values()}, + {''.join([call.args[0] for call in mock_file.write.call_args_list]) for mock_file in mock_files.values()}, params['write_data'], ) @@ -1066,12 +1054,12 @@ def test_load_rna_seq_sample_data(self, mock_open, mock_os): model_cls = params['model_cls'] model_cls.objects.all().delete() self.reset_logs() - mock_open.return_value.__enter__.return_value.read.return_value = params['parsed_file_data'][sample_guid] + parsed_file_lines = params['parsed_file_data'][sample_guid].strip().split('\n') + mock_open.return_value.__enter__.return_value.readlines.return_value = parsed_file_lines file_name = RNA_FILENAME_TEMPLATE.format(data_type) - response = self.client.post(url, content_type='application/json', data=json.dumps({ - 'fileName': file_name, 'dataType': data_type, - })) + body = {'fileName': file_name, 'dataType': data_type} + response = self.client.post(url, content_type='application/json', data=json.dumps(body)) self.assertEqual(response.status_code, 200) self.assertDictEqual(response.json(), {'success': True}) @@ -1092,6 +1080,14 @@ def test_load_rna_seq_sample_data(self, mock_open, mock_os): self.assertListEqual(list(params['get_models_json'](models)), params['expected_models_json']) + mismatch_row = {**json.loads(parsed_file_lines[0]), params.get('mismatch_field', 'p_value'): '0.05'} + mock_open.return_value.__enter__.return_value.readlines.return_value = parsed_file_lines + [json.dumps(mismatch_row)] + response = self.client.post(url, content_type='application/json', data=json.dumps(body)) + self.assertEqual(response.status_code, 400) + self.assertDictEqual(response.json(), { + 'error': f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {params.get("row_id", mismatch_row["gene_id"])}' + }) + @classmethod def _join_data(cls, data): return ['\t'.join(line).encode('utf-8') for line in data] diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index fceb6bfcdf..682eba561d 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -695,6 +695,24 @@ def _has_required_table(table, validator, tables): return tables.isdisjoint(validator) +def _is_required_col(required_validator, row): + if not required_validator: + return False + + if required_validator is True: + return True + + match = re.match(r'CONDITIONAL \(([\w+(\s)?]+) = ([\w+(\s)?]+)\)', required_validator) + if not match: + return True + + field, value = match.groups() + return row[field] == value + + + + + def _validate_column_data(column, file_name, data, column_validator, warnings, errors): data_type = column_validator.get('data_type') data_type_validator = DATA_TYPE_VALIDATORS.get(data_type) @@ -712,7 +730,7 @@ def _validate_column_data(column, file_name, data, column_validator, warnings, e for row in data: value = row.get(column) if not value: - if required: + if _is_required_col(required, row): missing.append(_get_row_id(row)) elif recommended: check_recommend_condition = WARN_MISSING_CONDITIONAL_COLUMNS.get(column) @@ -875,7 +893,6 @@ def _add_row(row, family_id, row_type): individual_data_types={i.individual_id: i.data_types for i in individuals}, add_row=_add_row, variant_json_fields=['clinvar', 'variantId'], - saved_variant_annotations={'tags': ArrayAgg('varianttag__variant_tag_type__name', distinct=True)}, mme_values={'variant_ids': ArrayAgg('matchmakersubmissiongenes__saved_variant__saved_variant_json__variantId')}, include_metadata=True, include_mondo=True, diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index 647ff3d730..55ba602a9a 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -447,7 +447,7 @@ 'participant': { 'internal_project_id': {'data_type': 'reference'}, 'prior_testing': {'data_type': 'enumeration'}, - 'proband_relationship': {'required': True}, + 'proband_relationship': {'required': 'CONDITIONAL (sex = Male)'}, 'reported_race': {'enumerations': ['Asian', 'White', 'Black']}, 'age_at_enrollment': {'data_type': 'date'} }, @@ -616,7 +616,7 @@ def test_anvil_export(self, mock_google_authenticated, mock_zip): '1_248367227_HG00731', 'HG00731', 'HG00731', 'RP11', 'Known', 'paternal', 'Homozygous', 'GRCh37', '1', '248367227', 'TC', 'T', '-', '-', '-', '-', '-', '-', '-'], discovery_file) self.assertIn([ - '21_3343353_NA19675_1', 'NA19675_1', 'NA19675', 'RP11', 'Known', 'de novo', + '21_3343353_NA19675_1', 'NA19675_1', 'NA19675', 'RP11', 'Candidate', 'de novo', 'Heterozygous', 'GRCh37', '21', '3343353', 'GAGA', 'G', 'c.375_377delTCT', 'p.Leu126del', 'ENST00000258436', '-', '-', '-', '-'], discovery_file) self.assertIn([ @@ -717,7 +717,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat ] + [ 'The following tables are required in the data model but absent from the reports: subject, dna_read_data_set', ] + [ - 'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', + 'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00732, Broad_NA19678, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881', 'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)', 'The following entries have invalid values for "age_at_enrollment" in the "participant" table. Allowed values have data type date. Invalid values: Broad_NA19675_1 (18)', 'The following entries have invalid values for "reference_assembly" (from Airtable) in the "aligned_dna_short_read" table. Allowed values have data type integer. Invalid values: NA20888 (GRCh38), VCGS_FAM203_621_D2 (GRCh38)', @@ -993,7 +993,7 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): ]) self.assertIn([ 'Broad_NA19675_1_21_3343353', 'Broad_NA19675_1', '', 'SNV/INDEL', 'GRCh37', '21', '3343353', 'GAGA', 'G', '', - 'RP11', 'ENST00000258436', 'c.375_377delTCT', 'p.Leu126del', 'Heterozygous', '', 'de novo', '', '', 'Known', + 'RP11', 'ENST00000258436', 'c.375_377delTCT', 'p.Leu126del', 'Heterozygous', '', 'de novo', '', '', 'Candidate', 'Myasthenic syndrome, congenital, 8, with pre- and postsynaptic defects', 'OMIM:615120', 'Autosomal recessive|X-linked', 'Full', '', '', 'SR-ES', '', ], genetic_findings_file) @@ -1006,12 +1006,12 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): self.assertIn([ 'Broad_NA20889_1_248367227', 'Broad_NA20889', '', 'SNV/INDEL', 'GRCh37', '1', '248367227', 'TC', 'T', '', 'OR4G11P', 'ENST00000505820', 'c.3955G>A', 'c.1586-17C>G', 'Heterozygous', '', 'unknown', - 'Broad_NA20889_1_249045487', '', 'Known', 'IRIDA syndrome', 'MONDO:0008788', 'Autosomal dominant', + 'Broad_NA20889_1_249045487', '', 'Candidate', 'IRIDA syndrome', 'MONDO:0008788', 'Autosomal dominant', 'Full', '', '', 'SR-ES', '', ], genetic_findings_file) self.assertIn([ 'Broad_NA20889_1_249045487', 'Broad_NA20889', '', 'SNV/INDEL', 'GRCh37', '1', '249045487', 'A', 'G', '', - 'OR4G11P', '', '', '', 'Heterozygous', '', 'unknown', 'Broad_NA20889_1_248367227', '', 'Known', + 'OR4G11P', '', '', '', 'Heterozygous', '', 'unknown', 'Broad_NA20889_1_248367227', '', 'Candidate', 'IRIDA syndrome', 'MONDO:0008788', 'Autosomal dominant', 'Full', '', '', 'SR-ES', '', ], genetic_findings_file) @@ -1126,7 +1126,7 @@ def test_family_metadata(self): 'consanguinity': 'Unknown', 'condition_id': 'OMIM:615123', 'known_condition_name': '', - 'condition_inheritance': '', + 'condition_inheritance': 'Unknown', }) # Test empty project @@ -1225,7 +1225,7 @@ def test_variant_metadata(self): 'family_history': 'Yes', 'gene': 'OR4G11P', 'gene_id': 'ENSG00000240361', - 'gene_known_for_phenotype': 'Known', + 'gene_known_for_phenotype': 'Candidate', 'genetic_findings_id': 'NA20889_1_248367227', 'hgvsc': 'c.3955G>A', 'hgvsp': 'c.1586-17C>G', @@ -1253,7 +1253,7 @@ def test_variant_metadata(self): 'family_history': 'Yes', 'gene': None, 'gene_id': None, - 'gene_known_for_phenotype': 'Known', + 'gene_known_for_phenotype': 'Candidate', 'genetic_findings_id': 'NA20889_1_249045487', 'participant_id': 'NA20889', 'pos': 249045487, diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py index 62b682bc93..c1b94f1e08 100644 --- a/seqr/views/apis/summary_data_api_tests.py +++ b/seqr/views/apis/summary_data_api_tests.py @@ -37,8 +37,8 @@ "num_saved_variants": 2, "solve_status": "Unsolved", "sample_id": "NA20889", - "gene_known_for_phenotype-1": "Known", - "gene_known_for_phenotype-2": "Known", + "gene_known_for_phenotype-1": "Candidate", + "gene_known_for_phenotype-2": "Candidate", "variant_inheritance-1": "unknown", "variant_inheritance-2": "unknown", 'genetic_findings_id-1': 'NA20889_1_248367227', @@ -105,6 +105,8 @@ 'allele_balance_or_heteroplasmy_percentage-2': None, 'notes-1': None, 'notes-2': None, + 'tags-1': ['Tier 1 - Novel gene and phenotype'], + 'tags-2': ['Tier 1 - Novel gene and phenotype'], } EXPECTED_SAMPLE_METADATA_ROW = { "dbgap_submission": "No", @@ -147,6 +149,7 @@ 'alt-1': 'T', 'chrom-1': '1', 'gene_known_for_phenotype-1': 'Candidate', + 'tags-1': ['Tier 1 - Novel gene and phenotype'], 'pos-1': 248367227, 'end-1': None, 'ref-1': 'TC', diff --git a/seqr/views/utils/anvil_metadata_utils.py b/seqr/views/utils/anvil_metadata_utils.py index 800887229e..e7dc41a648 100644 --- a/seqr/views/utils/anvil_metadata_utils.py +++ b/seqr/views/utils/anvil_metadata_utils.py @@ -126,7 +126,7 @@ def parse_anvil_metadata( variant_json_fields: Iterable[str] = None, post_process_variant: Callable[[dict, list[dict]], dict] = None, include_no_individual_families: bool = False, omit_airtable: bool = False, include_metadata: bool = False, include_discovery_sample_id: bool = False, include_mondo: bool = False, include_parent_mnvs: bool = False, - proband_only_variants: bool = False, saved_variant_annotations: dict = None): + proband_only_variants: bool = False): individual_samples = individual_samples or (_get_loaded_before_date_project_individual_samples(projects, max_loaded_date) \ if max_loaded_date else _get_all_project_individual_samples(projects)) @@ -147,7 +147,6 @@ def parse_anvil_metadata( saved_variants_by_family = _get_parsed_saved_discovery_variants_by_family( list(family_data_by_id.keys()), variant_filter=variant_filter, variant_json_fields=variant_json_fields, - saved_variant_annotations=saved_variant_annotations, ) condition_map = _get_condition_map(family_data_by_id.values()) @@ -285,21 +284,14 @@ def _post_process_variant_metadata(v, gene_variants, include_parent_mnvs=False): def _get_parsed_saved_discovery_variants_by_family( - families: Iterable[Family], variant_filter: dict, variant_json_fields: list[str], saved_variant_annotations: dict, + families: Iterable[Family], variant_filter: dict, variant_json_fields: list[str], ): tag_types = VariantTagType.objects.filter(project__isnull=True, category=DISCOVERY_CATEGORY) - annotations = { - 'gene_known_for_phenotype': Case(When( - Q(family__post_discovery_omim_numbers__len=0, family__mondo_id__isnull=True), - then=Value('Candidate')), default=Value('Known') - ), - **(saved_variant_annotations or {}), - } project_saved_variants = SavedVariant.objects.filter( varianttag__variant_tag_type__in=tag_types, family__id__in=families, **(variant_filter or {}), - ).order_by('created_date').distinct().annotate(**annotations) + ).order_by('created_date').distinct().annotate(tags=ArrayAgg('varianttag__variant_tag_type__name', distinct=True)) variants = [] gene_ids = set() @@ -321,8 +313,9 @@ def _get_parsed_saved_discovery_variants_by_family( 'hgvsc': (main_transcript.get('hgvsc') or '').split(':')[-1], 'hgvsp': (main_transcript.get('hgvsp') or '').split(':')[-1], 'seqr_chosen_consequence': main_transcript.get('majorConsequence'), + 'gene_known_for_phenotype': 'Known' if 'Known gene for phenotype' in variant.tags else 'Candidate', **{k: variant_json.get(k) for k in ['genotypes', 'svType', 'svName', 'end'] + (variant_json_fields or [])}, - **{k: getattr(variant, k) for k in ['family_id', 'ref', 'alt', *annotations.keys()]}, + **{k: getattr(variant, k) for k in ['family_id', 'ref', 'alt', 'tags']}, }) genes_by_id = get_genes(gene_ids) @@ -565,5 +558,5 @@ def _format_omim_conditions(conditions): 'known_condition_name': '|'.join(sorted({o['phenotype_description'] for o in conditions if o.get('phenotype_description')})), 'condition_inheritance': '|'.join(sorted({ MIM_INHERITANCE_MAP.get(i, i) for o in conditions if o.get('phenotype_inheritance') for i in o['phenotype_inheritance'].split(', ') - })) + })) or 'Unknown', } diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 8824860637..087b327718 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -20,9 +20,6 @@ logger = SeqrLogger(__name__) -MAX_UNSAVED_DATA_PER_SAMPLE = 5000 - - def load_mapping_file(mapping_file_path, user): file_content = parse_file(mapping_file_path, file_iter(mapping_file_path, user=user)) return load_mapping_file_content(file_content) @@ -283,7 +280,7 @@ def _get_splice_id(row): def _add_splice_rank(sample_data_rows): - sorted_data_rows = sorted([data_row for data_row in sample_data_rows.values()], key=lambda d: d[P_VALUE_COL]) + sorted_data_rows = sorted([data_row for data_row in sample_data_rows], key=lambda d: d[P_VALUE_COL]) for i, data_row in enumerate(sorted_data_rows): data_row['rank'] = i @@ -297,7 +294,7 @@ def _add_splice_rank(sample_data_rows): 'tpm': { 'model_class': RnaSeqTpm, 'columns': TPM_HEADER_COLS, - 'additional_kwargs': {'should_skip': lambda row: row[SAMPLE_ID_COL].startswith('GTEX')}, + 'additional_kwargs': {}, }, 'splice_outlier': { 'model_class': RnaSeqSpliceOutlier, @@ -305,8 +302,10 @@ def _add_splice_rank(sample_data_rows): 'additional_kwargs': { 'format_fields': SPLICE_OUTLIER_FORMATTER, 'allow_missing_gene': True, - 'get_unique_key': _get_splice_id, + }, + 'post_process_kwargs': { 'post_process': _add_splice_rank, + 'get_unique_key': _get_splice_id, }, }, } @@ -330,16 +329,15 @@ def _validate_rna_header(header, column_map): def _load_rna_seq_file( - file_path, user, potential_loaded_samples, update_sample_models, save_sample_data, get_matched_sample, mismatches, - column_map, mapping_file=None, get_unique_key=None, allow_missing_gene=False, ignore_extra_samples=False, - should_skip=None, format_fields=None, + file_path, user, potential_loaded_samples, update_sample_models, save_sample_data, get_matched_sample, + column_map, mapping_file=None, allow_missing_gene=False, ignore_extra_samples=False, + format_fields=None, ): sample_id_to_individual_id_mapping = {} if mapping_file: sample_id_to_individual_id_mapping = load_mapping_file_content(mapping_file) - samples_by_guid = defaultdict(dict) f = file_iter(file_path, user=user) parsed_f = parse_file(file_path.replace('.gz', ''), f, iter_file=True) header = next(parsed_f) @@ -349,11 +347,8 @@ def _load_rna_seq_file( unmatched_samples = set() missing_required_fields = defaultdict(set) gene_ids = set() - current_sample = None for line in tqdm(parsed_f, unit=' rows'): row = dict(zip(header, line)) - if should_skip and should_skip(row): - continue row_dict = {mapped_key: row[col] for mapped_key, col in column_map.items()} for mapped_key, format_func in (format_fields or {}).items(): @@ -390,34 +385,12 @@ def _load_rna_seq_file( # If there are definite errors, do not process/save data, just continue to check for additional errors continue - if current_sample != sample_guid: - # If a large amount of data has been parsed for the previous sample, save and do not keep in memory - if len(samples_by_guid[current_sample]) > MAX_UNSAVED_DATA_PER_SAMPLE: - save_sample_data(current_sample, samples_by_guid[current_sample]) - del samples_by_guid[current_sample] - current_sample = sample_guid - - gene_or_unique_id = get_unique_key(row_dict) if get_unique_key else gene_id - existing_data = samples_by_guid[sample_guid].get(gene_or_unique_id) - if existing_data and existing_data != row_dict: - mismatches[sample_guid].add(gene_or_unique_id) - - samples_by_guid[sample_guid][gene_or_unique_id] = row_dict + save_sample_data(sample_guid, row_dict) errors, warnings = _process_rna_errors( gene_ids, missing_required_fields, unmatched_samples, ignore_extra_samples, loaded_samples, ) - if not errors: - for sample_guid, sample_data in samples_by_guid.items(): - save_sample_data(sample_guid, sample_data) - - if mismatches: - errors = [ - f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {", ".join(mismatch_ids)}' - for sample_guid, mismatch_ids in mismatches.items() - ] + errors - if errors: raise ErrorsWarningsException(errors) @@ -441,7 +414,7 @@ def _process_rna_errors(gene_ids, missing_required_fields, unmatched_samples, ig errors.append(f'Unknown Gene IDs: {", ".join(sorted(unknown_gene_ids))}') if unmatched_samples: - unmatched_sample_ids = ', '.join(sorted([sample_key[0] for sample_key in unmatched_samples])) + unmatched_sample_ids = ', '.join(sorted({f'{sample_key[0]} ({sample_key[1]})' for sample_key in unmatched_samples})) if ignore_extra_samples: warnings.append(f'Skipped loading for the following {len(unmatched_samples)} unmatched samples: {unmatched_sample_ids}') else: @@ -453,7 +426,7 @@ def _process_rna_errors(gene_ids, missing_required_fields, unmatched_samples, ig return errors, warnings -def _load_rna_seq(model_cls, file_path, save_data, load_saved_data, *args, user=None, create_models_before_save=False, post_process=None, **kwargs): +def _load_rna_seq(model_cls, file_path, save_data, *args, user=None, **kwargs): projects = get_internal_projects() data_source = file_path.split('/')[-1].split('_-_')[-1] @@ -472,14 +445,11 @@ def _load_rna_seq(model_cls, file_path, save_data, load_saved_data, *args, user= sample_guids_to_load = set() existing_samples_by_guid = {} samples_to_create = {} - created_samples = set() - mismatches = defaultdict(set) def update_sample_models(): - remaining_samples_to_create = [s for key, s in samples_to_create.items() if key not in created_samples] - if remaining_samples_to_create: + if samples_to_create: _create_samples( - remaining_samples_to_create, + samples_to_create.values(), user=user, data_source=data_source, sample_type=Sample.SAMPLE_TYPE_RNA, @@ -496,7 +466,7 @@ def update_sample_models(): if to_delete: model_cls.bulk_delete(user, to_delete) - Sample.bulk_update(user, {'data_source': data_source}, guid__in=existing_samples_by_guid) + Sample.bulk_update(user, {'data_source': data_source, 'is_active': False}, guid__in=existing_samples_by_guid) for guid in to_delete_sample_individuals: existing_samples_by_guid[guid]['dataSource'] = data_source @@ -504,22 +474,8 @@ def save_sample_data(sample_guid, sample_data): if not sample_data: return - if create_models_before_save: - update_sample_models() - created_samples.update(samples_to_create.keys()) - - prev_data = load_saved_data(sample_guid) or {} - new_mismatches = {k for k, v in prev_data.items() if k in sample_data and v != sample_data[k]} - if new_mismatches: - mismatches[sample_guid].update(new_mismatches) - sample_data.update(prev_data) - - if post_process: - post_process(sample_data) - sample_guids_to_load.add(sample_guid) save_data(sample_guid, sample_data) - return new_mismatches def get_matched_sample(sample_key, unmatched_samples, sample_id_to_individual_id_mapping): if sample_key in potential_samples: @@ -541,7 +497,7 @@ def get_matched_sample(sample_key, unmatched_samples, sample_id_to_individual_id warnings, not_loaded_count = _load_rna_seq_file( file_path, user, potential_loaded_samples, update_sample_models, save_sample_data, get_matched_sample, - mismatches, *args, **kwargs) + *args, **kwargs) message = f'Parsed {len(sample_guids_to_load) + not_loaded_count} RNA-seq samples' info = [message] logger.info(message, user) @@ -564,6 +520,25 @@ def get_matched_sample(sample_key, unmatched_samples, sample_id_to_individual_id return sample_guids_to_load, info, warnings +def post_process_rna_data(sample_guid, data, get_unique_key=None, post_process=None): + mismatches = set() + + data_by_key = {} + for row in data: + gene_or_unique_id = get_unique_key(row) if get_unique_key else row[GENE_ID_COL] + existing_data = data_by_key.get(gene_or_unique_id) + if existing_data and existing_data != row: + mismatches.add(gene_or_unique_id) + data_by_key[gene_or_unique_id] = row + + error = f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {", ".join(mismatches)}' if mismatches else None + data = data_by_key.values() + if post_process and not error: + post_process(data) + + return data, error + + RNA_MODEL_DISPLAY_NAME = { RnaSeqOutlier: 'Expression Outlier', RnaSeqSpliceOutlier: 'Splice Outlier', diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index 3daba835ca..4af1fb3450 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -102,10 +102,13 @@ def update_project_saved_variant_json(project_id, family_guids=None, dataset_typ def saved_variants_dataset_type_filter(dataset_type): xpos_filter_key = 'xpos__gte' if dataset_type == Sample.DATASET_TYPE_MITO_CALLS else 'xpos__lt' - return { - 'alt__isnull': dataset_type == Sample.DATASET_TYPE_SV_CALLS, - xpos_filter_key: get_xpos('M', 1), - } + dataset_filter = {xpos_filter_key: get_xpos('M', 1)} + if dataset_type == Sample.DATASET_TYPE_SV_CALLS: + dataset_filter['alt__isnull'] = True + else: + # Filter out manual variants with invalid characters, such as those used for STRs + dataset_filter['alt__regex'] = '^[ACGT]$' + return dataset_filter def parse_saved_variant_json(variant_json, family): diff --git a/ui/package-lock.json b/ui/package-lock.json index 080a18222d..ab391c8066 100644 --- a/ui/package-lock.json +++ b/ui/package-lock.json @@ -8485,9 +8485,9 @@ "dev": true }, "node_modules/follow-redirects": { - "version": "1.15.4", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", - "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", "dev": true, "funding": [ { @@ -25853,9 +25853,9 @@ "dev": true }, "follow-redirects": { - "version": "1.15.4", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", - "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", "dev": true }, "foreach": { diff --git a/ui/pages/Report/components/VariantMetadata.jsx b/ui/pages/Report/components/VariantMetadata.jsx index 6f03815d0e..2f7799961b 100644 --- a/ui/pages/Report/components/VariantMetadata.jsx +++ b/ui/pages/Report/components/VariantMetadata.jsx @@ -1,7 +1,7 @@ import React from 'react' import LoadReportTable from 'shared/components/table/LoadReportTable' -import { VARIANT_METADATA_COLUMNS } from 'shared/utils/constants' +import { clinvarSignificance, VARIANT_METADATA_COLUMNS } from 'shared/utils/constants' const VIEW_ALL_PAGES = [ { name: 'GREGoR', downloadName: 'GREGoR', path: 'gregor' }, @@ -13,7 +13,7 @@ const COLUMNS = [ ...VARIANT_METADATA_COLUMNS.slice(0, -1), { name: 'allele_balance_or_heteroplasmy_percentage' }, { name: 'ClinGen allele ID', format: ({ clinvar }) => clinvar?.alleleId }, - { name: 'ClinVar Clinical Significance', format: ({ clinvar }) => clinvar?.clinicalSignificance }, + { name: 'ClinVar Clinical Significance', format: ({ clinvar }) => clinvarSignificance(clinvar).pathogenicity }, { name: 'ClinVar gold star', format: ({ clinvar }) => clinvar?.goldStars }, { name: 'known_condition_name' }, { name: 'condition_id' }, diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js index c8d0671dc7..a7c0a96132 100644 --- a/ui/shared/utils/constants.js +++ b/ui/shared/utils/constants.js @@ -513,6 +513,7 @@ const CLINVAR_MIN_RISK_PATHOGENICITY = 'likely_risk_allele' const CLINVAR_PATHOGENICITIES = [ 'pathogenic', 'pathogenic/likely_pathogenic', + 'pathogenic/likely_pathogenic/established_risk_allele', 'pathogenic/likely_pathogenic/likely_risk_allele', 'pathogenic/likely_risk_allele', 'likely_pathogenic',