diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json index 25300a2..4b6d125 100644 --- a/eva_sub_cli/etc/eva_schema.json +++ b/eva_sub_cli/etc/eva_schema.json @@ -319,7 +319,7 @@ "$ref": "eva-biosamples.json" }, { - "$ref": "https://www.ebi.ac.uk/biosamples/schemas/certification/biosamples-minimal.json" + "$ref": "https://www.ebi.ac.uk/biosamples/schema/store/registry/schemas/ERC000011" }, { "$ref": "https://www.ebi.ac.uk/biosamples/schemas/certification/plant-miappe.json" diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py index 9a62683..d834d73 100755 --- a/eva_sub_cli/executables/cli.py +++ b/eva_sub_cli/executables/cli.py @@ -45,7 +45,7 @@ def main(): vcf_group = argparser.add_argument_group( 'Input VCF and assembly', "Specify the VCF files and associated assembly with the following options. If you used different assemblies " - "for different VCF files then use --vcf_file_mapping" + "for different VCF files then include these in the metadata file." ) vcf_group.add_argument('--vcf_files', nargs='+', help="One or several vcf files to validate") vcf_group.add_argument('--reference_fasta', diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index ca03c90..a6253a5 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -92,6 +92,7 @@ workflow { if (metadata_json) { // Metadata checks and concordance checks metadata_json_validation(metadata_json) + metadata_semantic_check(metadata_json) sample_name_concordance(metadata_json, vcf_files.collect()) fasta_to_vcfs = Channel.fromPath(joinBasePath(params.vcf_files_mapping)) .splitCsv(header:true) diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index c28057b..ced4365 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -64,7 +64,8 @@ def check_all_project_accessions(self): def check_all_taxonomy_codes(self): """Check that taxonomy IDs are valid according to ENA.""" project = self.metadata[PROJECT_KEY] - self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}') + if TAX_ID_KEY in project: + self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}') # Check sample taxonomies for novel samples for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): if BIOSAMPLE_OBJECT_KEY in sample: @@ -145,8 +146,10 @@ def _validate_biosample_against_checklist(self, sample_data, json_path, accessio sample_data['characteristics']['checklist'].append({'text': self.sample_checklist}) text = self.communicator.follows_link('samples', join_url='validate', method='POST', json=sample_data, text_only=True) - if text.startswith('Checklist validation failed: Sample validation failed: '): - for error_dict in json.loads(text[len('Checklist validation failed: Sample validation failed: '):]): + response = json.loads(text) + # Successful response returns the sample object, error response has a list of error objects. + if isinstance(response, list): + for error_dict in response: if accession: self.add_error(json_path, f'Existing sample {accession} {error_dict["errors"][0]}') else: diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index aa9e542..fcf4051 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -12,7 +12,7 @@ logger = logging_config.get_logger(__name__) container_image = 'ebivariation/eva-sub-cli' -container_tag = 'v0.0.1.dev13' +container_tag = 'v0.0.1.dev14' container_validation_dir = '/opt/vcf_validation' container_validation_output_dir = 'vcf_validation_output' diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py index 71ddf65..5eca0e1 100644 --- a/tests/test_docker_validator.py +++ b/tests/test_docker_validator.py @@ -37,7 +37,7 @@ def setUp(self): "parentProject": "PRJ_INVALID" }, "sample": [ - {"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"} + {"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME123"} ], "analysis": [ {"analysisAlias": "AA"} @@ -138,6 +138,14 @@ def test_validate(self): metadata_val_lines = {l.strip() for l in open_file.readlines()} assert 'must match pattern "^PRJ(EB|NA)\\d+$"' in metadata_val_lines + # Check semantic metadata errors + semantic_yaml_file = os.path.join(self.validator.output_dir, 'other_validations', 'metadata_semantic_check.yml') + self.assertTrue(os.path.isfile(semantic_yaml_file)) + with open(semantic_yaml_file) as open_yaml: + semantic_output = yaml.safe_load(open_yaml) + assert semantic_output[1] == {'description': 'SAME123 does not exist or is private', + 'property': '/sample/0/bioSampleAccession'} + def test_validate_from_excel(self): self.validator_from_excel.validate() self.assertTrue(os.path.isfile(self.validator_from_excel._sample_check_yaml)) diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py index 2a69af0..fec6214 100644 --- a/tests/test_semantic_metadata.py +++ b/tests/test_semantic_metadata.py @@ -18,17 +18,19 @@ 'characteristics': { 'organism': [{'text': 'Viridiplantae'}], 'collection date': [{'text': '2018'}], - 'geo loc name': [{'text': 'France: Montferrier-sur-Lez'}] + 'geographic location (country and/or sea)': [{'text': 'France'}] } } invalid_sample = { 'accession': 'SAME00003', 'name': 'sample3', 'characteristics': { - 'organism': [{'text': 'Viridiplantae'}] + 'organism': [{'text': 'Viridiplantae'}], + 'geographic location (country and/or sea)': [{'text': 'France: Montferrier-sur-Lez'}] } } + class TestSemanticMetadata(TestCase): def test_check_all_project_accessions(self): @@ -88,16 +90,18 @@ def test_check_existing_biosamples_with_checklist(self): with patch.object(SemanticMetadataChecker, '_get_biosample', side_effect=[valid_sample, ValueError, invalid_sample]) as m_get_sample: checker.check_existing_biosamples() - self.assertEqual(checker.errors, [ - {'property': '/sample/0/bioSampleAccession', - 'description': "Existing sample SAME00001 should have required property 'geographic location (country and/or sea)'"}, - {'property': '/sample/1/bioSampleAccession', - 'description': 'SAME00002 does not exist or is private'}, + self.assertEqual( + checker.errors[0], + {'property': '/sample/1/bioSampleAccession', 'description': 'SAME00002 does not exist or is private'} + ) + self.assertEqual( + checker.errors[1], {'property': '/sample/2/bioSampleAccession', - 'description': "Existing sample SAME00003 should have required property 'collection date'"}, - {'property': '/sample/2/bioSampleAccession', - 'description': "Existing sample SAME00003 should have required property 'geographic location (country and/or sea)'"} - ]) + 'description': "Existing sample SAME00003 should have required property 'collection date'"} + ) + # Final error message lists all possible geographic locations + self.assertTrue(checker.errors[2]['description'].startswith( + 'Existing sample SAME00003 should be equal to one of the allowed values:')) def test_check_existing_biosamples(self): checker = SemanticMetadataChecker(metadata, sample_checklist=None) @@ -112,10 +116,6 @@ def test_check_existing_biosamples(self): { 'property': '/sample/2/bioSampleAccession', 'description': 'Existing sample SAME00003 does not have a valid collection date' - }, - { - 'property': '/sample/2/bioSampleAccession', - 'description': 'Existing sample SAME00003 does not have a valid geographic location' } ]) @@ -168,4 +168,3 @@ def test_check_all_analysis_run_accessions(self): checker.check_all_analysis_run_accessions() assert checker.errors == [ {'property': '/analysis/1/runAccessions', 'description': 'Run SRR00000000001 does not exist in ENA or is private'}] - diff --git a/tests/test_submit.py b/tests/test_submit.py index 719155f..f3b9ac6 100644 --- a/tests/test_submit.py +++ b/tests/test_submit.py @@ -96,7 +96,7 @@ def test_submit_with_config(self): assert os.path.exists(self.config_file) # assert backup file is created assert os.path.exists(f"{self.config_file}.1") - with (open(self.config_file, 'r') as f): + with open(self.config_file, 'r') as f: sub_config_data = yaml.safe_load(f) assert sub_config_data[SUB_CLI_CONFIG_KEY_SUBMISSION_ID] == "mock_submission_id" assert sub_config_data[SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL] == "directory to use for upload"