Merge pull request #52 from apriltuesday/EVA-3636

EVA-3636: Fixes for semantic metadata validation
EBIvariation · Aug 21, 2024 · 5879d11 · 5879d11
2 parents 9b9cae0 + 9c04ef9
commit 5879d11
Show file tree

Hide file tree

Showing 8 changed files with 35 additions and 24 deletions.
diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json
@@ -319,7 +319,7 @@
                 "$ref": "eva-biosamples.json"
               },
               {
-                 "$ref": "https://www.ebi.ac.uk/biosamples/schemas/certification/biosamples-minimal.json"
+                 "$ref": "https://www.ebi.ac.uk/biosamples/schema/store/registry/schemas/ERC000011"
               },
               {
                 "$ref": "https://www.ebi.ac.uk/biosamples/schemas/certification/plant-miappe.json"

diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py
@@ -45,7 +45,7 @@ def main():
     vcf_group = argparser.add_argument_group(
         'Input VCF and assembly',
         "Specify the VCF files and associated assembly with the following options. If you used different assemblies "
-        "for different VCF files then use --vcf_file_mapping"
+        "for different VCF files then include these in the metadata file."
     )
     vcf_group.add_argument('--vcf_files', nargs='+', help="One or several vcf files to validate")
     vcf_group.add_argument('--reference_fasta',

diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
@@ -92,6 +92,7 @@ workflow {
     if (metadata_json) {
         // Metadata checks and concordance checks
         metadata_json_validation(metadata_json)
+        metadata_semantic_check(metadata_json)
         sample_name_concordance(metadata_json, vcf_files.collect())
         fasta_to_vcfs = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
             .splitCsv(header:true)

diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py
@@ -64,7 +64,8 @@ def check_all_project_accessions(self):
     def check_all_taxonomy_codes(self):
         """Check that taxonomy IDs are valid according to ENA."""
         project = self.metadata[PROJECT_KEY]
-        self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
+        if TAX_ID_KEY in project:
+            self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
         # Check sample taxonomies for novel samples
         for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
             if BIOSAMPLE_OBJECT_KEY in sample:
@@ -145,8 +146,10 @@ def _validate_biosample_against_checklist(self, sample_data, json_path, accessio
             sample_data['characteristics']['checklist'].append({'text': self.sample_checklist})
             text = self.communicator.follows_link('samples', join_url='validate', method='POST', json=sample_data,
                                                   text_only=True)
-            if text.startswith('Checklist validation failed: Sample validation failed: '):
-                for error_dict in json.loads(text[len('Checklist validation failed: Sample validation failed: '):]):
+            response = json.loads(text)
+            # Successful response returns the sample object, error response has a list of error objects.
+            if isinstance(response, list):
+                for error_dict in response:
                     if accession:
                         self.add_error(json_path, f'Existing sample {accession} {error_dict["errors"][0]}')
                     else:

diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py
@@ -12,7 +12,7 @@
 logger = logging_config.get_logger(__name__)
 
 container_image = 'ebivariation/eva-sub-cli'
-container_tag = 'v0.0.1.dev13'
+container_tag = 'v0.0.1.dev14'
 container_validation_dir = '/opt/vcf_validation'
 container_validation_output_dir = 'vcf_validation_output'
 

diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py
@@ -37,7 +37,7 @@ def setUp(self):
                 "parentProject": "PRJ_INVALID"
             },
             "sample": [
-                {"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"}
+                {"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME123"}
             ],
             "analysis": [
                 {"analysisAlias": "AA"}
@@ -138,6 +138,14 @@ def test_validate(self):
             metadata_val_lines = {l.strip() for l in open_file.readlines()}
             assert 'must match pattern "^PRJ(EB|NA)\\d+$"' in metadata_val_lines
 
+        # Check semantic metadata errors
+        semantic_yaml_file = os.path.join(self.validator.output_dir, 'other_validations', 'metadata_semantic_check.yml')
+        self.assertTrue(os.path.isfile(semantic_yaml_file))
+        with open(semantic_yaml_file) as open_yaml:
+            semantic_output = yaml.safe_load(open_yaml)
+            assert semantic_output[1] == {'description': 'SAME123 does not exist or is private',
+                                          'property': '/sample/0/bioSampleAccession'}
+
     def test_validate_from_excel(self):
         self.validator_from_excel.validate()
         self.assertTrue(os.path.isfile(self.validator_from_excel._sample_check_yaml))
diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py
@@ -18,17 +18,19 @@
     'characteristics': {
         'organism': [{'text': 'Viridiplantae'}],
         'collection date': [{'text': '2018'}],
-        'geo loc name': [{'text': 'France: Montferrier-sur-Lez'}]
+        'geographic location (country and/or sea)': [{'text': 'France'}]
     }
 }
 invalid_sample = {
     'accession': 'SAME00003',
     'name': 'sample3',
     'characteristics': {
-        'organism': [{'text': 'Viridiplantae'}]
+        'organism': [{'text': 'Viridiplantae'}],
+        'geographic location (country and/or sea)': [{'text': 'France: Montferrier-sur-Lez'}]
     }
 }
 
+
 class TestSemanticMetadata(TestCase):
 
     def test_check_all_project_accessions(self):
@@ -88,16 +90,18 @@ def test_check_existing_biosamples_with_checklist(self):
         with patch.object(SemanticMetadataChecker, '_get_biosample',
                           side_effect=[valid_sample, ValueError, invalid_sample]) as m_get_sample:
             checker.check_existing_biosamples()
-            self.assertEqual(checker.errors, [
-                {'property': '/sample/0/bioSampleAccession',
-                 'description': "Existing sample SAME00001 should have required property 'geographic location (country and/or sea)'"},
-                {'property': '/sample/1/bioSampleAccession',
-                 'description': 'SAME00002 does not exist or is private'},
+            self.assertEqual(
+                checker.errors[0],
+                {'property': '/sample/1/bioSampleAccession', 'description': 'SAME00002 does not exist or is private'}
+            )
+            self.assertEqual(
+                checker.errors[1],
                 {'property': '/sample/2/bioSampleAccession',
-                 'description': "Existing sample SAME00003 should have required property 'collection date'"},
-                {'property': '/sample/2/bioSampleAccession',
-                 'description': "Existing sample SAME00003 should have required property 'geographic location (country and/or sea)'"}
-            ])
+                 'description': "Existing sample SAME00003 should have required property 'collection date'"}
+            )
+            # Final error message lists all possible geographic locations
+            self.assertTrue(checker.errors[2]['description'].startswith(
+                'Existing sample SAME00003 should be equal to one of the allowed values:'))
 
     def test_check_existing_biosamples(self):
         checker = SemanticMetadataChecker(metadata, sample_checklist=None)
@@ -112,10 +116,6 @@ def test_check_existing_biosamples(self):
                 {
                     'property': '/sample/2/bioSampleAccession',
                     'description': 'Existing sample SAME00003 does not have a valid collection date'
-                },
-                {
-                    'property': '/sample/2/bioSampleAccession',
-                    'description': 'Existing sample SAME00003 does not have a valid geographic location'
                 }
             ])
 
@@ -168,4 +168,3 @@ def test_check_all_analysis_run_accessions(self):
         checker.check_all_analysis_run_accessions()
         assert checker.errors == [
             {'property': '/analysis/1/runAccessions', 'description': 'Run SRR00000000001 does not exist in ENA or is private'}]
-
diff --git a/tests/test_submit.py b/tests/test_submit.py
@@ -96,7 +96,7 @@ def test_submit_with_config(self):
         assert os.path.exists(self.config_file)
         # assert backup file is created
         assert os.path.exists(f"{self.config_file}.1")
-        with (open(self.config_file, 'r') as f):
+        with open(self.config_file, 'r') as f:
             sub_config_data = yaml.safe_load(f)
             assert sub_config_data[SUB_CLI_CONFIG_KEY_SUBMISSION_ID] == "mock_submission_id"
             assert sub_config_data[SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL] == "directory to use for upload"