Merge pull request #50 from tcezard/EVA3629-Fixes_and_improvements

EVA-3628 Fixes and Improvements
EBIvariation · Aug 13, 2024 · fda2ec2 · fda2ec2
2 parents ff2ed33 + fa7c32c
commit fda2ec2
Show file tree

Hide file tree

Showing 14 changed files with 106 additions and 43 deletions.
diff --git a/eva_sub_cli/etc/EVA_Submission_template.xlsx b/eva_sub_cli/etc/EVA_Submission_template.xlsx
diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json
@@ -159,7 +159,7 @@
               "Curation",
               "Genotyping by sequencing",
               "Target sequencing",
-              "transcriptomics"
+              "Transcriptomics"
             ]
           },
           "referenceGenome": {
@@ -266,7 +266,8 @@
           "runAccessions": {
             "type": "array",
             "items": {
-              "type": "string"
+              "type": "string",
+              "pattern": "^(E|D|S)RR[0-9]{6,}$"
             },
             "description": "List of run accessions linking to the raw data used in this analysis if applicable (e.g. SRR576651, SRR576652)"
           }

diff --git a/eva_sub_cli/etc/spreadsheet2json_conf.yaml b/eva_sub_cli/etc/spreadsheet2json_conf.yaml
@@ -35,7 +35,7 @@ Project:
     Tax ID: taxId
   optional:
     Publication(s): publications
-    Parent Project(s): parentProject
+    Parent Project: parentProject
     Child Project(s): childProjects
     Peer Project(s): peerProjects
     Link(s): links
@@ -44,6 +44,12 @@ Project:
     Strain: strain
     Breed: breed
     Broker: broker
+  cast:
+    Publication(s): list
+    Child Project(s): list
+    Peer Project(s): list
+    Link(s): list
+    Hold Date: date
 
 Analysis:
   required:
@@ -63,6 +69,11 @@ Analysis:
     Date: date
     Link(s): links
     Run Accession(s): runAccessions
+  cast:
+    Run Accession(s): list
+    Imputation: boolean
+    Phasing: boolean
+    Date: date
 
 Sample:
   header_row: 3
@@ -115,6 +126,7 @@ Sample:
   cast:
     Sample ID: string
     BioSample Name: string
+    collection_date: date
 
 
 Files:

diff --git a/eva_sub_cli/executables/xlsx2json.py b/eva_sub_cli/executables/xlsx2json.py
@@ -129,6 +129,14 @@ def cast_value(value, type_name):
         if type_name and value is not None:
             if type_name == 'string':
                 return str(value)
+            if type_name == 'boolean':
+                return str(value).lower() in ['true', '1', 't', 'y', 'yes']
+            if type_name == 'list':
+                # split and remove empty values
+                return [XlsxParser.trim_value(v)
+                        for v in value.split(',') if v]
+            if type_name == 'date':
+                return XlsxParser.serialize(value)
         return value
 
     @staticmethod

diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py
@@ -1,3 +1,4 @@
+import re
 import json
 from copy import deepcopy
 
@@ -22,7 +23,7 @@
 CHARACTERISTICS_KEY = 'characteristics'
 TAX_ID_KEY = 'taxId'
 ANALYSIS_ALIAS_KEY = 'analysisAlias'
-
+ANALYSIS_RUNS_KEY = 'runAccessions'
 
 def cast_list(l, type_to_cast=str):
     for e in l:
@@ -47,6 +48,7 @@ def check_all(self):
         self.check_all_project_accessions()
         self.check_all_taxonomy_codes()
         self.check_existing_biosamples()
+        self.check_all_analysis_run_accessions
         self.check_analysis_alias_coherence()
 
     def check_all_project_accessions(self):
@@ -69,12 +71,23 @@ def check_all_taxonomy_codes(self):
                 self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'],
                                          f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')
 
+    def check_all_analysis_run_accessions(self):
+        """Check that the Run accession are valid and exist in ENA"""
+        for idx, analysis in enumerate(self.metadata[ANALYSIS_KEY]):
+            json_path = f'/{ANALYSIS_KEY}/{idx}/{ANALYSIS_RUNS_KEY}'
+            if analysis[ANALYSIS_RUNS_KEY]:
+                for run_acc in analysis[ANALYSIS_RUNS_KEY]:
+                    self.check_accession_in_ena(run_acc, 'Run', json_path)
+
     @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
-    def check_project_accession(self, project_acc, json_path):
+    def check_accession_in_ena(self, ena_accession, accession_type, json_path):
         try:
-            download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_acc}')
+            res = download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{ena_accession}')
         except Exception:
-            self.add_error(json_path, f'{project_acc} does not exist or is private')
+            self.add_error(json_path, f'{accession_type} {ena_accession} does not exist in ENA or is private')
+
+    def check_project_accession(self, project_acc, json_path):
+        self.check_accession_in_ena(project_acc, 'Project', json_path)
 
     @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
     def check_taxonomy_code(self, taxonomy_code, json_path):

diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py
@@ -12,7 +12,7 @@
 logger = logging_config.get_logger(__name__)
 
 container_image = 'ebivariation/eva-sub-cli'
-container_tag = 'v0.0.1.dev11'
+container_tag = 'v0.0.1.dev12'
 container_validation_dir = '/opt/vcf_validation'
 container_validation_output_dir = 'vcf_validation_output'
 

diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
@@ -109,10 +109,12 @@ def set_up_output_dir(self):
         os.makedirs(self.output_dir, exist_ok=True)
 
     def clean_up_output_dir(self):
-        # Move intermediate validation outputs into a subdir
+        # Move intermediate validation outputs into a subdir except metadata.json
         subdir = os.path.join(self.output_dir, 'other_validations')
         os.mkdir(subdir)
         for file_name in os.listdir(self.output_dir):
+            if file_name == 'metadata.json':
+                continue
             file_path = os.path.join(self.output_dir, file_name)
             if os.path.isfile(file_path):
                 os.rename(file_path, os.path.join(subdir, file_name))
@@ -451,6 +453,9 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
             self.results['metadata_check']['spreadsheet_errors'] = []
         for error in self.results['metadata_check'].get('json_errors', {}):
             sheet_json, row_json, attribute_json = self._parse_metadata_property(error['property'])
+            # There should only be one Project but adding the row back means it's easier for users to find
+            if sheet_json == 'project' and row_json is None:
+                row_json = 0
             sheet = self._convert_metadata_sheet(sheet_json, xls2json_conf)
             row = self._convert_metadata_row(sheet, row_json, xls2json_conf)
             column = self._convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
@@ -460,12 +465,12 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
                 if 'have required' not in error['description']:
                     new_description = error['description']
                 else:
-                    new_description = f'In sheet "{sheet}", column "{column}" is not populated'
+                    new_description = f'Column "{column}" is not populated'
             elif attribute_json and column:
                 if 'have required' not in error['description']:
                     new_description = error['description']
                 else:
-                    new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated'
+                    new_description = f'Column "{column}" is not populated'
             else:
                 new_description = error["description"].replace(sheet_json, sheet)
             if column is None:

diff --git a/tests/resources/EVA_Submission_test.xlsx b/tests/resources/EVA_Submission_test.xlsx
diff --git a/tests/resources/EVA_Submission_test_fails.xlsx b/tests/resources/EVA_Submission_test_fails.xlsx
diff --git a/tests/resources/validation_reports/expected_report_metadata_xlsx.html b/tests/resources/validation_reports/expected_report_metadata_xlsx.html
diff --git a/tests/test_report.py b/tests/test_report.py
@@ -143,15 +143,15 @@
     'metadata_check': {
         'spreadsheet_errors': [
             {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
-            {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
-            {'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'},
-            {'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'},
-            {'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'},
-            {'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'},
-            {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
-            {'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'},
-            {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
-            {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
+            {'sheet': 'Project', 'row': 2, 'column': 'Project Title', 'description': 'Column "Project Title" is not populated'},
+            {'sheet': 'Project', 'row': 2, 'column': 'Description', 'description': 'Column "Description" is not populated'},
+            {'sheet': 'Project', 'row': 2, 'column': 'Tax ID', 'description': 'Column "Tax ID" is not populated'},
+            {'sheet': 'Project', 'row': 2, 'column': 'Center', 'description': 'Column "Center" is not populated'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'Column "Analysis Title" is not populated'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'Column "Description" is not populated'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'Column "Experiment Type" is not populated'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'Column "Reference" is not populated'},
+            {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'Column "Sample Accession" is not populated'}
         ],
         'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt',
     }

diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py
@@ -43,7 +43,7 @@ def test_check_all_project_accessions(self):
             m_ena_download.side_effect = [True, True, Exception('problem downloading')]
             checker.check_all_project_accessions()
             self.assertEqual(checker.errors, [
-                {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}
+                {'property': '/project/childProjects/1', 'description': 'Project PRJEBNA does not exist in ENA or is private'}
             ])
 
     def test_check_all_taxonomy_codes(self):
@@ -152,3 +152,20 @@ def test_check_analysis_alias_coherence(self):
             {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'},
             {'property': '/sample/analysisAlias', 'description': 'alias_1,alias_2 present in Samples not in Analysis'}
         ])
+
+    def test_check_all_analysis_run_accessions(self):
+        metadata = {
+            "analysis": [
+                {'runAccessions': ['SRR000001', 'SRR000002']}
+            ]
+        }
+        checker = SemanticMetadataChecker(metadata)
+        checker.check_all_analysis_run_accessions()
+        assert checker.errors == []
+
+        metadata["analysis"].append({'runAccessions': ['SRR00000000001']})
+
+        checker.check_all_analysis_run_accessions()
+        assert checker.errors == [
+            {'property': '/analysis/1/runAccessions', 'description': 'Run SRR00000000001 does not exist in ENA or is private'}]
+
diff --git a/tests/test_validator.py b/tests/test_validator.py
@@ -82,19 +82,19 @@ def test__collect_validation_workflow_results_with_metadata_xlsx(self):
                     {'sheet': 'Project', 'row': '', 'column': 'Tax ID',
                      'description': 'Worksheet Project is missing required header Tax ID'},
                     {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
-                    {'sheet': 'Project', 'row': '', 'column': 'Project Title',
-                     'description': 'In sheet "Project", column "Project Title" is not populated'},
-                    {'sheet': 'Project', 'row': '', 'column': 'Tax ID',
-                     'description': 'In sheet "Project", column "Tax ID" is not populated'},
-                    {'sheet': 'Project', 'row': '', 'column': 'Hold Date',
+                    {'sheet': 'Project', 'row': 2, 'column': 'Project Title',
+                     'description': 'Column "Project Title" is not populated'},
+                    {'sheet': 'Project', 'row': 2, 'column': 'Tax ID',
+                     'description': 'Column "Tax ID" is not populated'},
+                    {'sheet': 'Project', 'row': 2, 'column': 'Hold Date',
                      'description': 'must match format "date"'},
                     {'sheet': 'Analysis', 'row': 2, 'column': 'Description',
-                     'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
+                     'description': 'Column "Description" is not populated'},
                     {'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
-                     'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
+                     'description': 'Column "Reference" is not populated'},
                     {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
-                     'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'},
-                    {'sheet': 'Project', 'row': '', 'column': 'Child Project(s)',
+                     'description': 'Column "Sample Accession" is not populated'},
+                    {'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)',
                      'description': 'PRJEBNA does not exist or is private'},
                     {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id',
                      'description': '1234 is not a valid taxonomy code'},
@@ -244,19 +244,19 @@ def test_convert_biovalidator_validation_to_spreadsheet(self):
 
         assert self.validator.results['metadata_check']['spreadsheet_errors'] == [
             {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
-            {'sheet': 'Project', 'row': '', 'column': 'Project Title',
-             'description': 'In sheet "Project", column "Project Title" is not populated'},
-            {'sheet': 'Project', 'row': '', 'column': 'Tax ID',
-             'description': 'In sheet "Project", column "Tax ID" is not populated'},
-            {'sheet': 'Project', 'row': '', 'column': 'Hold Date',
+            {'sheet': 'Project', 'row': 2, 'column': 'Project Title',
+             'description': 'Column "Project Title" is not populated'},
+            {'sheet': 'Project', 'row': 2, 'column': 'Tax ID',
+             'description': 'Column "Tax ID" is not populated'},
+            {'sheet': 'Project', 'row': 2, 'column': 'Hold Date',
              'description': 'must match format "date"'},
             {'sheet': 'Analysis', 'row': 2, 'column': 'Description',
-             'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
+             'description': 'Column "Description" is not populated'},
             {'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
-             'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
+             'description': 'Column "Reference" is not populated'},
             {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
-             'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'},
-            {'sheet': 'Project', 'row': '', 'column': 'Child Project(s)',
+             'description': 'Column "Sample Accession" is not populated'},
+            {'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)',
              'description': 'PRJEBNA does not exist or is private'},
             {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', 'description': '1234 is not a valid taxonomy code'},
             {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias',

diff --git a/tests/test_xlsx2json.py b/tests/test_xlsx2json.py
@@ -106,7 +106,9 @@ def get_expected_json(self):
                 "description": "An example project for demonstration purposes",
                 "centre": "University of Example",
                 "taxId": 9606,
-                "holdDate": "2023-12-31T00:00:00"
+                "holdDate": "2023-12-31",
+                'parentProject': 'PRJEB00001',
+                'childProjects': ['PRJEB00002', 'PRJEB00003']
             },
             "analysis": [
                 {
@@ -116,7 +118,8 @@ def get_expected_json(self):
                     "experimentType": "Whole genome sequencing",
                     "referenceGenome": "GCA_000001405.27",
                     "referenceFasta": "GCA_000001405.27_fasta.fa",
-                    "platform": "BGISEQ-500"
+                    "platform": "BGISEQ-500",
+                    "imputation": True,
                 },
                 {
                     "analysisTitle": "Variant Detection 2",
@@ -125,7 +128,8 @@ def get_expected_json(self):
                     "experimentType": "Whole genome sequencing",
                     "referenceGenome": "GCA_000001405.27",
                     "referenceFasta": "GCA_000001405.27_fasta.fa",
-                    "platform": "BGISEQ-500"
+                    "platform": "BGISEQ-500",
+                    'phasing': True,
                 },
                 {
                     "analysisTitle": "Variant Detection 3",
@@ -179,6 +183,9 @@ def get_expected_json(self):
                             ],
                             "species": [
                                 {"text": "Lemur catta"}
+                            ],
+                            'collectionDate': [
+                                {'text': '2021-03-12'}
                             ]
                         }
                     }