EBIvariation · apriltuesday · Sep 16, 2024 · Sep 6, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/eva_sub_cli/executables/xlsx2json.py b/eva_sub_cli/executables/xlsx2json.py
@@ -52,14 +52,16 @@ def __init__(self, xlsx_filename, conf_filename):
         try:
             self.workbook = load_workbook(xlsx_filename, read_only=True)
         except Exception as e:
-            self.add_error(f'Error loading {xlsx_filename}: {e}')
+            self.add_error(f'Error loading {xlsx_filename}: {repr(e)}')
+            self.file_loaded = False
             return
-        self.worksheets = None
+        self.worksheets = []
         self._active_worksheet = None
         self.row_offset = {}
         self.headers = {}
-        self.valid = None
+        self.file_loaded = True
         self.errors = []
+        self.valid_worksheets()
 
     @property
     def active_worksheet(self):
@@ -77,14 +79,8 @@ def active_worksheet(self, worksheet):
 
     def valid_worksheets(self):
         """
-        Get the list of the names of worksheets which have all the configured required headers
-        :return: list of valid worksheet names in the Excel file
-        :rtype: list
+        Get the list of the names of worksheets which have the expected title and header row.
         """
-        if self.worksheets is not None:
-            return self.worksheets
-
-        self.worksheets = []
         sheet_titles = self.workbook.sheetnames
 
         for title in self.xlsx_conf[WORKSHEETS_KEY_NAME]:
@@ -97,31 +93,10 @@ def valid_worksheets(self):
             header_row = self.xlsx_conf[title].get(HEADERS_KEY_ROW, 1)
             if worksheet.max_row < header_row + 1:
                 continue
-            # Check required headers are present
+            # Store headers and worksheet title
             self.headers[title] = [cell.value if cell.value is None else cell.value.strip()
                                    for cell in worksheet[header_row]]
-            required_headers = self.xlsx_conf[title].get(REQUIRED_HEADERS_KEY_NAME, [])
-            if set(required_headers) <= set(self.headers[title]):  # issubset
-                self.worksheets.append(title)
-            else:
-                missing_headers = set(required_headers) - set(self.headers[title])
-                for header in missing_headers:
-                    self.add_error(f'Worksheet {title} is missing required header {header}',
-                                   sheet=title, column=header)
-
-        return self.worksheets
-
-    def is_valid(self):
-        """
-        Check that is all the worksheets contain required headers
-        :return: True if all the worksheets contain required headers. False otherwise
-        :rtype: bool
-        """
-        if self.valid is None:
-            self.valid = True
-            self.valid_worksheets()
-
-        return self.valid
+            self.worksheets.append(title)
 
     @staticmethod
     def cast_value(value, type_name):
@@ -219,16 +194,17 @@ def get_biosample_object(self, data):
         scientific_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SCIENTIFIC_NAME_KEY]
 
         # BioSample expects any of organism or species field
-        data[SPECIES] = data[scientific_name]
+        if scientific_name in data:
+            data[SPECIES] = data[scientific_name]
         # BioSample name goes in its own attribute, not part of characteristics
-        biosample_name = data.pop(sample_name)
-        # For all characteristics, BioSample expects value in arrays of objects
-        data = {k: [{'text': self.serialize(v)}] for k, v in data.items()}
+        biosample_name = data.pop(sample_name, None)
 
+        # For all characteristics, BioSample expects value in arrays of objects
         biosample_object = {
-            "name": biosample_name,
-            "characteristics": data
+            'characteristics': {k: [{'text': self.serialize(v)}] for k, v in data.items()}
         }
+        if biosample_name is not None:
+            biosample_object['name'] = biosample_name
 
         return biosample_object
 
@@ -263,30 +239,15 @@ def get_sample_json_data(self):
                 json_value.pop(analysis_alias)
                 json_value.pop(sample_name_in_vcf)
 
-                # Check for headers that are required only in this case
-                sample_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SAMPLE_NAME_KEY]
-                scientific_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SCIENTIFIC_NAME_KEY]
-                if sample_name not in json_value:
-                    self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have '
-                                   f'{SAMPLE_NAME_KEY} populated',
-                                   sheet=SAMPLE, row=row_num, column=SAMPLE_NAME_KEY)
-                    return None
-                if scientific_name not in json_value:
-                    self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have '
-                                   f'{SCIENTIFIC_NAME_KEY} populated',
-                                   sheet=SAMPLE, row=row_num, column=SCIENTIFIC_NAME_KEY)
-                    return None
-
                 biosample_obj = self.get_biosample_object(json_value)
                 sample_data.update(bioSampleObject=biosample_obj)
                 sample_json[json_key].append(sample_data)
 
         return sample_json
 
     def json(self, output_json_file):
-        # First check that all sheets present have the required headers;
-        # also guards against the case where conversion fails in init
-        if not self.is_valid():
+        # If the file could not be loaded at all, return without generating JSON.
+        if not self.file_loaded:
             return
         json_data = {}
         for title in self.xlsx_conf[WORKSHEETS_KEY_NAME]:
@@ -295,8 +256,6 @@ def json(self, output_json_file):
                 json_data.update(self.get_project_json_data())
             elif title == SAMPLE:
                 sample_data = self.get_sample_json_data()
-                if sample_data is None:  # missing conditionally required headers
-                    return
                 json_data.update(sample_data)
             else:
                 json_data[self.xlsx_conf[WORKSHEETS_KEY_NAME][title]] = []
@@ -324,7 +283,6 @@ def add_error(self, message, sheet='', row='', column=''):
         """Adds a conversion error using the same structure as other validation errors,
         and marks the spreadsheet as invalid."""
         self.errors.append({'sheet': sheet, 'row': row, 'column': column, 'description': message})
-        self.valid = False
 
     def save_errors(self, errors_yaml_file):
         with open(errors_yaml_file, 'w') as open_file:

diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py
@@ -1,12 +1,11 @@
-import re
 import json
 from copy import deepcopy
 
 import yaml
 
 from retry import retry
 from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator
-from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena
+from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena, get_scientific_name_and_common_name
 from ebi_eva_common_pyutils.logger import AppLogger
 
 from eva_sub_cli.date_utils import check_date
@@ -22,9 +21,11 @@
 BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession'
 CHARACTERISTICS_KEY = 'characteristics'
 TAX_ID_KEY = 'taxId'
+SCI_NAME_KEYS = ['species', 'Species', 'organism', 'Organism']
 ANALYSIS_ALIAS_KEY = 'analysisAlias'
 ANALYSIS_RUNS_KEY = 'runAccessions'
 
+
 def cast_list(l, type_to_cast=str):
     for e in l:
         yield type_to_cast(e)
@@ -36,7 +37,7 @@ def __init__(self, metadata, sample_checklist='ERC000011'):
         self.sample_checklist = sample_checklist
         self.metadata = metadata
         self.errors = []
-        # Caches whether taxonomy code is valid or not
+        # Caches whether taxonomy code is valid or not, and maps to scientific name if valid
         self.taxonomy_valid = {}
         self.communicator = NoAuthHALCommunicator(bsd_url='https://www.ebi.ac.uk/biosamples')
 
@@ -47,8 +48,9 @@ def write_result_yaml(self, output_path):
     def check_all(self):
         self.check_all_project_accessions()
         self.check_all_taxonomy_codes()
+        self.check_all_scientific_names()
         self.check_existing_biosamples()
-        self.check_all_analysis_run_accessions
+        self.check_all_analysis_run_accessions()
         self.check_analysis_alias_coherence()
 
     def check_all_project_accessions(self):
@@ -68,15 +70,34 @@ def check_all_taxonomy_codes(self):
             self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
         # Check sample taxonomies for novel samples
         for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
-            if BIOSAMPLE_OBJECT_KEY in sample:
+            if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]:
                 self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'],
                                          f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')
 
+    def check_all_scientific_names(self):
+        """Check that all scientific names are consistent with taxonomy codes."""
+        for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
+            if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]:
+                characteristics = sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]
+                # Get the scientific name from the taxonomy (if valid)
+                tax_code = int(characteristics[TAX_ID_KEY][0]['text'])
+                sci_name_from_tax = self.taxonomy_valid[tax_code]
+                if not sci_name_from_tax:
+                    continue
+                # Check if scientific name in sample matches
+                for sci_name_key in SCI_NAME_KEYS:
+                    if sci_name_key in characteristics:
+                        sci_name = characteristics[sci_name_key][0]['text']
+                        if sci_name_from_tax.lower() != sci_name.lower():
+                            self.add_error(
+                                f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{sci_name_key}',
+                                f'Species {sci_name} does not match taxonomy {tax_code} ({sci_name_from_tax})')
+
     def check_all_analysis_run_accessions(self):
         """Check that the Run accession are valid and exist in ENA"""
         for idx, analysis in enumerate(self.metadata[ANALYSIS_KEY]):
             json_path = f'/{ANALYSIS_KEY}/{idx}/{ANALYSIS_RUNS_KEY}'
-            if analysis[ANALYSIS_RUNS_KEY]:
+            if ANALYSIS_RUNS_KEY in analysis and analysis[ANALYSIS_RUNS_KEY]:
                 for run_acc in analysis[ANALYSIS_RUNS_KEY]:
                     self.check_accession_in_ena(run_acc, 'Run', json_path)
 
@@ -98,8 +119,8 @@ def check_taxonomy_code(self, taxonomy_code, json_path):
                 self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
         else:
             try:
-                download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}')
-                self.taxonomy_valid[taxonomy_code] = True
+                sci_name, _ = get_scientific_name_and_common_name(taxonomy_code)
+                self.taxonomy_valid[taxonomy_code] = sci_name
             except Exception:
                 self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
                 self.taxonomy_valid[taxonomy_code] = False

diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py
@@ -1,4 +1,3 @@
-import argparse
 import csv
 import os
 import re
@@ -12,7 +11,7 @@
 logger = logging_config.get_logger(__name__)
 
 container_image = 'ebivariation/eva-sub-cli'
-container_tag = 'v0.0.1.dev16'
+container_tag = 'v0.0.1.dev17'
 container_validation_dir = '/opt/vcf_validation'
 container_validation_output_dir = 'vcf_validation_output'
 

diff --git a/eva_sub_cli/validators/validation_results_parsers.py b/eva_sub_cli/validators/validation_results_parsers.py
@@ -134,6 +134,10 @@ def clean_read(ifile):
                 if line.startswith('Validation failed with following error(s):'):
                     collect = True
             else:
+                while line and not line.startswith('/'):
+                    # Sometimes there are multiple (possibly redundant) errors listed under a single property,
+                    # we only report the first
+                    line = clean_read(open_file)
                 line2 = clean_read(open_file)
                 if line is None or line2 is None:
                     break  # EOF
@@ -164,6 +168,9 @@ def convert_metadata_attribute(sheet, json_attribute, xls2json_conf):
     attributes_dict = {}
     attributes_dict.update(xls2json_conf[sheet].get('required', {}))
     attributes_dict.update(xls2json_conf[sheet].get('optional', {}))
+    attributes_dict['Scientific Name'] = 'species'
+    attributes_dict['BioSample Name'] = 'name'
+
     for attribute in attributes_dict:
         if attributes_dict[attribute] == json_attribute:
             return attribute
@@ -185,7 +192,12 @@ def parse_metadata_property(property_str):
 
 
 def parse_sample_metadata_property(property_str):
+    # Check characteristics
     match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str)
     if match:
         return 'sample', match.group(1), match.group(2)
+    # Check name
+    match = re.match(r'/sample/(\d+)/bioSampleObject/name', property_str)
+    if match:
+        return 'sample', match.group(1), 'name'
     return None, None, None
diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 import csv
 import datetime
-import glob
 import json
 import logging
 import os
@@ -345,7 +344,7 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
             sheet = convert_metadata_sheet(sheet_json, xls2json_conf)
             row = convert_metadata_row(sheet, row_json, xls2json_conf)
             column = convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
-            if row_json is None and attribute_json is None:
+            if row_json is None and attribute_json is None and sheet is not None:
                 new_description = f'Sheet "{sheet}" is missing'
             elif row_json is None:
                 if 'have required' not in error['description']:

diff --git a/tests/resources/EVA_Submission_test_fails.xlsx b/tests/resources/EVA_Submission_test_fails.xlsx
diff --git a/...ces/validation_reports/validation_output/other_validations/metadata_conversion_errors.yml b/...ces/validation_reports/validation_output/other_validations/metadata_conversion_errors.yml
@@ -1,4 +1,4 @@
-- column: Tax ID
-  description: Worksheet Project is missing required header Tax ID
+- column: ''
+  description: 'Error loading problem.xlsx: Exception()'
   row: ''
-  sheet: Project
+  sheet: ''
diff --git a/...ources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml b/...ources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml
@@ -2,6 +2,8 @@
   property: /project/childProjects/1
 - description: 1234 is not a valid taxonomy code
   property: /sample/2/bioSampleObject/characteristics/taxId
+- description: Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)
+  property: /sample/1/bioSampleObject/characteristics/Organism
 - description: alias1 present in Analysis not in Samples
   property: /sample/analysisAlias
 - description: alias_1,alias_2 present in Samples not in Analysis

diff --git a/.../resources/validation_reports/validation_output/other_validations/metadata_validation.txt b/.../resources/validation_reports/validation_output/other_validations/metadata_validation.txt
@@ -26,4 +26,19 @@
 	should have required property 'bioSampleObject'
 /sample/0
 	should match exactly one schema in oneOf
+/sample/3/bioSampleObject/name
+        must have required property 'name'
+        must have required property 'name'
+        must have required property 'name'
+/sample/3/bioSampleObject/characteristics/organism
+        must have required property 'organism'
+        must have required property 'organism'
+/sample/3/bioSampleObject/characteristics/Organism
+        must have required property 'Organism'
+/sample/3/bioSampleObject/characteristics/species
+        must have required property 'species'
+/sample/3/bioSampleObject/characteristics/Species
+        must have required property 'Species'
+/sample/3/bioSampleObject/characteristics
+        must match a schema in anyOf
  [0m
diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py
@@ -74,9 +74,9 @@ def test_check_all_taxonomy_codes(self):
             ]
         }
         checker = SemanticMetadataChecker(metadata)
-        with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download:
+        with patch('eva_sub_cli.semantic_metadata.get_scientific_name_and_common_name') as m_get_sci_name:
             # Mock should only be called once per taxonomy code
-            m_ena_download.side_effect = [True, Exception('problem downloading')]
+            m_get_sci_name.side_effect = [('Homo sapiens', 'human'), Exception('problem downloading')]
             checker.check_all_taxonomy_codes()
             self.assertEqual(checker.errors, [
                 {
@@ -85,6 +85,47 @@ def test_check_all_taxonomy_codes(self):
                 }
             ])
 
+    def test_check_all_scientific_names(self):
+        metadata = {
+            "sample": [
+                {
+                    "bioSampleObject": {
+                        "characteristics": {
+                            "taxId": [{"text": "9606"}],
+                            "Organism": [{"text": "homo sapiens"}]
+                        }
+                    }
+                },
+                {
+                    "bioSampleObject": {
+                        "characteristics": {
+                            "taxId": [{"text": "9606"}],
+                            "Organism": [{"text": "sheep sapiens"}]
+                        }
+                    }
+                },
+                {
+                    "bioSampleObject": {
+                        "characteristics": {
+                            "taxId": [{"text": "1234"}]
+                        }
+                    }
+                }
+            ]
+        }
+        checker = SemanticMetadataChecker(metadata)
+        checker.taxonomy_valid = {
+            1234: False,
+            9606: "Homo sapiens"
+        }
+        checker.check_all_scientific_names()
+        self.assertEqual(checker.errors, [
+            {
+                'property': '/sample/1/bioSampleObject/characteristics/Organism',
+                'description': 'Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)'
+            }
+        ])
+
     def test_check_existing_biosamples_with_checklist(self):
         checker = SemanticMetadataChecker(metadata)
         with patch.object(SemanticMetadataChecker, '_get_biosample',