Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3564 - Simplify metadata conversion and validation #55

Merged
merged 5 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 17 additions & 59 deletions eva_sub_cli/executables/xlsx2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,16 @@ def __init__(self, xlsx_filename, conf_filename):
try:
self.workbook = load_workbook(xlsx_filename, read_only=True)
except Exception as e:
self.add_error(f'Error loading {xlsx_filename}: {e}')
self.add_error(f'Error loading {xlsx_filename}: {repr(e)}')
self.file_loaded = False
return
self.worksheets = None
self.worksheets = []
self._active_worksheet = None
self.row_offset = {}
self.headers = {}
self.valid = None
self.file_loaded = True
self.errors = []
self.valid_worksheets()

@property
def active_worksheet(self):
Expand All @@ -77,14 +79,8 @@ def active_worksheet(self, worksheet):

def valid_worksheets(self):
"""
Get the list of the names of worksheets which have all the configured required headers
:return: list of valid worksheet names in the Excel file
:rtype: list
Get the list of the names of worksheets which have the expected title and header row.
"""
if self.worksheets is not None:
return self.worksheets

self.worksheets = []
sheet_titles = self.workbook.sheetnames

for title in self.xlsx_conf[WORKSHEETS_KEY_NAME]:
Expand All @@ -97,31 +93,10 @@ def valid_worksheets(self):
header_row = self.xlsx_conf[title].get(HEADERS_KEY_ROW, 1)
if worksheet.max_row < header_row + 1:
continue
# Check required headers are present
# Store headers and worksheet title
self.headers[title] = [cell.value if cell.value is None else cell.value.strip()
for cell in worksheet[header_row]]
required_headers = self.xlsx_conf[title].get(REQUIRED_HEADERS_KEY_NAME, [])
if set(required_headers) <= set(self.headers[title]): # issubset
self.worksheets.append(title)
else:
missing_headers = set(required_headers) - set(self.headers[title])
for header in missing_headers:
self.add_error(f'Worksheet {title} is missing required header {header}',
sheet=title, column=header)

return self.worksheets

def is_valid(self):
"""
Check that is all the worksheets contain required headers
:return: True if all the worksheets contain required headers. False otherwise
:rtype: bool
"""
if self.valid is None:
self.valid = True
self.valid_worksheets()

return self.valid
self.worksheets.append(title)

@staticmethod
def cast_value(value, type_name):
Expand Down Expand Up @@ -219,16 +194,17 @@ def get_biosample_object(self, data):
scientific_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SCIENTIFIC_NAME_KEY]

# BioSample expects any of organism or species field
data[SPECIES] = data[scientific_name]
if scientific_name in data:
data[SPECIES] = data[scientific_name]
Comment on lines +197 to +198
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth having a mechanism that convert a excel spreadsheet into more than one BioSample field ?
We could then provide this in the spreadsheet2json_conf.yaml as

Sample:
  header_row: 3
  optional:
    Scientific Name: [scientificName, species]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could do, it would only be used for this case though as far as I know - the name field goes outside of characteristics so has to be handled differently. I probably won't add it to this PR but we should keep it in mind.

# BioSample name goes in its own attribute, not part of characteristics
biosample_name = data.pop(sample_name)
# For all characteristics, BioSample expects value in arrays of objects
data = {k: [{'text': self.serialize(v)}] for k, v in data.items()}
biosample_name = data.pop(sample_name, None)

# For all characteristics, BioSample expects value in arrays of objects
biosample_object = {
"name": biosample_name,
"characteristics": data
'characteristics': {k: [{'text': self.serialize(v)}] for k, v in data.items()}
}
if biosample_name is not None:
biosample_object['name'] = biosample_name

return biosample_object

Expand Down Expand Up @@ -263,30 +239,15 @@ def get_sample_json_data(self):
json_value.pop(analysis_alias)
json_value.pop(sample_name_in_vcf)

# Check for headers that are required only in this case
sample_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SAMPLE_NAME_KEY]
scientific_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SCIENTIFIC_NAME_KEY]
if sample_name not in json_value:
self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have '
f'{SAMPLE_NAME_KEY} populated',
sheet=SAMPLE, row=row_num, column=SAMPLE_NAME_KEY)
return None
if scientific_name not in json_value:
self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have '
f'{SCIENTIFIC_NAME_KEY} populated',
sheet=SAMPLE, row=row_num, column=SCIENTIFIC_NAME_KEY)
return None

biosample_obj = self.get_biosample_object(json_value)
sample_data.update(bioSampleObject=biosample_obj)
sample_json[json_key].append(sample_data)

return sample_json

def json(self, output_json_file):
# First check that all sheets present have the required headers;
# also guards against the case where conversion fails in init
if not self.is_valid():
# If the file could not be loaded at all, return without generating JSON.
if not self.file_loaded:
return
json_data = {}
for title in self.xlsx_conf[WORKSHEETS_KEY_NAME]:
Expand All @@ -295,8 +256,6 @@ def json(self, output_json_file):
json_data.update(self.get_project_json_data())
elif title == SAMPLE:
sample_data = self.get_sample_json_data()
if sample_data is None: # missing conditionally required headers
return
json_data.update(sample_data)
else:
json_data[self.xlsx_conf[WORKSHEETS_KEY_NAME][title]] = []
Expand Down Expand Up @@ -324,7 +283,6 @@ def add_error(self, message, sheet='', row='', column=''):
"""Adds a conversion error using the same structure as other validation errors,
and marks the spreadsheet as invalid."""
self.errors.append({'sheet': sheet, 'row': row, 'column': column, 'description': message})
self.valid = False

def save_errors(self, errors_yaml_file):
with open(errors_yaml_file, 'w') as open_file:
Expand Down
37 changes: 29 additions & 8 deletions eva_sub_cli/semantic_metadata.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import re
import json
from copy import deepcopy

import yaml

from retry import retry
from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator
from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena
from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena, get_scientific_name_and_common_name
from ebi_eva_common_pyutils.logger import AppLogger

from eva_sub_cli.date_utils import check_date
Expand All @@ -22,9 +21,11 @@
BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession'
CHARACTERISTICS_KEY = 'characteristics'
TAX_ID_KEY = 'taxId'
SCI_NAME_KEYS = ['species', 'Species', 'organism', 'Organism']
ANALYSIS_ALIAS_KEY = 'analysisAlias'
ANALYSIS_RUNS_KEY = 'runAccessions'


def cast_list(l, type_to_cast=str):
for e in l:
yield type_to_cast(e)
Expand All @@ -36,7 +37,7 @@ def __init__(self, metadata, sample_checklist='ERC000011'):
self.sample_checklist = sample_checklist
self.metadata = metadata
self.errors = []
# Caches whether taxonomy code is valid or not
# Caches whether taxonomy code is valid or not, and maps to scientific name if valid
self.taxonomy_valid = {}
self.communicator = NoAuthHALCommunicator(bsd_url='https://www.ebi.ac.uk/biosamples')

Expand All @@ -47,8 +48,9 @@ def write_result_yaml(self, output_path):
def check_all(self):
self.check_all_project_accessions()
self.check_all_taxonomy_codes()
self.check_all_scientific_names()
self.check_existing_biosamples()
self.check_all_analysis_run_accessions
self.check_all_analysis_run_accessions()
self.check_analysis_alias_coherence()

def check_all_project_accessions(self):
Expand All @@ -68,15 +70,34 @@ def check_all_taxonomy_codes(self):
self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
# Check sample taxonomies for novel samples
for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
if BIOSAMPLE_OBJECT_KEY in sample:
if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]:
self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'],
f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')

def check_all_scientific_names(self):
"""Check that all scientific names are consistent with taxonomy codes."""
for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]:
characteristics = sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]
# Get the scientific name from the taxonomy (if valid)
tax_code = int(characteristics[TAX_ID_KEY][0]['text'])
sci_name_from_tax = self.taxonomy_valid[tax_code]
if not sci_name_from_tax:
continue
# Check if scientific name in sample matches
for sci_name_key in SCI_NAME_KEYS:
if sci_name_key in characteristics:
sci_name = characteristics[sci_name_key][0]['text']
if sci_name_from_tax.lower() != sci_name.lower():
self.add_error(
f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{sci_name_key}',
f'Species {sci_name} does not match taxonomy {tax_code} ({sci_name_from_tax})')

def check_all_analysis_run_accessions(self):
"""Check that the Run accession are valid and exist in ENA"""
for idx, analysis in enumerate(self.metadata[ANALYSIS_KEY]):
json_path = f'/{ANALYSIS_KEY}/{idx}/{ANALYSIS_RUNS_KEY}'
if analysis[ANALYSIS_RUNS_KEY]:
if ANALYSIS_RUNS_KEY in analysis and analysis[ANALYSIS_RUNS_KEY]:
for run_acc in analysis[ANALYSIS_RUNS_KEY]:
self.check_accession_in_ena(run_acc, 'Run', json_path)

Expand All @@ -98,8 +119,8 @@ def check_taxonomy_code(self, taxonomy_code, json_path):
self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
else:
try:
download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}')
self.taxonomy_valid[taxonomy_code] = True
sci_name, _ = get_scientific_name_and_common_name(taxonomy_code)
self.taxonomy_valid[taxonomy_code] = sci_name
except Exception:
self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
self.taxonomy_valid[taxonomy_code] = False
Expand Down
3 changes: 1 addition & 2 deletions eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import argparse
import csv
import os
import re
Expand All @@ -12,7 +11,7 @@
logger = logging_config.get_logger(__name__)

container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev16'
container_tag = 'v0.0.1.dev17'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

Expand Down
12 changes: 12 additions & 0 deletions eva_sub_cli/validators/validation_results_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ def clean_read(ifile):
if line.startswith('Validation failed with following error(s):'):
collect = True
else:
while line and not line.startswith('/'):
# Sometimes there are multiple (possibly redundant) errors listed under a single property,
# we only report the first
Comment on lines +138 to +139
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably should report that to biovalidator.js

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I'll make an issue for them.

line = clean_read(open_file)
line2 = clean_read(open_file)
if line is None or line2 is None:
break # EOF
Expand Down Expand Up @@ -164,6 +168,9 @@ def convert_metadata_attribute(sheet, json_attribute, xls2json_conf):
attributes_dict = {}
attributes_dict.update(xls2json_conf[sheet].get('required', {}))
attributes_dict.update(xls2json_conf[sheet].get('optional', {}))
attributes_dict['Scientific Name'] = 'species'
attributes_dict['BioSample Name'] = 'name'

for attribute in attributes_dict:
if attributes_dict[attribute] == json_attribute:
return attribute
Expand All @@ -185,7 +192,12 @@ def parse_metadata_property(property_str):


def parse_sample_metadata_property(property_str):
# Check characteristics
match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str)
if match:
return 'sample', match.group(1), match.group(2)
# Check name
match = re.match(r'/sample/(\d+)/bioSampleObject/name', property_str)
if match:
return 'sample', match.group(1), 'name'
return None, None, None
3 changes: 1 addition & 2 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python
import csv
import datetime
import glob
import json
import logging
import os
Expand Down Expand Up @@ -345,7 +344,7 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
sheet = convert_metadata_sheet(sheet_json, xls2json_conf)
row = convert_metadata_row(sheet, row_json, xls2json_conf)
column = convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
if row_json is None and attribute_json is None:
if row_json is None and attribute_json is None and sheet is not None:
new_description = f'Sheet "{sheet}" is missing'
elif row_json is None:
if 'have required' not in error['description']:
Expand Down
Binary file modified tests/resources/EVA_Submission_test_fails.xlsx
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
- column: Tax ID
description: Worksheet Project is missing required header Tax ID
- column: ''
description: 'Error loading problem.xlsx: Exception()'
row: ''
sheet: Project
sheet: ''
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
property: /project/childProjects/1
- description: 1234 is not a valid taxonomy code
property: /sample/2/bioSampleObject/characteristics/taxId
- description: Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)
property: /sample/1/bioSampleObject/characteristics/Organism
- description: alias1 present in Analysis not in Samples
property: /sample/analysisAlias
- description: alias_1,alias_2 present in Samples not in Analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,19 @@
should have required property 'bioSampleObject'
/sample/0
should match exactly one schema in oneOf
/sample/3/bioSampleObject/name
must have required property 'name'
must have required property 'name'
must have required property 'name'
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure what's going on here, but this is the output I get from biovalidator...

/sample/3/bioSampleObject/characteristics/organism
must have required property 'organism'
must have required property 'organism'
/sample/3/bioSampleObject/characteristics/Organism
must have required property 'Organism'
/sample/3/bioSampleObject/characteristics/species
must have required property 'species'
/sample/3/bioSampleObject/characteristics/Species
must have required property 'Species'
/sample/3/bioSampleObject/characteristics
must match a schema in anyOf

45 changes: 43 additions & 2 deletions tests/test_semantic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ def test_check_all_taxonomy_codes(self):
]
}
checker = SemanticMetadataChecker(metadata)
with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download:
with patch('eva_sub_cli.semantic_metadata.get_scientific_name_and_common_name') as m_get_sci_name:
# Mock should only be called once per taxonomy code
m_ena_download.side_effect = [True, Exception('problem downloading')]
m_get_sci_name.side_effect = [('Homo sapiens', 'human'), Exception('problem downloading')]
checker.check_all_taxonomy_codes()
self.assertEqual(checker.errors, [
{
Expand All @@ -85,6 +85,47 @@ def test_check_all_taxonomy_codes(self):
}
])

def test_check_all_scientific_names(self):
metadata = {
"sample": [
{
"bioSampleObject": {
"characteristics": {
"taxId": [{"text": "9606"}],
"Organism": [{"text": "homo sapiens"}]
}
}
},
{
"bioSampleObject": {
"characteristics": {
"taxId": [{"text": "9606"}],
"Organism": [{"text": "sheep sapiens"}]
}
}
},
{
"bioSampleObject": {
"characteristics": {
"taxId": [{"text": "1234"}]
}
}
}
]
}
checker = SemanticMetadataChecker(metadata)
checker.taxonomy_valid = {
1234: False,
9606: "Homo sapiens"
}
checker.check_all_scientific_names()
self.assertEqual(checker.errors, [
{
'property': '/sample/1/bioSampleObject/characteristics/Organism',
'description': 'Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)'
}
])

def test_check_existing_biosamples_with_checklist(self):
checker = SemanticMetadataChecker(metadata)
with patch.object(SemanticMetadataChecker, '_get_biosample',
Expand Down
Loading
Loading