Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3628 Fixes and Improvements #50

Merged
merged 11 commits into from
Aug 13, 2024
Binary file modified eva_sub_cli/etc/EVA_Submission_template.xlsx
Binary file not shown.
5 changes: 3 additions & 2 deletions eva_sub_cli/etc/eva_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@
"Curation",
"Genotyping by sequencing",
"Target sequencing",
"transcriptomics"
"Transcriptomics"
]
},
"referenceGenome": {
Expand Down Expand Up @@ -266,7 +266,8 @@
"runAccessions": {
"type": "array",
"items": {
"type": "string"
"type": "string",
"pattern": "^(E|D|S)RR[0-9]{6,}$"
},
"description": "List of run accessions linking to the raw data used in this analysis if applicable (e.g. SRR576651, SRR576652)"
}
Expand Down
14 changes: 13 additions & 1 deletion eva_sub_cli/etc/spreadsheet2json_conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Project:
Tax ID: taxId
optional:
Publication(s): publications
Parent Project(s): parentProject
Parent Project: parentProject
Child Project(s): childProjects
Peer Project(s): peerProjects
Link(s): links
Expand All @@ -44,6 +44,12 @@ Project:
Strain: strain
Breed: breed
Broker: broker
cast:
Publication(s): list
Child Project(s): list
Peer Project(s): list
Link(s): list
Hold Date: date

Analysis:
required:
Expand All @@ -63,6 +69,11 @@ Analysis:
Date: date
Link(s): links
Run Accession(s): runAccessions
cast:
Run Accession(s): list
Imputation: boolean
Phasing: boolean
Date: date

Sample:
header_row: 3
Expand Down Expand Up @@ -115,6 +126,7 @@ Sample:
cast:
Sample ID: string
BioSample Name: string
collection_date: date


Files:
Expand Down
8 changes: 8 additions & 0 deletions eva_sub_cli/executables/xlsx2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ def cast_value(value, type_name):
if type_name and value is not None:
if type_name == 'string':
return str(value)
if type_name == 'boolean':
return str(value).lower() in ['true', '1', 't', 'y', 'yes']
if type_name == 'list':
# split and remove empty values
return [XlsxParser.trim_value(v)
for v in value.split(',') if v]
if type_name == 'date':
return XlsxParser.serialize(value)
return value

@staticmethod
Expand Down
21 changes: 17 additions & 4 deletions eva_sub_cli/semantic_metadata.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import json
from copy import deepcopy

Expand All @@ -22,7 +23,7 @@
CHARACTERISTICS_KEY = 'characteristics'
TAX_ID_KEY = 'taxId'
ANALYSIS_ALIAS_KEY = 'analysisAlias'

ANALYSIS_RUNS_KEY = 'runAccessions'

def cast_list(l, type_to_cast=str):
for e in l:
Expand All @@ -47,6 +48,7 @@ def check_all(self):
self.check_all_project_accessions()
self.check_all_taxonomy_codes()
self.check_existing_biosamples()
self.check_all_analysis_run_accessions
self.check_analysis_alias_coherence()

def check_all_project_accessions(self):
Expand All @@ -69,12 +71,23 @@ def check_all_taxonomy_codes(self):
self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'],
f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')

def check_all_analysis_run_accessions(self):
"""Check that the Run accession are valid and exist in ENA"""
for idx, analysis in enumerate(self.metadata[ANALYSIS_KEY]):
json_path = f'/{ANALYSIS_KEY}/{idx}/{ANALYSIS_RUNS_KEY}'
if analysis[ANALYSIS_RUNS_KEY]:
for run_acc in analysis[ANALYSIS_RUNS_KEY]:
self.check_accession_in_ena(run_acc, 'Run', json_path)

@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
def check_project_accession(self, project_acc, json_path):
def check_accession_in_ena(self, ena_accession, accession_type, json_path):
try:
download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_acc}')
res = download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{ena_accession}')
except Exception:
self.add_error(json_path, f'{project_acc} does not exist or is private')
self.add_error(json_path, f'{accession_type} {ena_accession} does not exist in ENA or is private')

def check_project_accession(self, project_acc, json_path):
self.check_accession_in_ena(project_acc, 'Project', json_path)

@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
def check_taxonomy_code(self, taxonomy_code, json_path):
Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
logger = logging_config.get_logger(__name__)

container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev11'
container_tag = 'v0.0.1.dev12'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

Expand Down
11 changes: 8 additions & 3 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,12 @@ def set_up_output_dir(self):
os.makedirs(self.output_dir, exist_ok=True)

def clean_up_output_dir(self):
# Move intermediate validation outputs into a subdir
# Move intermediate validation outputs into a subdir except metadata.json
subdir = os.path.join(self.output_dir, 'other_validations')
os.mkdir(subdir)
for file_name in os.listdir(self.output_dir):
if file_name == 'metadata.json':
continue
file_path = os.path.join(self.output_dir, file_name)
if os.path.isfile(file_path):
os.rename(file_path, os.path.join(subdir, file_name))
Expand Down Expand Up @@ -451,6 +453,9 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
self.results['metadata_check']['spreadsheet_errors'] = []
for error in self.results['metadata_check'].get('json_errors', {}):
sheet_json, row_json, attribute_json = self._parse_metadata_property(error['property'])
# There should only be one Project but adding the row back means it's easier for users to find
if sheet_json == 'project' and row_json is None:
row_json = 0
sheet = self._convert_metadata_sheet(sheet_json, xls2json_conf)
row = self._convert_metadata_row(sheet, row_json, xls2json_conf)
column = self._convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
Expand All @@ -460,12 +465,12 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
if 'have required' not in error['description']:
new_description = error['description']
else:
new_description = f'In sheet "{sheet}", column "{column}" is not populated'
new_description = f'Column "{column}" is not populated'
elif attribute_json and column:
if 'have required' not in error['description']:
new_description = error['description']
else:
new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated'
new_description = f'Column "{column}" is not populated'
else:
new_description = error["description"].replace(sheet_json, sheet)
if column is None:
Expand Down
Binary file modified tests/resources/EVA_Submission_test.xlsx
Binary file not shown.
Binary file modified tests/resources/EVA_Submission_test_fails.xlsx
Binary file not shown.

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions tests/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,15 @@
'metadata_check': {
'spreadsheet_errors': [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
{'sheet': 'Project', 'row': 2, 'column': 'Project Title', 'description': 'Column "Project Title" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Description', 'description': 'Column "Description" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Tax ID', 'description': 'Column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Center', 'description': 'Column "Center" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'Column "Analysis Title" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'Column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'Column "Experiment Type" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'Column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'Column "Sample Accession" is not populated'}
],
'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt',
}
Expand Down
19 changes: 18 additions & 1 deletion tests/test_semantic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_check_all_project_accessions(self):
m_ena_download.side_effect = [True, True, Exception('problem downloading')]
checker.check_all_project_accessions()
self.assertEqual(checker.errors, [
{'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}
{'property': '/project/childProjects/1', 'description': 'Project PRJEBNA does not exist in ENA or is private'}
])

def test_check_all_taxonomy_codes(self):
Expand Down Expand Up @@ -152,3 +152,20 @@ def test_check_analysis_alias_coherence(self):
{'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'},
{'property': '/sample/analysisAlias', 'description': 'alias_1,alias_2 present in Samples not in Analysis'}
])

def test_check_all_analysis_run_accessions(self):
metadata = {
"analysis": [
{'runAccessions': ['SRR000001', 'SRR000002']}
]
}
checker = SemanticMetadataChecker(metadata)
checker.check_all_analysis_run_accessions()
assert checker.errors == []

metadata["analysis"].append({'runAccessions': ['SRR00000000001']})

checker.check_all_analysis_run_accessions()
assert checker.errors == [
{'property': '/analysis/1/runAccessions', 'description': 'Run SRR00000000001 does not exist in ENA or is private'}]

36 changes: 18 additions & 18 deletions tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,19 @@ def test__collect_validation_workflow_results_with_metadata_xlsx(self):
{'sheet': 'Project', 'row': '', 'column': 'Tax ID',
'description': 'Worksheet Project is missing required header Tax ID'},
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title',
'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Tax ID',
'description': 'In sheet "Project", column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Hold Date',
{'sheet': 'Project', 'row': 2, 'column': 'Project Title',
'description': 'Column "Project Title" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Tax ID',
'description': 'Column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Hold Date',
'description': 'must match format "date"'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description',
'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
'description': 'Column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
'description': 'Column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Child Project(s)',
'description': 'Column "Sample Accession" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)',
'description': 'PRJEBNA does not exist or is private'},
{'sheet': 'Sample', 'row': 5, 'column': 'Tax Id',
'description': '1234 is not a valid taxonomy code'},
Expand Down Expand Up @@ -244,19 +244,19 @@ def test_convert_biovalidator_validation_to_spreadsheet(self):

assert self.validator.results['metadata_check']['spreadsheet_errors'] == [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title',
'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Tax ID',
'description': 'In sheet "Project", column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Hold Date',
{'sheet': 'Project', 'row': 2, 'column': 'Project Title',
'description': 'Column "Project Title" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Tax ID',
'description': 'Column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Hold Date',
'description': 'must match format "date"'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description',
'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
'description': 'Column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
'description': 'Column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Child Project(s)',
'description': 'Column "Sample Accession" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)',
'description': 'PRJEBNA does not exist or is private'},
{'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', 'description': '1234 is not a valid taxonomy code'},
{'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias',
Expand Down
13 changes: 10 additions & 3 deletions tests/test_xlsx2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ def get_expected_json(self):
"description": "An example project for demonstration purposes",
"centre": "University of Example",
"taxId": 9606,
"holdDate": "2023-12-31T00:00:00"
"holdDate": "2023-12-31",
'parentProject': 'PRJEB00001',
'childProjects': ['PRJEB00002', 'PRJEB00003']
},
"analysis": [
{
Expand All @@ -116,7 +118,8 @@ def get_expected_json(self):
"experimentType": "Whole genome sequencing",
"referenceGenome": "GCA_000001405.27",
"referenceFasta": "GCA_000001405.27_fasta.fa",
"platform": "BGISEQ-500"
"platform": "BGISEQ-500",
"imputation": True,
},
{
"analysisTitle": "Variant Detection 2",
Expand All @@ -125,7 +128,8 @@ def get_expected_json(self):
"experimentType": "Whole genome sequencing",
"referenceGenome": "GCA_000001405.27",
"referenceFasta": "GCA_000001405.27_fasta.fa",
"platform": "BGISEQ-500"
"platform": "BGISEQ-500",
'phasing': True,
},
{
"analysisTitle": "Variant Detection 3",
Expand Down Expand Up @@ -179,6 +183,9 @@ def get_expected_json(self):
],
"species": [
{"text": "Lemur catta"}
],
'collectionDate': [
{'text': '2021-03-12'}
]
}
}
Expand Down
Loading