Skip to content

Commit

Permalink
Merge pull request #50 from tcezard/EVA3629-Fixes_and_improvements
Browse files Browse the repository at this point in the history
EVA-3628 Fixes and Improvements
  • Loading branch information
tcezard authored Aug 13, 2024
2 parents ff2ed33 + fa7c32c commit fda2ec2
Show file tree
Hide file tree
Showing 14 changed files with 106 additions and 43 deletions.
Binary file modified eva_sub_cli/etc/EVA_Submission_template.xlsx
Binary file not shown.
5 changes: 3 additions & 2 deletions eva_sub_cli/etc/eva_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@
"Curation",
"Genotyping by sequencing",
"Target sequencing",
"transcriptomics"
"Transcriptomics"
]
},
"referenceGenome": {
Expand Down Expand Up @@ -266,7 +266,8 @@
"runAccessions": {
"type": "array",
"items": {
"type": "string"
"type": "string",
"pattern": "^(E|D|S)RR[0-9]{6,}$"
},
"description": "List of run accessions linking to the raw data used in this analysis if applicable (e.g. SRR576651, SRR576652)"
}
Expand Down
14 changes: 13 additions & 1 deletion eva_sub_cli/etc/spreadsheet2json_conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Project:
Tax ID: taxId
optional:
Publication(s): publications
Parent Project(s): parentProject
Parent Project: parentProject
Child Project(s): childProjects
Peer Project(s): peerProjects
Link(s): links
Expand All @@ -44,6 +44,12 @@ Project:
Strain: strain
Breed: breed
Broker: broker
cast:
Publication(s): list
Child Project(s): list
Peer Project(s): list
Link(s): list
Hold Date: date

Analysis:
required:
Expand All @@ -63,6 +69,11 @@ Analysis:
Date: date
Link(s): links
Run Accession(s): runAccessions
cast:
Run Accession(s): list
Imputation: boolean
Phasing: boolean
Date: date

Sample:
header_row: 3
Expand Down Expand Up @@ -115,6 +126,7 @@ Sample:
cast:
Sample ID: string
BioSample Name: string
collection_date: date


Files:
Expand Down
8 changes: 8 additions & 0 deletions eva_sub_cli/executables/xlsx2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ def cast_value(value, type_name):
if type_name and value is not None:
if type_name == 'string':
return str(value)
if type_name == 'boolean':
return str(value).lower() in ['true', '1', 't', 'y', 'yes']
if type_name == 'list':
# split and remove empty values
return [XlsxParser.trim_value(v)
for v in value.split(',') if v]
if type_name == 'date':
return XlsxParser.serialize(value)
return value

@staticmethod
Expand Down
21 changes: 17 additions & 4 deletions eva_sub_cli/semantic_metadata.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import json
from copy import deepcopy

Expand All @@ -22,7 +23,7 @@
CHARACTERISTICS_KEY = 'characteristics'
TAX_ID_KEY = 'taxId'
ANALYSIS_ALIAS_KEY = 'analysisAlias'

ANALYSIS_RUNS_KEY = 'runAccessions'

def cast_list(l, type_to_cast=str):
for e in l:
Expand All @@ -47,6 +48,7 @@ def check_all(self):
self.check_all_project_accessions()
self.check_all_taxonomy_codes()
self.check_existing_biosamples()
self.check_all_analysis_run_accessions
self.check_analysis_alias_coherence()

def check_all_project_accessions(self):
Expand All @@ -69,12 +71,23 @@ def check_all_taxonomy_codes(self):
self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'],
f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')

def check_all_analysis_run_accessions(self):
"""Check that the Run accession are valid and exist in ENA"""
for idx, analysis in enumerate(self.metadata[ANALYSIS_KEY]):
json_path = f'/{ANALYSIS_KEY}/{idx}/{ANALYSIS_RUNS_KEY}'
if analysis[ANALYSIS_RUNS_KEY]:
for run_acc in analysis[ANALYSIS_RUNS_KEY]:
self.check_accession_in_ena(run_acc, 'Run', json_path)

@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
def check_project_accession(self, project_acc, json_path):
def check_accession_in_ena(self, ena_accession, accession_type, json_path):
try:
download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_acc}')
res = download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{ena_accession}')
except Exception:
self.add_error(json_path, f'{project_acc} does not exist or is private')
self.add_error(json_path, f'{accession_type} {ena_accession} does not exist in ENA or is private')

def check_project_accession(self, project_acc, json_path):
self.check_accession_in_ena(project_acc, 'Project', json_path)

@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
def check_taxonomy_code(self, taxonomy_code, json_path):
Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
logger = logging_config.get_logger(__name__)

container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev11'
container_tag = 'v0.0.1.dev12'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

Expand Down
11 changes: 8 additions & 3 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,12 @@ def set_up_output_dir(self):
os.makedirs(self.output_dir, exist_ok=True)

def clean_up_output_dir(self):
# Move intermediate validation outputs into a subdir
# Move intermediate validation outputs into a subdir except metadata.json
subdir = os.path.join(self.output_dir, 'other_validations')
os.mkdir(subdir)
for file_name in os.listdir(self.output_dir):
if file_name == 'metadata.json':
continue
file_path = os.path.join(self.output_dir, file_name)
if os.path.isfile(file_path):
os.rename(file_path, os.path.join(subdir, file_name))
Expand Down Expand Up @@ -451,6 +453,9 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
self.results['metadata_check']['spreadsheet_errors'] = []
for error in self.results['metadata_check'].get('json_errors', {}):
sheet_json, row_json, attribute_json = self._parse_metadata_property(error['property'])
# There should only be one Project but adding the row back means it's easier for users to find
if sheet_json == 'project' and row_json is None:
row_json = 0
sheet = self._convert_metadata_sheet(sheet_json, xls2json_conf)
row = self._convert_metadata_row(sheet, row_json, xls2json_conf)
column = self._convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
Expand All @@ -460,12 +465,12 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
if 'have required' not in error['description']:
new_description = error['description']
else:
new_description = f'In sheet "{sheet}", column "{column}" is not populated'
new_description = f'Column "{column}" is not populated'
elif attribute_json and column:
if 'have required' not in error['description']:
new_description = error['description']
else:
new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated'
new_description = f'Column "{column}" is not populated'
else:
new_description = error["description"].replace(sheet_json, sheet)
if column is None:
Expand Down
Binary file modified tests/resources/EVA_Submission_test.xlsx
Binary file not shown.
Binary file modified tests/resources/EVA_Submission_test_fails.xlsx
Binary file not shown.

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions tests/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,15 @@
'metadata_check': {
'spreadsheet_errors': [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
{'sheet': 'Project', 'row': 2, 'column': 'Project Title', 'description': 'Column "Project Title" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Description', 'description': 'Column "Description" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Tax ID', 'description': 'Column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Center', 'description': 'Column "Center" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'Column "Analysis Title" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'Column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'Column "Experiment Type" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'Column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'Column "Sample Accession" is not populated'}
],
'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt',
}
Expand Down
19 changes: 18 additions & 1 deletion tests/test_semantic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_check_all_project_accessions(self):
m_ena_download.side_effect = [True, True, Exception('problem downloading')]
checker.check_all_project_accessions()
self.assertEqual(checker.errors, [
{'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}
{'property': '/project/childProjects/1', 'description': 'Project PRJEBNA does not exist in ENA or is private'}
])

def test_check_all_taxonomy_codes(self):
Expand Down Expand Up @@ -152,3 +152,20 @@ def test_check_analysis_alias_coherence(self):
{'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'},
{'property': '/sample/analysisAlias', 'description': 'alias_1,alias_2 present in Samples not in Analysis'}
])

def test_check_all_analysis_run_accessions(self):
metadata = {
"analysis": [
{'runAccessions': ['SRR000001', 'SRR000002']}
]
}
checker = SemanticMetadataChecker(metadata)
checker.check_all_analysis_run_accessions()
assert checker.errors == []

metadata["analysis"].append({'runAccessions': ['SRR00000000001']})

checker.check_all_analysis_run_accessions()
assert checker.errors == [
{'property': '/analysis/1/runAccessions', 'description': 'Run SRR00000000001 does not exist in ENA or is private'}]

36 changes: 18 additions & 18 deletions tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,19 @@ def test__collect_validation_workflow_results_with_metadata_xlsx(self):
{'sheet': 'Project', 'row': '', 'column': 'Tax ID',
'description': 'Worksheet Project is missing required header Tax ID'},
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title',
'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Tax ID',
'description': 'In sheet "Project", column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Hold Date',
{'sheet': 'Project', 'row': 2, 'column': 'Project Title',
'description': 'Column "Project Title" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Tax ID',
'description': 'Column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Hold Date',
'description': 'must match format "date"'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description',
'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
'description': 'Column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
'description': 'Column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Child Project(s)',
'description': 'Column "Sample Accession" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)',
'description': 'PRJEBNA does not exist or is private'},
{'sheet': 'Sample', 'row': 5, 'column': 'Tax Id',
'description': '1234 is not a valid taxonomy code'},
Expand Down Expand Up @@ -244,19 +244,19 @@ def test_convert_biovalidator_validation_to_spreadsheet(self):

assert self.validator.results['metadata_check']['spreadsheet_errors'] == [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title',
'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Tax ID',
'description': 'In sheet "Project", column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Hold Date',
{'sheet': 'Project', 'row': 2, 'column': 'Project Title',
'description': 'Column "Project Title" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Tax ID',
'description': 'Column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Hold Date',
'description': 'must match format "date"'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description',
'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
'description': 'Column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
'description': 'Column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Child Project(s)',
'description': 'Column "Sample Accession" is not populated'},
{'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)',
'description': 'PRJEBNA does not exist or is private'},
{'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', 'description': '1234 is not a valid taxonomy code'},
{'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias',
Expand Down
13 changes: 10 additions & 3 deletions tests/test_xlsx2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ def get_expected_json(self):
"description": "An example project for demonstration purposes",
"centre": "University of Example",
"taxId": 9606,
"holdDate": "2023-12-31T00:00:00"
"holdDate": "2023-12-31",
'parentProject': 'PRJEB00001',
'childProjects': ['PRJEB00002', 'PRJEB00003']
},
"analysis": [
{
Expand All @@ -116,7 +118,8 @@ def get_expected_json(self):
"experimentType": "Whole genome sequencing",
"referenceGenome": "GCA_000001405.27",
"referenceFasta": "GCA_000001405.27_fasta.fa",
"platform": "BGISEQ-500"
"platform": "BGISEQ-500",
"imputation": True,
},
{
"analysisTitle": "Variant Detection 2",
Expand All @@ -125,7 +128,8 @@ def get_expected_json(self):
"experimentType": "Whole genome sequencing",
"referenceGenome": "GCA_000001405.27",
"referenceFasta": "GCA_000001405.27_fasta.fa",
"platform": "BGISEQ-500"
"platform": "BGISEQ-500",
'phasing': True,
},
{
"analysisTitle": "Variant Detection 3",
Expand Down Expand Up @@ -179,6 +183,9 @@ def get_expected_json(self):
],
"species": [
{"text": "Lemur catta"}
],
'collectionDate': [
{'text': '2021-03-12'}
]
}
}
Expand Down

0 comments on commit fda2ec2

Please sign in to comment.