Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3636: Fixes for semantic metadata validation #52

Merged
merged 3 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion eva_sub_cli/etc/eva_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@
"$ref": "eva-biosamples.json"
},
{
"$ref": "https://www.ebi.ac.uk/biosamples/schemas/certification/biosamples-minimal.json"
"$ref": "https://www.ebi.ac.uk/biosamples/schema/store/registry/schemas/ERC000011"
},
{
"$ref": "https://www.ebi.ac.uk/biosamples/schemas/certification/plant-miappe.json"
Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/executables/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def main():
vcf_group = argparser.add_argument_group(
'Input VCF and assembly',
"Specify the VCF files and associated assembly with the following options. If you used different assemblies "
"for different VCF files then use --vcf_file_mapping"
"for different VCF files then include these in the metadata file."
)
vcf_group.add_argument('--vcf_files', nargs='+', help="One or several vcf files to validate")
vcf_group.add_argument('--reference_fasta',
Expand Down
1 change: 1 addition & 0 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ workflow {
if (metadata_json) {
// Metadata checks and concordance checks
metadata_json_validation(metadata_json)
metadata_semantic_check(metadata_json)
sample_name_concordance(metadata_json, vcf_files.collect())
fasta_to_vcfs = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
.splitCsv(header:true)
Expand Down
9 changes: 6 additions & 3 deletions eva_sub_cli/semantic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def check_all_project_accessions(self):
def check_all_taxonomy_codes(self):
"""Check that taxonomy IDs are valid according to ENA."""
project = self.metadata[PROJECT_KEY]
self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
if TAX_ID_KEY in project:
self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
# Check sample taxonomies for novel samples
for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
if BIOSAMPLE_OBJECT_KEY in sample:
Expand Down Expand Up @@ -145,8 +146,10 @@ def _validate_biosample_against_checklist(self, sample_data, json_path, accessio
sample_data['characteristics']['checklist'].append({'text': self.sample_checklist})
text = self.communicator.follows_link('samples', join_url='validate', method='POST', json=sample_data,
text_only=True)
if text.startswith('Checklist validation failed: Sample validation failed: '):
for error_dict in json.loads(text[len('Checklist validation failed: Sample validation failed: '):]):
response = json.loads(text)
# Successful response returns the sample object, error response has a list of error objects.
if isinstance(response, list):
for error_dict in response:
if accession:
self.add_error(json_path, f'Existing sample {accession} {error_dict["errors"][0]}')
else:
Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
logger = logging_config.get_logger(__name__)

container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev13'
container_tag = 'v0.0.1.dev14'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

Expand Down
10 changes: 9 additions & 1 deletion tests/test_docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def setUp(self):
"parentProject": "PRJ_INVALID"
},
"sample": [
{"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"}
{"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME123"}
],
"analysis": [
{"analysisAlias": "AA"}
Expand Down Expand Up @@ -138,6 +138,14 @@ def test_validate(self):
metadata_val_lines = {l.strip() for l in open_file.readlines()}
assert 'must match pattern "^PRJ(EB|NA)\\d+$"' in metadata_val_lines

# Check semantic metadata errors
semantic_yaml_file = os.path.join(self.validator.output_dir, 'other_validations', 'metadata_semantic_check.yml')
self.assertTrue(os.path.isfile(semantic_yaml_file))
with open(semantic_yaml_file) as open_yaml:
semantic_output = yaml.safe_load(open_yaml)
assert semantic_output[1] == {'description': 'SAME123 does not exist or is private',
'property': '/sample/0/bioSampleAccession'}

def test_validate_from_excel(self):
self.validator_from_excel.validate()
self.assertTrue(os.path.isfile(self.validator_from_excel._sample_check_yaml))
31 changes: 15 additions & 16 deletions tests/test_semantic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,19 @@
'characteristics': {
'organism': [{'text': 'Viridiplantae'}],
'collection date': [{'text': '2018'}],
'geo loc name': [{'text': 'France: Montferrier-sur-Lez'}]
'geographic location (country and/or sea)': [{'text': 'France'}]
}
}
invalid_sample = {
'accession': 'SAME00003',
'name': 'sample3',
'characteristics': {
'organism': [{'text': 'Viridiplantae'}]
'organism': [{'text': 'Viridiplantae'}],
'geographic location (country and/or sea)': [{'text': 'France: Montferrier-sur-Lez'}]
}
}


class TestSemanticMetadata(TestCase):

def test_check_all_project_accessions(self):
Expand Down Expand Up @@ -88,16 +90,18 @@ def test_check_existing_biosamples_with_checklist(self):
with patch.object(SemanticMetadataChecker, '_get_biosample',
side_effect=[valid_sample, ValueError, invalid_sample]) as m_get_sample:
checker.check_existing_biosamples()
self.assertEqual(checker.errors, [
{'property': '/sample/0/bioSampleAccession',
'description': "Existing sample SAME00001 should have required property 'geographic location (country and/or sea)'"},
{'property': '/sample/1/bioSampleAccession',
'description': 'SAME00002 does not exist or is private'},
self.assertEqual(
checker.errors[0],
{'property': '/sample/1/bioSampleAccession', 'description': 'SAME00002 does not exist or is private'}
)
self.assertEqual(
checker.errors[1],
{'property': '/sample/2/bioSampleAccession',
'description': "Existing sample SAME00003 should have required property 'collection date'"},
{'property': '/sample/2/bioSampleAccession',
'description': "Existing sample SAME00003 should have required property 'geographic location (country and/or sea)'"}
])
'description': "Existing sample SAME00003 should have required property 'collection date'"}
)
# Final error message lists all possible geographic locations
self.assertTrue(checker.errors[2]['description'].startswith(
'Existing sample SAME00003 should be equal to one of the allowed values:'))

def test_check_existing_biosamples(self):
checker = SemanticMetadataChecker(metadata, sample_checklist=None)
Expand All @@ -112,10 +116,6 @@ def test_check_existing_biosamples(self):
{
'property': '/sample/2/bioSampleAccession',
'description': 'Existing sample SAME00003 does not have a valid collection date'
},
{
'property': '/sample/2/bioSampleAccession',
'description': 'Existing sample SAME00003 does not have a valid geographic location'
}
])

Expand Down Expand Up @@ -168,4 +168,3 @@ def test_check_all_analysis_run_accessions(self):
checker.check_all_analysis_run_accessions()
assert checker.errors == [
{'property': '/analysis/1/runAccessions', 'description': 'Run SRR00000000001 does not exist in ENA or is private'}]

2 changes: 1 addition & 1 deletion tests/test_submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def test_submit_with_config(self):
assert os.path.exists(self.config_file)
# assert backup file is created
assert os.path.exists(f"{self.config_file}.1")
with (open(self.config_file, 'r') as f):
with open(self.config_file, 'r') as f:
sub_config_data = yaml.safe_load(f)
assert sub_config_data[SUB_CLI_CONFIG_KEY_SUBMISSION_ID] == "mock_submission_id"
assert sub_config_data[SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL] == "directory to use for upload"
Expand Down
Loading