Skip to content

Commit

Permalink
Merge pull request #52 from apriltuesday/EVA-3636
Browse files Browse the repository at this point in the history
EVA-3636: Fixes for semantic metadata validation
  • Loading branch information
apriltuesday authored Aug 21, 2024
2 parents 9b9cae0 + 9c04ef9 commit 5879d11
Show file tree
Hide file tree
Showing 8 changed files with 35 additions and 24 deletions.
2 changes: 1 addition & 1 deletion eva_sub_cli/etc/eva_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@
"$ref": "eva-biosamples.json"
},
{
"$ref": "https://www.ebi.ac.uk/biosamples/schemas/certification/biosamples-minimal.json"
"$ref": "https://www.ebi.ac.uk/biosamples/schema/store/registry/schemas/ERC000011"
},
{
"$ref": "https://www.ebi.ac.uk/biosamples/schemas/certification/plant-miappe.json"
Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/executables/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def main():
vcf_group = argparser.add_argument_group(
'Input VCF and assembly',
"Specify the VCF files and associated assembly with the following options. If you used different assemblies "
"for different VCF files then use --vcf_file_mapping"
"for different VCF files then include these in the metadata file."
)
vcf_group.add_argument('--vcf_files', nargs='+', help="One or several vcf files to validate")
vcf_group.add_argument('--reference_fasta',
Expand Down
1 change: 1 addition & 0 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ workflow {
if (metadata_json) {
// Metadata checks and concordance checks
metadata_json_validation(metadata_json)
metadata_semantic_check(metadata_json)
sample_name_concordance(metadata_json, vcf_files.collect())
fasta_to_vcfs = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
.splitCsv(header:true)
Expand Down
9 changes: 6 additions & 3 deletions eva_sub_cli/semantic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def check_all_project_accessions(self):
def check_all_taxonomy_codes(self):
"""Check that taxonomy IDs are valid according to ENA."""
project = self.metadata[PROJECT_KEY]
self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
if TAX_ID_KEY in project:
self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
# Check sample taxonomies for novel samples
for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
if BIOSAMPLE_OBJECT_KEY in sample:
Expand Down Expand Up @@ -145,8 +146,10 @@ def _validate_biosample_against_checklist(self, sample_data, json_path, accessio
sample_data['characteristics']['checklist'].append({'text': self.sample_checklist})
text = self.communicator.follows_link('samples', join_url='validate', method='POST', json=sample_data,
text_only=True)
if text.startswith('Checklist validation failed: Sample validation failed: '):
for error_dict in json.loads(text[len('Checklist validation failed: Sample validation failed: '):]):
response = json.loads(text)
# Successful response returns the sample object, error response has a list of error objects.
if isinstance(response, list):
for error_dict in response:
if accession:
self.add_error(json_path, f'Existing sample {accession} {error_dict["errors"][0]}')
else:
Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
logger = logging_config.get_logger(__name__)

container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev13'
container_tag = 'v0.0.1.dev14'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

Expand Down
10 changes: 9 additions & 1 deletion tests/test_docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def setUp(self):
"parentProject": "PRJ_INVALID"
},
"sample": [
{"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"}
{"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME123"}
],
"analysis": [
{"analysisAlias": "AA"}
Expand Down Expand Up @@ -138,6 +138,14 @@ def test_validate(self):
metadata_val_lines = {l.strip() for l in open_file.readlines()}
assert 'must match pattern "^PRJ(EB|NA)\\d+$"' in metadata_val_lines

# Check semantic metadata errors
semantic_yaml_file = os.path.join(self.validator.output_dir, 'other_validations', 'metadata_semantic_check.yml')
self.assertTrue(os.path.isfile(semantic_yaml_file))
with open(semantic_yaml_file) as open_yaml:
semantic_output = yaml.safe_load(open_yaml)
assert semantic_output[1] == {'description': 'SAME123 does not exist or is private',
'property': '/sample/0/bioSampleAccession'}

def test_validate_from_excel(self):
self.validator_from_excel.validate()
self.assertTrue(os.path.isfile(self.validator_from_excel._sample_check_yaml))
31 changes: 15 additions & 16 deletions tests/test_semantic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,19 @@
'characteristics': {
'organism': [{'text': 'Viridiplantae'}],
'collection date': [{'text': '2018'}],
'geo loc name': [{'text': 'France: Montferrier-sur-Lez'}]
'geographic location (country and/or sea)': [{'text': 'France'}]
}
}
invalid_sample = {
'accession': 'SAME00003',
'name': 'sample3',
'characteristics': {
'organism': [{'text': 'Viridiplantae'}]
'organism': [{'text': 'Viridiplantae'}],
'geographic location (country and/or sea)': [{'text': 'France: Montferrier-sur-Lez'}]
}
}


class TestSemanticMetadata(TestCase):

def test_check_all_project_accessions(self):
Expand Down Expand Up @@ -88,16 +90,18 @@ def test_check_existing_biosamples_with_checklist(self):
with patch.object(SemanticMetadataChecker, '_get_biosample',
side_effect=[valid_sample, ValueError, invalid_sample]) as m_get_sample:
checker.check_existing_biosamples()
self.assertEqual(checker.errors, [
{'property': '/sample/0/bioSampleAccession',
'description': "Existing sample SAME00001 should have required property 'geographic location (country and/or sea)'"},
{'property': '/sample/1/bioSampleAccession',
'description': 'SAME00002 does not exist or is private'},
self.assertEqual(
checker.errors[0],
{'property': '/sample/1/bioSampleAccession', 'description': 'SAME00002 does not exist or is private'}
)
self.assertEqual(
checker.errors[1],
{'property': '/sample/2/bioSampleAccession',
'description': "Existing sample SAME00003 should have required property 'collection date'"},
{'property': '/sample/2/bioSampleAccession',
'description': "Existing sample SAME00003 should have required property 'geographic location (country and/or sea)'"}
])
'description': "Existing sample SAME00003 should have required property 'collection date'"}
)
# Final error message lists all possible geographic locations
self.assertTrue(checker.errors[2]['description'].startswith(
'Existing sample SAME00003 should be equal to one of the allowed values:'))

def test_check_existing_biosamples(self):
checker = SemanticMetadataChecker(metadata, sample_checklist=None)
Expand All @@ -112,10 +116,6 @@ def test_check_existing_biosamples(self):
{
'property': '/sample/2/bioSampleAccession',
'description': 'Existing sample SAME00003 does not have a valid collection date'
},
{
'property': '/sample/2/bioSampleAccession',
'description': 'Existing sample SAME00003 does not have a valid geographic location'
}
])

Expand Down Expand Up @@ -168,4 +168,3 @@ def test_check_all_analysis_run_accessions(self):
checker.check_all_analysis_run_accessions()
assert checker.errors == [
{'property': '/analysis/1/runAccessions', 'description': 'Run SRR00000000001 does not exist in ENA or is private'}]

2 changes: 1 addition & 1 deletion tests/test_submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def test_submit_with_config(self):
assert os.path.exists(self.config_file)
# assert backup file is created
assert os.path.exists(f"{self.config_file}.1")
with (open(self.config_file, 'r') as f):
with open(self.config_file, 'r') as f:
sub_config_data = yaml.safe_load(f)
assert sub_config_data[SUB_CLI_CONFIG_KEY_SUBMISSION_ID] == "mock_submission_id"
assert sub_config_data[SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL] == "directory to use for upload"
Expand Down

0 comments on commit 5879d11

Please sign in to comment.