diff --git a/ena-submission/config/defaults.yaml b/ena-submission/config/defaults.yaml index db2f268ed..429de011a 100644 --- a/ena-submission/config/defaults.yaml +++ b/ena-submission/config/defaults.yaml @@ -6,3 +6,46 @@ unique_project_suffix: Loculus ena_submission_username: fake-user ena_submission_password: fake-password ena_submission_url: https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit +metadata_mapping: + 'subject exposure': + loculus_fields: [exposure_event] + 'type exposure': + loculus_fields: [exposure_event] + hospitalisation: + loculus_fields: [host_health_state] + function: match + args: [Hospital] + 'illness symptoms': + loculus_fields: [signs_and_symptoms] + 'collection date': + loculus_fields: [sample_collection_date] + 'geographic location (country and/or sea)': + loculus_fields: [geo_loc_country] + 'geographic location (region and locality)': + loculus_fields: [geo_loc_admin_1] + 'sample capture status': + loculus_fields: [purpose_of_sampling] + 'host disease outcome': + loculus_fields: [host_health_outcome] + 'host common name': + loculus_fields: [host_name_common] + 'host age': + loculus_fields: [host_age] + 'host health state': + loculus_fields: [host_health_state] + 'host sex': + loculus_fields: [host_gender] + 'host scientific name': + loculus_fields: [host_name_scientific] + 'isolate': + loculus_fields: [specimen_collector_sample_id] + 'collecting institution': + loculus_fields: [sequenced_by_organization, author_affiliations] + 'receipt date': + loculus_fields: [received date] + 'isolation source host-associated': + loculus_fields: [anatomical material, anatomical part, body product] + 'isolation source non-host-associated': + loculus_fields: [environmental site, environmental material] + 'authors': + loculus_fields: [authors] diff --git a/ena-submission/scripts/create_sample.py b/ena-submission/scripts/create_sample.py index fcb1a96e0..963cb2348 100644 --- a/ena-submission/scripts/create_sample.py +++ b/ena-submission/scripts/create_sample.py @@ -1,5 +1,6 @@ import json import logging +import re from dataclasses import dataclass from typing import Dict, List @@ -59,12 +60,35 @@ class Config: def construct_sample_set_object(config, organism_metadata, sample_metadata, center_name, row): list_sample_attributes = [] for field in config.metadata_map: - loculus_metadata_field = config.metadata_map[field] - if sample_metadata.get(loculus_metadata_field, None): + loculus_metadata_field_names = config.metadata_map[field]["loculus_fields"] + loculus_metadata_field_values = [ + sample_metadata.get(metadata, None) for metadata in loculus_metadata_field_names + ] + if "function" in config.metadata_map[field] and "args" in config.metadata_map[field]: + function = config.metadata_map[field]["function"] + args = config.metadata_map[field]["args"] + if function == "match" and (len(loculus_metadata_field_names) == len(args)): + value = True + for i in range(len(loculus_metadata_field_names)): + if not re.match( + args[i], + sample_metadata.get(loculus_metadata_field_names[i], None), + re.IGNORECASE, + ): + value = False + break + else: + logging.warning( + f"Could not calculate function {function} with args: {args} for {row["accession"]}" + ) + continue + else: + value = ";".join([metadata for metadata in loculus_metadata_field_values if metadata]) + if value: list_sample_attributes.append( SampleAttribute( tag=field, - value=sample_metadata[loculus_metadata_field], + value=value, ) ) sample_type = SampleType( diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py index e8dd09111..24486cffb 100644 --- a/ena-submission/scripts/test_ena_submission.py +++ b/ena-submission/scripts/test_ena_submission.py @@ -1,7 +1,7 @@ import unittest -import xmltodict from unittest import mock +import xmltodict from create_sample import construct_sample_set_object from ena_submission_helper import ( SubmissionConfig, @@ -54,18 +54,34 @@ + + hospitalisation + true + geographic location (country and/or sea) China - collector name - I. Kurane, M. Saijo, Q. Tang, S. Morikawa, T. Qing, Z. Xinqin + geographic location (region and locality) + Xinjiang province + + + host health state + Hospital care required isolate 66019 + + collecting institution + Special Pathogens Laboratory; 4-7-1 Gakuen, Musashimurayama, Tokyo 208-0011 + + + authors + I. Kurane, M. Saijo, Q. Tang, S. Morikawa, T. Qing, Z. Xinqin + @@ -144,20 +160,36 @@ def test_sample_set_construction(self): center_name = "Fake center name" config = mock.Mock() config.metadata_map = { - "subject exposure": "exposure_event", - "illness symptoms": "signs_and_symptoms", - "collection date": "sample_collection_date", - "geographic location (country and/or sea)": "geo_loc_country", - "host disease outcome": "host_health_outcome", - "host common name": "host_name_common", - "host age": "host_age", - "host sex": "host_gender", - "lab_host": "is_lab_host", - "host scientific name": "host_name_scientific", - "collector name": "authors", - "receipt date": "sample_received_date", - "isolate": "specimen_collector_sample_id", - "host behaviour": "host_role", + "subject exposure": {"loculus_fields": ["exposure_event"]}, + "type exposure": {"loculus_fields": ["exposure_event"]}, + "hospitalisation": { + "loculus_fields": ["host_health_state"], + "function": "match", + "args": ["Hospital"], + }, + "illness symptoms": {"loculus_fields": ["signs_and_symptoms"]}, + "collection date": {"loculus_fields": ["sample_collection_date"]}, + "geographic location (country and/or sea)": {"loculus_fields": ["geo_loc_country"]}, + "geographic location (region and locality)": {"loculus_fields": ["geo_loc_admin_1"]}, + "sample capture status": {"loculus_fields": ["purpose_of_sampling"]}, + "host disease outcome": {"loculus_fields": ["host_health_outcome"]}, + "host common name": {"loculus_fields": ["host_name_common"]}, + "host age": {"loculus_fields": ["host_age"]}, + "host health state": {"loculus_fields": ["host_health_state"]}, + "host sex": {"loculus_fields": ["host_gender"]}, + "host scientific name": {"loculus_fields": ["host_name_scientific"]}, + "isolate": {"loculus_fields": ["specimen_collector_sample_id"]}, + "collecting institution": { + "loculus_fields": ["sequenced_by_organization", "author_affiliations"] + }, + "receipt date": {"loculus_fields": ["received date"]}, + "isolation source host-associated": { + "loculus_fields": ["anatomical material", "anatomical part", "body product"] + }, + "isolation source non-host-associated": { + "loculus_fields": ["environmental site", "environmental material"] + }, + "authors": {"loculus_fields": ["authors"]}, } config.db_name = "Loculus" config.unique_project_suffix = "test suffix" @@ -209,7 +241,7 @@ def test_sample_set_construction(self): "collection_device": None, "collection_method": None, "depth_of_coverage": None, - "host_health_state": None, + "host_health_state": "Hospital care required", "ncbi_release_date": "2002-02-07", "sra_run_accession": None, "environmental_site": None, @@ -307,6 +339,7 @@ def test_sample_set_construction(self): sample_set = construct_sample_set_object( config, organism_metadata, sample_metadata, center_name, row ) + print(dataclass_to_xml(sample_set, root_name="SAMPLE_SET")) assert xmltodict.parse( dataclass_to_xml(sample_set, root_name="SAMPLE_SET") ) == xmltodict.parse(test_xml_request)