Skip to content

Commit

Permalink
Use correct metadata map from PHA4GE.
Browse files Browse the repository at this point in the history
  • Loading branch information
anna-parker committed Jul 23, 2024
1 parent 1b95382 commit 8b84d7a
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 21 deletions.
43 changes: 43 additions & 0 deletions ena-submission/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,46 @@ unique_project_suffix: Loculus
ena_submission_username: fake-user
ena_submission_password: fake-password
ena_submission_url: https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit
metadata_mapping:
'subject exposure':
loculus_fields: [exposure_event]
'type exposure':
loculus_fields: [exposure_event]
hospitalisation:
loculus_fields: [host_health_state]
function: match
args: [Hospital]
'illness symptoms':
loculus_fields: [signs_and_symptoms]
'collection date':
loculus_fields: [sample_collection_date]
'geographic location (country and/or sea)':
loculus_fields: [geo_loc_country]
'geographic location (region and locality)':
loculus_fields: [geo_loc_admin_1]
'sample capture status':
loculus_fields: [purpose_of_sampling]
'host disease outcome':
loculus_fields: [host_health_outcome]
'host common name':
loculus_fields: [host_name_common]
'host age':
loculus_fields: [host_age]
'host health state':
loculus_fields: [host_health_state]
'host sex':
loculus_fields: [host_gender]
'host scientific name':
loculus_fields: [host_name_scientific]
'isolate':
loculus_fields: [specimen_collector_sample_id]
'collecting institution':
loculus_fields: [sequenced_by_organization, author_affiliations]
'receipt date':
loculus_fields: [received date]
'isolation source host-associated':
loculus_fields: [anatomical material, anatomical part, body product]
'isolation source non-host-associated':
loculus_fields: [environmental site, environmental material]
'authors':
loculus_fields: [authors]
30 changes: 27 additions & 3 deletions ena-submission/scripts/create_sample.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
import re
from dataclasses import dataclass
from typing import Dict, List

Expand Down Expand Up @@ -59,12 +60,35 @@ class Config:
def construct_sample_set_object(config, organism_metadata, sample_metadata, center_name, row):
list_sample_attributes = []
for field in config.metadata_map:
loculus_metadata_field = config.metadata_map[field]
if sample_metadata.get(loculus_metadata_field, None):
loculus_metadata_field_names = config.metadata_map[field]["loculus_fields"]
loculus_metadata_field_values = [
sample_metadata.get(metadata, None) for metadata in loculus_metadata_field_names
]
if "function" in config.metadata_map[field] and "args" in config.metadata_map[field]:
function = config.metadata_map[field]["function"]
args = config.metadata_map[field]["args"]
if function == "match" and (len(loculus_metadata_field_names) == len(args)):
value = True
for i in range(len(loculus_metadata_field_names)):
if not re.match(
args[i],
sample_metadata.get(loculus_metadata_field_names[i], None),
re.IGNORECASE,
):
value = False
break
else:
logging.warning(
f"Could not calculate function {function} with args: {args} for {row["accession"]}"
)
continue
else:
value = ";".join([metadata for metadata in loculus_metadata_field_values if metadata])
if value:
list_sample_attributes.append(
SampleAttribute(
tag=field,
value=sample_metadata[loculus_metadata_field],
value=value,
)
)
sample_type = SampleType(
Expand Down
69 changes: 51 additions & 18 deletions ena-submission/scripts/test_ena_submission.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest
import xmltodict
from unittest import mock

import xmltodict
from create_sample import construct_sample_set_object
from ena_submission_helper import (
SubmissionConfig,
Expand Down Expand Up @@ -54,18 +54,34 @@
</SAMPLE_LINK>
</SAMPLE_LINKS>
<SAMPLE_ATTRIBUTES>
<SAMPLE_ATTRIBUTE>
<TAG>hospitalisation</TAG>
<VALUE>true</VALUE>
</SAMPLE_ATTRIBUTE>
<SAMPLE_ATTRIBUTE>
<TAG>geographic location (country and/or sea)</TAG>
<VALUE>China</VALUE>
</SAMPLE_ATTRIBUTE>
<SAMPLE_ATTRIBUTE>
<TAG>collector name</TAG>
<VALUE>I. Kurane, M. Saijo, Q. Tang, S. Morikawa, T. Qing, Z. Xinqin</VALUE>
<TAG>geographic location (region and locality)</TAG>
<VALUE>Xinjiang province</VALUE>
</SAMPLE_ATTRIBUTE>
<SAMPLE_ATTRIBUTE>
<TAG>host health state</TAG>
<VALUE>Hospital care required</VALUE>
</SAMPLE_ATTRIBUTE>
<SAMPLE_ATTRIBUTE>
<TAG>isolate</TAG>
<VALUE>66019</VALUE>
</SAMPLE_ATTRIBUTE>
<SAMPLE_ATTRIBUTE>
<TAG>collecting institution</TAG>
<VALUE>Special Pathogens Laboratory; 4-7-1 Gakuen, Musashimurayama, Tokyo 208-0011</VALUE>
</SAMPLE_ATTRIBUTE>
<SAMPLE_ATTRIBUTE>
<TAG>authors</TAG>
<VALUE>I. Kurane, M. Saijo, Q. Tang, S. Morikawa, T. Qing, Z. Xinqin</VALUE>
</SAMPLE_ATTRIBUTE>
</SAMPLE_ATTRIBUTES>
</SAMPLE>
</SAMPLE_SET>
Expand Down Expand Up @@ -144,20 +160,36 @@ def test_sample_set_construction(self):
center_name = "Fake center name"
config = mock.Mock()
config.metadata_map = {
"subject exposure": "exposure_event",
"illness symptoms": "signs_and_symptoms",
"collection date": "sample_collection_date",
"geographic location (country and/or sea)": "geo_loc_country",
"host disease outcome": "host_health_outcome",
"host common name": "host_name_common",
"host age": "host_age",
"host sex": "host_gender",
"lab_host": "is_lab_host",
"host scientific name": "host_name_scientific",
"collector name": "authors",
"receipt date": "sample_received_date",
"isolate": "specimen_collector_sample_id",
"host behaviour": "host_role",
"subject exposure": {"loculus_fields": ["exposure_event"]},
"type exposure": {"loculus_fields": ["exposure_event"]},
"hospitalisation": {
"loculus_fields": ["host_health_state"],
"function": "match",
"args": ["Hospital"],
},
"illness symptoms": {"loculus_fields": ["signs_and_symptoms"]},
"collection date": {"loculus_fields": ["sample_collection_date"]},
"geographic location (country and/or sea)": {"loculus_fields": ["geo_loc_country"]},
"geographic location (region and locality)": {"loculus_fields": ["geo_loc_admin_1"]},
"sample capture status": {"loculus_fields": ["purpose_of_sampling"]},
"host disease outcome": {"loculus_fields": ["host_health_outcome"]},
"host common name": {"loculus_fields": ["host_name_common"]},
"host age": {"loculus_fields": ["host_age"]},
"host health state": {"loculus_fields": ["host_health_state"]},
"host sex": {"loculus_fields": ["host_gender"]},
"host scientific name": {"loculus_fields": ["host_name_scientific"]},
"isolate": {"loculus_fields": ["specimen_collector_sample_id"]},
"collecting institution": {
"loculus_fields": ["sequenced_by_organization", "author_affiliations"]
},
"receipt date": {"loculus_fields": ["received date"]},
"isolation source host-associated": {
"loculus_fields": ["anatomical material", "anatomical part", "body product"]
},
"isolation source non-host-associated": {
"loculus_fields": ["environmental site", "environmental material"]
},
"authors": {"loculus_fields": ["authors"]},
}
config.db_name = "Loculus"
config.unique_project_suffix = "test suffix"
Expand Down Expand Up @@ -209,7 +241,7 @@ def test_sample_set_construction(self):
"collection_device": None,
"collection_method": None,
"depth_of_coverage": None,
"host_health_state": None,
"host_health_state": "Hospital care required",
"ncbi_release_date": "2002-02-07",
"sra_run_accession": None,
"environmental_site": None,
Expand Down Expand Up @@ -307,6 +339,7 @@ def test_sample_set_construction(self):
sample_set = construct_sample_set_object(
config, organism_metadata, sample_metadata, center_name, row
)
print(dataclass_to_xml(sample_set, root_name="SAMPLE_SET"))
assert xmltodict.parse(
dataclass_to_xml(sample_set, root_name="SAMPLE_SET")
) == xmltodict.parse(test_xml_request)
Expand Down

0 comments on commit 8b84d7a

Please sign in to comment.