Skip to content

Commit

Permalink
Add project template xml
Browse files Browse the repository at this point in the history
  • Loading branch information
anna-parker committed Jul 16, 2024
1 parent b336daa commit cdf77ef
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 92 deletions.
22 changes: 22 additions & 0 deletions ena-submission/resources/project_template.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<PROJECT_SET>
<PROJECT center_name="{{ Institution }}" alias="{{ group_accession }}:{{ organism }}:{{ unique_id }}">
<NAME>{{ ncbi_virus_name }}</NAME>
<TITLE>{{ ncbi_virus_name }} Genome sequencing</TITLE>
<DESCRIPTION>Automated upload of {{ ncbi_virus_name }} sequences submitted by {{ Institution }} from {{ db }}.</DESCRIPTION>
<SUBMISSION_PROJECT>
<SEQUENCING_PROJECT/>
<ORGANISM>
<TAXON_ID>{{ taxon_id }}</TAXON_ID>
<SCIENTIFIC_NAME>{{ ncbi_virus_name }}</SCIENTIFIC_NAME>
</ORGANISM>
</SUBMISSION_PROJECT>
<PROJECT_LINKS>
<PROJECT_LINK>
<XREF_LINK>
<DB>{{ db }}</DB>
<ID>{{ group_accession }}</ID>
</XREF_LINK>
</PROJECT_LINK>
</PROJECT_LINKS>
</PROJECT>
</PROJECT_SET>
40 changes: 14 additions & 26 deletions ena-submission/scripts/call_loculus.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,7 @@ def make_request(
response = requests.get(url, headers=headers, params=params)
case HTTPMethod.POST:
if files:
headers.pop(
"Content-Type"
) # Remove content-type for multipart/form-data
headers.pop("Content-Type") # Remove content-type for multipart/form-data
response = requests.post(url, headers=headers, files=files, data=params)
else:
response = requests.post(
Expand Down Expand Up @@ -131,9 +129,7 @@ def submit_external_metadata(
pre_ndjson = [x.strip() for x in file.readlines()]
data = " ".join(pre_ndjson)

response = make_request(
HTTPMethod.POST, url, config, data=data, headers=headers, params=params
)
response = make_request(HTTPMethod.POST, url, config, data=data, headers=headers, params=params)

if not response.ok:
response.raise_for_status()
Expand Down Expand Up @@ -162,30 +158,24 @@ def get_released_data(
except jsonlines.Error as err:
response_summary = response.text
if len(response_summary) > 100:
response_summary = (
response_summary[:50] + "\n[..]\n" + response_summary[-50:]
)
response_summary = response_summary[:50] + "\n[..]\n" + response_summary[-50:]
logger.error(f"Error decoding JSON from /get-released-data: {response_summary}")
raise ValueError() from err

if remove_if_has_ena_specific_metadata:
data_dict: dict[str, Any] = {}
for item in entries:
fields = [
1 if item["metadata"][field] else 0
for field in config.ena_specific_metadata
]
if sum(fields) > 0:
print(
"Discarding entry as contains ENA-specific metadata already and should not be resubmitted"
)
else:
key = item["metadata"]["accessionVersion"]
data_dict[key] = item
if item["metadata"]["dataUseTerms"] != "OPEN":
print("Discarding entry as not OPEN for release")
continue
# fields = [1 if item["metadata"][field] else 0 for field in config.ena_specific_metadata]
# if sum(fields) > 0:
# print("Discarding entry as contains ENA-specific metadata already.")
# else:
key = item["metadata"]["accessionVersion"]
data_dict[key] = item
else:
data_dict: dict[str, Any] = {
rec["metadata"]["accessionVersion"]: rec for rec in entries
}
data_dict: dict[str, Any] = {rec["metadata"]["accessionVersion"]: rec for rec in entries}

return data_dict

Expand Down Expand Up @@ -254,9 +244,7 @@ def record_factory(*args, **kwargs):

with open(config_file) as file:
full_config = yaml.safe_load(file)
relevant_config = {
key: full_config.get(key, []) for key in Config.__annotations__
}
relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__}
config = Config(**relevant_config)

logger.info(f"Config: {config}")
Expand Down
124 changes: 92 additions & 32 deletions ena-submission/scripts/create_project_xml.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,92 @@
group_accession = "group1"
Institution = "bla"
db = "Pathoplexus"
organism = "ebola-zaire"
ncbi_virus_name = "Zaire ebolavirus"
taxon_id = 186538
unique_id = "pathoplexus.org" # tbd

final_project_xml = f"""
<PROJECT_SET>
<PROJECT center_name="{Institution}" alias="{group_accession}:{organism}:{unique_id}">
<NAME>{ncbi_virus_name}</NAME>
<TITLE>{ncbi_virus_name} Genome sequencing</TITLE>
<DESCRIPTION>Automated upload of {ncbi_virus_name} sequences submitted by {Institution} from {db}.</DESCRIPTION>
<SUBMISSION_PROJECT>
<SEQUENCING_PROJECT/>
<ORGANISM>
<TAXON_ID>{taxon_id}</TAXON_ID>
<SCIENTIFIC_NAME>{ncbi_virus_name}</SCIENTIFIC_NAME>
</ORGANISM>
</SUBMISSION_PROJECT>
<PROJECT_LINKS>
<PROJECT_LINK>
<XREF_LINK>
<DB>{db}</DB>
<ID>{group_accession}</ID>
</XREF_LINK>
</PROJECT_LINK>
</PROJECT_LINKS>
</PROJECT>
</PROJECT_SET>
"""
import json
import logging
import string
from dataclasses import dataclass
from typing import Dict, List

import click
import yaml
from submission_db import get_db_config, in_submission_table


logger = logging.getLogger(__name__)
logging.basicConfig(
encoding="utf-8",
level=logging.INFO,
format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ",
datefmt="%H:%M:%S",
)


@dataclass
class Config:
organisms: List[Dict[str, str]]
organism: str
backend_url: str
keycloak_token_url: str
keycloak_client_id: str
username: str
password: str
ena_specific_metadata: List[str]
db_username: str
db_password: str
db_host: str


def get_project_xml(
group_accession="group1",
Institution="bla",
db="Pathoplexus",
organism="ebola-zaire",
ncbi_virus_name="Zaire ebolavirus",
taxon_id=186538,
unique_id="pathoplexus.org",
):
# Read the template file
with open("resources/project_template.xml", "r") as file:
template = string.Template(file.read())

# Create a dictionary with the values to substitute
values = {
"group_accession": group_accession,
"Institution": Institution,
"db": db,
"organism": organism,
"ncbi_virus_name": ncbi_virus_name,
"taxon_id": taxon_id,
"unique_id": unique_id,
}

# Substitute the placeholders with actual values
return template.safe_substitute(values)


@click.command()
@click.option(
"--log-level",
default="INFO",
type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
)
@click.option(
"--config-file",
required=True,
type=click.Path(exists=True),
)
@click.option(
"--input-file",
required=True,
type=click.Path(),
)
def create_project_xml(log_level, config_file, input_file):
logger.setLevel(log_level)
logging.getLogger("requests").setLevel(logging.WARNING)

with open(config_file) as file:
full_config = yaml.safe_load(file)
relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__}
config = Config(**relevant_config)
logger.info(f"Config: {config}")

db_config = get_db_config(config.db_password, config.db_username, config.db_host)

sequences_to_upload = json.load(open(input_file, encoding="utf-8"))
40 changes: 6 additions & 34 deletions ena-submission/scripts/get_ena_submission_list.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
from call_loculus import get_released_data
from submission_db import in_submission_table, DBConfig

import os
import json
import logging
from dataclasses import dataclass
from typing import List, Dict
from pathlib import Path
from typing import Dict, List

import click
import yaml
from call_loculus import get_released_data
from submission_db import get_db_config, in_submission_table

logger = logging.getLogger(__name__)
logging.basicConfig(
Expand All @@ -35,28 +33,6 @@ class Config:
db_host: str


def get_db_config(config: Config):
db_password = os.getenv("DB_PASSWORD")
if not db_password:
db_password = config.db_password

db_username = os.getenv("DB_USERNAME")
if not db_username:
db_username = config.db_username

db_host = os.getenv("DB_HOST")
if not db_host:
db_host = config.db_host

db_params = {
"username": db_username,
"password": db_password,
"host": db_host,
}

return DBConfig(**db_params)


@click.command()
@click.option(
"--log-level",
Expand All @@ -79,23 +55,19 @@ def get_ena_submission_list(log_level, config_file, output_file):

with open(config_file) as file:
full_config = yaml.safe_load(file)
relevant_config = {
key: full_config.get(key, []) for key in Config.__annotations__
}
relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__}
config = Config(**relevant_config)
logger.info(f"Config: {config}")

db_config = get_db_config(config)
db_config = get_db_config(config.db_password, config.db_username, config.db_host)

entries_to_submit = {}
for organism in config.organisms:
config.ena_specific_metadata = [
value["name"] for value in config.organisms[organism]["externalMetadata"]
]
logging.info(f"Getting released sequences for organism: {organism}")
entries = get_released_data(
config, organism, remove_if_has_ena_specific_metadata=True
)
entries = get_released_data(config, organism, remove_if_has_ena_specific_metadata=True)

for key, item in entries.items():
accession, version = key.split(".")
Expand Down
23 changes: 23 additions & 0 deletions ena-submission/scripts/submission_db.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import psycopg2
from enum import Enum
from dataclasses import dataclass
Expand All @@ -10,6 +11,28 @@ class DBConfig:
host: str


def get_db_config(db_password_default: str, db_username_default: str, db_host_default: str):
db_password = os.getenv("DB_PASSWORD")
if not db_password:
db_password = db_password_default

db_username = os.getenv("DB_USERNAME")
if not db_username:
db_username = db_username_default

db_host = os.getenv("DB_HOST")
if not db_host:
db_host = db_host_default

db_params = {
"username": db_username,
"password": db_password,
"host": db_host,
}

return DBConfig(**db_params)


class StatusAll(Enum):
READY_TO_SUBMIT = 0
SUBMITTING_PROJECT = 1
Expand Down

0 comments on commit cdf77ef

Please sign in to comment.