From 8db4aab85b020643e06e84c59e1e9e380a1f866c Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Tue, 23 Jul 2024 15:40:31 +0200 Subject: [PATCH] Add create_sample updates as well --- ena-submission/scripts/create_sample.py | 49 ++++++++++++------- ena-submission/scripts/test_ena_submission.py | 5 +- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/ena-submission/scripts/create_sample.py b/ena-submission/scripts/create_sample.py index 963cb2348..5e25fa6a6 100644 --- a/ena-submission/scripts/create_sample.py +++ b/ena-submission/scripts/create_sample.py @@ -41,7 +41,7 @@ @dataclass class Config: organisms: List[Dict[str, str]] - metadata_map: Dict[str, Dict[str, str]] + metadata_mapping: Dict[str, Dict[str, str]] backend_url: str keycloak_token_url: str keycloak_client_id: str @@ -57,30 +57,36 @@ class Config: ena_submission_username: str -def construct_sample_set_object(config, organism_metadata, sample_metadata, center_name, row): +def get_sample_attributes(config, sample_metadata, row): list_sample_attributes = [] - for field in config.metadata_map: - loculus_metadata_field_names = config.metadata_map[field]["loculus_fields"] + for field in config.metadata_mapping: + loculus_metadata_field_names = config.metadata_mapping[field]["loculus_fields"] loculus_metadata_field_values = [ sample_metadata.get(metadata, None) for metadata in loculus_metadata_field_names ] - if "function" in config.metadata_map[field] and "args" in config.metadata_map[field]: - function = config.metadata_map[field]["function"] - args = config.metadata_map[field]["args"] - if function == "match" and (len(loculus_metadata_field_names) == len(args)): + if ( + "function" in config.metadata_mapping[field] + and "args" in config.metadata_mapping[field] + ): + function = config.metadata_mapping[field]["function"] + args = [i for i in config.metadata_mapping[field]["args"] if i] + full_field_values = [i for i in loculus_metadata_field_values if i] + if function != "match": + logging.warning( + f"Unknown function: {function} with args: {args} for {row["accession"]}" + ) + continue + if function == "match" and (len(full_field_values) == len(args)): value = True - for i in range(len(loculus_metadata_field_names)): + for i in range(len(full_field_values)): if not re.match( args[i], - sample_metadata.get(loculus_metadata_field_names[i], None), + full_field_values[i], re.IGNORECASE, ): value = False break else: - logging.warning( - f"Could not calculate function {function} with args: {args} for {row["accession"]}" - ) continue else: value = ";".join([metadata for metadata in loculus_metadata_field_values if metadata]) @@ -91,9 +97,16 @@ def construct_sample_set_object(config, organism_metadata, sample_metadata, cent value=value, ) ) + return list_sample_attributes + + +def construct_sample_set_object( + config, organism_metadata, sample_metadata, center_name, row, organism +): + list_sample_attributes = get_sample_attributes(config, sample_metadata, row) sample_type = SampleType( center_name=XmlAttribute(center_name), - alias=XmlAttribute(f"{row["accession"]}:{row["organism"]}:{config.unique_project_suffix}"), + alias=XmlAttribute(f"{row["accession"]}:{organism}:{config.unique_project_suffix}"), title=f"{organism_metadata["scientific_name"]}: Genome sequencing", description=( f"Automated upload of {organism_metadata["scientific_name"]} sequences submitted by {center_name} from {config.db_name}", @@ -135,12 +148,12 @@ def create_sample(log_level, config_file): while True: # Check submission_table for newly added sequences - conditions = {"status_all": StatusAll.READY_TO_SUBMIT.name} + conditions = {"status_all": StatusAll.SUBMITTED_PROJECT.name} ready_to_submit = find_conditions_in_db( db_config, table_name="submission_table", conditions=conditions ) logging.debug( - f"Found {len(ready_to_submit)} entries in submission_table in status READY_TO_SUBMIT" + f"Found {len(ready_to_submit)} entries in submission_table in status SUBMITTED_PROJECT" ) for row in ready_to_submit: seq_key = {"accession": row["accession"], "version": row["version"]} @@ -230,7 +243,7 @@ def create_sample(log_level, config_file): organism_metadata = config.organisms[organism]["ingest"] sample_set = construct_sample_set_object( - config, organism_metadata, sample_metadata, center_name, row + config, organism_metadata, sample_metadata, center_name, row, organism ) update_values = {"status": Status.SUBMITTING.name} number_rows_updated = update_db_where_conditions( @@ -297,7 +310,7 @@ def create_sample(log_level, config_file): f"Found {len(entries_with_errors)} entries in sample_table in status HAS_ERRORS", f" for {time_threshold}m", ) - for row in ready_to_submit_sample: + for row in entries_with_errors: # TODO: Query ENA to check if sample has in fact been created # If created update sample_table # If not retry 3 times, then raise for manual intervention diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py index e7fada95a..686e73d69 100644 --- a/ena-submission/scripts/test_ena_submission.py +++ b/ena-submission/scripts/test_ena_submission.py @@ -195,7 +195,8 @@ def test_sample_set_construction(self): config.unique_project_suffix = "test suffix" row = {} row["accession"] = "test_accession" - row["organism"] = "test organism" + organism = "test organism" + row["organism"] = organism sample_metadata = { "authors": "I. Kurane, M. Saijo, Q. Tang, S. Morikawa, T. Qing, Z. Xinqin", "host_age": None, @@ -337,7 +338,7 @@ def test_sample_set_construction(self): "dataUseTermsUrl": "https://#TODO-MVP/open", } sample_set = construct_sample_set_object( - config, organism_metadata, sample_metadata, center_name, row + config, organism_metadata, sample_metadata, center_name, row, organism ) assert xmltodict.parse( dataclass_to_xml(sample_set, root_name="SAMPLE_SET")