From abb78356f05136380b95d9e12bd487d69e27d101 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 15 Jul 2024 11:31:41 +0200 Subject: [PATCH] Add suggestions --- ena-submission/ENA_submission.md | 2 +- ena-submission/scripts/get_ena_submission_list.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ena-submission/ENA_submission.md b/ena-submission/ENA_submission.md index 352551fef..293945e81 100644 --- a/ena-submission/ENA_submission.md +++ b/ena-submission/ENA_submission.md @@ -35,7 +35,7 @@ We require the following components: - Analysis: An analysis contains secondary analysis results derived from sequence reads (e.g. a genome assembly). -At the time of writing (October 2023), in contrast to ENA, Pathoplexus has no hierarchy of study/sample/sequence: every sequence is its own study and sample. Thus, each sequence will have to be submitted to ENA as its own study and sample. Alternatively, each submitter could have exactly _one_ study pre organism (this is the approach we are currently taking). +At the time of writing (October 2023), in contrast to ENA, Pathoplexus has no hierarchy of study/sample/sequence: every sequence is its own study and sample. Therefore we need to figure out how to map sequences to projects, each submitter could have exactly _one_ study pre organism (this is the approach we are currently taking), or each sequence could be associated with its own study. ### Mapping sequences and studies diff --git a/ena-submission/scripts/get_ena_submission_list.py b/ena-submission/scripts/get_ena_submission_list.py index 27e0de27c..85868227b 100644 --- a/ena-submission/scripts/get_ena_submission_list.py +++ b/ena-submission/scripts/get_ena_submission_list.py @@ -74,14 +74,16 @@ def get_db_config(config: Config): type=click.Path(), ) def get_ena_submission_list(log_level, config_file, output_file): + """ + Get a list of all sequences in state APPROVED_FOR_RELEASE without insdc-specific + metadata fields and not already in the ena_submission.submission_table. + """ logger.setLevel(log_level) logging.getLogger("requests").setLevel(logging.WARNING) with open(config_file) as file: full_config = yaml.safe_load(file) - relevant_config = { - key: full_config.get(key, []) for key in Config.__annotations__ - } + relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__} config = Config(**relevant_config) logger.info(f"Config: {config}") @@ -93,9 +95,7 @@ def get_ena_submission_list(log_level, config_file, output_file): value["name"] for value in config.organisms[organism]["externalMetadata"] ] logging.info(f"Getting released sequences for organism: {organism}") - entries = get_released_data( - config, organism, remove_if_has_ena_specific_metadata=True - ) + entries = get_released_data(config, organism, remove_if_has_ena_specific_metadata=True) for key, item in entries.items(): accession, version = key.split(".")