diff --git a/ena-submission/.gitignore b/ena-submission/.gitignore index ecb73e18f..47734c592 100644 --- a/ena-submission/.gitignore +++ b/ena-submission/.gitignore @@ -1,3 +1,7 @@ .snakemake/ results/ -__pycache__ \ No newline at end of file +assembly/ +project/ +sample/ +__pycache__ +config/config.yaml \ No newline at end of file diff --git a/ena-submission/config/config.yaml b/ena-submission/config/config.yaml deleted file mode 100644 index 2d783b972..000000000 --- a/ena-submission/config/config.yaml +++ /dev/null @@ -1,162 +0,0 @@ -backend_url: http://localhost:8079/ -keycloak_token_url: http://localhost:8083/realms/loculus/protocol/openid-connect/token -db_username: postgres -db_password: unsecure -db_url: "jdbc:postgresql://127.0.0.1:5432/loculus" -organisms: - cchf: - ingest: - nextclade_dataset_name: nextstrain/cchfv/linked - nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output - nucleotide_sequences: - - L - - M - - S - taxon_id: 3052518 - scientific_name: "Orthonairovirus haemorrhagiae" - organismName: "Crimean-Congo Hemorrhagic Fever Virus" - externalMetadata: - - externalMetadataUpdater: ena - name: ncbiReleaseDate - type: date - - externalMetadataUpdater: ena - name: ncbiUpdateDate_L - type: date - - externalMetadataUpdater: ena - name: ncbiUpdateDate_M - type: date - - externalMetadataUpdater: ena - name: ncbiUpdateDate_S - type: date - - externalMetadataUpdater: ena - name: ncbiSubmitterCountry - type: string - - externalMetadataUpdater: ena - name: insdcAccessionBase_L - type: string - - externalMetadataUpdater: ena - name: insdcAccessionBase_M - type: string - - externalMetadataUpdater: ena - name: insdcAccessionBase_S - type: string - - externalMetadataUpdater: ena - name: insdcVersion_L - type: int - - externalMetadataUpdater: ena - name: insdcVersion_M - type: int - - externalMetadataUpdater: ena - name: insdcVersion_S - type: int - - externalMetadataUpdater: ena - name: insdcAccessionFull_L - type: string - - externalMetadataUpdater: ena - name: insdcAccessionFull_M - type: string - - externalMetadataUpdater: ena - name: insdcAccessionFull_S - type: string - - externalMetadataUpdater: ena - name: bioprojectAccession - type: string - - externalMetadataUpdater: ena - name: biosampleAccession - type: string - - externalMetadataUpdater: ena - name: ncbiSourceDb - type: string - - externalMetadataUpdater: ena - name: ncbiVirusName - type: string - - externalMetadataUpdater: ena - name: ncbiVirusTaxId - type: int - - externalMetadataUpdater: ena - name: sraRunAccession - type: string - ebola-zaire: - ingest: - taxon_id: 186538 - scientific_name: "Orthoebolavirus zairense" - organismName: "Ebola Zaire" - externalMetadata: - - externalMetadataUpdater: ena - name: ncbiReleaseDate - type: date - - externalMetadataUpdater: ena - name: ncbiUpdateDate - type: date - - externalMetadataUpdater: ena - name: ncbiSubmitterCountry - type: string - - externalMetadataUpdater: ena - name: insdcAccessionBase - type: string - - externalMetadataUpdater: ena - name: insdcVersion - type: int - - externalMetadataUpdater: ena - name: insdcAccessionFull - type: string - - externalMetadataUpdater: ena - name: bioprojectAccession - type: string - - externalMetadataUpdater: ena - name: biosampleAccession - type: string - - externalMetadataUpdater: ena - name: ncbiSourceDb - type: string - - externalMetadataUpdater: ena - name: ncbiVirusName - type: string - - externalMetadataUpdater: ena - name: ncbiVirusTaxId - type: int - - externalMetadataUpdater: ena - name: sraRunAccession - type: string - west-nile: - ingest: - scientific_name: "West Nile virus" - taxon_id: 3048448 - organismName: "West Nile Virus" - externalMetadata: - - externalMetadataUpdater: ena - name: ncbiReleaseDate - type: date - - externalMetadataUpdater: ena - name: ncbiUpdateDate - type: date - - externalMetadataUpdater: ena - name: ncbiSubmitterCountry - type: string - - externalMetadataUpdater: ena - name: insdcAccessionBase - type: string - - externalMetadataUpdater: ena - name: insdcVersion - type: int - - externalMetadataUpdater: ena - name: insdcAccessionFull - type: string - - externalMetadataUpdater: ena - name: bioprojectAccession - type: string - - externalMetadataUpdater: ena - name: biosampleAccession - type: string - - externalMetadataUpdater: ena - name: ncbiSourceDb - type: string - - externalMetadataUpdater: ena - name: ncbiVirusName - type: string - - externalMetadataUpdater: ena - name: ncbiVirusTaxId - type: int - - externalMetadataUpdater: ena - name: sraRunAccession - type: string diff --git a/ena-submission/scripts/create_assembly.py b/ena-submission/scripts/create_assembly.py index 33de3278b..41718c4dc 100644 --- a/ena-submission/scripts/create_assembly.py +++ b/ena-submission/scripts/create_assembly.py @@ -7,6 +7,7 @@ import click import pytz import yaml +from call_loculus import get_group_info from ena_submission_helper import ( CreationResult, create_chromosome_list, @@ -67,6 +68,7 @@ class Config: slack_hook: str slack_token: str slack_channel_id: str + is_broker: bool def create_chromosome_list_object( @@ -133,15 +135,26 @@ def create_manifest_object( sample_accession = sample_table_entry["result"]["ena_sample_accession"] study_accession = project_table_entry["result"]["bioproject_accession"] + address_string = project_table_entry["center_name"] + if config.is_broker: + try: + group_info = get_group_info(config, project_table_entry["group_id"])[0]["group"] + address = group_info["address"] + address_string = (f'{address.get("line1", "")}, {address.get("line2", "")}, ' + f'{address.get("city", "")}, {address.get("state", "")}, ' + f'{address.get("postalCode", "")}, {address.get("country")}') + except Exception as e: + logger.error(f"Was unable to create address, setting address to center_name due to {e}") + metadata = submission_table_entry["metadata"] unaligned_nucleotide_sequences = submission_table_entry["unaligned_nucleotide_sequences"] - organism_metadata = config.organisms[group_key["organism"]]["ingest"] + organism_metadata = config.organisms[group_key["organism"]]["enaDeposition"] chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key) chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir) authors = ( metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown") ) - collection_date = metadata.get("collectionDate", "Unknown") + collection_date = metadata.get("sampleCollectionDate", "Unknown") country = metadata.get("geoLocCountry", "Unknown") admin1 = metadata.get("geoLocAdmin1", "") admin2 = metadata.get("geoLocAdmin2", "") @@ -203,6 +216,8 @@ def create_manifest_object( chromosome_list=chromosome_list_file, description=description, moleculetype=moleculetype, + authors=authors, + address=address_string, ) @@ -365,7 +380,7 @@ def assembly_table_create( group_key, test, ) - manifest_file = create_manifest(manifest_object) + manifest_file = create_manifest(manifest_object, is_broker=config.is_broker) except Exception as e: logger.error( f"Manifest creation failed for accession {row["accession"]} with error {e}" diff --git a/ena-submission/scripts/create_project.py b/ena-submission/scripts/create_project.py index 437d037cd..f455158eb 100644 --- a/ena-submission/scripts/create_project.py +++ b/ena-submission/scripts/create_project.py @@ -73,13 +73,13 @@ def construct_project_set_object( Construct project set object, using: - entry in project_table - group_info of corresponding group_id - - config information, such as ingest metadata for that organism + - config information, such as enaDeposition metadata for that organism If test=True add a timestamp to the alias suffix to allow for multiple submissions of the same project for testing. (ENA blocks multiple submissions with the same alias) """ - metadata_dict = config.organisms[entry["organism"]]["ingest"] + metadata_dict = config.organisms[entry["organism"]]["enaDeposition"] if test: alias = XmlAttribute( f"{entry["group_id"]}:{entry["organism"]}:{config.unique_project_suffix}:{datetime.now(tz=pytz.utc)}" diff --git a/ena-submission/scripts/create_sample.py b/ena-submission/scripts/create_sample.py index c0fec4844..c3c361d8f 100644 --- a/ena-submission/scripts/create_sample.py +++ b/ena-submission/scripts/create_sample.py @@ -132,7 +132,7 @@ def construct_sample_set_object( Construct sample set object, using: - entry in sample_table - sample_data_in_submission_table: corresponding entry in submission_table - - config information, such as ingest metadata for that organism + - config information, such as enaDeposition metadata for that organism If test=True add a timestamp to the alias suffix to allow for multiple submissions of the same project for testing. (ENA blocks multiple submissions with the same alias) @@ -140,7 +140,7 @@ def construct_sample_set_object( sample_metadata = sample_data_in_submission_table["metadata"] center_name = sample_data_in_submission_table["center_name"] organism = sample_data_in_submission_table["organism"] - organism_metadata = config.organisms[organism]["ingest"] + organism_metadata = config.organisms[organism]["enaDeposition"] if test: alias = XmlAttribute( f"{entry["accession"]}:{organism}:{config.unique_project_suffix}:{datetime.now(tz=pytz.utc)}" diff --git a/ena-submission/scripts/deposition_dry_run.py b/ena-submission/scripts/deposition_dry_run.py index 3822d734f..bb3292ccc 100644 --- a/ena-submission/scripts/deposition_dry_run.py +++ b/ena-submission/scripts/deposition_dry_run.py @@ -35,6 +35,7 @@ class Config: metadata_mapping_mandatory_field_defaults: dict[str, str] ena_checklist: str use_ena_checklist: bool + is_broker: bool @click.command() @@ -158,7 +159,7 @@ def local_ena_submission_generator( manifest_object = create_manifest_object( config, dummy_sample_dict, dummy_project_dict, entry, entry, entry, dir=directory ) - create_manifest(manifest_object, dir=directory) + create_manifest(manifest_object, is_broker=config.is_broker, dir=directory) logger.info( "You can submit the assembly to ENA using the command: \n" "java -jarwebin-cli.jar -username {ena_submission_username} " diff --git a/ena-submission/scripts/ena_submission_helper.py b/ena-submission/scripts/ena_submission_helper.py index b0a3ebae7..e09d90663 100644 --- a/ena-submission/scripts/ena_submission_helper.py +++ b/ena-submission/scripts/ena_submission_helper.py @@ -378,7 +378,9 @@ def create_fasta( return filename -def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str: +def create_manifest( + manifest: AssemblyManifest, is_broker: bool = False, dir: str | None = None +) -> str: """ Creates a temp manifest file: https://ena-docs.readthedocs.io/en/latest/submit/assembly/genome.html#manifest-files @@ -410,6 +412,16 @@ def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str: f.write(f"DESCRIPTION\t{manifest.description}\n") if manifest.moleculetype: f.write(f"MOLECULETYPE\t{manifest.moleculetype!s}\n") + if manifest.authors: + if not is_broker: + logger.error("Cannot set authors field for non broker") + else: + f.write(f"AUTHORS\t{manifest.authors}\n") + if manifest.address: + if not is_broker: + logger.error("Cannot set address field for non broker") + else: + f.write(f"ADDRESS\t{manifest.address}\n") return filename diff --git a/ena-submission/scripts/ena_types.py b/ena-submission/scripts/ena_types.py index fbb163d99..f41a6a376 100644 --- a/ena-submission/scripts/ena_types.py +++ b/ena-submission/scripts/ena_types.py @@ -188,6 +188,8 @@ class AssemblyManifest: moleculetype: MoleculeType | None = None description: str | None = None run_ref: list[str] | None = None + address: str | None = None + authors: str | None = None class ChromosomeType(Enum): diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py index 1bee66a27..9e2c1e211 100644 --- a/ena-submission/scripts/test_ena_submission.py +++ b/ena-submission/scripts/test_ena_submission.py @@ -48,7 +48,7 @@ def mock_config(): "scientific_name": "Test scientific name", "molecule_type": "genomic RNA", } - config.organisms = {"Test organism": {"ingest": metadata_dict}} + config.organisms = {"Test organism": {"enaDeposition": metadata_dict}} config.metadata_mapping = defaults["metadata_mapping"] config.metadata_mapping_mandatory_field_defaults = defaults[ "metadata_mapping_mandatory_field_defaults" @@ -240,7 +240,10 @@ def test_create_manifest(self): study_accession = "Test Study Accession" sample_accession = "Test Sample Accession" results_in_sample_table = {"result": {"ena_sample_accession": sample_accession}} - results_in_project_table = {"result": {"bioproject_accession": study_accession}} + results_in_project_table = { + "result": {"bioproject_accession": study_accession}, + "center_name": "generic_center_name", + } manifest = create_manifest_object( config, results_in_sample_table, diff --git a/kubernetes/loculus/templates/_common-metadata.tpl b/kubernetes/loculus/templates/_common-metadata.tpl index 0b1ff517b..46f533626 100644 --- a/kubernetes/loculus/templates/_common-metadata.tpl +++ b/kubernetes/loculus/templates/_common-metadata.tpl @@ -370,12 +370,12 @@ organisms: {{ $key }}: {{- with $instance.schema }} {{- $nucleotideSequences := .nucleotideSequences | default (list "main")}} - ingest: {{- $instance.ingest.configFile | toYaml | nindent 8 }} + enaDeposition: {{- $instance.enaDeposition.configFile | toYaml | nindent 6 }} organismName: {{ quote .organismName }} externalMetadata: {{- $args := dict "metadata" (include "loculus.patchMetadataSchema" . | fromYaml).metadata "nucleotideSequences" $nucleotideSequences}} {{- $metadata := include "loculus.generateBackendExternalMetadata" $args | fromYaml }} - {{- $metadata.fields | default list | toYaml | nindent 8 }} + {{- $metadata.fields | default list | toYaml | nindent 6 }} {{- end }} {{- end }} {{- end }} diff --git a/kubernetes/loculus/templates/ena-submission-config.yaml b/kubernetes/loculus/templates/ena-submission-config.yaml index b93560229..a779c4058 100644 --- a/kubernetes/loculus/templates/ena-submission-config.yaml +++ b/kubernetes/loculus/templates/ena-submission-config.yaml @@ -4,6 +4,7 @@ {{- $submitToEnaProduction := .Values.submitToEnaProduction | default false }} {{- $enaDbName := .Values.enaDbName | default false }} {{- $enaUniqueSuffix := .Values.enaUniqueSuffix | default false }} +{{- $enaIsBroker := .Values.enaIsBroker | default false }} --- apiVersion: v1 kind: ConfigMap @@ -13,6 +14,7 @@ data: config.yaml: | submit_to_ena_prod: {{ $submitToEnaProduction }} db_name: {{ $enaDbName }} + is_broker: {{ $enaIsBroker }} unique_project_suffix: {{ $enaUniqueSuffix }} backend_url: {{ $backendHost }} keycloak_token_url: {{ $keycloakHost -}}/realms/loculus/protocol/openid-connect/token diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index ae4a0983c..fcdeb27b0 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1087,6 +1087,9 @@ defaultOrganismConfig: &defaultOrganismConfig ingest: &ingest image: ghcr.io/loculus-project/ingest configFile: &ingestConfigFile + taxon_id: 3052462 + enaDeposition: + configFile: taxon_id: 186538 scientific_name: "Zaire ebolavirus" molecule_type: "genomic RNA" @@ -1173,6 +1176,9 @@ defaultOrganisms: <<: *ingest configFile: taxon_id: 3048448 + enaDeposition: + configFile: + taxon_id: 11082 scientific_name: "West Nile virus" molecule_type: "genomic RNA" referenceGenomes: @@ -1402,14 +1408,17 @@ defaultOrganisms: configFile: <<: *ingestConfigFile taxon_id: 3052518 - scientific_name: "Orthonairovirus haemorrhagiae" - molecule_type: "genomic RNA" nucleotide_sequences: - L - M - S nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output nextclade_dataset_name: nextstrain/cchfv/linked + enaDeposition: + configFile: + taxon_id: 3052518 + scientific_name: "Orthonairovirus haemorrhagiae" + molecule_type: "genomic RNA" referenceGenomes: nucleotideSequences: - name: L @@ -1513,6 +1522,7 @@ registrationTermsMessage: > submitToEnaProduction: false enaDbName: Loculus enaUniqueSuffix: Loculus +enaIsBroker: False subdomainSeparator: "-" replicas: website: 1