diff --git a/.github/workflows/ingest.yml b/.github/workflows/ingest.yml new file mode 100644 index 000000000..c20940853 --- /dev/null +++ b/.github/workflows/ingest.yml @@ -0,0 +1,82 @@ +name: ingest +on: + push: + workflow_dispatch: + inputs: + build_arm: + type: boolean + description: "Build for ARM as well" + default: false + required: false + +env: + DOCKER_IMAGE_NAME: ghcr.io/loculus-project/ingest + BUILD_ARM: ${{ github.ref == 'refs/heads/main' || github.event.inputs.build_arm }} + +concurrency: + group: ci-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}-ingest + cancel-in-progress: true + +jobs: + dockerImage: + name: Build ingest Docker Image # Don't change: Referenced by .github/workflows/update-argocd-metadata.yml + runs-on: ubuntu-latest + timeout-minutes: 15 + permissions: + contents: read + packages: write + checks: read + steps: + - uses: actions/checkout@v4 + + - name: Generate files hash + id: files-hash + run: | + DIR_HASH=$(echo -n ${{ hashFiles('ingest/**', '.github/workflows/ingest.yml') }}) + echo "DIR_HASH=$DIR_HASH${{ env.BUILD_ARM && '-arm'|| '' }}" >> $GITHUB_ENV + + - name: Setup Docker metadata + id: dockerMetadata + uses: docker/metadata-action@v5 + with: + images: ${{ env.DOCKER_IMAGE_NAME }} + tags: | + type=raw,value=${{ env.DIR_HASH }} + type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }} + type=ref,event=branch + type=sha,prefix=commit- + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Check if image exists + id: check-image + run: | + EXISTS=$(docker manifest inspect ${{ env.DOCKER_IMAGE_NAME }}:${{ env.DIR_HASH }} > /dev/null 2>&1 && echo "true" || echo "false") + echo "CACHE_HIT=$EXISTS" >> $GITHUB_ENV + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build and push image if input files changed + if: env.CACHE_HIT == 'false' + uses: docker/build-push-action@v5 + with: + context: ./ingest + push: true + tags: ${{ steps.dockerMetadata.outputs.tags }} + cache-from: type=gha,scope=ingest-${{ github.ref }} + cache-to: type=gha,mode=max,scope=ingest-${{ github.ref }} + platforms: ${{ env.BUILD_ARM && 'linux/amd64,linux/arm64' || 'linux/amd64' }} + + - name: Retag and push existing image if cache hit + if: env.CACHE_HIT == 'true' + run: | + TAGS=(${{ steps.dockerMetadata.outputs.tags }}) + for TAG in "${TAGS[@]}"; do + docker buildx imagetools create --tag $TAG ${{ env.DOCKER_IMAGE_NAME }}:${{ env.DIR_HASH }} + done diff --git a/ingest/.dockerignore b/ingest/.dockerignore new file mode 100644 index 000000000..7f57bcabc --- /dev/null +++ b/ingest/.dockerignore @@ -0,0 +1,8 @@ +.snakemake/ +.git/ +data/ +results/ +result/ +.DS_Store +.ruff_cache +config/config.yaml \ No newline at end of file diff --git a/ingest/.gitignore b/ingest/.gitignore new file mode 100644 index 000000000..55edfdf82 --- /dev/null +++ b/ingest/.gitignore @@ -0,0 +1,5 @@ +.snakemake/ +data/ +results/ +.DS_Store +.ruff_cache \ No newline at end of file diff --git a/ingest/.mambarc b/ingest/.mambarc new file mode 100644 index 000000000..8809fe054 --- /dev/null +++ b/ingest/.mambarc @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +repodata_use_zst: true +channel_priority: strict +download_threads: 20 \ No newline at end of file diff --git a/ingest/Dockerfile b/ingest/Dockerfile new file mode 100644 index 000000000..1b61b83e1 --- /dev/null +++ b/ingest/Dockerfile @@ -0,0 +1,15 @@ +FROM mambaorg/micromamba:1.5.7 + +COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/env.yaml +COPY --chown=$MAMBA_USER:$MAMBA_USER .mambarc /tmp/.mambarc + +RUN micromamba config set extract_threads 1 \ + && micromamba install -y -n base -f /tmp/env.yaml --rc-file /tmp/.mambarc \ + && micromamba clean --all --yes + +# Set the environment variable to activate the conda environment +ARG MAMBA_DOCKERFILE_ACTIVATE=1 + +COPY --chown=$MAMBA_USER:$MAMBA_USER . /package + +WORKDIR /package \ No newline at end of file diff --git a/ingest/README.md b/ingest/README.md new file mode 100644 index 000000000..e78bd8974 --- /dev/null +++ b/ingest/README.md @@ -0,0 +1,12 @@ +# Pipeline to ingest data from INSDC into loculus + +## Overview + +1. Download data from INSDC +2. Filtering +3. Turn into FASTA/Metadata +4. Upload to loculus + +## Deployment + +Pipeline shall be put in a docker container that takes a config file as input diff --git a/ingest/Snakefile b/ingest/Snakefile new file mode 100644 index 000000000..0bec2fd87 --- /dev/null +++ b/ingest/Snakefile @@ -0,0 +1,124 @@ +TAXON_ID = config["taxon_id"] +ALL_FIELDS = ",".join(config["all_fields"]) +COLUMN_MAPPING = config["column_mapping"] +LOG_LEVEL = config.get("log_level", "INFO") + + +def rename_columns(input_file, output_file): + with open(input_file, "r") as f: + header = f.readline().strip().split("\t") + header = [COLUMN_MAPPING.get(h, h) for h in header] + with open(output_file, "w") as g: + g.write("\t".join(header) + "\n") + for line in f: + g.write(line) + + +rule all: + input: + "data/sequences.fasta", + "data/metadata.tsv", + + +rule fetch_ncbi_dataset_package: + output: + dataset_package="results/ncbi_dataset.zip", + retries: 5 + shell: + """ + datasets download virus genome taxon {TAXON_ID} \ + --no-progressbar \ + --filename {output.dataset_package} + """ + + +rule extract_ncbi_dataset_sequences: + input: + dataset_package="results/ncbi_dataset.zip", + output: + ncbi_dataset_sequences="results/sequences.fasta", + shell: + """ + unzip -jp {input.dataset_package} \ + ncbi_dataset/data/genomic.fna \ + | seqkit seq -i -w0 \ + > {output.ncbi_dataset_sequences} + """ + + +rule format_ncbi_dataset_report: + input: + dataset_package="results/ncbi_dataset.zip", + output: + ncbi_dataset_tsv="results/metadata_post_extract.tsv", + params: + fields_to_include=ALL_FIELDS, + shell: + """ + dataformat tsv virus-genome \ + --package {input.dataset_package} \ + --fields {params.fields_to_include:q} \ + > {output.ncbi_dataset_tsv} + """ + + +rule rename_columns: + input: + ncbi_dataset_tsv="results/metadata_post_extract.tsv", + output: + ncbi_dataset_tsv="results/metadata_post_rename.tsv", + run: + rename_columns(input.ncbi_dataset_tsv, output.ncbi_dataset_tsv) + + +rule prepare_metadata: + input: + metadata="results/metadata_post_rename.tsv", + config="config/config.yaml", + output: + metadata="results/metadata_post_prepare.tsv", + params: + log_level=LOG_LEVEL, + shell: + """ + python scripts/prepare_metadata.py \ + --config-file {input.config} \ + --input {input.metadata} \ + --output {output.metadata} \ + --log-level {params.log_level} \ + """ + + +rule submit_to_loculus: + input: + metadata="results/metadata_post_prepare.tsv", + sequences="results/sequences.fasta", + config="config/config.yaml", + output: + submitted=touch("results/submitted"), + params: + log_level=LOG_LEVEL, + shell: + """ + python scripts/submit_to_loculus.py \ + --mode submit \ + --metadata {input.metadata} \ + --sequences {input.sequences} \ + --config-file {input.config} \ + --log-level {params.log_level} \ + """ + + +rule approve: + input: + submitted="results/submitted", + config="config/config.yaml", + params: + log_level=LOG_LEVEL, + shell: + """ + python scripts/submit_to_loculus.py \ + --mode approve \ + --config-file {input.config} \ + --log-level {params.log_level} \ + """ diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml new file mode 100644 index 000000000..ae937d5bb --- /dev/null +++ b/ingest/config/config.yaml @@ -0,0 +1,137 @@ +log_level: DEBUG +compound_country_field: ncbi_geo_location +fasta_id_field: genbank_accession +rename: + genbank_accession: insdc_accession_full + ncbi_collection_date: collection_date + ncbi_isolate_name: isolate_name + ncbi_isolate_source: isolate_source + ncbi_sra_accessions: sra_accessions + ncbi_submitter_affiliation: author_affiliation + ncbi_submitter_country: submitter_country + ncbi_submitter_names: authors +keep: + - division + - country + - submissionId + - insdc_accession_base + - insdc_version + - bioprojects + - biosample_accession + - ncbi_completeness + - ncbi_host_name + - ncbi_host_tax_id + - ncbi_is_lab_host + - ncbi_length + - ncbi_protein_count + - ncbi_release_date + - ncbi_update_date + - ncbi_sourcedb + - ncbi_virus_name + - ncbi_virus_tax_id +taxon_id: 186538 +all_fields: + - accession + - bioprojects + - biosample-acc + - completeness + - gene-count + - geo-location + - geo-region + - host-common-name + - host-infraspecific-breed + - host-infraspecific-cultivar + - host-infraspecific-ecotype + - host-infraspecific-isolate + - host-infraspecific-sex + - host-infraspecific-strain + - host-name + - host-pangolin + - host-tax-id + - is-annotated + - is-complete + - is-lab-host + - is-vaccine-strain + - isolate-collection-date + - isolate-lineage + - isolate-lineage-source + - lab-host + - length + - matpeptide-count + - mol-type + - nucleotide-completeness + - protein-count + - purpose-of-sampling + - release-date + - sourcedb + - sra-accs + - submitter-affiliation + - submitter-country + - submitter-names + - update-date + - virus-common-name + - virus-infraspecific-breed + - virus-infraspecific-cultivar + - virus-infraspecific-ecotype + - virus-infraspecific-isolate + - virus-infraspecific-sex + - virus-infraspecific-strain + - virus-name + - virus-pangolin + - virus-tax-id +column_mapping: + Accession: genbank_accession + BioProjects: bioprojects + BioSample accession: biosample_accession + Completeness: ncbi_completeness + Gene count: ncbi_gene_count + Geographic Location: ncbi_geo_location + Geographic Region: ncbi_geo_region + Host Common Name: ncbi_host_common_name + Host Infraspecific Names Breed: ncbi_host_breed + Host Infraspecific Names Cultivar: ncbi_host_cultivar + Host Infraspecific Names Ecotype: ncbi_host_ecotype + Host Infraspecific Names Isolate: ncbi_host_isolate + Host Infraspecific Names Sex: ncbi_host_sex + Host Infraspecific Names Strain: ncbi_host_strain + Host Name: ncbi_host_name + Host Pangolin Classification: ncbi_host_pangolin + Host Taxonomic ID: ncbi_host_tax_id + Is Annotated: ncbi_is_annotated + Is Complete: ncbi_is_complete + Is Lab Host: ncbi_is_lab_host + Is Vaccine Strain: ncbi_is_vaccine_strain + Isolate Collection date: ncbi_collection_date + Isolate Lineage: ncbi_isolate_name + Isolate Lineage source: ncbi_isolate_source + Lab Host: ncbi_lab_host + Length: ncbi_length + Mature peptide count: ncbi_mature_peptide_count + Molecule type: ncbi_mol_type + Nucleotide completeness: ncbi_nucleotide_completeness + Protein count: ncbi_protein_count + Purpose of Sampling: ncbi_purpose_of_sampling + Release date: ncbi_release_date + Source database: ncbi_sourcedb + SRA Accessions: ncbi_sra_accessions + Submitter Affiliation: ncbi_submitter_affiliation + Submitter Country: ncbi_submitter_country + Submitter Names: ncbi_submitter_names + Update date: ncbi_update_date + Virus Common Name: ncbi_virus_common_name + Virus Infraspecific Names Breed: ncbi_virus_breed + Virus Infraspecific Names Cultivar: ncbi_virus_cultivar + Virus Infraspecific Names Ecotype: ncbi_virus_ecotype + Virus Infraspecific Names Isolate: ncbi_virus_isolate + Virus Infraspecific Names Sex: ncbi_virus + Virus Infraspecific Names Strain: ncbi_virus_strain + Virus Name: ncbi_virus_name + Virus Pangolin Classification: ncbi_virus_pangolin + Virus Taxonomic ID: ncbi_virus_tax_id +group_name: insdc_ingest_group +username : insdc_ingest_user +password : insdc_ingest_user +keycloak_client_id : test-cli +backend_url : https://backend-ingest.loculus.org/ +keycloak_token_url : https://authentication-ingest.loculus.org/realms/loculus/protocol/openid-connect/token +organism: ebola-zaire diff --git a/ingest/environment.yml b/ingest/environment.yml new file mode 100644 index 000000000..48f6f6b73 --- /dev/null +++ b/ingest/environment.yml @@ -0,0 +1,15 @@ +name: loculus-ingest +channels: + - conda-forge + - bioconda +dependencies: + - python=3.12 + - pip=24.0 + - ncbi-datasets-cli + - snakemake + - pandas + - PyYAML + - click + - requests + - seqkit + - unzip diff --git a/ingest/profiles/default/config.yaml b/ingest/profiles/default/config.yaml new file mode 100644 index 000000000..f8e8e4d7e --- /dev/null +++ b/ingest/profiles/default/config.yaml @@ -0,0 +1,4 @@ +rerun-incomplete: true +printshellcmds: true +cores: all +configfile: config/config.yaml \ No newline at end of file diff --git a/ingest/ruff.toml b/ingest/ruff.toml new file mode 100644 index 000000000..56f010e91 --- /dev/null +++ b/ingest/ruff.toml @@ -0,0 +1,5 @@ +target-version = "py311" +line-length = 100 + +[lint] +select = ["E", "F", "B"] diff --git a/ingest/scripts/prepare_metadata.py b/ingest/scripts/prepare_metadata.py new file mode 100644 index 000000000..4241130a9 --- /dev/null +++ b/ingest/scripts/prepare_metadata.py @@ -0,0 +1,64 @@ +"""Script to rename fields and transform values prior to submission to Loculus""" + +# Needs to be configurable via yaml file +# Start off with a simple mapping +# Add transformations that can be applied to certain fields +# Like separation of country into country and division + +import hashlib +import logging +from dataclasses import dataclass + +import click +import pandas as pd +import yaml + + +@dataclass +class Config: + compound_country_field: str + fasta_id_field: str + rename: dict[str, str] + keep: list[str] + + +def hash_row_with_columns(row: pd.Series) -> str: + items = sorted((f"{col}_{val}" for col, val in row.items())) + row_string = "".join(items) + return hashlib.sha256(row_string.encode()).hexdigest() + + +@click.command() +@click.option("--config-file", required=True, type=click.Path(exists=True)) +@click.option("--input", required=True, type=click.Path(exists=True)) +@click.option("--output", required=True, type=click.Path()) +@click.option("--log-level", default="INFO", type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])) +def main(config_file: str, input: str, output: str, log_level: str) -> None: + logging.basicConfig(level=log_level) + with open(config_file) as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config[key] for key in Config.__annotations__} + config = Config(**relevant_config) + logging.debug(config) + df = pd.read_csv(input, sep="\t").sort_values(by=config.compound_country_field) + logging.debug(df.columns) + df["division"] = df[config.compound_country_field].str.split(":", n=1).str[1].str.strip() + logging.debug(df["division"].unique()) + df["country"] = df[config.compound_country_field].str.split(":", n=1).str[0].str.strip() + logging.debug(df["country"].unique()) + df["submissionId"] = df[config.fasta_id_field] + logging.debug(df["submissionId"].unique()) + df["insdc_accession_base"] = df[config.fasta_id_field].str.split(".", n=1).str[0] + logging.debug(df["insdc_accession_base"]) + df["insdc_version"] = df[config.fasta_id_field].str.split(".", n=1).str[1] + logging.debug(df["insdc_version"].unique()) + df = df.rename(columns=config.rename) + # Drop columns that are neither a value of `rename` nor in `keep` + df = df.drop(columns=set(df.columns) - set(config.rename.values()) - set(config.keep)) + # Create a metadata hash that is independent of the order of the columns + df["metadata_hash"] = df.apply(hash_row_with_columns, axis=1) + df.to_csv(output, sep="\t", index=False) + + +if __name__ == "__main__": + main() diff --git a/ingest/scripts/submit_to_loculus.py b/ingest/scripts/submit_to_loculus.py new file mode 100644 index 000000000..381cd2715 --- /dev/null +++ b/ingest/scripts/submit_to_loculus.py @@ -0,0 +1,207 @@ +import logging +from dataclasses import dataclass +from time import sleep + +import click +import requests +import yaml + +logging.basicConfig(level=logging.DEBUG) + +@dataclass +class Config: + organism: str + backend_url: str + keycloak_token_url: str + keycloak_client_id: str + username: str + password: str + group_name: str + + +def organism_url(config: Config): + return f"{config.backend_url.rstrip('/')}/{config.organism.strip('/')}" + + +def get_jwt(config: Config): + """ + Get a JWT token for the given username and password + """ + + data = { + "username": config.username, + "password": config.password, + "grant_type": "password", + "client_id": config.keycloak_client_id, + } + headers = {"Content-Type": "application/x-www-form-urlencoded"} + + keycloak_token_url = config.keycloak_token_url + + response = requests.post(keycloak_token_url, data=data, headers=headers) + response.raise_for_status() + + jwt_keycloak = response.json() + jwt = jwt_keycloak["access_token"] + return jwt + + +def create_group(config: Config): + # Create the ingest group + url = f"{config.backend_url.rstrip('/')}/groups" + token = get_jwt(config) + group_name = config.group_name + + headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} + + data = { + "groupName": group_name, + "institution": "NA", + "address": { + "line1": "1234 Loculus Street", + "line2": "NA", + "city": "Dortmund", + "state": "NRW", + "postalCode": "12345", + "country": "Germany", + }, + "contactEmail": "something@loculus.org", + } + + response = requests.post(url, json=data, headers=headers) + + if response.status_code == 409: + print("Group already exists") + # raise if not 409 and not happy 2xx + elif not response.ok: + print(f"Error creating group: {response.json()}") + response.raise_for_status() + + +def submit(metadata, sequences, config: Config): + """ + Submit data to Loculus. + """ + + jwt = get_jwt(config) + + # Endpoint URL + url = f"{organism_url(config)}/submit" + + # Headers with Bearer Authentication + headers = {"Authorization": f"Bearer {jwt}"} + + # Files to be uploaded + files = { + "metadataFile": open(metadata, "rb"), + "sequenceFile": open(sequences, "rb"), + } + + # Query parameters + params = { + "groupName": config.group_name, + "dataUseTermsType": "OPEN", + } + + # POST request + response = requests.post(url, headers=headers, files=files, params=params) + response.raise_for_status() + + # Closing files + files["metadataFile"].close() + files["sequenceFile"].close() + + return response.json() + + +def approve(config: Config): + """ + Get sequences that were preprocessed successfully and approve them. + 1. Get the ids of the sequences that were preprocessed successfully + /ORGANISM/get-sequences + 2. Approve the sequences + """ + jwt = get_jwt(config) + + url = f"{organism_url(config)}/get-sequences" + + # Headers with Bearer Authentication + headers = {"Authorization": f"Bearer {jwt}"} + + # POST request + response = requests.get(url, headers=headers) + response.raise_for_status() + + payload = {"scope": "ALL"} + + url = f"{organism_url(config)}/approve-processed-data" + + response = requests.post(url, headers=headers, json=payload) + response.raise_for_status() + + return response.json() + + +# %% + + +@click.command() +@click.option( + "--metadata", + required=False, + type=click.Path(exists=True), +) +@click.option( + "--sequences", + required=False, + type=click.Path(exists=True), +) +@click.option( + "--mode", + required=True, + type=click.Choice(["submit", "approve"]), +) +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +@click.option( + "--config-file", + required=True, + type=click.Path(exists=True), +) +def submit_to_loculus(metadata, sequences, mode, log_level, config_file): + """ + Submit data to Loculus. + """ + logging.basicConfig(level=log_level) + with open(config_file) as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config[key] for key in Config.__annotations__} + config = Config(**relevant_config) + + if mode == "submit": + logging.info("Submitting to Loculus") + logging.debug(f"Config: {config}") + # Create group if it doesn't exist + logging.info(f"Creating group {config.group_name}") + create_group(config) + logging.info(f"Group {config.group_name} created") + + # Submit + logging.info("Starting submission") + response = submit(metadata, sequences, config) + logging.info("Submission complete") + + if mode == "approve": + while True: + logging.info("Approving sequences") + response = approve(config) + logging.debug(f"Approved: {response}") + sleep(10) + + + +if __name__ == "__main__": + submit_to_loculus() diff --git a/kubernetes/loculus/templates/loculus-ingest-config.yaml b/kubernetes/loculus/templates/loculus-ingest-config.yaml new file mode 100644 index 000000000..295ad89bf --- /dev/null +++ b/kubernetes/loculus/templates/loculus-ingest-config.yaml @@ -0,0 +1,23 @@ +{{ $backendHost := .Values.disableBackend | ternary + "http://host.k3d.internal:8079" + "http://loculus-backend-service:8079" +}} +{{- $keycloakHost := .Values.environment | eq "server" | ternary + (printf "https://authentication-%s" $.Values.host) + "http://loculus-keycloak-service:8083" +}} +{{- range $key, $values := (.Values.organisms | default .Values.defaultOrganisms) }} +{{- if $values.ingest }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: loculus-ingest-config-{{ $key }} +data: + config.yaml: | + {{- $values.ingest.configFile | toYaml | nindent 4 }} + organism: {{ $key }} + backend_url: {{ $backendHost }} + keycloak_token_url: {{ $keycloakHost -}}/realms/loculus/protocol/openid-connect/token +{{- end }} +{{- end }} \ No newline at end of file diff --git a/kubernetes/loculus/templates/loculus-ingest-deployment.yaml b/kubernetes/loculus/templates/loculus-ingest-deployment.yaml new file mode 100644 index 000000000..a5a7659d0 --- /dev/null +++ b/kubernetes/loculus/templates/loculus-ingest-deployment.yaml @@ -0,0 +1,46 @@ +{{- $dockerTag := include "loculus.dockerTag" .Values }} +{{- if not .Values.disableIngest }} +{{- range $key, $value := (.Values.organisms | default .Values.defaultOrganisms) }} +{{- if $value.ingest }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loculus-ingest-{{ $key }} + annotations: + argocd.argoproj.io/sync-options: Replace=true + reloader.stakater.com/auto: "true" +spec: + replicas: 1 + selector: + matchLabels: + app: loculus + component: loculus-ingest-{{ $key }} + template: + metadata: + labels: + app: loculus + component: loculus-ingest-{{ $key }} + spec: + containers: + - name: ingest-{{ $key }} + image: {{ $value.ingest.image}}:{{ $dockerTag }} + imagePullPolicy: Always + args: + {{- range $arg := $value.ingest.args }} + - "{{ $arg }}" + {{- end }} + {{- if $value.ingest.configFile }} + volumeMounts: + - name: loculus-ingest-config-volume-{{ $key }} + mountPath: /package/config + volumes: + - name: loculus-ingest-config-volume-{{ $key }} + configMap: + name: loculus-ingest-config-{{ $key }} + {{- end }} + imagePullSecrets: + - name: ghcr-secret +{{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 8a004a19c..0a7fe5669 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -12,6 +12,7 @@ keycloakDatabase: disableWebsite: false disableBackend: false disablePreprocessing: false +disableIngest: false siloImportLimitSeconds: 3600 accessionPrefix: "LOC_" name: "Loculus" @@ -223,12 +224,97 @@ defaultOrganisms: autocomplete: true - name: authors type: string + - name: submitter_country + type: string + generateIndex: true + autocomplete: true + - name: division + type: string + generateIndex: true + autocomplete: true + - name: insdc_accession_base + type: string + - name: insdc_version + type: string + - name: insdc_accession_full + type: string + - name: bioprojects + type: string + - name: biosample_accession + type: string + - name: ncbi_completeness + type: string + generateIndex: true + autocomplete: true + - name: ncbi_host_name + type: string + generateIndex: true + autocomplete: true + - name: ncbi_host_tax_id + type: string + generateIndex: true + autocomplete: true + - name: ncbi_is_lab_host + type: string + generateIndex: true + autocomplete: true + - name: ncbi_length + type: string + - name: ncbi_protein_count + type: string + - name: ncbi_update_date + type: date + - name: ncbi_sourcedb + type: string + generateIndex: true + autocomplete: true + - name: ncbi_virus_name + type: string + generateIndex: true + autocomplete: true + - name: ncbi_virus_tax_id + type: string + generateIndex: true + autocomplete: true + - name: isolate_source + type: string + generateIndex: true + autocomplete: true + - name: sra_accessions + type: string + - name: total_snps + type: string + - name: total_inserted_nucs + type: string + - name: total_deleted_nucs + type: string + - name: total_ambiguous_nucs + type: string + - name: total_unknown_nucs + type: string + - name: total_frame_shifts + type: string + - name: frame_shifts + type: string + - name: completeness + type: string + - name: total_stop_codons + type: string + - name: stop_codons + type: string + - name: metadata_hash + type: string website: tableColumns: - collection_date - country + - division + - submitter_country - author_affiliation - ncbi_release_date + - insdc_accession_full + - ncbi_length + - ncbi_host_name defaultOrderBy: collection_date defaultOrder: descending silo: @@ -253,6 +339,46 @@ defaultOrganisms: - L batch_size: 100 processing_spec: + total_snps: + function: identity + inputs: + input: nextclade.totalSubstitutions + total_inserted_nucs: + function: identity + inputs: + input: nextclade.totalInsertions + total_deleted_nucs: + function: identity + inputs: + input: nextclade.totalDeletions + total_ambiguous_nucs: + function: identity + inputs: + input: nextclade.totalNonACGTNs + total_unknown_nucs: + function: identity + inputs: + input: nextclade.totalMissing + total_frame_shifts: + function: identity + inputs: + input: nextclade.totalFrameShifts + frame_shifts: + function: identity + inputs: + input: nextclade.frameShifts + completeness: + function: identity + inputs: + input: nextclade.coverage + total_stop_codons: + function: identity + inputs: + input: nextclade.qc.stopCodons.totalStopCodons + stop_codons: + function: identity + inputs: + input: nextclade.qc.stopCodons.stopCodons collection_date: function: process_date inputs: @@ -280,6 +406,225 @@ defaultOrganisms: function: identity inputs: input: isolate_name + submitter_country: + function: identity + inputs: + input: submitter_country + division: + function: identity + inputs: + input: division + insdc_accession_base: + function: identity + inputs: + input: insdc_accession_base + insdc_version: + function: identity + inputs: + input: insdc_version + insdc_accession_full: + function: identity + inputs: + input: insdc_accession_full + bioprojects: + function: identity + inputs: + input: bioprojects + biosample_accession: + function: identity + inputs: + input: biosample_accession + ncbi_completeness: + function: identity + inputs: + input: ncbi_completeness + ncbi_host_name: + function: identity + inputs: + input: ncbi_host_name + ncbi_host_tax_id: + function: identity + inputs: + input: ncbi_host_tax_id + ncbi_is_lab_host: + function: identity + inputs: + input: ncbi_is_lab_host + ncbi_length: + function: identity + inputs: + input: ncbi_length + ncbi_protein_count: + function: identity + inputs: + input: ncbi_protein_count + ncbi_update_date: + function: parse_timestamp + inputs: + timestamp: ncbi_update_date + ncbi_sourcedb: + function: identity + inputs: + input: ncbi_sourcedb + ncbi_virus_name: + function: identity + inputs: + input: ncbi_virus_name + ncbi_virus_tax_id: + function: identity + inputs: + input: ncbi_virus_tax_id + isolate_source: + function: identity + inputs: + input: isolate_source + sra_accessions: + function: identity + inputs: + input: sra_accessions + metadata_hash: + function: identity + inputs: + input: metadata_hash + ingest: + args: + - snakemake + - approve + image: ghcr.io/loculus-project/ingest + configFile: + compound_country_field: ncbi_geo_location + fasta_id_field: genbank_accession + rename: + genbank_accession: insdc_accession_full + ncbi_collection_date: collection_date + ncbi_isolate_name: isolate_name + ncbi_isolate_source: isolate_source + ncbi_sra_accessions: sra_accessions + ncbi_submitter_affiliation: author_affiliation + ncbi_submitter_country: submitter_country + ncbi_submitter_names: authors + keep: + - division + - country + - submissionId + - insdc_accession_base + - insdc_version + - bioprojects + - biosample_accession + - ncbi_completeness + - ncbi_host_name + - ncbi_host_tax_id + - ncbi_is_lab_host + - ncbi_length + - ncbi_protein_count + - ncbi_release_date + - ncbi_update_date + - ncbi_sourcedb + - ncbi_virus_name + - ncbi_virus_tax_id + taxon_id: 186538 + all_fields: + - accession + - bioprojects + - biosample-acc + - completeness + - gene-count + - geo-location + - geo-region + - host-common-name + - host-infraspecific-breed + - host-infraspecific-cultivar + - host-infraspecific-ecotype + - host-infraspecific-isolate + - host-infraspecific-sex + - host-infraspecific-strain + - host-name + - host-pangolin + - host-tax-id + - is-annotated + - is-complete + - is-lab-host + - is-vaccine-strain + - isolate-collection-date + - isolate-lineage + - isolate-lineage-source + - lab-host + - length + - matpeptide-count + - mol-type + - nucleotide-completeness + - protein-count + - purpose-of-sampling + - release-date + - sourcedb + - sra-accs + - submitter-affiliation + - submitter-country + - submitter-names + - update-date + - virus-common-name + - virus-infraspecific-breed + - virus-infraspecific-cultivar + - virus-infraspecific-ecotype + - virus-infraspecific-isolate + - virus-infraspecific-sex + - virus-infraspecific-strain + - virus-name + - virus-pangolin + - virus-tax-id + column_mapping: + Accession: genbank_accession + BioProjects: bioprojects + BioSample accession: biosample_accession + Completeness: ncbi_completeness + Gene count: ncbi_gene_count + Geographic Location: ncbi_geo_location + Geographic Region: ncbi_geo_region + Host Common Name: ncbi_host_common_name + Host Infraspecific Names Breed: ncbi_host_breed + Host Infraspecific Names Cultivar: ncbi_host_cultivar + Host Infraspecific Names Ecotype: ncbi_host_ecotype + Host Infraspecific Names Isolate: ncbi_host_isolate + Host Infraspecific Names Sex: ncbi_host_sex + Host Infraspecific Names Strain: ncbi_host_strain + Host Name: ncbi_host_name + Host Pangolin Classification: ncbi_host_pangolin + Host Taxonomic ID: ncbi_host_tax_id + Is Annotated: ncbi_is_annotated + Is Complete: ncbi_is_complete + Is Lab Host: ncbi_is_lab_host + Is Vaccine Strain: ncbi_is_vaccine_strain + Isolate Collection date: ncbi_collection_date + Isolate Lineage: ncbi_isolate_name + Isolate Lineage source: ncbi_isolate_source + Lab Host: ncbi_lab_host + Length: ncbi_length + Mature peptide count: ncbi_mature_peptide_count + Molecule type: ncbi_mol_type + Nucleotide completeness: ncbi_nucleotide_completeness + Protein count: ncbi_protein_count + Purpose of Sampling: ncbi_purpose_of_sampling + Release date: ncbi_release_date + Source database: ncbi_sourcedb + SRA Accessions: ncbi_sra_accessions + Submitter Affiliation: ncbi_submitter_affiliation + Submitter Country: ncbi_submitter_country + Submitter Names: ncbi_submitter_names + Update date: ncbi_update_date + Virus Common Name: ncbi_virus_common_name + Virus Infraspecific Names Breed: ncbi_virus_breed + Virus Infraspecific Names Cultivar: ncbi_virus_cultivar + Virus Infraspecific Names Ecotype: ncbi_virus_ecotype + Virus Infraspecific Names Isolate: ncbi_virus_isolate + Virus Infraspecific Names Sex: ncbi_virus + Virus Infraspecific Names Strain: ncbi_virus_strain + Virus Name: ncbi_virus_name + Virus Pangolin Classification: ncbi_virus_pangolin + Virus Taxonomic ID: ncbi_virus_tax_id + group_name: insdc_ingest_group + username : insdc_ingest_user + password : insdc_ingest_user + keycloak_client_id : test-cli referenceGenomes: nucleotideSequences: - name: "main"