Skip to content

Commit

Permalink
feat(ingest): Add basic ingest pipeline, deploy for ebola-zaire (#1399)
Browse files Browse the repository at this point in the history
  • Loading branch information
corneliusroemer authored Mar 21, 2024
1 parent f3ad919 commit f405c8b
Show file tree
Hide file tree
Showing 16 changed files with 1,098 additions and 0 deletions.
82 changes: 82 additions & 0 deletions .github/workflows/ingest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
name: ingest
on:
push:
workflow_dispatch:
inputs:
build_arm:
type: boolean
description: "Build for ARM as well"
default: false
required: false

env:
DOCKER_IMAGE_NAME: ghcr.io/loculus-project/ingest
BUILD_ARM: ${{ github.ref == 'refs/heads/main' || github.event.inputs.build_arm }}

concurrency:
group: ci-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}-ingest
cancel-in-progress: true

jobs:
dockerImage:
name: Build ingest Docker Image # Don't change: Referenced by .github/workflows/update-argocd-metadata.yml
runs-on: ubuntu-latest
timeout-minutes: 15
permissions:
contents: read
packages: write
checks: read
steps:
- uses: actions/checkout@v4

- name: Generate files hash
id: files-hash
run: |
DIR_HASH=$(echo -n ${{ hashFiles('ingest/**', '.github/workflows/ingest.yml') }})
echo "DIR_HASH=$DIR_HASH${{ env.BUILD_ARM && '-arm'|| '' }}" >> $GITHUB_ENV
- name: Setup Docker metadata
id: dockerMetadata
uses: docker/metadata-action@v5
with:
images: ${{ env.DOCKER_IMAGE_NAME }}
tags: |
type=raw,value=${{ env.DIR_HASH }}
type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }}
type=ref,event=branch
type=sha,prefix=commit-
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Check if image exists
id: check-image
run: |
EXISTS=$(docker manifest inspect ${{ env.DOCKER_IMAGE_NAME }}:${{ env.DIR_HASH }} > /dev/null 2>&1 && echo "true" || echo "false")
echo "CACHE_HIT=$EXISTS" >> $GITHUB_ENV
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build and push image if input files changed
if: env.CACHE_HIT == 'false'
uses: docker/build-push-action@v5
with:
context: ./ingest
push: true
tags: ${{ steps.dockerMetadata.outputs.tags }}
cache-from: type=gha,scope=ingest-${{ github.ref }}
cache-to: type=gha,mode=max,scope=ingest-${{ github.ref }}
platforms: ${{ env.BUILD_ARM && 'linux/amd64,linux/arm64' || 'linux/amd64' }}

- name: Retag and push existing image if cache hit
if: env.CACHE_HIT == 'true'
run: |
TAGS=(${{ steps.dockerMetadata.outputs.tags }})
for TAG in "${TAGS[@]}"; do
docker buildx imagetools create --tag $TAG ${{ env.DOCKER_IMAGE_NAME }}:${{ env.DIR_HASH }}
done
8 changes: 8 additions & 0 deletions ingest/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.snakemake/
.git/
data/
results/
result/
.DS_Store
.ruff_cache
config/config.yaml
5 changes: 5 additions & 0 deletions ingest/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.snakemake/
data/
results/
.DS_Store
.ruff_cache
6 changes: 6 additions & 0 deletions ingest/.mambarc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
repodata_use_zst: true
channel_priority: strict
download_threads: 20
15 changes: 15 additions & 0 deletions ingest/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM mambaorg/micromamba:1.5.7

COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/env.yaml
COPY --chown=$MAMBA_USER:$MAMBA_USER .mambarc /tmp/.mambarc

RUN micromamba config set extract_threads 1 \
&& micromamba install -y -n base -f /tmp/env.yaml --rc-file /tmp/.mambarc \
&& micromamba clean --all --yes

# Set the environment variable to activate the conda environment
ARG MAMBA_DOCKERFILE_ACTIVATE=1

COPY --chown=$MAMBA_USER:$MAMBA_USER . /package

WORKDIR /package
12 changes: 12 additions & 0 deletions ingest/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Pipeline to ingest data from INSDC into loculus

## Overview

1. Download data from INSDC
2. Filtering
3. Turn into FASTA/Metadata
4. Upload to loculus

## Deployment

Pipeline shall be put in a docker container that takes a config file as input
124 changes: 124 additions & 0 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
TAXON_ID = config["taxon_id"]
ALL_FIELDS = ",".join(config["all_fields"])
COLUMN_MAPPING = config["column_mapping"]
LOG_LEVEL = config.get("log_level", "INFO")


def rename_columns(input_file, output_file):
with open(input_file, "r") as f:
header = f.readline().strip().split("\t")
header = [COLUMN_MAPPING.get(h, h) for h in header]
with open(output_file, "w") as g:
g.write("\t".join(header) + "\n")
for line in f:
g.write(line)


rule all:
input:
"data/sequences.fasta",
"data/metadata.tsv",


rule fetch_ncbi_dataset_package:
output:
dataset_package="results/ncbi_dataset.zip",
retries: 5
shell:
"""
datasets download virus genome taxon {TAXON_ID} \
--no-progressbar \
--filename {output.dataset_package}
"""


rule extract_ncbi_dataset_sequences:
input:
dataset_package="results/ncbi_dataset.zip",
output:
ncbi_dataset_sequences="results/sequences.fasta",
shell:
"""
unzip -jp {input.dataset_package} \
ncbi_dataset/data/genomic.fna \
| seqkit seq -i -w0 \
> {output.ncbi_dataset_sequences}
"""


rule format_ncbi_dataset_report:
input:
dataset_package="results/ncbi_dataset.zip",
output:
ncbi_dataset_tsv="results/metadata_post_extract.tsv",
params:
fields_to_include=ALL_FIELDS,
shell:
"""
dataformat tsv virus-genome \
--package {input.dataset_package} \
--fields {params.fields_to_include:q} \
> {output.ncbi_dataset_tsv}
"""


rule rename_columns:
input:
ncbi_dataset_tsv="results/metadata_post_extract.tsv",
output:
ncbi_dataset_tsv="results/metadata_post_rename.tsv",
run:
rename_columns(input.ncbi_dataset_tsv, output.ncbi_dataset_tsv)


rule prepare_metadata:
input:
metadata="results/metadata_post_rename.tsv",
config="config/config.yaml",
output:
metadata="results/metadata_post_prepare.tsv",
params:
log_level=LOG_LEVEL,
shell:
"""
python scripts/prepare_metadata.py \
--config-file {input.config} \
--input {input.metadata} \
--output {output.metadata} \
--log-level {params.log_level} \
"""


rule submit_to_loculus:
input:
metadata="results/metadata_post_prepare.tsv",
sequences="results/sequences.fasta",
config="config/config.yaml",
output:
submitted=touch("results/submitted"),
params:
log_level=LOG_LEVEL,
shell:
"""
python scripts/submit_to_loculus.py \
--mode submit \
--metadata {input.metadata} \
--sequences {input.sequences} \
--config-file {input.config} \
--log-level {params.log_level} \
"""


rule approve:
input:
submitted="results/submitted",
config="config/config.yaml",
params:
log_level=LOG_LEVEL,
shell:
"""
python scripts/submit_to_loculus.py \
--mode approve \
--config-file {input.config} \
--log-level {params.log_level} \
"""
Loading

0 comments on commit f405c8b

Please sign in to comment.