Skip to content

Commit

Permalink
Merge branch 'ray-torchtrainer-integration' into hpo-tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
annaelisalappe committed Nov 26, 2024
2 parents 617082d + 052d648 commit 9d77f6d
Show file tree
Hide file tree
Showing 183 changed files with 6,987 additions and 3,158 deletions.
1 change: 0 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ pl-training.yml
# Project folders/files
# use-cases
workflows
tests
CHANGELOG

# Docs
Expand Down
12 changes: 11 additions & 1 deletion .github/linters/.hadolint.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# --------------------------------------------------------------------------------------
# Part of the interTwin Project: https://www.intertwin.eu/
#
# Created by: Matteo Bunino
#
# Credit:
# - Matteo Bunino <[email protected]> - CERN
# --------------------------------------------------------------------------------------

failure-threshold: warning
ignored:
- DL3008 # Pin versions in apt get install.
- DL3013 # Pin versions in pip. TODO: remove.
- DL4001 # Either use Wget or Curl but not both
- DL3003 # Use WORKDIR to switch to a directory
- DL3003 # Use WORKDIR to switch to a directory
- DL3006 # Always tag the version of an image explicitly: https://github.com/hadolint/hadolint/issues/339
77 changes: 77 additions & 0 deletions .github/workflows/container-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# --------------------------------------------------------------------------------------
# Part of the interTwin Project: https://www.intertwin.eu/
#
# Created by: Matteo Bunino
#
# Credit:
# - Matteo Bunino <[email protected]> - CERN
# --------------------------------------------------------------------------------------

name: Container CI

on:
push:
branches: [main]
# pull_request:

jobs:
build:
name: build
runs-on: ubuntu-latest
steps:
- name: Maximize build disk space
uses: easimon/maximize-build-space@v10
with:
# Reserve space on root for docker cache
root-reserve-mb: 35000
overprovision-lvm: true
swap-size-mb: 1024
remove-dotnet: true
remove-android: true
remove-haskell: true
remove-codeql: true

- name: Checkout
uses: actions/checkout@v4

- name: Get Repo Owner
id: get_repo_owner
run: echo "name=repo_owner::$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> "$GITHUB_OUTPUT"

- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.DOCKER_TOKEN }}

- name: Delete huge unnecessary tools folder and large packages
run: |
rm -rf /opt/hostedtoolcache &&
curl -fsSL https://raw.githubusercontent.com/apache/flink/02d30ace69dc18555a5085eccf70ee884e73a16e/tools/azure-pipelines/free_disk_space.sh | bash
- name: Prepare Environment Variables
run: |
echo "COMMIT_HASH=$(git rev-parse --verify HEAD)" >> "$GITHUB_ENV"
BASE_IMG_NAME=nvcr.io/nvidia/pytorch:24.05-py3
echo "BASE_IMG_NAME=$BASE_IMG_NAME" >> "$GITHUB_ENV"
BASE_IMG_DIGEST=$(docker pull $BASE_IMG_NAME > /dev/null 2>&1 && docker inspect $BASE_IMG_NAME --format='{{index .RepoDigests 0}}')
echo "BASE_IMG_DIGEST=$BASE_IMG_DIGEST" >> "$GITHUB_ENV"
docker system prune -af
- name: Integration Test
uses: dagger/dagger-for-github@v7
with:
workdir: ci
verb: call
args: >-
--name="${{ env.COMMIT_HASH }}-torch"
build-container --context=.. --dockerfile=../env-files/torch/Dockerfile
--build-args="COMMIT_HASH=${{ env.COMMIT_HASH }},BASE_IMG_NAME=${{ env.BASE_IMG_NAME }},BASE_IMG_DIGEST=${{ env.BASE_IMG_DIGEST }}"
test-n-publish --kubeconfig=env:KUBECONFIG_STR --stage=DEV --framework=TORCH
--tag-template='${itwinai_version}-torch${framework_version}-${os_version}'
cloud-token: ${{ secrets.DAGGER_CLOUD_TOKEN }}
version: "latest"
env:
KUBECONFIG_STR: ${{ secrets.KUBECONFIG_INFN }}

1 change: 1 addition & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ jobs:

VALIDATE_PYTHON: true
VALIDATE_PYTHON_BLACK: false
VALIDATE_ISORT: false
VALIDATE_PYTHON_MYPY: false
VALIDATE_PYTHON_PYLINT: false
VALIDATE_HTML: false
Expand Down
5 changes: 2 additions & 3 deletions .vscode/extensions.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
"davidanson.vscode-markdownlint",
"ms-python.vscode-pylance",
"ms-python.python",
"ms-python.autopep8",
"bierner.markdown-mermaid",
"github.vscode-github-actions",
"tamasfe.even-better-toml",
"ms-python.isort"
"charliermarsh.ruff",
"github.vscode-github-actions"
]
}
8 changes: 6 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
{
"editor.formatOnSave": true,
"editor.defaultFormatter": null,
"editor.rulers": [
95
],
"cSpell.ignoreWords": [
"itwinpreproc",
"typer"
Expand Down Expand Up @@ -53,7 +56,7 @@
}
},
"[python]": {
"editor.defaultFormatter": "ms-python.autopep8"
"editor.defaultFormatter": "charliermarsh.ruff"
},
"python.testing.pytestArgs": [
"tests"
Expand All @@ -62,5 +65,6 @@
"python.testing.pytestEnabled": true,
"python.analysis.extraPaths": [
"./src/itwinai"
]
],
"makefile.configureOnOpen": false
}
30 changes: 26 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ environment for PyTorch:

```bash
ml --force purge
ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
```

Expand All @@ -80,7 +80,7 @@ environment for TensorFlow:

```bash
ml --force purge
ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
```

Expand Down Expand Up @@ -227,7 +227,7 @@ Commands to be executed before activating the python virtual environment:

```bash
ml --force purge
ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
```

Expand Down Expand Up @@ -261,7 +261,7 @@ Commands to be executed before activating the python virtual environment:

```bash
ml --force purge
ml Python CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml Python/3.11.5-GCCcore-13.2.0 CMake/3.24.3-GCCcore-11.3.0 mpi4py OpenMPI CUDA/12.3
ml GCCcore/11.3.0 NCCL cuDNN/8.9.7.29-CUDA-12.3.0 UCX-CUDA/1.15.0-GCCcore-13.2.0-CUDA-12.3.0
```

Expand Down Expand Up @@ -371,3 +371,25 @@ hashes.
> It is very important to keep the number of tags for `itwinai-cvmfs` as low
> as possible. Tags should only be created under this namespace when strictly
> necessary. Otherwise, this could cause issues for the Unpacker.

### Building a new container

Our docker manifests support labels to record provenance information, which can be lately
accessed by `docker inspect IMAGE_NAME:TAG`.

A full example below:

```bash
export BASE_IMG_NAME="what goes after the last FROM"
export IMAGE_FULL_NAME="IMAGE_NAME:TAG"
docker build \
-t "$IMAGE_FULL_NAME" \
-f path/to/Dockerfile \
--build-arg COMMIT_HASH="$(git rev-parse --verify HEAD)" \
--build-arg BASE_IMG_NAME="$BASE_IMG_NAME" \
--build-arg BASE_IMG_DIGEST="$(docker pull "$BASE_IMG_NAME" > /dev/null 2>&1 && docker inspect "$BASE_IMG_NAME" --format='{{index .RepoDigests 0}}')" \
--build-arg ITWINAI_VERSION="$(grep -Po '(?<=^version = ")[^"]*' pyproject.toml)" \
--build-arg CREATION_DATE="$(date +"%Y-%m-%dT%H:%M:%S%:z")" \
--build-arg IMAGE_FULL_NAME=$IMAGE_FULL_NAME \
.
```
1 change: 1 addition & 0 deletions ci/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/sdk/** linguist-generated
3 changes: 3 additions & 0 deletions ci/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/sdk
/.venv
/**/__pycache__
6 changes: 6 additions & 0 deletions ci/dagger.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"name": "itwinai",
"sdk": "python",
"source": ".",
"engineVersion": "v0.13.6"
}
66 changes: 66 additions & 0 deletions ci/examples.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash

# --------------------------------------------------------------------------------------
# Part of the interTwin Project: https://www.intertwin.eu/
#
# Created by: Matteo Bunino
#
# Credit:
# - Matteo Bunino <[email protected]> - CERN
# --------------------------------------------------------------------------------------

# Example of running dagger pipelines -- this script is mostly a scratchpad

# Build and run local tests (no HPC required)
dagger call \
build-container --context=.. --dockerfile=../env-files/torch/Dockerfile \
test-local
# Build container with additional requirements
dagger call \
build-container --context=.. --dockerfile=../env-files/torch/Dockerfile \
--build-args="REQUIREMENTS=env-files/torch/requirements/cmcc-requirements.txt" \
test-local

# Build and publish
dagger call --name="$(git rev-parse --verify HEAD)" \
build-container --context=.. --dockerfile=../env-files/torch/Dockerfile \
publish

# Pipeline method: build, test local, push, test remote, and push (publish)
export COMMIT_HASH=$(git rev-parse --verify HEAD)
export BASE_IMG_NAME="nvcr.io/nvidia/pytorch:24.05-py3"
export BASE_IMG_DIGEST="$(docker pull $BASE_IMG_NAME > /dev/null 2>&1 && docker inspect $BASE_IMG_NAME --format='{{index .RepoDigests 0}}' | awk -F'@' '{print $2}')"
dagger call --name="${COMMIT_HASH}-torch" \
build-container --context=.. --dockerfile=../env-files/torch/Dockerfile \
--build-args="COMMIT_HASH=$COMMIT_HASH,BASE_IMG_NAME=$BASE_IMG_NAME,BASE_IMG_DIGEST=$BASE_IMG_DIGEST" \
test-n-publish --kubeconfig=env:KUBECONFIG_STR --stage=DEV --framework=TORCH \
--tag-template='${itwinai_version}-torch${framework_version}-${os_version}'

# Open teminal in newly created container
dagger call \
build-container --context=.. --dockerfile=../env-files/torch/Dockerfile \
terminal


############## SLIM ###############
# Build container
dagger call --name="$(git rev-parse --verify HEAD)" \
build-container --context=.. --dockerfile=../env-files/torch/slim.Dockerfile \
test-local

# Test on HPC and publish
export COMMIT_HASH=$(git rev-parse --verify HEAD)
export BASE_IMG_NAME="python:3.10-slim"
export BASE_IMG_DIGEST="$(docker pull $BASE_IMG_NAME > /dev/null 2>&1 && docker inspect $BASE_IMG_NAME --format='{{index .RepoDigests 0}}' | awk -F'@' '{print $2}')"
dagger call --name="${COMMIT_HASH}-torch-slim" \
build-container --context=.. --dockerfile=../env-files/torch/slim.Dockerfile \
--build-args="COMMIT_HASH=$COMMIT_HASH,BASE_IMG_NAME=$BASE_IMG_NAME,BASE_IMG_DIGEST=$BASE_IMG_DIGEST" \
test-n-publish --kubeconfig=env:KUBECONFIG_STR --stage=DEV --framework=TORCH \
--tag-template='${itwinai_version}-slim-torch${framework_version}-${os_version}'


# Convert to singularity
dagger call --name="${COMMIT_HASH}-torch-slim" \
build-container --context=.. --dockerfile=../env-files/torch/slim.Dockerfile \
singularity --src-container "python:3.12" \
export --path my_container.sif
38 changes: 38 additions & 0 deletions ci/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# --------------------------------------------------------------------------------------
# Part of the interTwin Project: https://www.intertwin.eu/
#
# Created by: Matteo Bunino
#
# Credit:
# - Matteo Bunino <[email protected]> - CERN
# --------------------------------------------------------------------------------------

[project]
name = "main"
version = "0.1.0"
maintainers = [{ name = "Matteo Bunino", email = "[email protected]" }]
authors = [{ name = "Matteo Bunino", email = "[email protected]" }]
requires-python = ">=3.12"
dependencies = ["dagger-io", "kubernetes>=31.0.0", "ruff>=0.7.3"]

[tool.uv.sources]
dagger-io = { path = "sdk", editable = true }

[build-system]
requires = ["hatchling==1.25.0"]
build-backend = "hatchling.build"

# Ruff configuration: https://docs.astral.sh/ruff/configuration/
[tool.ruff]
line-length = 95

[tool.ruff.lint]
select = ["E", "F", "I", "W"]
ignore = ["E203"]
fixable = ["ALL"]

[tool.ruff.format]
quote-style = "double"
indent-style = "space"
skip-magic-trailing-comma = false
line-ending = "auto"
Loading

0 comments on commit 9d77f6d

Please sign in to comment.