Skip to content

Commit

Permalink
CI move to ALPS (daint-gpu -> alps_gh200) (#1225)
Browse files Browse the repository at this point in the history
  • Loading branch information
rasolca authored Feb 11, 2025
1 parent 28bc43f commit b14ffba
Show file tree
Hide file tree
Showing 29 changed files with 367 additions and 193 deletions.
10 changes: 5 additions & 5 deletions ci/.gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ include:
- local: 'ci/cpu/gcc12_release_cxx20.yml'
- local: 'ci/cpu/gcc13_codecov.yml'
- local: 'ci/cpu/gcc13_release.yml'
- local: 'ci/cuda/gcc11_release.yml'
- local: 'ci/cuda/gcc11_release_scalapack.yml'
- local: 'ci/cuda/gcc11_codecov.yml'
- local: 'ci/cuda/gcc11_debug_scalapack.yml'
- local: 'ci/cuda/gcc13_release_stdexec.yml'
- local: 'ci/cuda/gcc13_release.yml'
- local: 'ci/cuda/gcc13_release_scalapack.yml'
- local: 'ci/cuda/gcc13_debug.yml'
- local: 'ci/cuda/gcc13_debug_scalapack.yml'
# - local: 'ci/cuda/gcc13_release_stdexec.yml'
- local: 'ci/rocm/clang14_release.yml'
- local: 'ci/rocm/clang14_release_stdexec.yml'
- local: 'ci/rocm/clang15_release_stdexec.yml'
64 changes: 64 additions & 0 deletions ci/base-images/gh200-cray-mpich/HOWTO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Modified base image to allow building cray-mpich

## Preparation steps

```
mkdir lib64
cp -a /usr/lib64/libcuda.* lib64/
cp -a /usr/lib64/libxpmem.* lib64/
git clone https://github.com/eth-cscs/alps-cluster-config.git
cp alps-cluster-config/daint/packages.yaml packages.yaml
```

## Edit cluster config files

Modify `packages.yaml`:
```
xpmem:
buildable: false
externals:
- spec: [email protected]
prefix: /usr
libfabric:
- buildable: false
- externals:
- - spec: [email protected]
- prefix: /opt/cray/libfabric/1.15.2.0/
+ require: "@1.15.2.0"
slurm:
buildable: false
externals:
- spec: slurm@23-11-7
prefix: /usr
```
Note: The container engine (CE) will replace libfabric with the system one when running the container.
Make sure to use the same version.


Modify `alps-cluster-config/site/repo/packages/cray-gtl/package.py`
```
patchelf("--force-rpath", "--set-rpath", rpath, f, fail_on_error=False)
# The C compiler wrapper can fail because libmpi_gtl_cuda refers to the symbol
# __gxx_personality_v0 but wasn't linked against libstdc++.
- if "libmpi_gtl_cuda.so" in str(f):
- patchelf("--add-needed", "libstdc++.so", f, fail_on_error=False)
if "@8.1.27+cuda" in self.spec:
patchelf("--add-needed", "libcudart.so", f, fail_on_error=False)
patchelf("--add-needed", "libcuda.so", f, fail_on_error=False)
```
Note: the library links `libstdc++.so` from version 8.1.23. All the available aarch64 libraries already link with it,
therefore we can safely remove it for gh200.

## Build and push

```
export TAG="v1.4"
CSCS_REGISTRY="jfrog.svc.cscs.ch/docker-ci-ext/4700071344751697"
podman login jfrog.svc.cscs.ch
podman build -f build.Dockerfile -t $CSCS_REGISTRY/base-images/cuda_12.6.1-devel-ubuntu24.04:$TAG
podman push $CSCS_REGISTRY/base-images/cuda_12.6.1-devel-ubuntu24.04:$TAG
```
5 changes: 5 additions & 0 deletions ci/base-images/gh200-cray-mpich/build.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM docker.io/nvidia/cuda:12.6.1-devel-ubuntu24.04

COPY lib64 /usr/lib
COPY packages.yaml /root/.spack/packages.yaml
COPY alps-cluster-config/site /root/site
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
# dlaf-no-license-check

CSCS_REGISTRY="jfrog.svc.cscs.ch/contbuild/testing/anfink/4700071344751697"
CSCS_REGISTRY="jfrog.svc.cscs.ch/docker-ci-ext/4700071344751697"
docker build -t $CSCS_REGISTRY/rocm-patched:5.3.3 -f build.Dockerfile .
docker push $CSCS_REGISTRY/rocm-patched:5.3.3
File renamed without changes.
8 changes: 8 additions & 0 deletions ci/ci-ext-custom.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'

# Need custom definition as remote adds SLURM_MPI_TYPE: pmi2
.dlaf-container-runner-daint-gh200:
extends: .container-runner-daint-gh200
variables:
SLURM_MPI_TYPE: cray_shasta
68 changes: 46 additions & 22 deletions ci/common-ci.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
- local: 'ci/ci-ext-custom.yml'

stages:
- build_deps
Expand All @@ -9,23 +10,20 @@ stages:
variables:
FF_TIMESTAMPS: true

##
## BUILDS
##
## BUILD DEPS

.build_deps_common:
extends: .container-builder
.build_deps_common_base:
stage: build_deps
timeout: 6 hours
before_script:
- echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin
- TAG_IMAGE=`echo ${BASE_IMAGE##*/} | sed 's/[:]//g'`
- TAG_APTGET=`echo ${EXTRA_APTGET} | sha256sum - | head -c 6`
- TAG_COMPILER=`echo ${COMPILER}_CXX${CXXSTD} | sed 's/[@]//g'`
- TAG_DOCKERFILE=`sha256sum $DOCKERFILE | head -c 16`
- TAG_SPACK=`echo $SPACK_SHA`
- TAG_REPO=`find $SPACK_DLAF_REPO -type f -exec sha256sum {} \; | sha256sum - | head -c 16`
- TAG_ENVIRONMENT=`cat $SPACK_ENVIRONMENT $COMMON_SPACK_ENVIRONMENT | sha256sum | head -c 16`
- TAG_DOCKERFILE=`sha256sum $DOCKERFILE | head -c 12`
- TAG_SPACK=`echo $SPACK_SHA | sed "s/develop-//g" | head -c 16`
- TAG_REPO=`find $SPACK_DLAF_REPO -type f -exec sha256sum {} \; | sha256sum - | head -c 12`
- TAG_ENVIRONMENT=`cat $SPACK_ENVIRONMENT $COMMON_SPACK_ENVIRONMENT | sha256sum | head -c 12`
- TAG=${TAG_IMAGE}-${TAG_APTGET}-${TAG_COMPILER}-MKL${USE_MKL}-${TAG_DOCKERFILE}-${TAG_SPACK}-${TAG_REPO}-${TAG_ENVIRONMENT}
- export PERSIST_IMAGE_NAME=$DEPS_IMAGE:$TAG
- echo "DEPS_IMAGE=$PERSIST_IMAGE_NAME" > build.env
Expand Down Expand Up @@ -58,11 +56,25 @@ variables:
EXTRA_APTGET: ""
CXXSTD: 17
USE_MKL: "OFF"
COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml
USE_CODECOV: "false"

.build_common:
extends: .container-builder
.build_deps_common:
extends:
- .container-builder-cscs-zen2
- .build_deps_common_base
variables:
COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml

.build_deps_common_gh200:
extends:
- .container-builder-cscs-gh200
- .build_deps_common_base
variables:
COMMON_SPACK_ENVIRONMENT: ci/docker/common-gh200.yaml

## BUILD DLAF

.build_common_base:
stage: build
timeout: 2 hours
before_script:
Expand All @@ -74,6 +86,7 @@ variables:
PERSIST_IMAGE_NAME: $DLAF_IMAGE
DOCKER_BUILD_ARGS: '[
"DEPS_IMAGE",
"DLAF_LD_PRELOAD",
"PIP_OPTS",
"NUM_PROCS=$NUM_CORES_BUILD_DLAF"
]'
Expand All @@ -85,19 +98,20 @@ variables:
paths:
- pipeline.yml

.build_for_daint-mc:
.build_common:
extends:
- .container-builder-cscs-zen2
- .build_common_base
variables:
RUNNER: ".container-runner-daint"
SLURM_CONSTRAINT: mc
THREADS_MAX_PER_TASK: 72
THREADS_PER_NODE: 72
DLAF_LD_PRELOAD: "/lib/x86_64-linux-gnu/libSegFault.so"


.build_for_daint-gpu:
.build_common_gh200:
extends:
- .container-builder-cscs-gh200
- .build_common_base
variables:
RUNNER: ".container-runner-daint"
SLURM_CONSTRAINT: gpu
THREADS_MAX_PER_TASK: 24
THREADS_PER_NODE: 24
DLAF_LD_PRELOAD: "/lib/aarch64-linux-gnu/libSegFault.so"

.build_for_eiger:
variables:
Expand All @@ -106,6 +120,16 @@ variables:
THREADS_MAX_PER_TASK: 32
THREADS_PER_NODE: 256

.build_for_alps_gh200:
variables:
RUNNER: ".dlaf-container-runner-daint-gh200"
SLURM_CONSTRAINT: gpu
# 64 / 2 to avoid ranks on multiple sockets for RANK6
THREADS_MAX_PER_TASK: 32
THREADS_PER_NODE: 256

## RUN

.run_common:
stage: test
trigger:
Expand Down
2 changes: 1 addition & 1 deletion ci/cpu/asan_ubsan_lsan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ cpu asan ubsan lsan test:
ASAN_OPTIONS: "fast_unwind_on_malloc=0:strict_string_checks=1:detect_leaks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1"
UBSAN_OPTIONS: "halt_on_error=1:print_stacktrace=1"
# Override use of libSegFault, not necessary with sanitizers
LD_PRELOAD: ""
DLAF_LD_PRELOAD: ""
trigger:
include:
- artifact: pipeline.yml
Expand Down
6 changes: 3 additions & 3 deletions ci/ctest_to_gitlab.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ ARTIFACTS="
"
fi

# CRAY_CUDA_MPS set to 0 to avoid test hanging on daint (See PR #1197)
BASE_TEMPLATE="
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
- local: 'ci/ci-ext-custom.yml'
image: $IMAGE
Expand All @@ -70,7 +70,7 @@ variables:
SLURM_EXCLUSIVE: ''
SLURM_EXACT: ''
SLURM_CONSTRAINT: $SLURM_CONSTRAINT
CRAY_CUDA_MPS: 0
CRAY_CUDA_MPS: 1
MPICH_MAX_THREAD_SAFETY: multiple
{{JOBS}}
Expand Down Expand Up @@ -104,7 +104,7 @@ for rank_label in `ctest --print-labels | egrep -o "RANK_[1-9][0-9]?"`; do
N=`echo "$rank_label" | sed "s/RANK_//"`
C=$(( THREADS_PER_NODE / N ))
if [ $C -gt $THREADS_MAX_PER_TASK ]; then
C=$THREADS_MAX_PER_TASK
C=$THREADS_MAX_PER_TASK
fi

# Skip label combinations that match no tests
Expand Down
32 changes: 0 additions & 32 deletions ci/cuda/gcc11_codecov.yml

This file was deleted.

30 changes: 0 additions & 30 deletions ci/cuda/gcc11_debug_scalapack.yml

This file was deleted.

31 changes: 0 additions & 31 deletions ci/cuda/gcc11_release.yml

This file was deleted.

31 changes: 0 additions & 31 deletions ci/cuda/gcc11_release_scalapack.yml

This file was deleted.

Loading

0 comments on commit b14ffba

Please sign in to comment.