Skip to content

Commit

Permalink
ADLR/megatron-lm!2223 - ci(refactor): Facelift gitlab-ci
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g committed Oct 18, 2024
1 parent 0d89fc4 commit 33d2f45
Show file tree
Hide file tree
Showing 18 changed files with 231 additions and 2,047 deletions.
10 changes: 5 additions & 5 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,23 @@ workflow:
- if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 5
UNIT_TEST_TIMEOUT: 50
UNIT_TEST_TIMEOUT: 75
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
- if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 5
UNIT_TEST_TIMEOUT: 50
UNIT_TEST_TIMEOUT: 75
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: nightly
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
- if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 5
UNIT_TEST_TIMEOUT: 50
UNIT_TEST_TIMEOUT: 75
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: weekly
FUNCTIONAL_TEST_CLUSTER_A100: ""
Expand Down Expand Up @@ -95,7 +95,7 @@ variables:
description: Type of publish (freeze or final release)

# CI wide variables
CI_MCORE_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci
CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts
CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
LINTING_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_linting
Expand All @@ -104,6 +104,6 @@ variables:

include:
- .gitlab/stages/00.pre.yml
- .gitlab/stages/01.tests.yml
- .gitlab/stages/01.test.yml
- .gitlab/stages/02.functional-tests.yml
- .gitlab/stages/03.publish.yml
6 changes: 4 additions & 2 deletions .gitlab/labeler-config.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
CI:
- .gitlab-ci.yml
- Dockerfile.ci
- jet-tests.yml
- Dockerfile.ci.lts
- Dockerfile.ci.dev
- .github/**
- .gitlab/**

Datasets:
- megatron/core/datasets/**
Expand Down
40 changes: 22 additions & 18 deletions .gitlab/stages/00.pre.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
include:
- template: Security/Secret-Detection.gitlab-ci.yml

.pre_mr_rules:
.pre_rules:
rules:
- if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
Expand All @@ -10,7 +10,16 @@ include:
- when: never
stage: .pre

mirror_to_github:
.dind_rules:
image: docker:26.1.4-dind
variables:
DOCKER_HOST: unix:///var/run/docker.sock
before_script:
- docker system prune -a --filter "until=36h" -f || true
- echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
- echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin

pre:mirror_to_github:
rules:
- if: '$CI_COMMIT_REF_PROTECTED == "true" && $CI_PIPELINE_SOURCE == "push"'
- when: never
Expand All @@ -24,7 +33,7 @@ mirror_to_github:
- git remote add github https://ko3n1g:[email protected]/NVIDIA/Megatron-LM.git || true
- git push -u github $CI_COMMIT_BRANCH

create_ci_branches:
pre:create_ci_branches:
rules:
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
- when: never
Expand All @@ -47,8 +56,8 @@ create_ci_branches:
- git switch --force-create $branch
- git push --force -u origin $branch

label_merge_request:
extends: [.pre_mr_rules]
pre:label_merge_request:
extends: [.pre_rules]
image: golang:1.22
tags:
- mcore-docker-node-small
Expand All @@ -67,21 +76,17 @@ label_merge_request:
source labels
curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT
clean_docker_node:
extends: [.pre_mr_rules]
image: docker:26.1.4-dind
pre:clean_docker_node:
extends: [.pre_rules, .dind_rules]
tags:
- ${node}
parallel:
matrix:
- node: 8xL40S
- node: mcore-docker-node-small
- node: mcore-docker-node-jet
script:
- export DOCKER_HOST='unix:///var/run/docker.sock'
- docker system prune -a --filter "until=36h" -f || true
- node: mcore-docker-node-large
script: ':'

maybe_cherry_pick_commit:
pre:maybe_cherry_pick_commit:
rules:
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
- when: never
Expand Down Expand Up @@ -168,11 +173,10 @@ maybe_cherry_pick_commit:
done
interruptible: false

check_milestone:
extends: [.pre_mr_rules]
pre:check_milestone:
extends: [.pre_rules]
image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci:buildcache
tags:
- mcore-docker-node-small
tags: [mcore-docker-node-small]
script:
- env
- |
Expand Down
158 changes: 86 additions & 72 deletions .gitlab/stages/01.tests.yml → .gitlab/stages/01.test.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.test_mr_rules:
.test_rules:
rules:
- if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
Expand All @@ -9,33 +9,29 @@
include:
- template: Security/Secret-Detection.gitlab-ci.yml

build_image:
extends: [.test_mr_rules]
test:build_image:
extends: [.test_rules, .dind_rules]
tags:
- ${TAG}
image: docker:26.1.4-dind
timeout: 45m
parallel:
matrix:
- IMAGE: CI_MCORE_IMAGE
FILE: Dockerfile.ci
- IMAGE: CI_MCORE_LTS_IMAGE
FILE: Dockerfile.ci.lts
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
TAG: mcore-docker-node-large
- IMAGE: CI_MCORE_DEV_IMAGE
FILE: Dockerfile.ci.dev
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.07-py3
TAG: mcore-docker-node-large
- IMAGE: CI_NEMO_IMAGE
FILE: Dockerfile.ci
FILE: Dockerfile.ci.lts
BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
TAG: mcore-docker-node-large
- IMAGE: LINTING_IMAGE
FILE: Dockerfile.linting
BASE_IMAGE: python:3.10
TAG: mcore-docker-node-small
before_script:
- echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
- echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin
variables:
STAGE: main
script:
Expand All @@ -45,8 +41,6 @@ build_image:
set -x
env
eval "IMAGE=\$$IMAGE"
docker system prune -a --filter "until=24h" -f || true
docker buildx create --name container --driver=docker-container
Expand All @@ -61,13 +55,22 @@ build_image:
ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
fi
if [[ "$CI_PIPELINE_SOURCE" == "merge_request_event" ]]; then
MCORE_REF=$(echo ${CI_MERGE_REQUEST_REF_PATH} | sed 's/head$/merge/')
else
MCORE_REF=$CI_COMMIT_SHA
fi
DOCKER_BUILDKIT=1 docker build \
--secret id=JET_INDEX_URLS \
--target $STAGE \
-f $FILE \
-t ${IMAGE}:${CI_PIPELINE_ID} \
--builder=container \
--build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \
--build-arg MCORE_REPO=${CI_REPOSITORY_URL} \
--build-arg MCORE_REF=${MCORE_REF} \
--build-arg MCORE_BACKWARDS_REF="core_r0.9.0" \
--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
--cache-from type=registry,ref=${IMAGE}-buildcache:main \
Expand All @@ -80,55 +83,37 @@ build_image:
retry:
max: 2

unit_tests:
# This job runs both test suite of ToT and of a historic ref against
# the current code. This is a form of backwards compatibility testing
# and helps in providing stable interfaces.
extends: [.test_mr_rules]
image: ${IMAGE}:${CI_PIPELINE_ID}
needs: [build_image]
.unit_tests:
extends: [.test_rules, .dind_rules]
needs: [test:build_image]
timeout: 180m
parallel:
matrix:
- TAG: latest
IMAGE: ${CI_MCORE_IMAGE}
- TAG: latest
IMAGE: ${CI_MCORE_DEV_IMAGE}
- TAG: core_r0.9.0
IMAGE: ${CI_MCORE_IMAGE}
- TAG: core_r0.9.0
IMAGE: ${CI_MCORE_DEV_IMAGE}
tags: [8xL40S]
variables:
GIT_STRATEGY: clone
GIT_DEPTH: 0
before_script:
- |
if [[ $TAG != latest ]]; then
git checkout $TAG
rm -rf /opt/megatron-lm/tests
cp -r tests/ /opt/megatron-lm
fi
GIT_STRATEGY: none
script:
- if [ $UNIT_TEST_REPEAT -eq 0 ]; then exit 0; fi;
- docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
- |
export NVTE_FLASH_ATTN=0
export NVTE_FUSED_ATTN=0
cd /opt/megatron-lm
if [[ $UNIT_TEST_REPEAT -eq 0 ]]; then
exit 0
fi
for i in $(seq $UNIT_TEST_REPEAT); do
SEED=$((RANDOM % 9000 + 1000));
ARGS=()
if [[ $TAG != latest ]]; then
ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
else
ARGS+=(-m "not flaky and not flaky_in_dev")
fi
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
done
docker exec mcore_ci_${CI_PIPELINE_ID} bash -c '
set -e
MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/")
cd /opt/megatron-lm$MCORE_DIR;
for i in $(seq $UNIT_TEST_REPEAT); do
SEED=$((RANDOM % 9000 + 1000));
ARGS=()
if [[ $TAG != latest ]]; then
ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
else
ARGS+=(-m "not flaky and not flaky_in_dev")
fi
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
done
'
after_script:
- docker container stop mcore_ci_${CI_PIPELINE_ID} || true
artifacts:
paths:
- coverage
Expand All @@ -138,10 +123,38 @@ unit_tests:
when: always
- when: always

unit-tests-results-notify:
extends: [.test_mr_rules]
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
needs: [unit_tests]
test:pyt(LTS)_mcore(latest):
extends: [.unit_tests]
variables:
TAG: latest
IMAGE: ${CI_MCORE_LTS_IMAGE}

test:pyt(LTS)_mcore(0.9.0):
extends: [.unit_tests]
variables:
TAG: core_r0.9.0
IMAGE: ${CI_MCORE_LTS_IMAGE}

test:pyt(DEV)_mcore(latest):
extends: [.unit_tests]
variables:
TAG: latest
IMAGE: ${CI_MCORE_DEV_IMAGE}

test:pyt(DEV)_mcore(0.9.0):
extends: [.unit_tests]
variables:
TAG: core_r0.9.0
IMAGE: ${CI_MCORE_DEV_IMAGE}

test:notify:
extends: [.test_rules]
image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
needs:
- test:pyt(LTS)_mcore(latest)
- test:pyt(DEV)_mcore(latest)
- test:pyt(LTS)_mcore(0.9.0)
- test:pyt(DEV)_mcore(0.9.0)
tags:
- mcore-docker-node-small
script:
Expand All @@ -160,39 +173,40 @@ unit-tests-results-notify:
when: always
- when: never

docs_build_test:
extends: [.test_mr_rules]
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
test:docs_build:
extends: [.test_rules]
image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
tags: [mcore-docker-node-small]
needs: [build_image]
needs: [test:build_image]
script:
- cd ..
- rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git
- mv megatron-lm/ documentation/
- cd documentation/
- ./repo docs

formatting:
extends: [.test_mr_rules]
test:formatting:
extends: [.test_rules]
image: ${LINTING_IMAGE}:${CI_PIPELINE_ID}
tags: [mcore-docker-node-small]
needs: [build_image]
needs: [test:build_image]
script:
- env
- git fetch origin main
- BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh

copyright:
extends: [.test_mr_rules]
test:copyright:
extends: [.test_rules]
tags: [mcore-docker-node-small]
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
needs: [build_image]
image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
needs: [test:build_image]
script:
- git fetch origin main
- bash tools/copyright.sh

secret_detection:
test:secret_detection:
tags: [mcore-docker-node-small]
extends: ".secret-analyzer"
variables:
GIT_DEPTH: 0
SECRET_DETECTION_LOG_OPTIONS: ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA}
Expand Down
Loading

0 comments on commit 33d2f45

Please sign in to comment.