Skip to content

Commit

Permalink
Merge branch 'ko3n1g/ci/jet-fleet' into 'main'
Browse files Browse the repository at this point in the history
ci: JET improvements

See merge request ADLR/megatron-lm!2365
  • Loading branch information
ko3n1g committed Nov 19, 2024
2 parents f214627 + a231b87 commit b6866ae
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 118 deletions.
110 changes: 55 additions & 55 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,51 +10,51 @@ workflow:
- if: $CI_PIPELINE_SOURCE == "web"
- if: $CI_COMMIT_REF_PROTECTED == "true"
variables:
FUNCTIONAL_TEST: "no"
FUNCTIONAL_TEST: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 2700
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
PUBLISH: "no"
FUNCTIONAL_TEST_CLUSTER_A100: ''
FUNCTIONAL_TEST_CLUSTER_H100: ''
PUBLISH: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: nightly
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 2700
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
PUBLISH: "no"
FUNCTIONAL_TEST_CLUSTER_A100: ''
FUNCTIONAL_TEST_CLUSTER_H100: ''
PUBLISH: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: weekly
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_TIME_LIMIT: 9000
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
PUBLISH: "no"
FUNCTIONAL_TEST_CLUSTER_A100: ''
FUNCTIONAL_TEST_CLUSTER_H100: ''
PUBLISH: 'no'
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
FUNCTIONAL_TEST: "no"
PUBLISH: "no"
FUNCTIONAL_TEST: 'no'
PUBLISH: 'no'
- when: never
auto_cancel:
on_new_commit: interruptible
# on_job_failure: all

stages:
- test
- test
- functional_tests
- publish

Expand All @@ -63,73 +63,73 @@ default:

variables:
UNIT_TEST:
value: "yes"
value: 'yes'
options:
- "yes"
- "no"
- 'yes'
- 'no'
description: To run the funtional test suite
UNIT_TEST_REPEAT:
value: "1"
description: "Number of repetitions"
UNIT_TEST_TIMEOUT:
value: "10"
value: '1'
description: 'Number of repetitions'
UNIT_TEST_TIMEOUT:
value: '10'
description: Timeout (minutes) for Unit tests (all repeats)
FUNCTIONAL_TEST:
value: "yes"
FUNCTIONAL_TEST:
value: 'yes'
options:
- "yes"
- "no"
- 'yes'
- 'no'
description: To run the funtional test suite
FUNCTIONAL_TEST_SCOPE:
value: "mr"
value: 'mr'
options:
- "mr"
- "nightly"
- "weekly"
- "pre-release"
- "release"
description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
- 'mr'
- 'nightly'
- 'weekly'
- 'pre-release'
- 'release'
description: 'Testsuite to run (only for FUNCTIONAL_TEST=yes)'
FUNCTIONAL_TEST_REPEAT:
value: "5"
description: "Number of repetitions per test"
value: '5'
description: 'Number of repetitions per test'
FUNCTIONAL_TEST_TIME_LIMIT:
value: "2700"
description: "Timeout in seconds per test"
value: '2700'
description: 'Timeout in seconds per test'
FUNCTIONAL_TEST_CASES:
value: "all"
value: 'all'
description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
FUNCTIONAL_TEST_CLUSTER_A100:
value: "dgxa100_dracooci"
value: 'dgxa100_dracooci'
options:
- "dgxa100_dracooci"
- "dgxa100_dracooci-ord"
- 'dgxa100_dracooci'
- 'dgxa100_dracooci-ord'
description: 'Cluster for A100 workloads'
FUNCTIONAL_TEST_CLUSTER_H100:
value: "dgxh100_eos"
value: 'dgxh100_eos'
options:
- "dgxh100_coreweave"
- "dgxh100_eos"
- 'dgxh100_coreweave'
- 'dgxh100_eos'
description: 'Cluster for H100 workloads'
FUNCTIONAL_TEST_NAME:
description: "Name of functional test run (only for pre-release and release)"
PUBLISH:
value: "no"
options:
- "yes"
- "no"
description: 'Name of functional test run (only for pre-release and release)'
PUBLISH:
value: 'no'
options:
- 'yes'
- 'no'
description: Build and publish a wheel to PyPi
PUBLISH_SCOPE:
value: "code-freeze"
value: 'code-freeze'
options:
- "code-freeze"
- "release"
- 'code-freeze'
- 'release'
description: Type of publish (freeze or final release)

# CI wide variables
CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts
CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
LINTING_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_linting
UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility

include:
- .gitlab/stages/00.pre.yml
Expand Down
21 changes: 9 additions & 12 deletions .gitlab/stages/00.pre.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pre:mirror_to_github:
stage: .pre
image: python:3.10
variables:
GIT_STRATEGY: "clone"
GIT_STRATEGY: 'clone'
script:
- git checkout $CI_COMMIT_BRANCH
- git remote add github https://ko3n1g:[email protected]/NVIDIA/Megatron-LM.git || true
Expand All @@ -49,7 +49,7 @@ pre:create_ci_branches:
stage: .pre
image: python:3.10
variables:
GIT_STRATEGY: "clone"
GIT_STRATEGY: 'clone'
script:
- git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git"
- git switch --force-create $branch
Expand Down Expand Up @@ -81,17 +81,15 @@ pre:maybe_cherry_pick_commit:
- when: never
tags: [mcore-docker-node-small]
stage: .pre
image:
name: registry.gitlab.com/gitlab-ci-utils/curl-jq
entrypoint: [""]
image: badouralix/curl-jq
variables:
GIT_STRATEGY: "clone"
script:
GIT_STRATEGY: 'clone'
script:
- set -x
- set +e
- SHA=$(git rev-list --no-merges -n 1 HEAD)
- MESSAGE=$(git log -n 1 --pretty=format:%s $SHA)
- MR_ID=$(echo $MESSAGE | awk -F'!' '{print $2}' | awk '{print $1}' )
- MR_ID=$(echo $MESSAGE | awk -F'!' '{print $2}' | awk '{print $1}' )
- git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
- git config --global user.email "[email protected]"
- git config --global user.name "Mcore Bot"
Expand All @@ -109,10 +107,10 @@ pre:maybe_cherry_pick_commit:
echo Nothing to cherry pick
exit 0
fi
echo $TARGET_BRANCHES | while read -r RELEASE_BRANCH ; do
TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$RELEASE_BRANCH)" != "" ]] && echo true || echo false)
if [[ "$TARGET_BRANCH_EXISTS_OK" == "false" ]]; then
echo Release branch does not yet exist, will not cherry-pick
continue
Expand Down Expand Up @@ -164,7 +162,7 @@ pre:maybe_cherry_pick_commit:

pre:check_milestone:
extends: [.pre_rules]
image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci:buildcache
image: badouralix/curl-jq
tags: [mcore-docker-node-small]
script:
- env
Expand All @@ -175,4 +173,3 @@ pre:check_milestone:
echo Please assign a Milestone to this MR!
exit 1
fi
Loading

0 comments on commit b6866ae

Please sign in to comment.