From 5e0810dc094a7332808e7b30d88295cc53464b3d Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Fri, 15 Mar 2024 15:55:55 -0400 Subject: [PATCH] test cloud ci --- .github/workflows/cleanup.yml | 23 ----- .github/workflows/cloud-ci.yml | 101 +++++++++++++++++++++ .github/workflows/docker-run.yml | 50 ----------- .github/workflows/docker.yml | 91 ------------------- .github/workflows/docs.yml | 33 ------- .github/workflows/integration.yml | 77 ---------------- .github/workflows/report_container.yml | 60 ------------- .github/workflows/show_config.yml | 26 ------ .github/workflows/tests.yml | 113 ------------------------ config/examples/ec2-system.yaml | 3 +- milabench/cli/cloud.py | 18 ++-- milabench/cli/covalent/__main__.py | 10 +-- milabench/cli/covalent/requirements.txt | 2 +- 13 files changed, 118 insertions(+), 489 deletions(-) delete mode 100644 .github/workflows/cleanup.yml create mode 100644 .github/workflows/cloud-ci.yml delete mode 100644 .github/workflows/docker-run.yml delete mode 100644 .github/workflows/docker.yml delete mode 100644 .github/workflows/docs.yml delete mode 100644 .github/workflows/integration.yml delete mode 100644 .github/workflows/report_container.yml delete mode 100644 .github/workflows/show_config.yml delete mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/cleanup.yml b/.github/workflows/cleanup.yml deleted file mode 100644 index e205cb001..000000000 --- a/.github/workflows/cleanup.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Clean space on the CI node - -on: - # Allow manual runs - workflow_dispatch: - -# define build arguments - -jobs: - clean: - strategy: - matrix: - include: - - arch: cuda - - arch: rocm - - runs-on: [self-hosted, "${{ matrix.arch }}"] - - steps: - - name: Get an overview of available space - run: | - df -h - docker image ls diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml new file mode 100644 index 000000000..593235bc9 --- /dev/null +++ b/.github/workflows/cloud-ci.yml @@ -0,0 +1,101 @@ +name: tests + +on: + # Runs for pull requests + pull_request: + branches: + - master + +jobs: + tests: + strategy: + fail-fast: true + matrix: + include: + - arch: cpu + exclude: "no-cuda" + run_on: ec2 + # - arch: rocm + # exclude : "no-rocm" + + runs-on: ubuntu-latest + environment: test-cloud-ci + + # Cancel previous jobs if a new version was pushed + concurrency: + group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.run_on }}" + cancel-in-progress: true + + defaults: + run: + shell: bash -el {0} + + env: + MILABENCH_CONFIG: "config/test.yaml" + MILABENCH_SYSTEM: "config/examples/${{ matrix.run_on }}-system.yaml" + MILABENCH_BASE: "output" + MILABENCH_ARGS: "" + MILABENCH_GPU_ARCH: "${{ matrix.arch }}" + MILABENCH_DASH: "no" + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: dependencies + run: | + python -m pip install -U pip + python -m pip install -U poetry + poetry lock --no-update + poetry install + + - name: cloud + run: | + mkdir -p ~/.aws + mkdir -p ~/.ssh/covalent + echo "${{ secrets.COVALENT_EC2_EXECUTOR_KEYPAIR }}" >~/.ssh/covalent/covalent-ec2-executor-keypair.pem + echo "[mb_aws_cloud_ci]" >~/.aws/credentials + echo "aws_access_key_id = ${{ secrets.AWS_ACCESS_KEY_ID }}" >>~/.aws/credentials + echo "aws_secret_access_key = ${{ secrets.AWS_SECRET_ACCESS_KEY }}" >>~/.aws/credentials + chmod a-rwx,o+rwX ~/.aws ~/.ssh + + _system=$( + poetry run milabench cloud \ + --setup \ + --run-on ${{ matrix.run_on }} + ) + { read _hash ; }< <( + echo -n "$_system" | while read l + do + if [[ "$l" == "# hash::>"* ]] + then + echo -n "${l#*::>}" + fi + done + ) + if [[ -z "${_hash}" ]] + then + >&2 echo "Failed to fetch system config hash" + exit 1 + fi + echo -n "$_system" >$MILABENCH_SYSTEM.$_hash + export MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$_hash + + - name: install benchmarks + run: | + milabench install + + - name: prepare benchmarks + run: | + milabench prepare + + - name: run benchmarks + run: | + milabench run + + - name: Summary + run: | + milabench report --push diff --git a/.github/workflows/docker-run.yml b/.github/workflows/docker-run.yml deleted file mode 100644 index 35c72fe59..000000000 --- a/.github/workflows/docker-run.yml +++ /dev/null @@ -1,50 +0,0 @@ -# Run Milabench using nightly docker images -name: docker-run - -on: - # Only works on manual runs - workflow_dispatch: - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - IMAGE_PATH: ghcr.io/mila-iqia/milabench:cuda-nightly - -jobs: - build-image: - strategy: - matrix: - - arch: [cuda, rocm] - - runs-on: [self-hosted, "${{ matrix.arch }}"] - - permissions: - contents: read - - env: - IMAGE_NAME: "ghcr.io/mila-iqia/milabench:${{ matrix.arch }}-nightly" - - steps: - - name: pull - run: | - docker pull $IMAGE_NAME - - - name: run - run: | - OUTPUT="$(pwd)/../results" - mkdir -p $OUTPUT - docker run --rm --shm-size=256M \ - --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all \ - -v $OUTPUT:/milabench/envs/runs \ - $IMAGE_NAME milabench run - - - name: Check out the repo - uses: actions/checkout@v3 - - - name: summary - run: | - python -m pip install -U pip - python -m pip install -U poetry - poetry lock --no-update - poetry install - milabench summary $OUTPUT diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml deleted file mode 100644 index fb8c75f26..000000000 --- a/.github/workflows/docker.yml +++ /dev/null @@ -1,91 +0,0 @@ -name: Publish Docker image - -on: - # Allow manual runs - workflow_dispatch: - - # Only run for push on the main branch or for tagged version - push: - branches: - - master - tags: - - v*.*.* - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - - -permissions: - packages: write - - -# define build arguments - -jobs: - build-image: - strategy: - fail-fast: false - matrix: - include: - - arch: cuda - - arch: rocm - - runs-on: [self-hosted, "${{ matrix.arch }}"] - - permissions: - contents: read - packages: write - - steps: - - name: Show all images - run: | - docker image ls - - - name: Prune - run: | - # Prune all images older than 2 weeks - # The images are still on github registry - docker image prune -f -a --filter "until=336h" - docker system prune -f - - - name: Check out the repo - uses: actions/checkout@v3 - - - name: Get Image Tag Name - env: - GITHUB_REF_NAME_ENV: ${{ github.ref_name }} - run: | - REGEX="(.*)v(.*)\.(.*)\.(.*)" - IMAGE_TAG="nightly" - if [[ "${GITHUB_REF_NAME_ENV}" =~ $REGEX ]]; then - IMAGE_TAG="${GITHUB_REF_NAME##*/}" - fi - echo "IMAGE_TAG=$IMAGE_TAG" >> $GITHUB_ENV - - - name: Log in to the registry - uses: docker/login-action@v2 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) for the image - id: meta - uses: docker/metadata-action@v4 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - tags: | - type=raw,value=${{ matrix.arch }}-${{ env.IMAGE_TAG }} - - - name: Build and push the image - uses: docker/build-push-action@v3 - with: - context: . - push: true - file: docker/Dockerfile-${{ matrix.arch }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - build-args: | - ARCH=${{ matrix.arch }} - CONFIG=standard.yaml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index 99ac4253d..000000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: docs - -on: - push: - branches: - - master - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - - uses: actions/checkout@master - with: - # otherwise, you will failed to push refs to dest repo - fetch-depth: 0 - - - name: Install Dependencies - run: | - pip install -e . - pip install sphinx sphinx-rtd-theme - - - name: Build and Commit - uses: sphinx-notes/pages@v2 - - - name: Push changes - uses: ad-m/github-push-action@master - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - branch: gh-pages diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml deleted file mode 100644 index 717ca7e6d..000000000 --- a/.github/workflows/integration.yml +++ /dev/null @@ -1,77 +0,0 @@ -name: integration - -on: - # Runs every sunday - schedule: - - cron: '0 0 * * SUN' - - # Runs for pull requests - pull_request: - branches: - - master - - # Runs on publish - release: - types: - [published] - - # Allow manual triggers - workflow_dispatch: - -jobs: - # Label of the container job - postgresql: - runs-on: ubuntu-latest - - concurrency: - group: "${{ github.ref }}" - cancel-in-progress: true - - services: - # The hostname of the PostgreSQL service is the label - postgres: - image: postgres - env: - POSTGRES_PASSWORD: password - POSTGRES_USER: username - POSTGRES_DB: milabench - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 - - steps: - - uses: actions/setup-python@v4 - with: - python-version: '3.9' - - - name: Check out repository code - uses: actions/checkout@v3 - - - name: dependencies - run: | - if [[ ! -d "~/.cargo/bin" ]]; then - wget --no-check-certificate --secure-protocol=TLSv1_2 -qO- https://sh.rustup.rs | sh -s -- -y - fi - export PATH="~/.cargo/bin:${PATH}" - python -m pip install -U pip - python -m pip install -U poetry - - - name: install - run: | - pip install pytest - poetry lock --no-update - pip install -e . - - - name: tests - env: - POSTGRES_USER: username - POSTGRES_PSWD: password - POSTGRES_DB: milabench - POSTGRES_HOST: localhost - POSTGRES_PORT: 5432 - run: pytest tests/integration diff --git a/.github/workflows/report_container.yml b/.github/workflows/report_container.yml deleted file mode 100644 index 1d48daedd..000000000 --- a/.github/workflows/report_container.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: Publish Docker image for reports - -on: - # Allow manual runs - workflow_dispatch: - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -permissions: - packages: write - -# define build arguments -jobs: - build-image: - runs-on: ubuntu-22.04 - - strategy: - fail-fast: false - - permissions: - contents: read - packages: write - - steps: - - name: Check out the repo - uses: actions/checkout@v3 - - - name: Get Image Tag Name - env: - GITHUB_REF_NAME_ENV: ${{ github.ref_name }} - run: | - echo "IMAGE_TAG=$GITHUB_REF_NAME_ENV" >> $GITHUB_ENV - - - name: Log in to the registry - uses: docker/login-action@v2 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) for the image - id: meta - uses: docker/metadata-action@v4 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - tags: | - type=raw,value=report-${{ env.IMAGE_TAG }} - - - name: Build and push the image - uses: docker/build-push-action@v3 - with: - context: . - push: true - file: docker/Dockerfile-report - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - build-args: | - CONFIG=standard.yaml diff --git a/.github/workflows/show_config.yml b/.github/workflows/show_config.yml deleted file mode 100644 index 1d2dd8096..000000000 --- a/.github/workflows/show_config.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Run Milabench using nightly docker images -name: show-config - -on: - # Only works on manual runs - workflow_dispatch: - -jobs: - execute: - runs-on: [self-hosted, rocm] - - steps: - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - - name: Install Dependencies - run: | - python -m pip install --upgrade pip - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.4.2 - - - name: Show Pytorch Config - run: | - python -c "import torch; print(torch.__config__.show())" - diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml deleted file mode 100644 index 7d456f9bb..000000000 --- a/.github/workflows/tests.yml +++ /dev/null @@ -1,113 +0,0 @@ -name: tests - -on: - # Runs every sunday - schedule: - - cron: '0 0 * * SUN' - - # Runs for pull requests - pull_request: - branches: - - master - - # Runs on publish - release: - types: - [published] - - # Allow manual triggers - workflow_dispatch: - - -jobs: - tests: - strategy: - fail-fast: false - matrix: - include: - - arch: cuda - exclude : "unsupported-cuda" - # - arch: rocm - # exclude : "unsupported-rocm" - - runs-on: [self-hosted, "${{ matrix.arch }}"] - - # Cancel previous jobs if a new version was pushed - concurrency: - group: "${{ github.ref }}-${{ matrix.arch }}" - cancel-in-progress: true - - defaults: - run: - shell: bash -el {0} - - env: - MILABENCH_CONFIG: "config/ci.yaml" - MILABENCH_BASE: "output" - MILABENCH_ARGS: "" - MILABENCH_GPU_ARCH: "${{ matrix.arch }}" - MILABENCH_DASH: "no" - MILABENCH_EXCLUDE: "${{ matrix.exclude }}" - - steps: - - uses: actions/checkout@v3 - - - uses: conda-incubator/setup-miniconda@v2 - with: - auto-activate-base: false - python-version: 3.9 - miniconda-version: "latest" - activate-environment: test - - - name: Pytorch Sanity - run: | - if [[ "${MILABENCH_GPU_ARCH}" == "rocm" ]]; then - groups - /opt/rocm/bin/rocminfo - fi - - - name: dependencies - run: | - if [[ ! -d "~/.cargo/bin" ]]; then - wget --no-check-certificate --secure-protocol=TLSv1_2 -qO- https://sh.rustup.rs | sh -s -- -y - fi - export PATH="~/.cargo/bin:${PATH}" - python -m pip install -U pip - python -m pip install -U poetry - poetry lock --no-update - # poetry v1.7 has a bug where it can't find pip during the first - # install attempt: - # Output: - # [...]/.venv/bin/python: can't open file - # '[...]/lib/python3.9/site-packages/virtualenv/seed/wheels/embed/pip-23.3.1-py3-none-any.whl/pip': - # [Errno 2] No such file or directory - ! poetry install - poetry install - - - name: pin - run: | - MILABENCH_GPU_ARCH=cuda poetry run milabench pin -c constraints/cuda.txt --config config/standard.yaml - MILABENCH_GPU_ARCH=rocm poetry run milabench pin -c constraints/rocm.txt --config config/standard.yaml - git diff --stat - - - name: tests - run: | - export PATH="/opt/rocm/bin:$PATH" - pytest --ignore=tests/integration tests/ - - - name: install benchmarks - run: | - milabench install --exclude "${MILABENCH_EXCLUDE}" - - - name: prepare benchmarks - run: | - milabench prepare --exclude "${MILABENCH_EXCLUDE}" - - - name: run benchmarks - run: | - export PATH="/opt/rocm/bin:$PATH" - milabench run --validations all --exclude "${MILABENCH_EXCLUDE}" - - - name: Summary - run: | - milabench summary $MILABENCH_BASE/runs/ diff --git a/config/examples/ec2-system.yaml b/config/examples/ec2-system.yaml index a81c09bfc..04cf6fc41 100644 --- a/config/examples/ec2-system.yaml +++ b/config/examples/ec2-system.yaml @@ -13,8 +13,9 @@ system: # Cloud instances profiles cloud_profiles: ec2: - profile: mb_test_sog_3 + # profile: mb_aws_cloud_ci username: ubuntu instance_type: t2.micro volume_size: 8 region: us-east-2 + state_id: ced1ea75ce796ece05d53f5655ebc0f8 diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py index c0f9c9bcb..35f961d57 100644 --- a/milabench/cli/cloud.py +++ b/milabench/cli/cloud.py @@ -47,15 +47,15 @@ def manage_cloud(pack, packs, run_on, action="setup"): import milabench.cli.covalent as cv - subprocess.run( - [ - sys.executable, - "-m", cv.__name__, - "serve", "start" - ] - , stdout=sys.stderr - , check=True - ) + # subprocess.run( + # [ + # sys.executable, + # "-m", cv.__name__, + # "serve", "start" + # ] + # , stdout=sys.stderr + # , check=True + # ) cmd = [ sys.executable, diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py index ed722d422..8d678ee1c 100644 --- a/milabench/cli/covalent/__main__.py +++ b/milabench/cli/covalent/__main__.py @@ -125,13 +125,13 @@ def lattice(argv=(), deps_bash = None): deps_bash = ct.DepsBash(deps_bash) argv = ["conda", "env", "list"] - if argv: - dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash) - result = ct.get_result(dispatch_id=dispatch_id, wait=True) - return_code, stdout, _ = result.result if result.result is not None else (1, "", "") + # if argv: + # dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash) + # result = ct.get_result(dispatch_id=dispatch_id, wait=True) + # return_code, stdout, _ = result.result if result.result is not None else (1, "", "") if return_code == 0 and args.setup: - assert any([l for l in stdout.split("\n") if l.startswith("milabench ")]) + # assert any([l for l in stdout.split("\n") if l.startswith("milabench ")]) _executor:ct.executor.BaseExecutor = executor_cls( **{ **_get_executor_kwargs(args), diff --git a/milabench/cli/covalent/requirements.txt b/milabench/cli/covalent/requirements.txt index f810e6eaf..e8cb6af4c 100644 --- a/milabench/cli/covalent/requirements.txt +++ b/milabench/cli/covalent/requirements.txt @@ -1,2 +1,2 @@ covalent -covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench \ No newline at end of file +covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench-cloud-ci \ No newline at end of file