From b5474c13f961481114041ee25bd8f2cebc30a52a Mon Sep 17 00:00:00 2001 From: Yukio Siraichi Date: Tue, 21 Jan 2025 09:38:03 -0300 Subject: [PATCH] Bump CUDA version to 12.3. --- .github/workflows/build_and_test.yml | 82 +++++++++++++--------------- 1 file changed, 39 insertions(+), 43 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ce5e86db0623..a4f5a8750b43 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -40,25 +40,23 @@ jobs: secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} - # Disable due to https://github.com/pytorch/xla/issues/8199 - # build-torch-with-cuda: - # name: "Build PyTorch with CUDA" - # uses: ./.github/workflows/_build_torch_with_cuda.yml - # needs: get-torch-commit - # with: - # # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner. - # dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 - # torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} - # runner: linux.24xlarge + build-torch-with-cuda: + name: "Build PyTorch with CUDA" + uses: ./.github/workflows/_build_torch_with_cuda.yml + needs: get-torch-commit + with: + # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner. + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3 + torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} + runner: linux.24xlarge - # Disable due to https://github.com/pytorch/xla/issues/8199 - # build-cuda-plugin: - # name: "Build XLA CUDA plugin" - # uses: ./.github/workflows/_build_plugin.yml - # with: - # dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 - # secrets: - # gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} + build-cuda-plugin: + name: "Build XLA CUDA plugin" + uses: ./.github/workflows/_build_plugin.yml + with: + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3 + secrets: + gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} test-python-cpu: name: "CPU tests" @@ -72,32 +70,30 @@ jobs: secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} - # Disable due to https://github.com/pytorch/xla/issues/8199 - # test-cuda: - # name: "GPU tests" - # uses: ./.github/workflows/_test.yml - # needs: [build-torch-xla, build-cuda-plugin, get-torch-commit] - # with: - # dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 - # runner: linux.8xlarge.nvidia.gpu - # timeout-minutes: 300 - # collect-coverage: false - # install-cuda-plugin: true - # torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} - # secrets: - # gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} + test-cuda: + name: "GPU tests" + uses: ./.github/workflows/_test.yml + needs: [build-torch-xla, build-cuda-plugin, get-torch-commit] + with: + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3 + runner: linux.8xlarge.nvidia.gpu + timeout-minutes: 300 + collect-coverage: false + install-cuda-plugin: true + torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} + secrets: + gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} - # Disable due to https://github.com/pytorch/xla/issues/8199 - # test-cuda-with-pytorch-cuda-enabled: - # name: "GPU tests requiring torch CUDA" - # uses: ./.github/workflows/_test_requiring_torch_cuda.yml - # needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, get-torch-commit] - # with: - # dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 - # runner: linux.8xlarge.nvidia.gpu - # timeout-minutes: 300 - # collect-coverage: false - # torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} + test-cuda-with-pytorch-cuda-enabled: + name: "GPU tests requiring torch CUDA" + uses: ./.github/workflows/_test_requiring_torch_cuda.yml + needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, get-torch-commit] + with: + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3 + runner: linux.8xlarge.nvidia.gpu + timeout-minutes: 300 + collect-coverage: false + torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} test-tpu: name: "TPU tests"