pytorch · ysiraichi · Jan 21, 2025 · Jan 22, 2025 · Jan 23, 2025 · Jan 28, 2025
diff --git a/.github/workflows/_build_torch_with_cuda.yml b/.github/workflows/_build_torch_with_cuda.yml
@@ -22,6 +22,9 @@ jobs:
       image: ${{ inputs.dev-image }}
     env:
       _GLIBCXX_USE_CXX11_ABI: 0
+      TORCH_CUDA_ARCH_LIST: "7.0;7.5;8.0;9.0"
+      USE_CUDA: 1
+      MAX_JOBS: 24
     steps:
       - name: Checkout actions
         uses: actions/checkout@v4
@@ -34,18 +37,11 @@ jobs:
         with:
           torch-commit: ${{ inputs.torch-commit }}
           cuda: true
-      - name: Checkout PyTorch Repo
-        uses: actions/checkout@v4
-        with:
-          repository: pytorch/pytorch
-          path: pytorch
-          ref: ${{ inputs.torch-commit }}
-          submodules: recursive
       - name: Build PyTorch with CUDA enabled
         shell: bash
         run: |
           cd pytorch
-          TORCH_CUDA_ARCH_LIST="5.2;8.6" USE_CUDA=1 MAX_JOBS="$(nproc --ignore=4)" python setup.py bdist_wheel
+          python setup.py bdist_wheel
       - name: Upload wheel
         uses: actions/upload-artifact@v4
         with:

diff --git a/.github/workflows/_test_requiring_torch_cuda.yml b/.github/workflows/_test_requiring_torch_cuda.yml
@@ -94,8 +94,12 @@ jobs:
           pip install -U --pre jaxlib -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
           pip install -U --pre jax-cuda12-pjrt jax-cuda12-plugin -f https://storage.googleapis.com/jax-releases/jax_cuda_plugin_nightly_releases.html
           pip install -U --pre jax -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
-          pip install --no-deps triton==2.3.0
         if: ${{ matrix.run_triton_tests }}
+      - name: Install Triton
+        shell: bash
+        run: |
+          cd pytorch
+          make triton
       - name: Python Tests
         shell: bash
         run: |
@@ -106,5 +110,5 @@ jobs:
       - name: Triton Tests
         shell: bash
         run: |
-          PJRT_DEVICE=CUDA TRITON_PTXAS_PATH=/usr/local/cuda-12.1/bin/ptxas python pytorch/xla/test/test_triton.py
+          PJRT_DEVICE=CUDA TRITON_PTXAS_PATH=/usr/local/cuda-12.3/bin/ptxas python pytorch/xla/test/test_triton.py
         if: ${{ matrix.run_triton_tests }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -42,25 +42,23 @@ jobs:
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
-  # Disable due to https://github.com/pytorch/xla/issues/8199
-  # build-torch-with-cuda:
-  #   name: "Build PyTorch with CUDA"
-  #   uses: ./.github/workflows/_build_torch_with_cuda.yml
-  #   needs: get-torch-commit
-  #   with:
-  #     # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner.
-  #     dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
-  #     torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
-  #     runner: linux.24xlarge
+  build-torch-with-cuda:
+    name: "Build PyTorch with CUDA"
+    uses: ./.github/workflows/_build_torch_with_cuda.yml
+    needs: get-torch-commit
+    with:
+      # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner.
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3
+      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
+      runner: linux.24xlarge
 
-  # Disable due to https://github.com/pytorch/xla/issues/8199
-  # build-cuda-plugin:
-  #   name: "Build XLA CUDA plugin"
-  #   uses: ./.github/workflows/_build_plugin.yml
-  #   with:
-  #     dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
-  #   secrets:
-  #     gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
+  build-cuda-plugin:
+    name: "Build XLA CUDA plugin"
+    uses: ./.github/workflows/_build_plugin.yml
+    with:
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3
+    secrets:
+      gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
   test-python-cpu:
     name: "CPU tests"
@@ -74,32 +72,30 @@ jobs:
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
-  # Disable due to https://github.com/pytorch/xla/issues/8199
-  # test-cuda:
-  #   name: "GPU tests"
-  #   uses: ./.github/workflows/_test.yml
-  #   needs: [build-torch-xla, build-cuda-plugin, get-torch-commit]
-  #   with:
-  #     dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
-  #     runner: linux.8xlarge.nvidia.gpu
-  #     timeout-minutes: 300
-  #     collect-coverage: false
-  #     install-cuda-plugin: true
-  #     torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
-  #   secrets:
-  #     gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
+  test-cuda:
+    name: "GPU tests"
+    uses: ./.github/workflows/_test.yml
+    needs: [build-torch-xla, build-cuda-plugin, get-torch-commit]
+    with:
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3
+      runner: linux.g4dn.12xlarge.nvidia.gpu
+      timeout-minutes: 300
+      collect-coverage: false
+      install-cuda-plugin: true
+      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
+    secrets:
+      gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
-  # Disable due to https://github.com/pytorch/xla/issues/8199
-  # test-cuda-with-pytorch-cuda-enabled:
-  #   name: "GPU tests requiring torch CUDA"
-  #   uses: ./.github/workflows/_test_requiring_torch_cuda.yml
-  #   needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, get-torch-commit]
-  #   with:
-  #     dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
-  #     runner: linux.8xlarge.nvidia.gpu
-  #     timeout-minutes: 300
-  #     collect-coverage: false
-  #     torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
+  test-cuda-with-pytorch-cuda-enabled:
+    name: "GPU tests requiring torch CUDA"
+    uses: ./.github/workflows/_test_requiring_torch_cuda.yml
+    needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, get-torch-commit]
+    with:
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3
+      runner: linux.8xlarge.nvidia.gpu
+      timeout-minutes: 300
+      collect-coverage: false
+      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
 
   test-tpu:
     name: "TPU tests"

diff --git a/.github/workflows/setup/action.yml b/.github/workflows/setup/action.yml
@@ -29,8 +29,8 @@ runs:
     - name: Setup CUDA environment
       shell: bash
       run: |
-        echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
-        echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
+        echo "PATH=$PATH:/usr/local/cuda-12.3/bin" >> $GITHUB_ENV
+        echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.3/lib64" >> $GITHUB_ENV
       if: ${{ inputs.cuda }}
     - name: Setup gcloud
       shell: bash