pytorch · githubsgi · Jun 11, 2025 · Jun 11, 2025 · Jun 25, 2025 · Jun 27, 2025
diff --git a/.github/workflows/main_distributed.yaml b/.github/workflows/main_distributed.yaml
@@ -17,10 +17,10 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python 3.8
+    - name: Set up Python 3.9
       uses: actions/setup-python@v2
       with:
-        python-version: 3.8
+        python-version: 3.9
     - name: Install PyTorch
       uses: astral-sh/setup-uv@v6
     - name: Run Tests

diff --git a/distributed/tensor_parallelism/README.md b/distributed/tensor_parallelism/README.md
@@ -12,5 +12,5 @@ https://pytorch.org/docs/stable/distributed.tensor.parallel.html
 
 ```
 pip install -r requirements.txt
-python example.py
+torchrun --nnodes 1 --nproc-per-node 4 tensor_parallel_example.py
 ```
diff --git a/distributed/tensor_parallelism/log_utils.py b/distributed/tensor_parallelism/log_utils.py
@@ -17,6 +17,6 @@ def rank_log(_rank, logger, msg):
 
 def verify_min_gpu_count(min_gpus: int = 2) -> bool:
     """ verification that we have at least 2 gpus to run dist examples """
-    has_cuda = torch.cuda.is_available()
-    gpu_count = torch.cuda.device_count()
-    return has_cuda and gpu_count >= min_gpus
+    has_gpu = torch.accelerator.is_available()
+    gpu_count = torch.accelerator.device_count()
+    return has_gpu and gpu_count >= min_gpus
diff --git a/distributed/tensor_parallelism/requirements.txt b/distributed/tensor_parallelism/requirements.txt
@@ -1,6 +1,3 @@
 # Python dependencies required for running the example
 
---pre
---extra-index-url https://download.pytorch.org/whl/nightly/cu118
---extra-index-url https://download.pytorch.org/whl/nightly/cu121
-torch >= 2.3.0.dev0; sys_platform == "linux"
+torch >= 2.7.1; sys_platform == "linux"
diff --git a/distributed/tensor_parallelism/sequence_parallel_example.py b/distributed/tensor_parallelism/sequence_parallel_example.py
@@ -1,3 +1,5 @@
+# The following is an example command to run this code
+# torchrun --nnodes 1 --nproc-per-node 4 sequence_parallel_example.py
 import os
 import sys
 import torch
@@ -63,9 +65,10 @@ def forward(self, x):
 """
 logger = get_logger()
 
+device_type = torch.accelerator.current_accelerator().type
 # create a device mesh based on the given world_size.
 device_mesh = init_device_mesh(
-    device_type="cuda", mesh_shape=(int(os.environ["WORLD_SIZE"]),)
+    device_type=device_type, mesh_shape=(int(os.environ["WORLD_SIZE"]),)
 )
 
 _rank = device_mesh.get_rank()
@@ -75,7 +78,7 @@ def forward(self, x):
 rank_log(_rank, logger, f"Device Mesh created: {device_mesh=}")
 
 # create model and move it to GPU.  Init_device_mesh has already assigned gpu ids...
-model = ToyModel().to("cuda")
+model = ToyModel().to(device_type)
 
 # Custom parallelization plan for the model
 sp_model = parallelize_module(
@@ -100,7 +103,7 @@ def forward(self, x):
 
 for i in range(num_iters):
     # For SP, input can be different across all ranks.
-    inp = torch.rand(20, 10, device="cuda")
+    inp = torch.rand(20, 10, device=device_type)
     output = sp_model(inp)
     output.sum().backward()
     optimizer.step()

diff --git a/distributed/tensor_parallelism/tensor_parallel_example.py b/distributed/tensor_parallelism/tensor_parallel_example.py
@@ -1,3 +1,5 @@
+# The following is an example command to run this code
+# torchrun --nnodes 1 --nproc-per-node 4 tensor_parallel_example.py
 import os
 import sys
 import torch
@@ -76,8 +78,8 @@ def forward(self, x):
 
 # create a device mesh based on the given world_size.
 _world_size = int(os.environ["WORLD_SIZE"])
-
-device_mesh = init_device_mesh(device_type="cuda", mesh_shape=(_world_size,))
+device_type = torch.accelerator.current_accelerator().type
+device_mesh = init_device_mesh(device_type=device_type, mesh_shape=(_world_size,))
 _rank = device_mesh.get_rank()
 
 
@@ -88,8 +90,8 @@ def forward(self, x):
 
 rank_log(_rank, logger, f"Device Mesh created: {device_mesh=}")
 
-# create model and move it to GPU - init"cuda"_mesh has already mapped GPU ids.
-tp_model = ToyModel().to("cuda")
+# create model and move it to GPU - initdevice_type_mesh has already mapped GPU ids.
+tp_model = ToyModel().to(device_type)
 
 
 # Custom parallelization plan for the model
@@ -116,7 +118,7 @@ def forward(self, x):
     # For TP, input needs to be same across all TP ranks.
     # Setting the random seed is to mimic the behavior of dataloader.
     torch.manual_seed(i)
-    inp = torch.rand(20, 10, device="cuda")
+    inp = torch.rand(20, 10, device=device_type)
     output = tp_model(inp)
     output.sum().backward()
     optimizer.step()

diff --git a/runtime.txt b/runtime.txt
@@ -1 +1 @@
-3.8
+3.9