update DTensor usage with upstream (#9079)

zpcore · Pei Zhang · web-flow · commit 02a1ba6f1a47 · 2025-05-02T16:13:25.000-07:00
Co-authored-by: Pei Zhang &lt;piz@t1v-n-15c505fb-w-0.us-central2-b.c.tpu-pytorch.internal&gt;
diff --git a/docs/source/perf/spmd_advanced.md b/docs/source/perf/spmd_advanced.md
@@ -80,14 +80,14 @@ The main use case for `XLAShardedTensor` [[RFC](https://github.com/pytorch/xla/i
 There is also an ongoing effort to integrate <code>XLAShardedTensor</code> into <code>DistributedTensor</code> API to support XLA backend [[RFC](https://github.com/pytorch/pytorch/issues/92909)].
 
 ### DTensor Integration
-PyTorch has prototype-released [DTensor](https://github.com/pytorch/pytorch/blob/main/torch/distributed/_tensor/README.md) in 2.1.
+PyTorch has prototype-released [DTensor](https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md) since 2.1.
 We are integrating PyTorch/XLA SPMD into DTensor API [RFC](https://github.com/pytorch/pytorch/issues/92909). We have a proof-of-concept integration for `distribute_tensor`, which calls `mark_sharding` annotation API to shard a tensor and its computation using XLA:
 ```python
 import torch
-from torch.distributed import DeviceMesh, Shard, distribute_tensor
+from torch.distributed.tensor import init_device_mesh, Shard, distribute_tensor
 
 # distribute_tensor now works with `xla` backend using PyTorch/XLA SPMD.
-mesh = DeviceMesh("xla", list(range(world_size)))
+mesh = init_device_mesh("xla", mesh_shape=(world_size,))
 big_tensor = torch.randn(100000, 88)
 my_dtensor = distribute_tensor(big_tensor, mesh, [Shard(0)])
 ```
@@ -152,15 +152,15 @@ PyTorch/XLA auto-sharding can be enabled by one of the following:
 import torch_xla.runtime as xr
 xr.use_spmd(auto=True)
 ```
-- Calling `pytorch.distributed._tensor.distribute_module` with `auto-policy` and `xla`:
+- Calling `pytorch.distributed.tensor.distribute_module` with `auto-policy` and `xla`:
 
 ```python
 import torch_xla.runtime as xr
-from torch.distributed._tensor import DeviceMesh, distribute_module
+from torch.distributed.tensor import init_device_mesh, distribute_module
 from torch_xla.distributed.spmd import auto_policy
 
 device_count = xr.global_runtime_device_count()
-device_mesh = DeviceMesh("xla", list(range(device_count)))
+device_mesh = init_device_mesh("xla", mesh_shape=(device_count,))
 
 # Currently, model should be loaded to xla device via distribute_module.
 model = MyModule()  # nn.module
diff --git a/test/spmd/test_dtensor_integration.py b/test/spmd/test_dtensor_integration.py
@@ -4,8 +4,8 @@
 import torch
 from torch import nn
 import torch.optim as optim
-from torch.distributed._tensor import (DeviceMesh, Shard, distribute_tensor,
-                                       distribute_module)
+from torch.distributed.tensor import init_device_mesh, Shard, distribute_tensor, distribute_module
+
 import torch_xla
 import torch_xla.debug.metrics as met
 import torch_xla.runtime as xr
@@ -25,7 +25,7 @@ def setUpClass(cls):
 
   def test_xla_distribute_tensor(self):
     device_count = xr.global_runtime_device_count()
-    device_mesh = DeviceMesh("xla", list(range(device_count)))
+    device_mesh = init_device_mesh("xla", mesh_shape=(device_count,))
     shard_spec = [Shard(0)]
 
     for requires_grad in [True, False]:
@@ -53,7 +53,7 @@ def test_optimizer_step_with_sharding(self):
 
     # Running the same mark_sharding test with xla_distribute_tensor instead
     device_count = xr.global_runtime_device_count()
-    device_mesh = DeviceMesh("xla", list(range(device_count)))
+    device_mesh = init_device_mesh("xla", mesh_shape=(device_count,))
     shard_spec = [Shard(0)]
     distribute_tensor(model.fc1.weight, device_mesh, shard_spec)
     sharding_spec = torch_xla._XLAC._get_xla_sharding_spec(model.fc1.weight)
@@ -79,7 +79,7 @@ def test_xla_distribute_module(self):
     model = self.SimpleLinear().to(xm.xla_device())
 
     device_count = xr.global_runtime_device_count()
-    device_mesh = DeviceMesh("xla", list(range(device_count)))
+    device_mesh = init_device_mesh("xla", mesh_shape=(device_count,))
 
     def shard_params(mod_name, mod, mesh):
       shard_spec = [Shard(0)]
diff --git a/test/spmd/test_dtensor_integration2.py b/test/spmd/test_dtensor_integration2.py
@@ -4,8 +4,8 @@
 import torch
 from torch import nn
 import torch.optim as optim
-from torch.distributed._tensor import (DeviceMesh, Shard, distribute_tensor,
-                                       distribute_module)
+from torch.distributed.tensor import (DeviceMesh, Shard, distribute_tensor,
+                                      distribute_module)
 import torch_xla
 import torch_xla.debug.metrics as met
 import torch_xla.runtime as xr
diff --git a/torch_xla/distributed/spmd/api.py b/torch_xla/distributed/spmd/api.py
@@ -8,8 +8,8 @@
 import torch
 
 import torch.nn as nn
-from torch.distributed._tensor.device_mesh import DeviceMesh
-from torch.distributed._tensor.placement_types import Placement, Replicate
+from torch.distributed import DeviceMesh
+from torch.distributed.tensor.placement_types import Placement, Replicate
 
 import torch_xla.core.xla_model as xm  # type:ignore[import]  # noqa: F401
 import torch_xla.runtime as xr  # type:ignore[import]