Support FP8 in op flip, index_put, and index.Tensor (#2190)

chunhuanMeng · CuiYifeng · web-flow · commit 9aac5a1ddf50 · 2025-11-03T02:17:51.000Z
To solve #2207 Extends support for float8 data types across various XPU tensor indexing and transformation kernels, ensuring these operations are compatible with the new types. It also adds a regression test for flipping float8 tensors and removes the skip for float8 indexing tests. **Float8 type support:** * Updated dispatch macros in `XPUScalar.cpp` and `Indexing.cpp` to include `AT_FLOAT8_TYPES`, enabling float8 support in scalar extraction, indexing, index_put, and deterministic index_put kernels. * Modified `flip_kernel` in `TensorTransformationsKernels.cpp` to support float8 and barebones unsigned types, updating the dispatch mechanism accordingly. * Included the new dispatch header `Dispatch_v2.h` for the updated dispatch macros. **Testing improvements:** * Added a regression test for flipping float8 tensors in `test_index_and_index_put.py` to verify correctness of the operation on XPU. * Removed the skip for float8 tests in `test_indexing_xpu.py`, re-enabling these tests now that support is implemented. --------- Co-authored-by: Cui, Yifeng <yifeng.cui@intel.com>
diff --git a/src/ATen/native/xpu/XPUScalar.cpp b/src/ATen/native/xpu/XPUScalar.cpp
@@ -32,6 +32,7 @@ Scalar _local_scalar_dense_xpu(const Tensor& self) {
         r = Scalar(*value.const_data_ptr<scalar_t>());
       }),
       AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+      AT_EXPAND(AT_FLOAT8_TYPES),
       kComplexHalf,
       kHalf,
       kBool,
diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -43,14 +43,10 @@ void index_kernel(
     TensorIteratorBase& iter,
     IntArrayRef index_size,
     IntArrayRef index_stride) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
-      at::ScalarType::ComplexHalf,
-      at::ScalarType::BFloat16,
-      at::ScalarType::Half,
-      at::ScalarType::Bool,
+  AT_DISPATCH_V2(
       iter.dtype(),
       "index_xpu",
-      [&] {
+      AT_WRAP([&] {
         using dtype = OpaqueType<sizeof(scalar_t)>;
         IndexFunctor<dtype> f;
         _index_kernel(
@@ -61,7 +57,13 @@ void index_kernel(
             IntArrayRef{},
             f,
             true);
-      });
+      }),
+      AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+      AT_EXPAND(AT_FLOAT8_TYPES),
+      kComplexHalf,
+      kHalf,
+      kBool,
+      kBFloat16);
 }
 
 template <typename ValType>
@@ -588,14 +590,10 @@ void index_put_kernel(
               false);
         });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
-        at::ScalarType::ComplexHalf,
-        at::ScalarType::BFloat16,
-        at::ScalarType::Half,
-        at::ScalarType::Bool,
+    AT_DISPATCH_V2(
         iter.dtype(),
         "index_put_xpu",
-        [&] {
+        AT_WRAP([&] {
           using dtype = OpaqueType<sizeof(scalar_t)>;
           IndexPutFunctor<dtype> f;
           _index_kernel(
@@ -606,7 +604,13 @@ void index_put_kernel(
               IntArrayRef{},
               f,
               false);
-        });
+        }),
+        AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+        AT_EXPAND(AT_FLOAT8_TYPES),
+        kComplexHalf,
+        kHalf,
+        kBool,
+        kBFloat16);
   }
 }
 
@@ -693,14 +697,10 @@ void index_put_deterministic_kernel(
         expandedValue.numel());
 
     if (sliceSize > SIMD) {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
-          at::ScalarType::ComplexHalf,
-          at::ScalarType::BFloat16,
-          at::ScalarType::Half,
-          at::ScalarType::Bool,
+      AT_DISPATCH_V2(
           expandedValue.scalar_type(),
           "index_put_deterministic_kernel",
-          [&] {
+          AT_WRAP([&] {
             launch_index_put_deterministic_kernel<scalar_t, scalar_t>(
                 sorted_indices.mutable_data_ptr<int64_t>(),
                 orig_indices.mutable_data_ptr<int64_t>(),
@@ -711,17 +711,24 @@ void index_put_deterministic_kernel(
                 strideBefore,
                 nElemBefore,
                 accumulate);
-          });
+          }),
+          AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+          // TODO: Enable AT_FLOAT8_DTYPES after accumulation behavior is
+          // cleared for float8 dtypes.
+          kFloat8_e4m3fn,
+          kFloat8_e5m2,
+          kFloat8_e4m3fnuz,
+          kFloat8_e5m2fnuz,
+          kComplexHalf,
+          kHalf,
+          kBool,
+          kBFloat16);
     } else {
       // Align acc type with CUDA
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
-          at::ScalarType::ComplexHalf,
-          at::ScalarType::BFloat16,
-          at::ScalarType::Half,
-          at::ScalarType::Bool,
+      AT_DISPATCH_V2(
           expandedValue.scalar_type(),
           "index_put_deterministic_kernel",
-          [&] {
+          AT_WRAP([&] {
             using accscalar_t = at::opmath_type<scalar_t>;
             launch_index_put_deterministic_kernel<scalar_t, accscalar_t>(
                 sorted_indices.mutable_data_ptr<int64_t>(),
@@ -733,7 +740,18 @@ void index_put_deterministic_kernel(
                 strideBefore,
                 nElemBefore,
                 accumulate);
-          });
+          }),
+          AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+          // TODO: Enable AT_FLOAT8_DTYPES after accumulation behavior is
+          // cleared for float8 dtypes.
+          kFloat8_e4m3fn,
+          kFloat8_e5m2,
+          kFloat8_e4m3fnuz,
+          kFloat8_e5m2fnuz,
+          kComplexHalf,
+          kHalf,
+          kBool,
+          kBFloat16);
     }
 
     if (permuted)
diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
@@ -1,5 +1,6 @@
 // #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/native/xpu/sycl/MemoryAccess.h>
 #include <ATen/native/xpu/sycl/OffsetCalculator.h>
@@ -129,16 +130,20 @@ void flip_kernel(TensorIterator& iter, bool quantized) {
   if (quantized) {
     TORCH_CHECK(false, "XPU current does not flip for quantized tensor");
   }
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-      at::ScalarType::Half,
-      at::ScalarType::Bool,
-      at::ScalarType::BFloat16,
+  AT_DISPATCH_V2(
       iter.dtype(),
       "flip_xpu",
-      [&] {
+      AT_WRAP([&] {
         using dtype = OpaqueType<sizeof(scalar_t)>;
         flip_kernel_impl<dtype>(iter);
-      });
+      }),
+      AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+      AT_EXPAND(AT_FLOAT8_TYPES),
+      AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+      kComplexHalf,
+      kHalf,
+      kBool,
+      kBFloat16);
 }
 
 template <typename scalar_t>
diff --git a/test/regressions/test_index_and_index_put.py b/test/regressions/test_index_and_index_put.py
@@ -96,3 +96,18 @@ def test_index_put_with_zero_shape_dim(self, dtype=torch.bfloat16):
         b = torch.randn([5, 0], dtype=dtype, device=torch.device("xpu"))
         a[:5, :] = a[:5, :] * 2 + b
         torch.use_deterministic_algorithms(False)
+
+    def test_flip_float8(self):
+        FLOAT8_DTYPES = (
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2,
+            torch.float8_e5m2fnuz,
+            torch.float8_e8m0fnu,
+        )
+        for dtype in FLOAT8_DTYPES:
+            a_cpu = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=dtype)
+            a_xpu = a_cpu.to("xpu")
+            b_cpu = torch.flip(a_cpu, [0]).to(torch.float32)
+            b_xpu = torch.flip(a_xpu, [0]).cpu().to(torch.float32)
+            self.assertEqual(b_cpu, b_xpu)
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
@@ -281,12 +281,7 @@
         # x_cuda = x.clone().detach().to("cuda").requires_grad_(): Torch not compiled with CUDA enabled
         "test_layer_norm_backwards_eps",
     ),
-    "test_indexing_xpu.py": (
-        # XPU implementation doesn't claimn FP8 now
-        # https://github.com/intel/torch-xpu-ops/issues/461
-        # https://github.com/intel/torch-xpu-ops/issues/1975
-        "float8",
-    ),
+    "test_indexing_xpu.py": None,
     "nn/test_pooling_xpu.py": None,
     "nn/test_dropout_xpu.py": None,
     "test_dataloader_xpu.py": None,