From b6786e31c36b31bb2cc18e2325451a3198832cb8 Mon Sep 17 00:00:00 2001
From: min-jean-cho <min.jean.cho@intel.com>
Date: Sat, 25 Jan 2025 18:45:24 -0800
Subject: [PATCH] Add aten::_nested_tensor_softmax_with_shape (#1323)

Part of https://github.com/intel/torch-xpu-ops/issues/1141.

Depends on https://github.com/pytorch/pytorch/pull/145467.

- `_nested_tensor_softmax_with_shape`
---
 src/ATen/CMakeLists.txt                       |  2 +-
 .../NestedTensorTransformerFunctions.cpp      | 20 +++++++++++++++++++
 yaml/native/native_functions.yaml             |  5 +++++
 3 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
index ddc969d86..22e060111 100644
--- a/src/ATen/CMakeLists.txt
+++ b/src/ATen/CMakeLists.txt
@@ -3,7 +3,7 @@
 file(GLOB xpu_h "xpu/*.h")
 file(GLOB xpu_cpp "xpu/*.cpp")
 file(GLOB xpu_mkl "native/xpu/mkl/*.cpp")
-file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp")
+file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp")
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp" "native/sparse/xpu/sycl/*.cpp" "native/nested/xpu/sycl/*.cpp" "native/transformers/sycl/*.cpp" "native/quantized/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
diff --git a/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp b/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
new file mode 100644
index 000000000..31ae5ab39
--- /dev/null
+++ b/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
@@ -0,0 +1,20 @@
+#include <ATen/ATen.h>
+#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
+
+namespace at::native {
+
+Tensor NestedTensor_softmax_dropout_xpu(
+    const Tensor& self,
+    const Tensor& query) {
+  std::optional<Tensor> attn_mask;
+
+  attn_mask = NestedTensor_to_mask(query, 2, self.size(2));
+  attn_mask = attn_mask->to(query.device(), /*non-blocking=*/true);
+  return _masked_softmax(
+      self,
+      *attn_mask,
+      self.dim() - 1,
+      /*mask type */ 1); // NestedTensor_to_mask produces a BxT mask
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
index 3d64f3eba..07c4f7967 100644
--- a/yaml/native/native_functions.yaml
+++ b/yaml/native/native_functions.yaml
@@ -4437,6 +4437,11 @@
     XPU: nested_from_padded_xpu
   autogen: _nested_from_padded.out
 
+- func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
+  dispatch:
+    NestedTensorXPU: NestedTensor_softmax_dropout_xpu
+  tags: nondeterministic_seeded
+
 - func: avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   structured: True