PaddlePaddle
diff --git a/‎custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc‎
Lines changed: 3 additions & 0 deletions b/‎custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎custom_ops/xpu_ops/src/ops/block_attn.cc‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/xpu_ops/src/ops/block_attn.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎custom_ops/xpu_ops/src/ops/gather_next_token.cc‎
Lines changed: 111 additions & 73 deletions b/‎custom_ops/xpu_ops/src/ops/gather_next_token.cc‎
Lines changed: 111 additions & 73 deletions
diff --git a/‎custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc‎
Lines changed: 5 additions & 1 deletion b/‎custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎custom_ops/xpu_ops/src/ops/mtp/speculate_get_padding_offset_v2.cc‎
Lines changed: 131 additions & 0 deletions b/‎custom_ops/xpu_ops/src/ops/mtp/speculate_get_padding_offset_v2.cc‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc‎
Lines changed: 1 addition & 2 deletions b/‎custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc‎
Lines changed: 1 addition & 2 deletions
@@ -23,6 +23,9 @@
 #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
 #endif
 
+#define GET_OUTPUT_DEBUG
+#define SAVE_WITH_OUTPUT_DEBUG
+
 #include "speculate_msg.h"
 
 void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
 
@@ -623,7 +623,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
             : quant_v_scale_inv,
         nullptr,          // o_maxptr
         param.head_dim);  // vo_head_dim
-        PD_CHECK(0, "speculative_attention unimplemented");
+        // PD_CHECK(0, "speculative_attention unimplemented");
       PD_CHECK(ret == api::SUCCESS,
                "xfa::speculative_attention_decoder failed.");
       if (!Eq_len) {
 
@@ -1,4 +1,4 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,97 +12,135 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include <xft/xdnn_plugin.h>
 #include "paddle/extension.h"
 #include "xpu/plugin.h"
-#include <paddle/phi/backends/xpu/xpu_context.h>
-std::vector<paddle::Tensor>
-GatherNextToken(const paddle::Tensor &tmp_out,     // [token_num, dim_embed]
-                const paddle::Tensor &cum_offsets, // [bsz, 1]
-                const paddle::Tensor &encoder_seq_lod,
-                const paddle::Tensor &encoder_batch_map,
-                const paddle::Tensor &decoder_batch_map,
-                const paddle::Tensor &encoder_seq_lod_cpu,
-                const paddle::Tensor &encoder_batch_map_cpu,
-                const paddle::Tensor &decoder_batch_map_cpu,
-                const paddle::Tensor &enc_batch_tensor,
-                const paddle::Tensor &dec_batch_tensor,
-                const paddle::optional<paddle::Tensor> &output_padding_offset,
-                int max_input_length) {
-    phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
-    auto dev_ctx =
-        paddle::experimental::DeviceContextPool::Instance().Get(place);
-    auto xpu_ctx = static_cast<const phi::XPUContext *>(dev_ctx);
-    using XPUType =
-        typename XPUTypeTrait<bfloat16>::Type; // only support bfloat16
-    typedef paddle::bfloat16 data_t;
-    const int dim = tmp_out.dims()[1];
-    const int bsz = cum_offsets.shape()[0];
-    int enc_batch = enc_batch_tensor.data<int32_t>()[0];
-    int dec_batch = dec_batch_tensor.data<int32_t>()[0];
 
-    baidu::xpu::api::VectorParam<int32_t> encoder_seqs_lods_vp{
-        const_cast<int32_t *>(encoder_seq_lod_cpu.data<int32_t>()),
-        enc_batch + 1, const_cast<int32_t *>(encoder_seq_lod.data<int32_t>())};
-    baidu::xpu::api::VectorParam<int32_t> encoder_batch_map_vp{
-        const_cast<int32_t *>(encoder_batch_map_cpu.data<int32_t>()), enc_batch,
-        const_cast<int32_t *>(encoder_batch_map.data<int32_t>())};
-    baidu::xpu::api::VectorParam<int32_t> decoder_batch_map_vp{
-        const_cast<int32_t *>(decoder_batch_map_cpu.data<int32_t>()), dec_batch,
-        const_cast<int32_t *>(decoder_batch_map.data<int32_t>())};
+std::vector<paddle::Tensor> GatherNextToken(
+    const paddle::Tensor& x,            // [token_num, dim_embed]
+    const paddle::Tensor& cum_offsets,  // [bsz, 1]
+    const paddle::Tensor& encoder_seq_lod,
+    const paddle::Tensor& encoder_batch_map,
+    const paddle::Tensor& decoder_batch_map,
+    const paddle::Tensor& encoder_seq_lod_cpu,
+    const paddle::Tensor& encoder_batch_map_cpu,
+    const paddle::Tensor& decoder_batch_map_cpu,
+    const paddle::Tensor& len_info_cpu,
+    const paddle::optional<paddle::Tensor>& output_padding_offset) {
+  phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
+  auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
+  using XPUType =
+      typename XPUTypeTrait<bfloat16>::Type;  // only support bfloat16
+  typedef paddle::bfloat16 data_t;
+  const int dim = x.dims()[1];
+  const int token_num = x.shape()[0];
+  const int bsz = cum_offsets.shape()[0];
+  int enc_batch = len_info_cpu.data<int32_t>()[0];
+  int dec_batch = len_info_cpu.data<int32_t>()[1];
 
-    auto out = paddle::full({bsz, dim}, -2, tmp_out.type(), tmp_out.place());
+  baidu::xpu::api::VectorParam<int32_t> encoder_seqs_lods_vp{
+      const_cast<int32_t*>(encoder_seq_lod_cpu.data<int32_t>()),
+      enc_batch + 1,
+      const_cast<int32_t*>(encoder_seq_lod.data<int32_t>())};
+  baidu::xpu::api::VectorParam<int32_t> encoder_batch_map_vp{
+      const_cast<int32_t*>(encoder_batch_map_cpu.data<int32_t>()),
+      enc_batch,
+      const_cast<int32_t*>(encoder_batch_map.data<int32_t>())};
+  baidu::xpu::api::VectorParam<int32_t> decoder_batch_map_vp{
+      const_cast<int32_t*>(decoder_batch_map_cpu.data<int32_t>()),
+      dec_batch,
+      const_cast<int32_t*>(decoder_batch_map.data<int32_t>())};
 
+  paddle::Tensor out;
+  std::vector<int> encode_iota_lod_cpu(enc_batch);
+  if (output_padding_offset) {
+    int need_delete_token_num = 0;
+    if (enc_batch > 0) {
+      need_delete_token_num =
+          encoder_seq_lod_cpu.data<int32_t>()[enc_batch] - enc_batch;
+      std::iota(encode_iota_lod_cpu.begin(), encode_iota_lod_cpu.end(), 0);
+      encoder_batch_map_vp.cpu =
+          const_cast<const int32_t*>(encode_iota_lod_cpu.data());
+      encoder_batch_map_vp.len = enc_batch;
+      encoder_batch_map_vp.xpu = nullptr;
+    }
+    out = paddle::empty(
+        {token_num - need_delete_token_num, dim}, x.type(), x.place());
+  } else {
+    out = paddle::empty({bsz, dim}, x.type(), x.place());
+  }
+  if (x.shape()[0] == 0) {
+    return {out};
+  }
+
+  if (output_padding_offset && enc_batch <= 0) {
+    out = x.copy_to(x.place(), false);
+  } else {
     int r = baidu::xpu::api::plugin::eb_gather_next_token<XPUType, XPUType>(
         xpu_ctx->x_context(),
-        reinterpret_cast<const XPUType *>(tmp_out.data<data_t>()),
-        reinterpret_cast<XPUType *>(out.data<data_t>()), encoder_seqs_lods_vp,
-        encoder_batch_map_vp, decoder_batch_map_vp, dim);
-    return {out};
+        reinterpret_cast<const XPUType*>(x.data<data_t>()),
+        reinterpret_cast<XPUType*>(out.data<data_t>()),
+        encoder_seqs_lods_vp,
+        encoder_batch_map_vp,
+        decoder_batch_map_vp,
+        dim);
+    PD_CHECK(r == 0, "xpu::plugin::gather_next_token failed.");
+  }
+  return {out};
 }
 
 std::vector<std::vector<int64_t>> GatherNextTokenInferShape(
-    const std::vector<int64_t> &tmp_out_shape,
-    const std::vector<int64_t> &cum_offsets_shape,
-    const std::vector<int64_t> &encoder_seq_lod_shape,
-    const std::vector<int64_t> &encoder_batch_map_shape,
-    const std::vector<int64_t> &decoder_batch_map_shape,
-    const std::vector<int64_t> &encoder_seq_lod_cpu_shape,
-    const std::vector<int64_t> &encoder_batch_map_cpu_shape,
-    const std::vector<int64_t> &decoder_batch_map_cpu_shape,
-    const std::vector<int64_t> &enc_batch_tensor_shape,
-    const std::vector<int64_t> &dec_batch_tensor_shape,
-    const paddle::optional<std::vector<int64_t>> &output_padding_offset_shape) {
-    if (output_padding_offset_shape) {
-        PD_THROW("speculative decoding is not supported in XPU.");
-    }
+    const std::vector<int64_t>& x_shape,
+    const std::vector<int64_t>& cum_offsets_shape,
+    const std::vector<int64_t>& encoder_seq_lod_shape,
+    const std::vector<int64_t>& encoder_batch_map_shape,
+    const std::vector<int64_t>& decoder_batch_map_shape,
+    const std::vector<int64_t>& encoder_seq_lod_cpu_shape,
+    const std::vector<int64_t>& encoder_batch_map_cpu_shape,
+    const std::vector<int64_t>& decoder_batch_map_cpu_shape,
+    const std::vector<int64_t>& len_info_cpu_shape,
+    const paddle::optional<std::vector<int64_t>>& output_padding_offset_shape) {
+  // if (output_padding_offset_shape) {
+  //   PD_THROW("speculative decoding is not supported in XPU.");
+  // }
+  int64_t bsz = cum_offsets_shape[0];
+  int64_t dim_embed = x_shape[1];
+  if (output_padding_offset_shape) {
+    return {{-1, dim_embed}};
+  } else {
     int64_t bsz = cum_offsets_shape[0];
-    int64_t dim_embed = tmp_out_shape[1];
     return {{bsz, dim_embed}};
+  }
 }
 
 std::vector<paddle::DataType> GatherNextTokenInferDtype(
-    const paddle::DataType &tmp_out_dtype,
-    const paddle::DataType &cum_offsets_dtype,
-    const paddle::DataType &encoder_seq_lod_dtype,
-    const paddle::DataType &encoder_batch_map_dtype,
-    const paddle::DataType &decoder_batch_map_dtype,
-    const paddle::DataType &encoder_seq_lod_cpu_dtype,
-    const paddle::DataType &encoder_batch_map_cpu_dtype,
-    const paddle::DataType &decoder_batch_map_cpu_dtype,
-    const paddle::DataType &enc_batch_tensor_dtype,
-    const paddle::DataType &dec_batch_tensor_dtype,
-    const paddle::optional<paddle::DataType> &output_padding_offset_dtype) {
-    return {tmp_out_dtype};
+    const paddle::DataType& x_dtype,
+    const paddle::DataType& cum_offsets_dtype,
+    const paddle::DataType& encoder_seq_lod_dtype,
+    const paddle::DataType& encoder_batch_map_dtype,
+    const paddle::DataType& decoder_batch_map_dtype,
+    const paddle::DataType& encoder_seq_lod_cpu_dtype,
+    const paddle::DataType& encoder_batch_map_cpu_dtype,
+    const paddle::DataType& decoder_batch_map_cpu_dtype,
+    const paddle::DataType& len_info_cpu_dtype,
+    const paddle::optional<paddle::DataType>& output_padding_offset_dtype) {
+  return {x_dtype};
 }
 
 PD_BUILD_OP(gather_next_token)
-    .Inputs({"tmp_out", "cum_offsets", "encoder_seq_lod", "encoder_batch_map",
-             "decoder_batch_map", "encoder_seq_lod_cpu",
-             "encoder_batch_map_cpu", "decoder_batch_map_cpu",
-             "enc_batch_tensor", "dec_batch_tensor",
+    .Inputs({"x",
+             "cum_offsets",
+             "encoder_seq_lod",
+             "encoder_batch_map",
+             "decoder_batch_map",
+             "encoder_seq_lod_cpu",
+             "encoder_batch_map_cpu",
+             "decoder_batch_map_cpu",
+             "len_info_cpu",
              paddle::Optional("output_padding_offset")})
     .Outputs({"out"})
-    .Attrs({"max_input_length: int"})
     .SetKernelFn(PD_KERNEL(GatherNextToken))
     .SetInferShapeFn(PD_INFER_SHAPE(GatherNextTokenInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(GatherNextTokenInferDtype));
+    .SetInferDtypeFn(PD_INFER_DTYPE(GatherNextTokenInferDtype));
@@ -17,6 +17,10 @@
 #include "paddle/phi/core/enforce.h"
 #include "xpu/plugin.h"
 
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 namespace api = baidu::xpu::api;
 void DraftModelPreprocessV2(const paddle::Tensor& draft_tokens,
                           const paddle::Tensor& input_ids,
@@ -99,7 +103,7 @@ void DraftModelPreprocessV2(const paddle::Tensor& draft_tokens,
   not_need_stop_data[0] = not_need_stop_cpu.data<bool>()[0];
 }
 
-PD_BUILD_OP(draft_model_preprocess_v2)
+PD_BUILD_STATIC_OP(draft_model_preprocess_v2)
     .Inputs({"draft_tokens",
              "input_ids",
              "stop_flags",
 
@@ -0,0 +1,131 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include "paddle/extension.h"
+#include "xpu/plugin.h"
+
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
+std::vector<paddle::Tensor> SpeculateGetPaddingOffsetV2(
+    const paddle::Tensor& input_ids,
+    const paddle::Tensor& draft_tokens,
+    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& token_num,
+    const paddle::Tensor& seq_len,
+    const paddle::Tensor& seq_lens_encoder) {
+  phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
+  auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
+
+  std::vector<int64_t> input_ids_shape = input_ids.shape();
+  const int bsz = seq_len.shape()[0];
+  const int seq_length = input_ids_shape[1];
+  const int max_draft_tokens = draft_tokens.shape()[1];
+  auto cum_offsets_out = cum_offsets.copy_to(cum_offsets.place(), false);
+  auto cpu_token_num = token_num.copy_to(paddle::CPUPlace(), false);
+
+  const int token_num_data = cpu_token_num.data<int64_t>()[0];
+  auto x_remove_padding = paddle::empty(
+      {token_num_data}, paddle::DataType::INT64, input_ids.place());
+  auto padding_offset = paddle::empty(
+      {token_num_data}, paddle::DataType::INT32, input_ids.place());
+  auto batch_id_per_token = paddle::empty(
+      {token_num_data}, paddle::DataType::INT32, input_ids.place());
+  auto cu_seqlens_q =
+      paddle::empty({bsz + 1}, paddle::DataType::INT32, input_ids.place());
+  auto cu_seqlens_k =
+      paddle::empty({bsz + 1}, paddle::DataType::INT32, input_ids.place());
+
+  PD_CHECK(input_ids.is_contiguous(), "Input ids tensor must be contiguous");
+  PD_CHECK(draft_tokens.is_contiguous(),
+           "Draft tokens tensor must be contiguous");
+  PD_CHECK(cum_offsets.is_contiguous(),
+           "Cum offsets tensor must be contiguous");
+  PD_CHECK(seq_len.is_contiguous(), "Seq lens tensor must be contiguous");
+
+  int r = baidu::xpu::api::plugin::speculate_get_padding_offset_v2(
+      xpu_ctx->x_context(),
+      batch_id_per_token.data<int>(),
+      cum_offsets_out.data<int>(),
+      cu_seqlens_q.data<int>(),
+      cu_seqlens_k.data<int>(),
+      cum_offsets.data<int>(),
+      seq_len.data<int>(),
+      seq_length,
+      bsz);
+  PD_CHECK(r == 0, "XPU speculate_get_padding_offset_v2 failed");
+
+  r = baidu::xpu::api::plugin::speculate_remove_padding<int64_t>(
+      xpu_ctx->x_context(),
+      x_remove_padding.data<int64_t>(),
+      input_ids.data<int64_t>(),
+      draft_tokens.data<int64_t>(),
+      seq_len.data<int>(),
+      seq_lens_encoder.data<int>(),
+      cum_offsets_out.data<int>(),
+      seq_length,
+      max_draft_tokens,
+      bsz,
+      token_num_data);
+  PD_CHECK(r == 0, "XPU speculate_remove_padding failed");
+
+  return {x_remove_padding,
+          batch_id_per_token,
+          cu_seqlens_q,
+          cu_seqlens_k};  // , enc_token_num, dec_token_num};
+}
+
+std::vector<std::vector<int64_t>> SpeculateGetPaddingOffsetV2InferShape(
+    const std::vector<int64_t>& input_ids_shape,
+    const std::vector<int64_t>& draft_tokens_shape,
+    const std::vector<int64_t>& cum_offsets_shape,
+    const std::vector<int64_t>& token_num_shape,
+    const std::vector<int64_t>& seq_len_shape,
+    const std::vector<int64_t>& seq_lens_encoder_shape) {
+  int64_t bsz = seq_len_shape[0];
+  int64_t seq_len = input_ids_shape[1];
+  return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}};
+}
+
+std::vector<paddle::DataType> SpeculateGetPaddingOffsetV2InferDtype(
+    const paddle::DataType& input_ids_dtype,
+    const paddle::DataType& draft_tokens_dtype,
+    const paddle::DataType& cum_offsets_dtype,
+    const paddle::DataType& token_num_dtype,
+    const paddle::DataType& seq_len_dtype,
+    const paddle::DataType& seq_lens_encoder_dtype) {
+  return {input_ids_dtype,
+          seq_len_dtype,
+          seq_len_dtype,
+          seq_len_dtype,
+          seq_len_dtype};
+}
+
+PD_BUILD_STATIC_OP(speculate_get_padding_offset_v2)
+    .Inputs({"input_ids",
+             "draft_tokens",
+             "cum_offsets",
+             "token_num",
+             "seq_len",
+             "seq_lens_encoder"})
+    .Outputs({"x_remove_padding",
+              "batch_id_per_token",
+              "cu_seqlens_q",
+              "cu_seqlens_k"})
+    .SetKernelFn(PD_KERNEL(SpeculateGetPaddingOffsetV2))
+    .SetInferShapeFn(PD_INFER_SHAPE(SpeculateGetPaddingOffsetV2InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(SpeculateGetPaddingOffsetV2InferDtype));
@@ -35,8 +35,7 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
                                 const paddle::Tensor& not_need_stop,
                                 int64_t rank_id,
                                 int msg_queue_id,
-                                int save_each_rank) {
-  // printf("enter save output");
+                                bool save_each_rank) {
   if (!save_each_rank && rank_id > 0) {
     return;
   }