PaddlePaddle · cmcamdy · Sep 17, 2025 · Sep 25, 2025
diff --git a/custom_ops/xpu_ops/setup_ops.py b/custom_ops/xpu_ops/setup_ops.py
@@ -142,6 +142,8 @@ def xpu_setup_ops():
             if file.endswith(".cc"):
                 ops.append(os.path.join(root, file))
 
+    print(ops)
+
     include_dirs = [
         os.path.join(base_dir, "./"),
         os.path.join(base_dir, "./plugin/include"),

diff --git a/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc b/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include "paddle/extension.h"
+#include "paddle/phi/core/enforce.h"
+#include "xpu/plugin.h"
+
+namespace api = baidu::xpu::api;
+void DraftModelPreprocessV2(const paddle::Tensor& draft_tokens,
+                          const paddle::Tensor& input_ids,
+                          const paddle::Tensor& stop_flags,
+                          const paddle::Tensor& seq_lens_this_time,
+                          const paddle::Tensor& seq_lens_encoder,
+                          const paddle::Tensor& seq_lens_decoder,
+                          const paddle::Tensor& step_idx,
+                          const paddle::Tensor& not_need_stop,
+                          const paddle::Tensor& is_block_step,
+                          const paddle::Tensor& batch_drop,
+                          const paddle::Tensor& pre_ids,
+                          const paddle::Tensor& accept_tokens,
+                          const paddle::Tensor& accept_num,
+                          const paddle::Tensor& base_model_seq_lens_this_time,
+                          const paddle::Tensor& base_model_seq_lens_encoder,
+                          const paddle::Tensor& base_model_seq_lens_decoder,
+                          const paddle::Tensor& base_model_step_idx,
+                          const paddle::Tensor& base_model_stop_flags,
+                          const paddle::Tensor& base_model_is_block_step,
+                          const paddle::Tensor& base_model_draft_tokens,
+                          const int num_model_step,
+                          const bool truncate_first_token,
+                          const bool splitwise_prefill,
+                          const bool kvcache_scheduler_v1) {
+
+  phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
+  api::Context* ctx = static_cast<const phi::XPUContext*>(dev_ctx)->x_context();
+  if (draft_tokens.is_cpu()) {
+    ctx = new api::Context(api::kCPU);
+  }
+  int real_bsz = seq_lens_this_time.shape()[0];
+  int accept_tokens_len = accept_tokens.shape()[1];
+  int input_ids_len = input_ids.shape()[1];
+  int draft_tokens_len = draft_tokens.shape()[1];
+  int pre_ids_len = pre_ids.shape()[1];
+  constexpr int BlockSize = 512;
+  int base_model_draft_tokens_len = base_model_draft_tokens.shape()[1];
+  auto not_need_stop_gpu =
+      not_need_stop.copy_to(seq_lens_this_time.place(), false);
+
+  int r = baidu::xpu::api::plugin::draft_model_preprocess_v2(
+      ctx,
+      const_cast<int64_t*>(draft_tokens.data<int64_t>()),
+      const_cast<int64_t*>(input_ids.data<int64_t>()),
+      const_cast<bool*>(stop_flags.data<bool>()),
+      const_cast<int*>(seq_lens_this_time.data<int>()),
+      const_cast<int*>(seq_lens_encoder.data<int>()),
+      const_cast<int*>(seq_lens_decoder.data<int>()),
+      const_cast<int64_t*>(step_idx.data<int64_t>()),
+      const_cast<bool*>(not_need_stop_gpu.data<bool>()),
+      const_cast<bool*>(is_block_step.data<bool>()),
+      const_cast<bool*>(batch_drop.data<bool>()),
+      const_cast<int64_t*>(pre_ids.data<int64_t>()),
+      accept_tokens.data<int64_t>(),
+      accept_num.data<int>(),
+      base_model_seq_lens_this_time.data<int>(),
+      base_model_seq_lens_encoder.data<int>(),
+      base_model_seq_lens_decoder.data<int>(),
+      base_model_step_idx.data<int64_t>(),
+      base_model_stop_flags.data<bool>(),
+      base_model_is_block_step.data<bool>(),
+      const_cast<int64_t*>(base_model_draft_tokens.data<int64_t>()),
+      real_bsz,
+      num_model_step,
+      accept_tokens_len,
+      draft_tokens_len,
+      input_ids_len,
+      base_model_draft_tokens_len,
+      pre_ids_len,
+      truncate_first_token,
+      splitwise_prefill,
+      kvcache_scheduler_v1);
+
+  PD_CHECK(r == 0, "xpu::plugin::draft_model_preprocess failed.");
+  auto not_need_stop_cpu =
+      not_need_stop_gpu.copy_to(not_need_stop.place(), false);
+  bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
+  not_need_stop_data[0] = not_need_stop_cpu.data<bool>()[0];
+}
+
+PD_BUILD_OP(draft_model_preprocess_v2)
+    .Inputs({"draft_tokens",
+             "input_ids",
+             "stop_flags",
+             "seq_lens_this_time",
+             "seq_lens_encoder",
+             "seq_lens_decoder",
+             "step_idx",
+             "not_need_stop",
+             "is_block_step",
+             "batch_drop",
+             "pre_ids",
+             "accept_tokens",
+             "accept_num",
+             "base_model_seq_lens_this_time",
+             "base_model_seq_lens_encoder",
+             "base_model_seq_lens_decoder",
+             "base_model_step_idx",
+             "base_model_stop_flags",
+             "base_model_is_block_step",
+             "base_model_draft_tokens"})
+    .Outputs({"draft_tokens_out",
+              "input_ids_out",
+              "stop_flags_out",
+              "seq_lens_this_time_out",
+              "seq_lens_encoder_out",
+              "seq_lens_decoder_out",
+              "step_idx_out",
+              "not_need_stop_out",
+              "batch_drop_out",
+              "pre_ids_out"})
+    .Attrs({"num_model_step: int", "truncate_first_token: bool", "splitwise_prefill: bool", "kvcache_scheduler_v1: bool"})
+    .SetInplaceMap({{"draft_tokens", "draft_tokens_out"},
+                    {"input_ids", "input_ids_out"},
+                    {"stop_flags", "stop_flags_out"},
+                    {"seq_lens_this_time", "seq_lens_this_time_out"},
+                    {"seq_lens_encoder", "seq_lens_encoder_out"},
+                    {"seq_lens_decoder", "seq_lens_decoder_out"},
+                    {"step_idx", "step_idx_out"},
+                    {"not_need_stop", "not_need_stop_out"},
+                    {"batch_drop", "batch_drop_out"},
+                    {"pre_ids", "pre_ids_out"}})
+    .SetKernelFn(PD_KERNEL(DraftModelPreprocessV2));
diff --git a/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc b/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc
@@ -45,7 +45,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
                      const paddle::Tensor &topp,
                      int max_seq_len,
                      int verify_window,
-                     bool enable_topp) {
+                     bool enable_topp,
+                     bool benchmark_mode) {
   auto bsz = accept_tokens.shape()[0];
   int real_bsz = seq_lens_this_time.shape()[0];
   auto max_draft_tokens = draft_tokens.shape()[1];
@@ -133,7 +134,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
           max_seq_len,
           max_candidate_len,
           verify_window,
-          prefill_one_step_stop);
+          prefill_one_step_stop,
+          benchmark_mode);
     } else {
       baidu::xpu::api::plugin::speculate_verify<false, true>(
           ctx,
@@ -161,7 +163,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
           max_seq_len,
           max_candidate_len,
           verify_window,
-          prefill_one_step_stop);
+          prefill_one_step_stop,
+          benchmark_mode);
     }
   } else {
     if (enable_topp) {
@@ -191,7 +194,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
           max_seq_len,
           max_candidate_len,
           verify_window,
-          prefill_one_step_stop);
+          prefill_one_step_stop,
+          benchmark_mode);
     } else {
       baidu::xpu::api::plugin::speculate_verify<false, false>(
           ctx,
@@ -219,7 +223,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
           max_seq_len,
           max_candidate_len,
           verify_window,
-          prefill_one_step_stop);
+          prefill_one_step_stop,
+          benchmark_mode);
     }
   }
 }
@@ -246,7 +251,10 @@ PD_BUILD_STATIC_OP(speculate_verify)
               "accept_num_out",
               "step_idx_out",
               "stop_flags_out"})
-    .Attrs({"max_seq_len: int", "verify_window: int", "enable_topp: bool"})
+    .Attrs({"max_seq_len: int",
+            "verify_window: int",
+            "enable_topp: bool",
+            "benchmark_mode: bool"})
     .SetInplaceMap({{"accept_tokens", "accept_tokens_out"},
                     {"accept_num", "accept_num_out"},
                     {"step_idx", "step_idx_out"},

diff --git a/custom_ops/xpu_ops/src/ops/pybind/pybind.cc b/custom_ops/xpu_ops/src/ops/pybind/pybind.cc
@@ -205,7 +205,8 @@ void SpeculateVerify(const paddle::Tensor& accept_tokens,
                      const paddle::Tensor& topp,
                      int max_seq_len,
                      int verify_window,
-                     bool enable_topp);
+                     bool enable_topp,
+                     bool benchmark_mode);
 
 void SpeculateClearAcceptNums(const paddle::Tensor& accept_num,
                               const paddle::Tensor& seq_lens_decoder);
@@ -552,6 +553,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("max_seq_len"),
         py::arg("verify_window"),
         py::arg("enable_topp"),
+        py::arg("benchmark_mode"),
         "Perform speculative verification for decoding");
 
   m.def("speculate_clear_accept_nums",

diff --git a/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h b/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h
@@ -227,7 +227,8 @@ DLL_EXPORT int speculate_verify(Context* ctx,
                                 const int max_seq_len,
                                 const int max_candidate_len,
                                 const int verify_window,
-                                const bool prefill_one_step_stop);
+                                const bool prefill_one_step_stop,
+                                const bool benchmark_mode);
 
 DLL_EXPORT int speculate_clear_accept_nums(Context* ctx,
                                            int* accept_num,
@@ -457,6 +458,38 @@ DLL_EXPORT int rebuild_self_hidden_states(api::Context* ctx,
                                           T* output,
                                           int dim_embed,
                                           int elem_cnt);
+DLL_EXPORT int draft_model_preprocess_v2(api::Context* ctx,
+                                      int64_t* draft_tokens,
+                                      int64_t* input_ids,
+                                      bool* stop_flags,
+                                      int* seq_lens_this_time,
+                                      int* seq_lens_encoder,
+                                      int* seq_lens_decoder,
+                                      int64_t* step_idx,
+                                      bool* not_need_stop,
+                                      bool* is_block_step,
+                                      bool* batch_drop,
+                                      int64_t* pre_ids,
+                                      const int64_t* accept_tokens,
+                                      const int* accept_num,
+                                      const int* base_model_seq_lens_this_time,
+                                      const int* base_model_seq_lens_encoder,
+                                      const int* base_model_seq_lens_decoder,
+                                      const int64_t* base_model_step_idx,
+                                      const bool* base_model_stop_flags,
+                                      const bool* base_model_is_block_step,
+                                      int64_t* base_model_draft_tokens,
+                                      const int bsz,
+                                      const int num_model_step,
+                                      const int accept_tokens_len,
+                                      const int draft_tokens_len,
+                                      const int input_ids_len,
+                                      const int base_model_draft_tokens_len,
+                                      const int pre_ids_len,
+                                      const bool truncate_first_token,
+                                      const bool splitwise_prefill,
+                                      const bool kvcache_scheduler_v1);
+
 /*--------------------------------------- MTP end --------------------------------------------*/
 
 } // namespace plugin