diff --git a/1 b/1
new file mode 100644
index 0000000000..92e7e15074
--- /dev/null
+++ b/1
@@ -0,0 +1,209 @@
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
+
+[notice] A new release of pip is available: 24.2 -> 25.2
+[notice] To update, run: python -m pip install --upgrade pip
+XCCL /usr/local/lib/python3.10/dist-packages/paddle/base/../libs/libbkcl.so loaded
+/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py:717: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md
+  warnings.warn(warning_message)
+XCCL /usr/local/lib/python3.10/dist-packages/paddle/base/../libs/libbkcl.so loaded
+/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py:717: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md
+  warnings.warn(warning_message)
+XCCL /usr/local/lib/python3.10/dist-packages/paddle/base/../libs/libbkcl.so loaded
+/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py:717: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md
+  warnings.warn(warning_message)
+XCCL /usr/local/lib/python3.10/dist-packages/paddle/base/../libs/libbkcl.so loaded
+/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py:717: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md
+  warnings.warn(warning_message)
+CMake Warning:
+  Manually-specified variables were not used by the project:
+
+    BUILD_STANDALONE
+
+
+/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_set_value_by_flags.xpu:26:17: note: unsupported dynamic stack alloca
+__global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
+                ^
+/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_set_value_by_flags.xpu:26:17: note: unsupported dynamic stack alloca
+[2025-09-18 08:43:05,095] [    INFO] dist.py:970 - running install
+/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: SetuptoolsDeprecationWarning: setup.py install is deprecated.
+!!
+
+        ********************************************************************************
+        Please avoid running ``setup.py`` directly.
+        Instead, use pypa/build, pypa/installer or other
+        standards-based tools.
+
+        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
+        ********************************************************************************
+
+!!
+  self.initialize_options()
+/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: EasyInstallDeprecationWarning: easy_install command is deprecated.
+!!
+
+        ********************************************************************************
+        Please avoid running ``setup.py`` and ``easy_install``.
+        Instead, use pypa/build, pypa/installer or other
+        standards-based tools.
+
+        See https://github.com/pypa/setuptools/issues/917 for details.
+        ********************************************************************************
+
+!!
+  self.initialize_options()
+[2025-09-18 08:43:05,153] [    INFO] easy_install.py:573 - Checking .pth file support in /home/chenhuan09/FD/FastDeploy/custom_ops/tmp/
+[2025-09-18 08:43:05,153] [    INFO] spawn.py:60 - /usr/bin/python -E -c pass
+[2025-09-18 08:43:05,168] [ WARNING] easy_install.py:627 - TEST FAILED: /home/chenhuan09/FD/FastDeploy/custom_ops/tmp/ does NOT support .pth files
+[2025-09-18 08:43:05,168] [ WARNING] easy_install.py:504 - bad install directory or PYTHONPATH
+
+You are attempting to install a package to a directory that is not
+on PYTHONPATH and which Python does not read ".pth" files from.  The
+installation directory you specified (via --install-dir, --prefix, or
+the distutils default setting) was:
+
+    /home/chenhuan09/FD/FastDeploy/custom_ops/tmp/
+
+and your PYTHONPATH environment variable currently contains:
+
+    '/opt/source/PaddleNLP/llm/server/server:/opt/source/PaddleNLP'
+
+Here are some of your options for correcting the problem:
+
+* You can choose a different installation directory, i.e., one that is
+  on PYTHONPATH or supports .pth files
+
+* You can add the installation directory to the PYTHONPATH environment
+  variable.  (It must then also be on PYTHONPATH whenever you run
+  Python and want to use the package(s) you are installing.)
+
+* You can set up the installation directory to support ".pth" files by
+  using one of the approaches described here:
+
+  https://setuptools.pypa.io/en/latest/deprecated/easy_install.html#custom-installation-locations
+
+
+Please make the appropriate changes for your system and try again.
+[2025-09-18 08:43:05,215] [    INFO] dist.py:970 - running bdist_egg
+[2025-09-18 08:43:05,231] [    INFO] dist.py:970 - running egg_info
+[2025-09-18 08:43:05,231] [    INFO] dir_util.py:58 - creating fastdeploy_ops.egg-info
+[2025-09-18 08:43:05,237] [    INFO] egg_info.py:648 - writing fastdeploy_ops.egg-info/PKG-INFO
+[2025-09-18 08:43:05,237] [    INFO] egg_info.py:282 - writing dependency_links to fastdeploy_ops.egg-info/dependency_links.txt
+[2025-09-18 08:43:05,237] [    INFO] egg_info.py:282 - writing top-level names to fastdeploy_ops.egg-info/top_level.txt
+[2025-09-18 08:43:05,237] [    INFO] util.py:324 - writing manifest file 'fastdeploy_ops.egg-info/SOURCES.txt'
+[2025-09-18 08:43:05,362] [    INFO] util.py:324 - writing manifest file 'fastdeploy_ops.egg-info/SOURCES.txt'
+[2025-09-18 08:43:05,362] [    INFO] bdist_egg.py:162 - installing library code to build/fastdeploy_ops/bdist.linux-x86_64/egg
+[2025-09-18 08:43:05,362] [    INFO] dist.py:970 - running install_lib
+[2025-09-18 08:43:05,362] [    INFO] dist.py:970 - running build_ext
+[2025-09-18 08:43:05,383] [    INFO] build_ext.py:530 - building 'fastdeploy_ops' extension
+[2025-09-18 08:43:05,384] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/adjust_batch.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/adjust_batch.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,385] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/block_attn.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/block_attn.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,385] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/device/get_context_gm_max_mem_demand.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_context_gm_max_mem_demand.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,386] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/device/get_free_global_memory.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_free_global_memory.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,386] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/device/get_total_global_memory.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_total_global_memory.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,387] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/device/get_used_global_memory.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_used_global_memory.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,388] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/fused_rms_norm.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fused_rms_norm.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,388] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/gather_next_token.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/gather_next_token.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,389] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/get_infer_param.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_infer_param.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,390] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/get_output.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_output.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,391] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/get_padding_offset.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_padding_offset.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,391] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/get_token_penalty_multi_scores.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_token_penalty_multi_scores.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,392] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_ep_combine.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_ep_combine.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,392] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_ep_dispatch.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_ep_dispatch.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,393] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_expert_ffn.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,394] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_layer.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_layer.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,395] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_redundant_topk_select.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_redundant_topk_select.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,396] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_topk_select.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_topk_select.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,396] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/draft_model_postprocess.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_postprocess.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,397] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_preprocess.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,397] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_preprocess_v2.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,397] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/draft_model_update.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_update.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,398] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/eagle_get_hidden_states.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/eagle_get_hidden_states.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,399] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/eagle_get_self_hidden_states.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/eagle_get_self_hidden_states.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,400] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/mtp_save_first_token.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/mtp_save_first_token.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,400] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/mtp_step_paddle.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/mtp_step_paddle.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,401] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_clear_accept_nums.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_clear_accept_nums.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,402] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_get_output.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_output.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,402] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_get_output_padding_offset.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_output_padding_offset.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,403] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_get_padding_offset.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_padding_offset.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,403] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_get_seq_lens_output.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_seq_lens_output.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,404] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_rebuild_append_padding.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_rebuild_append_padding.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,405] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_save_output.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,405] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_set_stop_value_multi_seqs.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_set_stop_value_multi_seqs.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,406] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_set_value_by_flags.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_set_value_by_flags.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,406] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_step_reschedule.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_step_reschedule.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,407] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_token_penalty_multi_scores.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_token_penalty_multi_scores.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,408] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_update_input_ids_cpu.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_update_input_ids_cpu.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,408] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_update_v3.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_update_v3.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,409] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_verify.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,409] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/top_p_candidates.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/top_p_candidates.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,411] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/open_shm_and_get_meta_signal.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/open_shm_and_get_meta_signal.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,412] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/pybind/alloc_cache_pinned.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/alloc_cache_pinned.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,412] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/pybind/cachekv_signal_thread_worker.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/cachekv_signal_thread_worker.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,413] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/pybind/get_peermem_addr.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_peermem_addr.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,414] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/pybind/profiler.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/profiler.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,415] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/pybind/pybind.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/pybind.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,415] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/read_data_ipc.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/read_data_ipc.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,416] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/recover_decode_task.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/recover_decode_task.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,418] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/remote_cache_kv_ipc.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/remote_cache_kv_ipc.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,419] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/save_with_output_msg.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,420] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/set_data_ipc.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/set_data_ipc.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,422] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/set_value_by_flags_and_idx.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/set_value_by_flags_and_idx.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,423] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/share_external_data.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/share_external_data.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,424] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/step.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/step.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,425] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/stop_generation_multi_ends.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/stop_generation_multi_ends.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,428] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/swap_cache_batch.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/swap_cache_batch.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,431] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/update_inputs.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/update_inputs.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,433] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/update_inputs_v1.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/update_inputs_v1.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,434] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/utility/debug.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/debug.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,439] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/utility/env.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/env.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,444] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/utility/logging.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/logging.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,449] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/weight_only_linear.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/weight_only_linear.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:05,458] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/weight_quantize_xpu.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/weight_quantize_xpu.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17
+[2025-09-18 08:43:18,268] [    INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/adjust_batch.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/block_attn.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_context_gm_max_mem_demand.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_free_global_memory.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_total_global_memory.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_used_global_memory.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fused_rms_norm.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/gather_next_token.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_infer_param.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_output.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_padding_offset.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_token_penalty_multi_scores.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_ep_combine.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_ep_dispatch.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_expert_ffn.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_layer.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_redundant_topk_select.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_topk_select.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_postprocess.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_preprocess.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_preprocess_v2.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_update.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/eagle_get_hidden_states.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/eagle_get_self_hidden_states.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/mtp_save_first_token.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/mtp_step_paddle.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_clear_accept_nums.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_output.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_output_padding_offset.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_padding_offset.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_seq_lens_output.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_rebuild_append_padding.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_save_output.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_set_stop_value_multi_seqs.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_set_value_by_flags.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_step_reschedule.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_token_penalty_multi_scores.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_update_input_ids_cpu.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_update_v3.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_verify.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/top_p_candidates.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/open_shm_and_get_meta_signal.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/alloc_cache_pinned.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/cachekv_signal_thread_worker.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_peermem_addr.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/profiler.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/pybind.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/read_data_ipc.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/recover_decode_task.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/remote_cache_kv_ipc.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/save_with_output_msg.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/set_data_ipc.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/set_value_by_flags_and_idx.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/share_external_data.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/step.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/stop_generation_multi_ends.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/swap_cache_batch.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/update_inputs.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/update_inputs_v1.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/debug.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/env.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/logging.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/weight_only_linear.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/weight_quantize_xpu.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/build/libxpuplugin.a /usr/local/lib/python3.10/dist-packages/paddle/libs/libbkcl.so /usr/local/lib/python3.10/dist-packages/paddle/libs/libxpucuda.so /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/so/libapiinfer.so /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/so/libxft_blocks.so -L/usr/local/lib/python3.10/dist-packages/paddle/libs -L/usr/local/lib/python3.10/dist-packages/paddle/base -L/usr/lib/x86_64-linux-gnu -Wl,--enable-new-dtags,-rpath,/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/so -Wl,--enable-new-dtags,-rpath,/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/so -Wl,--enable-new-dtags,-rpath,/usr/local/lib/python3.10/dist-packages/paddle/libs -Wl,--enable-new-dtags,-rpath,/usr/local/lib/python3.10/dist-packages/paddle/base -o build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fastdeploy_ops.so -l:libpaddle.so
+[2025-09-18 08:43:20,462] [    INFO] dir_util.py:58 - creating build/fastdeploy_ops/bdist.linux-x86_64/egg
+[2025-09-18 08:43:20,463] [    INFO] file_util.py:130 - copying build/fastdeploy_ops/lib.linux-x86_64-cpython-310/version.txt -> build/fastdeploy_ops/bdist.linux-x86_64/egg
+[2025-09-18 08:43:20,463] [    INFO] file_util.py:130 - copying build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fastdeploy_ops.so -> build/fastdeploy_ops/bdist.linux-x86_64/egg
+[2025-09-18 08:43:20,558] [    INFO] bdist_egg.py:178 - creating stub loader for fastdeploy_ops.so
+Traceback (most recent call last):
+  File "/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/setup_ops.py", line 182, in <module>
+    xpu_setup_ops()
+  File "/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/setup_ops.py", line 163, in xpu_setup_ops
+    setup(
+  File "/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/cpp_extension.py", line 248, in setup
+    setuptools.setup(**attr)
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/__init__.py", line 117, in setup
+    return distutils.core.setup(**attrs)
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/core.py", line 183, in setup
+    return run_commands(dist)
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/core.py", line 199, in run_commands
+    dist.run_commands()
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/dist.py", line 954, in run_commands
+    self.run_command(cmd)
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/dist.py", line 950, in run_command
+    super().run_command(command)
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/dist.py", line 973, in run_command
+    cmd_obj.run()
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/command/install.py", line 97, in run
+    self.do_egg_install()
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/command/install.py", line 149, in do_egg_install
+    self.run_command('bdist_egg')
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py", line 316, in run_command
+    self.distribution.run_command(command)
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/dist.py", line 950, in run_command
+    super().run_command(command)
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/dist.py", line 973, in run_command
+    cmd_obj.run()
+  File "/usr/local/lib/python3.10/dist-packages/setuptools/command/bdist_egg.py", line 180, in run
+    write_stub(os.path.basename(ext_name), pyfile)
+  File "/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py", line 229, in custom_write_stub
+    new_custom_ops = load_op_meta_info_and_register_op(so_path)
+  File "/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py", line 167, in load_op_meta_info_and_register_op
+    core.load_op_meta_info_and_register_op(lib_filename)
+RuntimeError: (PreconditionNotMet) The third-party dynamic library (/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fastdeploy_ops.so) that Paddle depends on is not configured correctly. (error code is /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fastdeploy_ops.so: undefined symbol: _Z15SpeculateVerifyRKN6paddle6TensorES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_iib)
+  Suggestions:
+  1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) is installed correctly and its version is matched with paddlepaddle you installed.
+  2. Configure third-party dynamic library environment variables as follows:
+  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`
+  - Windows: set PATH by `set PATH=XXX;%PATH%`
+  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` [Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible unless System Integrity Protection (SIP) is disabled.] (at /home/chenhuan09/Paddle/paddle/phi/backends/dynload/dynamic_loader.cc:351)
+
diff --git a/ch_build.sh b/ch_build.sh
new file mode 100644
index 0000000000..760ced3175
--- /dev/null
+++ b/ch_build.sh
@@ -0,0 +1,6 @@
+bash custom_ops/xpu_ops/download_dependencies.sh develop
+
+export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xtdk
+export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm
+
+bash build.sh
\ No newline at end of file
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc
index 6d34b9736b..b1c42e04f4 100644
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc
@@ -23,6 +23,9 @@
 #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
 #endif
 
+#define GET_OUTPUT_DEBUG
+#define SAVE_WITH_OUTPUT_DEBUG
+
 #include "speculate_msg.h"
 
 void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
diff --git a/custom_ops/xpu_ops/setup_ops.py b/custom_ops/xpu_ops/setup_ops.py
index fa31f5a8e3..ed917d6d6f 100755
--- a/custom_ops/xpu_ops/setup_ops.py
+++ b/custom_ops/xpu_ops/setup_ops.py
@@ -152,6 +152,8 @@ def xpu_setup_ops():
             if file.endswith(".cc"):
                 ops.append(os.path.join(root, file))
 
+    print(ops)
+    
     include_dirs = [
         os.path.join(base_dir, "./"),
         os.path.join(base_dir, "./plugin/include"),
diff --git a/custom_ops/xpu_ops/src/ops/block_attn.cc b/custom_ops/xpu_ops/src/ops/block_attn.cc
index 72ae247491..abdac19475 100644
--- a/custom_ops/xpu_ops/src/ops/block_attn.cc
+++ b/custom_ops/xpu_ops/src/ops/block_attn.cc
@@ -639,7 +639,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
             : quant_v_scale_inv,
         nullptr,          // o_maxptr
         param.head_dim);  // vo_head_dim
-        PD_CHECK(0, "speculative_attention unimplemented");
+        // PD_CHECK(0, "speculative_attention unimplemented");
       PD_CHECK(ret == api::SUCCESS,
                "xfa::speculative_attention_decoder failed.");
       if (!Eq_len) {
diff --git a/custom_ops/xpu_ops/src/ops/gather_next_token.cc b/custom_ops/xpu_ops/src/ops/gather_next_token.cc
index 8d9aedcee8..2fc9da16bb 100644
--- a/custom_ops/xpu_ops/src/ops/gather_next_token.cc
+++ b/custom_ops/xpu_ops/src/ops/gather_next_token.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,97 +12,135 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include <xft/xdnn_plugin.h>
 #include "paddle/extension.h"
 #include "xpu/plugin.h"
-#include <paddle/phi/backends/xpu/xpu_context.h>
-std::vector<paddle::Tensor>
-GatherNextToken(const paddle::Tensor &tmp_out,     // [token_num, dim_embed]
-                const paddle::Tensor &cum_offsets, // [bsz, 1]
-                const paddle::Tensor &encoder_seq_lod,
-                const paddle::Tensor &encoder_batch_map,
-                const paddle::Tensor &decoder_batch_map,
-                const paddle::Tensor &encoder_seq_lod_cpu,
-                const paddle::Tensor &encoder_batch_map_cpu,
-                const paddle::Tensor &decoder_batch_map_cpu,
-                const paddle::Tensor &enc_batch_tensor,
-                const paddle::Tensor &dec_batch_tensor,
-                const paddle::optional<paddle::Tensor> &output_padding_offset,
-                int max_input_length) {
-    phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
-    auto dev_ctx =
-        paddle::experimental::DeviceContextPool::Instance().Get(place);
-    auto xpu_ctx = static_cast<const phi::XPUContext *>(dev_ctx);
-    using XPUType =
-        typename XPUTypeTrait<bfloat16>::Type; // only support bfloat16
-    typedef paddle::bfloat16 data_t;
-    const int dim = tmp_out.dims()[1];
-    const int bsz = cum_offsets.shape()[0];
-    int enc_batch = enc_batch_tensor.data<int32_t>()[0];
-    int dec_batch = dec_batch_tensor.data<int32_t>()[0];
 
-    baidu::xpu::api::VectorParam<int32_t> encoder_seqs_lods_vp{
-        const_cast<int32_t *>(encoder_seq_lod_cpu.data<int32_t>()),
-        enc_batch + 1, const_cast<int32_t *>(encoder_seq_lod.data<int32_t>())};
-    baidu::xpu::api::VectorParam<int32_t> encoder_batch_map_vp{
-        const_cast<int32_t *>(encoder_batch_map_cpu.data<int32_t>()), enc_batch,
-        const_cast<int32_t *>(encoder_batch_map.data<int32_t>())};
-    baidu::xpu::api::VectorParam<int32_t> decoder_batch_map_vp{
-        const_cast<int32_t *>(decoder_batch_map_cpu.data<int32_t>()), dec_batch,
-        const_cast<int32_t *>(decoder_batch_map.data<int32_t>())};
+std::vector<paddle::Tensor> GatherNextToken(
+    const paddle::Tensor& x,            // [token_num, dim_embed]
+    const paddle::Tensor& cum_offsets,  // [bsz, 1]
+    const paddle::Tensor& encoder_seq_lod,
+    const paddle::Tensor& encoder_batch_map,
+    const paddle::Tensor& decoder_batch_map,
+    const paddle::Tensor& encoder_seq_lod_cpu,
+    const paddle::Tensor& encoder_batch_map_cpu,
+    const paddle::Tensor& decoder_batch_map_cpu,
+    const paddle::Tensor& len_info_cpu,
+    const paddle::optional<paddle::Tensor>& output_padding_offset) {
+  phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
+  auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
+  using XPUType =
+      typename XPUTypeTrait<bfloat16>::Type;  // only support bfloat16
+  typedef paddle::bfloat16 data_t;
+  const int dim = x.dims()[1];
+  const int token_num = x.shape()[0];
+  const int bsz = cum_offsets.shape()[0];
+  int enc_batch = len_info_cpu.data<int32_t>()[0];
+  int dec_batch = len_info_cpu.data<int32_t>()[1];
 
-    auto out = paddle::full({bsz, dim}, -2, tmp_out.type(), tmp_out.place());
+  baidu::xpu::api::VectorParam<int32_t> encoder_seqs_lods_vp{
+      const_cast<int32_t*>(encoder_seq_lod_cpu.data<int32_t>()),
+      enc_batch + 1,
+      const_cast<int32_t*>(encoder_seq_lod.data<int32_t>())};
+  baidu::xpu::api::VectorParam<int32_t> encoder_batch_map_vp{
+      const_cast<int32_t*>(encoder_batch_map_cpu.data<int32_t>()),
+      enc_batch,
+      const_cast<int32_t*>(encoder_batch_map.data<int32_t>())};
+  baidu::xpu::api::VectorParam<int32_t> decoder_batch_map_vp{
+      const_cast<int32_t*>(decoder_batch_map_cpu.data<int32_t>()),
+      dec_batch,
+      const_cast<int32_t*>(decoder_batch_map.data<int32_t>())};
 
+  paddle::Tensor out;
+  std::vector<int> encode_iota_lod_cpu(enc_batch);
+  if (output_padding_offset) {
+    int need_delete_token_num = 0;
+    if (enc_batch > 0) {
+      need_delete_token_num =
+          encoder_seq_lod_cpu.data<int32_t>()[enc_batch] - enc_batch;
+      std::iota(encode_iota_lod_cpu.begin(), encode_iota_lod_cpu.end(), 0);
+      encoder_batch_map_vp.cpu =
+          const_cast<const int32_t*>(encode_iota_lod_cpu.data());
+      encoder_batch_map_vp.len = enc_batch;
+      encoder_batch_map_vp.xpu = nullptr;
+    }
+    out = paddle::empty(
+        {token_num - need_delete_token_num, dim}, x.type(), x.place());
+  } else {
+    out = paddle::empty({bsz, dim}, x.type(), x.place());
+  }
+  if (x.shape()[0] == 0) {
+    return {out};
+  }
+
+  if (output_padding_offset && enc_batch <= 0) {
+    out = x.copy_to(x.place(), false);
+  } else {
     int r = baidu::xpu::api::plugin::eb_gather_next_token<XPUType, XPUType>(
         xpu_ctx->x_context(),
-        reinterpret_cast<const XPUType *>(tmp_out.data<data_t>()),
-        reinterpret_cast<XPUType *>(out.data<data_t>()), encoder_seqs_lods_vp,
-        encoder_batch_map_vp, decoder_batch_map_vp, dim);
-    return {out};
+        reinterpret_cast<const XPUType*>(x.data<data_t>()),
+        reinterpret_cast<XPUType*>(out.data<data_t>()),
+        encoder_seqs_lods_vp,
+        encoder_batch_map_vp,
+        decoder_batch_map_vp,
+        dim);
+    PD_CHECK(r == 0, "xpu::plugin::gather_next_token failed.");
+  }
+  return {out};
 }
 
 std::vector<std::vector<int64_t>> GatherNextTokenInferShape(
-    const std::vector<int64_t> &tmp_out_shape,
-    const std::vector<int64_t> &cum_offsets_shape,
-    const std::vector<int64_t> &encoder_seq_lod_shape,
-    const std::vector<int64_t> &encoder_batch_map_shape,
-    const std::vector<int64_t> &decoder_batch_map_shape,
-    const std::vector<int64_t> &encoder_seq_lod_cpu_shape,
-    const std::vector<int64_t> &encoder_batch_map_cpu_shape,
-    const std::vector<int64_t> &decoder_batch_map_cpu_shape,
-    const std::vector<int64_t> &enc_batch_tensor_shape,
-    const std::vector<int64_t> &dec_batch_tensor_shape,
-    const paddle::optional<std::vector<int64_t>> &output_padding_offset_shape) {
-    if (output_padding_offset_shape) {
-        PD_THROW("speculative decoding is not supported in XPU.");
-    }
+    const std::vector<int64_t>& x_shape,
+    const std::vector<int64_t>& cum_offsets_shape,
+    const std::vector<int64_t>& encoder_seq_lod_shape,
+    const std::vector<int64_t>& encoder_batch_map_shape,
+    const std::vector<int64_t>& decoder_batch_map_shape,
+    const std::vector<int64_t>& encoder_seq_lod_cpu_shape,
+    const std::vector<int64_t>& encoder_batch_map_cpu_shape,
+    const std::vector<int64_t>& decoder_batch_map_cpu_shape,
+    const std::vector<int64_t>& len_info_cpu_shape,
+    const paddle::optional<std::vector<int64_t>>& output_padding_offset_shape) {
+  // if (output_padding_offset_shape) {
+  //   PD_THROW("speculative decoding is not supported in XPU.");
+  // }
+  int64_t bsz = cum_offsets_shape[0];
+  int64_t dim_embed = x_shape[1];
+  if (output_padding_offset_shape) {
+    return {{-1, dim_embed}};
+  } else {
     int64_t bsz = cum_offsets_shape[0];
-    int64_t dim_embed = tmp_out_shape[1];
     return {{bsz, dim_embed}};
+  }
 }
 
 std::vector<paddle::DataType> GatherNextTokenInferDtype(
-    const paddle::DataType &tmp_out_dtype,
-    const paddle::DataType &cum_offsets_dtype,
-    const paddle::DataType &encoder_seq_lod_dtype,
-    const paddle::DataType &encoder_batch_map_dtype,
-    const paddle::DataType &decoder_batch_map_dtype,
-    const paddle::DataType &encoder_seq_lod_cpu_dtype,
-    const paddle::DataType &encoder_batch_map_cpu_dtype,
-    const paddle::DataType &decoder_batch_map_cpu_dtype,
-    const paddle::DataType &enc_batch_tensor_dtype,
-    const paddle::DataType &dec_batch_tensor_dtype,
-    const paddle::optional<paddle::DataType> &output_padding_offset_dtype) {
-    return {tmp_out_dtype};
+    const paddle::DataType& x_dtype,
+    const paddle::DataType& cum_offsets_dtype,
+    const paddle::DataType& encoder_seq_lod_dtype,
+    const paddle::DataType& encoder_batch_map_dtype,
+    const paddle::DataType& decoder_batch_map_dtype,
+    const paddle::DataType& encoder_seq_lod_cpu_dtype,
+    const paddle::DataType& encoder_batch_map_cpu_dtype,
+    const paddle::DataType& decoder_batch_map_cpu_dtype,
+    const paddle::DataType& len_info_cpu_dtype,
+    const paddle::optional<paddle::DataType>& output_padding_offset_dtype) {
+  return {x_dtype};
 }
 
 PD_BUILD_OP(gather_next_token)
-    .Inputs({"tmp_out", "cum_offsets", "encoder_seq_lod", "encoder_batch_map",
-             "decoder_batch_map", "encoder_seq_lod_cpu",
-             "encoder_batch_map_cpu", "decoder_batch_map_cpu",
-             "enc_batch_tensor", "dec_batch_tensor",
+    .Inputs({"x",
+             "cum_offsets",
+             "encoder_seq_lod",
+             "encoder_batch_map",
+             "decoder_batch_map",
+             "encoder_seq_lod_cpu",
+             "encoder_batch_map_cpu",
+             "decoder_batch_map_cpu",
+             "len_info_cpu",
              paddle::Optional("output_padding_offset")})
     .Outputs({"out"})
-    .Attrs({"max_input_length: int"})
     .SetKernelFn(PD_KERNEL(GatherNextToken))
     .SetInferShapeFn(PD_INFER_SHAPE(GatherNextTokenInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(GatherNextTokenInferDtype));
+    .SetInferDtypeFn(PD_INFER_DTYPE(GatherNextTokenInferDtype));
\ No newline at end of file
diff --git a/custom_ops/xpu_ops/src/ops/get_infer_param.cc b/custom_ops/xpu_ops/src/ops/get_infer_param.cc
index fa57be193a..c0f1cb6d86 100644
--- a/custom_ops/xpu_ops/src/ops/get_infer_param.cc
+++ b/custom_ops/xpu_ops/src/ops/get_infer_param.cc
@@ -401,4 +401,4 @@ PD_BUILD_OP(get_infer_param)
     .SetKernelFn(PD_KERNEL(GetInferParam))
     .Attrs({"block_size: int"})
     .SetInferShapeFn(PD_INFER_SHAPE(GetInferParamInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(GetInferParamInferDtype));
+    .SetInferDtypeFn(PD_INFER_DTYPE(GetInferParamInferDtype));
\ No newline at end of file
diff --git a/custom_ops/xpu_ops/src/ops/get_infer_param_old.cc b/custom_ops/xpu_ops/src/ops/get_infer_param_old.cc
new file mode 100644
index 0000000000..e93dd8b632
--- /dev/null
+++ b/custom_ops/xpu_ops/src/ops/get_infer_param_old.cc
@@ -0,0 +1,244 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+#include "xpu/plugin.h"
+#include <paddle/phi/backends/xpu/xpu_context.h>
+
+std::vector<paddle::Tensor>
+GetInferParam(const paddle::Tensor &seq_lens_encoder,
+              const paddle::Tensor &seq_lens_decoder) {
+    phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+    auto dev_ctx =
+        paddle::experimental::DeviceContextPool::Instance().Get(place);
+    auto xpu_ctx = static_cast<const phi::XPUContext *>(dev_ctx);
+    const int bsz = seq_lens_encoder.dims()[0];
+    // 判断逻辑
+    std::vector<int32_t> seq_lens_encoder_vec(bsz, 0); // input
+    std::vector<int32_t> seq_lens_decoder_vec(bsz, 0); // input
+
+    std::vector<int32_t> encoder_batch_map_vec(bsz, 0); //
+    std::vector<int32_t> decoder_batch_map_vec(bsz, 0); //
+    std::vector<int32_t> encoder_batch_idx_vec(bsz, 0); // 去除空隙的batch map
+    std::vector<int32_t> decoder_batch_idx_vec(bsz, 0); // 去除空隙的batch map
+    std::vector<int32_t> encoder_seq_lod_vec(bsz + 1, 0);
+    std::vector<int32_t> decoder_context_len_vec(bsz, 0);
+    std::vector<int32_t> decoder_context_len_cache_vec(bsz, 0);
+    xpu_wait(xpu_ctx->x_context()->xpu_stream); // 是否需要！！！！TODO
+    int r = xpu_memcpy(seq_lens_encoder_vec.data(),
+                       seq_lens_encoder.data<int32_t>(), sizeof(int32_t) * bsz,
+                       XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+    r = xpu_memcpy(seq_lens_decoder_vec.data(),
+                   seq_lens_decoder.data<int32_t>(), sizeof(int32_t) * bsz,
+                   XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+
+    int enc_batch = 0, dec_batch = 0;
+    int total_enc_len = 0;
+    int batch_offset = 0;
+    for (int i = 0; i < bsz; ++i) {
+        if (seq_lens_encoder_vec[i] > 0) {
+            enc_batch++;
+            total_enc_len += seq_lens_encoder_vec[i];
+            encoder_batch_map_vec[enc_batch - 1] = i;
+            encoder_batch_idx_vec[enc_batch - 1] = i - batch_offset;
+            encoder_seq_lod_vec[enc_batch] =
+                seq_lens_encoder_vec[i] + encoder_seq_lod_vec[enc_batch - 1];
+        } else if (seq_lens_decoder_vec[i] > 0) {
+            dec_batch++;
+            decoder_batch_map_vec[dec_batch - 1] = i;
+            decoder_batch_idx_vec[dec_batch - 1] = i - batch_offset;
+            decoder_context_len_vec[dec_batch - 1] =
+                seq_lens_decoder_vec[i] + 1;
+            decoder_context_len_cache_vec[dec_batch - 1] =
+                seq_lens_decoder_vec[i];
+        } else {
+            batch_offset++;
+        }
+    }
+
+    auto encoder_batch_map_xpu =
+        paddle::full({encoder_batch_map_vec.size()}, 0, seq_lens_encoder.type(),
+                     seq_lens_encoder.place());
+    auto decoder_batch_map_xpu =
+        paddle::full({decoder_batch_map_vec.size()}, 0, seq_lens_encoder.type(),
+                     seq_lens_encoder.place());
+    auto encoder_batch_idx_xpu =
+        paddle::full({encoder_batch_idx_vec.size()}, 0, seq_lens_encoder.type(),
+                     seq_lens_encoder.place());
+    auto decoder_batch_idx_xpu =
+        paddle::full({decoder_batch_idx_vec.size()}, 0, seq_lens_encoder.type(),
+                     seq_lens_encoder.place());
+    auto encoder_seq_lod_xpu =
+        paddle::full({encoder_seq_lod_vec.size()}, 0, seq_lens_encoder.type(),
+                     seq_lens_encoder.place());
+    auto decoder_context_len_xpu =
+        paddle::full({decoder_context_len_vec.size()}, 0,
+                     seq_lens_encoder.type(), seq_lens_encoder.place());
+    auto decoder_context_len_cache_xpu =
+        paddle::full({decoder_context_len_cache_vec.size()}, 0,
+                     seq_lens_encoder.type(), seq_lens_encoder.place());
+
+    auto encoder_batch_map_cpu =
+        paddle::full({encoder_batch_map_vec.size()}, 0, seq_lens_encoder.type(),
+                     paddle::CPUPlace());
+    auto decoder_batch_map_cpu =
+        paddle::full({decoder_batch_map_vec.size()}, 0, seq_lens_encoder.type(),
+                     paddle::CPUPlace());
+    auto encoder_batch_idx_cpu =
+        paddle::full({encoder_batch_idx_vec.size()}, 0, seq_lens_encoder.type(),
+                     paddle::CPUPlace());
+    auto decoder_batch_idx_cpu =
+        paddle::full({decoder_batch_idx_vec.size()}, 0, seq_lens_encoder.type(),
+                     paddle::CPUPlace());
+    auto encoder_seq_lod_cpu =
+        paddle::full({encoder_seq_lod_vec.size()}, 0, seq_lens_encoder.type(),
+                     paddle::CPUPlace());
+    auto decoder_context_len_cpu =
+        paddle::full({decoder_context_len_vec.size()}, 0,
+                     seq_lens_encoder.type(), paddle::CPUPlace());
+    auto decoder_context_len_cache_cpu =
+        paddle::full({decoder_context_len_cache_vec.size()}, 0,
+                     seq_lens_encoder.type(), paddle::CPUPlace());
+
+    int ret = 0;
+    ret = xpu_memcpy(reinterpret_cast<int32_t *>(const_cast<int32_t *>(
+                         encoder_batch_map_xpu.data<int32_t>())),
+                     encoder_batch_map_vec.data(),
+                     sizeof(int32_t) * encoder_batch_map_vec.size(),
+                     XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    ret = xpu_memcpy(reinterpret_cast<int32_t *>(const_cast<int32_t *>(
+                         decoder_batch_map_xpu.data<int32_t>())),
+                     decoder_batch_map_vec.data(),
+                     sizeof(int32_t) * decoder_batch_map_vec.size(),
+                     XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    ret = xpu_memcpy(reinterpret_cast<int32_t *>(const_cast<int32_t *>(
+                         encoder_batch_idx_xpu.data<int32_t>())),
+                     encoder_batch_idx_vec.data(),
+                     sizeof(int32_t) * encoder_batch_idx_vec.size(),
+                     XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    ret = xpu_memcpy(reinterpret_cast<int32_t *>(const_cast<int32_t *>(
+                         decoder_batch_idx_xpu.data<int32_t>())),
+                     decoder_batch_idx_vec.data(),
+                     sizeof(int32_t) * decoder_batch_idx_vec.size(),
+                     XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    ret = xpu_memcpy(reinterpret_cast<int32_t *>(const_cast<int32_t *>(
+                         encoder_seq_lod_xpu.data<int32_t>())),
+                     encoder_seq_lod_vec.data(),
+                     sizeof(int32_t) * encoder_seq_lod_vec.size(),
+                     XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    ret = xpu_memcpy(reinterpret_cast<int32_t *>(const_cast<int32_t *>(
+                         decoder_context_len_xpu.data<int32_t>())),
+                     decoder_context_len_vec.data(),
+                     sizeof(int32_t) * decoder_context_len_vec.size(),
+                     XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    ret = xpu_memcpy(reinterpret_cast<int32_t *>(const_cast<int32_t *>(
+                         decoder_context_len_cache_xpu.data<int32_t>())),
+                     decoder_context_len_cache_vec.data(),
+                     sizeof(int32_t) * decoder_context_len_cache_vec.size(),
+                     XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+    std::memcpy(encoder_batch_map_cpu.data<int32_t>(),
+                encoder_batch_map_vec.data(),
+                sizeof(int32_t) * encoder_batch_map_vec.size());
+    std::memcpy(decoder_batch_map_cpu.data<int32_t>(),
+                decoder_batch_map_vec.data(),
+                sizeof(int32_t) * decoder_batch_map_vec.size());
+    std::memcpy(encoder_batch_idx_cpu.data<int32_t>(),
+                encoder_batch_idx_vec.data(),
+                sizeof(int32_t) * encoder_batch_idx_vec.size());
+    std::memcpy(decoder_batch_idx_cpu.data<int32_t>(),
+                decoder_batch_idx_vec.data(),
+                sizeof(int32_t) * decoder_batch_idx_vec.size());
+    std::memcpy(encoder_seq_lod_cpu.data<int32_t>(), encoder_seq_lod_vec.data(),
+                sizeof(int32_t) * encoder_seq_lod_vec.size());
+    std::memcpy(decoder_context_len_cpu.data<int32_t>(),
+                decoder_context_len_vec.data(),
+                sizeof(int32_t) * decoder_context_len_vec.size());
+    std::memcpy(decoder_context_len_cache_cpu.data<int32_t>(),
+                decoder_context_len_cache_vec.data(),
+                sizeof(int32_t) * decoder_context_len_cache_vec.size());
+
+    auto enc_batch_tensor = paddle::full(
+        {1}, enc_batch, seq_lens_encoder.type(), paddle::CPUPlace());
+    auto dec_batch_tensor = paddle::full(
+        {1}, dec_batch, seq_lens_encoder.type(), paddle::CPUPlace());
+    auto total_enc_len_tensor = paddle::full(
+        {1}, total_enc_len, seq_lens_encoder.type(), paddle::CPUPlace());
+
+    return {encoder_batch_map_xpu,
+            decoder_batch_map_xpu,
+            encoder_batch_idx_xpu,
+            decoder_batch_idx_xpu,
+            encoder_seq_lod_xpu,
+            decoder_context_len_xpu,
+            decoder_context_len_cache_xpu,
+            encoder_batch_map_cpu,
+            decoder_batch_map_cpu,
+            encoder_batch_idx_cpu,
+            decoder_batch_idx_cpu,
+            encoder_seq_lod_cpu,
+            decoder_context_len_cpu,
+            decoder_context_len_cache_cpu,
+            enc_batch_tensor,
+            dec_batch_tensor,
+            total_enc_len_tensor};
+}
+
+std::vector<std::vector<int64_t>>
+GetInferParamInferShape(const std::vector<int64_t> &seq_lens_encoder_shape,
+                        const std::vector<int64_t> &seq_lens_decoder_shape) {
+    return {seq_lens_encoder_shape,
+            seq_lens_encoder_shape,
+            seq_lens_encoder_shape,
+            seq_lens_encoder_shape,
+            {seq_lens_encoder_shape[0] + 1},
+            seq_lens_encoder_shape,
+            seq_lens_encoder_shape,
+            seq_lens_encoder_shape,
+            seq_lens_encoder_shape,
+            seq_lens_encoder_shape,
+            seq_lens_encoder_shape,
+            seq_lens_encoder_shape,
+            seq_lens_encoder_shape,
+            seq_lens_encoder_shape,
+            {1},
+            {1},
+            {1}};
+}
+
+std::vector<paddle::DataType>
+GetInferParamInferDtype(const paddle::DataType &seq_lens_encoder_dtype,
+                        const paddle::DataType &seq_lens_decoder_dtype) {
+    return {
+        seq_lens_encoder_dtype, seq_lens_encoder_dtype, seq_lens_encoder_dtype,
+        seq_lens_encoder_dtype, seq_lens_encoder_dtype, seq_lens_encoder_dtype,
+        seq_lens_encoder_dtype, seq_lens_encoder_dtype, seq_lens_encoder_dtype,
+        seq_lens_encoder_dtype, seq_lens_encoder_dtype, seq_lens_encoder_dtype,
+        seq_lens_encoder_dtype, seq_lens_encoder_dtype, seq_lens_encoder_dtype,
+        seq_lens_encoder_dtype, seq_lens_encoder_dtype};
+}
+
+PD_BUILD_OP(get_infer_param_old)
+    .Inputs({"seq_lens_encoder", "seq_lens_decoder"})
+    .Outputs({"encoder_batch_map_xpu", "decoder_batch_map_xpu",
+              "encoder_batch_idx_xpu", "decoder_batch_idx_xpu",
+              "encoder_seq_lod_xpu", "decoder_context_len_xpu",
+              "decoder_context_len_cache_xpu", "encoder_batch_map_cpu",
+              "decoder_batch_map_cpu", "encoder_batch_idx_cpu",
+              "decoder_batch_idx_cpu", "encoder_seq_lod_cpu",
+              "decoder_context_len_cpu", "decoder_context_len_cache_cpu",
+              "enc_batch_tensor", "dec_batch_tensor", "total_enc_len_tensor"})
+    .SetKernelFn(PD_KERNEL(GetInferParam))
+    .SetInferShapeFn(PD_INFER_SHAPE(GetInferParamInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(GetInferParamInferDtype));
\ No newline at end of file
diff --git a/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc b/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc
new file mode 100644
index 0000000000..d97e28f68f
--- /dev/null
+++ b/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include "paddle/extension.h"
+#include "paddle/phi/core/enforce.h"
+#include "xpu/plugin.h"
+
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
+namespace api = baidu::xpu::api;
+void DraftModelPreprocessV2(const paddle::Tensor& draft_tokens,
+                          const paddle::Tensor& input_ids,
+                          const paddle::Tensor& stop_flags,
+                          const paddle::Tensor& seq_lens_this_time,
+                          const paddle::Tensor& seq_lens_encoder,
+                          const paddle::Tensor& seq_lens_decoder,
+                          const paddle::Tensor& step_idx,
+                          const paddle::Tensor& not_need_stop,
+                          const paddle::Tensor& is_block_step,
+                          const paddle::Tensor& batch_drop,
+                          const paddle::Tensor& pre_ids,
+                          const paddle::Tensor& accept_tokens,
+                          const paddle::Tensor& accept_num,
+                          const paddle::Tensor& base_model_seq_lens_this_time,
+                          const paddle::Tensor& base_model_seq_lens_encoder,
+                          const paddle::Tensor& base_model_seq_lens_decoder,
+                          const paddle::Tensor& base_model_step_idx,
+                          const paddle::Tensor& base_model_stop_flags,
+                          const paddle::Tensor& base_model_is_block_step,
+                          const paddle::Tensor& base_model_draft_tokens,
+                          const int num_model_step,
+                          const bool truncate_first_token,
+                          const bool splitwise_prefill,
+                          const bool kvcache_scheduler_v1) {
+
+  phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
+  api::Context* ctx = static_cast<const phi::XPUContext*>(dev_ctx)->x_context();
+  if (draft_tokens.is_cpu()) {
+    ctx = new api::Context(api::kCPU);
+  }
+  int real_bsz = seq_lens_this_time.shape()[0];
+  int accept_tokens_len = accept_tokens.shape()[1];
+  int input_ids_len = input_ids.shape()[1];
+  int draft_tokens_len = draft_tokens.shape()[1];
+  int pre_ids_len = pre_ids.shape()[1];
+  constexpr int BlockSize = 512;
+  int base_model_draft_tokens_len = base_model_draft_tokens.shape()[1];
+  auto not_need_stop_gpu =
+      not_need_stop.copy_to(seq_lens_this_time.place(), false);
+
+  int r = baidu::xpu::api::plugin::draft_model_preprocess_v2(
+      ctx,
+      const_cast<int64_t*>(draft_tokens.data<int64_t>()),
+      const_cast<int64_t*>(input_ids.data<int64_t>()),
+      const_cast<bool*>(stop_flags.data<bool>()),
+      const_cast<int*>(seq_lens_this_time.data<int>()),
+      const_cast<int*>(seq_lens_encoder.data<int>()),
+      const_cast<int*>(seq_lens_decoder.data<int>()),
+      const_cast<int64_t*>(step_idx.data<int64_t>()),
+      const_cast<bool*>(not_need_stop_gpu.data<bool>()),
+      const_cast<bool*>(is_block_step.data<bool>()),
+      const_cast<bool*>(batch_drop.data<bool>()),
+      const_cast<int64_t*>(pre_ids.data<int64_t>()),
+      accept_tokens.data<int64_t>(),
+      accept_num.data<int>(),
+      base_model_seq_lens_this_time.data<int>(),
+      base_model_seq_lens_encoder.data<int>(),
+      base_model_seq_lens_decoder.data<int>(),
+      base_model_step_idx.data<int64_t>(),
+      base_model_stop_flags.data<bool>(),
+      base_model_is_block_step.data<bool>(),
+      const_cast<int64_t*>(base_model_draft_tokens.data<int64_t>()),
+      real_bsz,
+      num_model_step,
+      accept_tokens_len,
+      draft_tokens_len,
+      input_ids_len,
+      base_model_draft_tokens_len,
+      pre_ids_len,
+      truncate_first_token,
+      splitwise_prefill,
+      kvcache_scheduler_v1);
+
+  PD_CHECK(r == 0, "xpu::plugin::draft_model_preprocess failed.");
+  auto not_need_stop_cpu =
+      not_need_stop_gpu.copy_to(not_need_stop.place(), false);
+  bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
+  not_need_stop_data[0] = not_need_stop_cpu.data<bool>()[0];
+}
+
+PD_BUILD_STATIC_OP(draft_model_preprocess_v2)
+    .Inputs({"draft_tokens",
+             "input_ids",
+             "stop_flags",
+             "seq_lens_this_time",
+             "seq_lens_encoder",
+             "seq_lens_decoder",
+             "step_idx",
+             "not_need_stop",
+             "is_block_step",
+             "batch_drop",
+             "pre_ids",
+             "accept_tokens",
+             "accept_num",
+             "base_model_seq_lens_this_time",
+             "base_model_seq_lens_encoder",
+             "base_model_seq_lens_decoder",
+             "base_model_step_idx",
+             "base_model_stop_flags",
+             "base_model_is_block_step",
+             "base_model_draft_tokens"})
+    .Outputs({"draft_tokens_out",
+              "input_ids_out",
+              "stop_flags_out",
+              "seq_lens_this_time_out",
+              "seq_lens_encoder_out",
+              "seq_lens_decoder_out",
+              "step_idx_out",
+              "not_need_stop_out",
+              "batch_drop_out",
+              "pre_ids_out"})
+    .Attrs({"num_model_step: int", "truncate_first_token: bool", "splitwise_prefill: bool", "kvcache_scheduler_v1: bool"})
+    .SetInplaceMap({{"draft_tokens", "draft_tokens_out"},
+                    {"input_ids", "input_ids_out"},
+                    {"stop_flags", "stop_flags_out"},
+                    {"seq_lens_this_time", "seq_lens_this_time_out"},
+                    {"seq_lens_encoder", "seq_lens_encoder_out"},
+                    {"seq_lens_decoder", "seq_lens_decoder_out"},
+                    {"step_idx", "step_idx_out"},
+                    {"not_need_stop", "not_need_stop_out"},
+                    {"batch_drop", "batch_drop_out"},
+                    {"pre_ids", "pre_ids_out"}})
+    .SetKernelFn(PD_KERNEL(DraftModelPreprocessV2));
diff --git a/custom_ops/xpu_ops/src/ops/mtp/speculate_get_padding_offset_v2.cc b/custom_ops/xpu_ops/src/ops/mtp/speculate_get_padding_offset_v2.cc
new file mode 100644
index 0000000000..3740703b2f
--- /dev/null
+++ b/custom_ops/xpu_ops/src/ops/mtp/speculate_get_padding_offset_v2.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include "paddle/extension.h"
+#include "xpu/plugin.h"
+
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
+std::vector<paddle::Tensor> SpeculateGetPaddingOffsetV2(
+    const paddle::Tensor& input_ids,
+    const paddle::Tensor& draft_tokens,
+    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& token_num,
+    const paddle::Tensor& seq_len,
+    const paddle::Tensor& seq_lens_encoder) {
+  phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
+  auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
+
+  std::vector<int64_t> input_ids_shape = input_ids.shape();
+  const int bsz = seq_len.shape()[0];
+  const int seq_length = input_ids_shape[1];
+  const int max_draft_tokens = draft_tokens.shape()[1];
+  auto cum_offsets_out = cum_offsets.copy_to(cum_offsets.place(), false);
+  auto cpu_token_num = token_num.copy_to(paddle::CPUPlace(), false);
+
+  const int token_num_data = cpu_token_num.data<int64_t>()[0];
+  auto x_remove_padding = paddle::empty(
+      {token_num_data}, paddle::DataType::INT64, input_ids.place());
+  auto padding_offset = paddle::empty(
+      {token_num_data}, paddle::DataType::INT32, input_ids.place());
+  auto batch_id_per_token = paddle::empty(
+      {token_num_data}, paddle::DataType::INT32, input_ids.place());
+  auto cu_seqlens_q =
+      paddle::empty({bsz + 1}, paddle::DataType::INT32, input_ids.place());
+  auto cu_seqlens_k =
+      paddle::empty({bsz + 1}, paddle::DataType::INT32, input_ids.place());
+
+  PD_CHECK(input_ids.is_contiguous(), "Input ids tensor must be contiguous");
+  PD_CHECK(draft_tokens.is_contiguous(),
+           "Draft tokens tensor must be contiguous");
+  PD_CHECK(cum_offsets.is_contiguous(),
+           "Cum offsets tensor must be contiguous");
+  PD_CHECK(seq_len.is_contiguous(), "Seq lens tensor must be contiguous");
+
+  int r = baidu::xpu::api::plugin::speculate_get_padding_offset_v2(
+      xpu_ctx->x_context(),
+      batch_id_per_token.data<int>(),
+      cum_offsets_out.data<int>(),
+      cu_seqlens_q.data<int>(),
+      cu_seqlens_k.data<int>(),
+      cum_offsets.data<int>(),
+      seq_len.data<int>(),
+      seq_length,
+      bsz);
+  PD_CHECK(r == 0, "XPU speculate_get_padding_offset_v2 failed");
+
+  r = baidu::xpu::api::plugin::speculate_remove_padding<int64_t>(
+      xpu_ctx->x_context(),
+      x_remove_padding.data<int64_t>(),
+      input_ids.data<int64_t>(),
+      draft_tokens.data<int64_t>(),
+      seq_len.data<int>(),
+      seq_lens_encoder.data<int>(),
+      cum_offsets_out.data<int>(),
+      seq_length,
+      max_draft_tokens,
+      bsz,
+      token_num_data);
+  PD_CHECK(r == 0, "XPU speculate_remove_padding failed");
+
+  return {x_remove_padding,
+          batch_id_per_token,
+          cu_seqlens_q,
+          cu_seqlens_k};  // , enc_token_num, dec_token_num};
+}
+
+std::vector<std::vector<int64_t>> SpeculateGetPaddingOffsetV2InferShape(
+    const std::vector<int64_t>& input_ids_shape,
+    const std::vector<int64_t>& draft_tokens_shape,
+    const std::vector<int64_t>& cum_offsets_shape,
+    const std::vector<int64_t>& token_num_shape,
+    const std::vector<int64_t>& seq_len_shape,
+    const std::vector<int64_t>& seq_lens_encoder_shape) {
+  int64_t bsz = seq_len_shape[0];
+  int64_t seq_len = input_ids_shape[1];
+  return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}};
+}
+
+std::vector<paddle::DataType> SpeculateGetPaddingOffsetV2InferDtype(
+    const paddle::DataType& input_ids_dtype,
+    const paddle::DataType& draft_tokens_dtype,
+    const paddle::DataType& cum_offsets_dtype,
+    const paddle::DataType& token_num_dtype,
+    const paddle::DataType& seq_len_dtype,
+    const paddle::DataType& seq_lens_encoder_dtype) {
+  return {input_ids_dtype,
+          seq_len_dtype,
+          seq_len_dtype,
+          seq_len_dtype,
+          seq_len_dtype};
+}
+
+PD_BUILD_STATIC_OP(speculate_get_padding_offset_v2)
+    .Inputs({"input_ids",
+             "draft_tokens",
+             "cum_offsets",
+             "token_num",
+             "seq_len",
+             "seq_lens_encoder"})
+    .Outputs({"x_remove_padding",
+              "batch_id_per_token",
+              "cu_seqlens_q",
+              "cu_seqlens_k"})
+    .SetKernelFn(PD_KERNEL(SpeculateGetPaddingOffsetV2))
+    .SetInferShapeFn(PD_INFER_SHAPE(SpeculateGetPaddingOffsetV2InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(SpeculateGetPaddingOffsetV2InferDtype));
diff --git a/custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc b/custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc
index 60764b26a5..5bb6f744c4 100644
--- a/custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc
+++ b/custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc
@@ -35,8 +35,7 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
                                 const paddle::Tensor& not_need_stop,
                                 int64_t rank_id,
                                 int msg_queue_id,
-                                int save_each_rank) {
-  // printf("enter save output");
+                                bool save_each_rank) {
   if (!save_each_rank && rank_id > 0) {
     return;
   }
diff --git a/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc b/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc
index 53b5b90dc3..4b9e99944f 100644
--- a/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc
+++ b/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc
@@ -45,7 +45,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
                      const paddle::Tensor &topp,
                      int max_seq_len,
                      int verify_window,
-                     bool enable_topp) {
+                     bool enable_topp,
+                     bool benchmark_mode) {
   auto bsz = accept_tokens.shape()[0];
   int real_bsz = seq_lens_this_time.shape()[0];
   auto max_draft_tokens = draft_tokens.shape()[1];
@@ -133,7 +134,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
           max_seq_len,
           max_candidate_len,
           verify_window,
-          prefill_one_step_stop);
+          prefill_one_step_stop,
+          benchmark_mode);
     } else {
       baidu::xpu::api::plugin::speculate_verify<false, true>(
           ctx,
@@ -161,7 +163,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
           max_seq_len,
           max_candidate_len,
           verify_window,
-          prefill_one_step_stop);
+          prefill_one_step_stop,
+          benchmark_mode);
     }
   } else {
     if (enable_topp) {
@@ -191,7 +194,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
           max_seq_len,
           max_candidate_len,
           verify_window,
-          prefill_one_step_stop);
+          prefill_one_step_stop,
+          benchmark_mode);
     } else {
       baidu::xpu::api::plugin::speculate_verify<false, false>(
           ctx,
@@ -219,7 +223,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
           max_seq_len,
           max_candidate_len,
           verify_window,
-          prefill_one_step_stop);
+          prefill_one_step_stop,
+          benchmark_mode);
     }
   }
 }
@@ -246,7 +251,10 @@ PD_BUILD_STATIC_OP(speculate_verify)
               "accept_num_out",
               "step_idx_out",
               "stop_flags_out"})
-    .Attrs({"max_seq_len: int", "verify_window: int", "enable_topp: bool"})
+    .Attrs({"max_seq_len: int",
+            "verify_window: int",
+            "enable_topp: bool",
+            "benchmark_mode: bool"})
     .SetInplaceMap({{"accept_tokens", "accept_tokens_out"},
                     {"accept_num", "accept_num_out"},
                     {"step_idx", "step_idx_out"},
diff --git a/custom_ops/xpu_ops/src/ops/pybind/pybind.cc b/custom_ops/xpu_ops/src/ops/pybind/pybind.cc
index e8eea990b2..194e54a682 100644
--- a/custom_ops/xpu_ops/src/ops/pybind/pybind.cc
+++ b/custom_ops/xpu_ops/src/ops/pybind/pybind.cc
@@ -205,7 +205,8 @@ void SpeculateVerify(const paddle::Tensor& accept_tokens,
                      const paddle::Tensor& topp,
                      int max_seq_len,
                      int verify_window,
-                     bool enable_topp);
+                     bool enable_topp,
+                     bool benchmark_mode);
 
 void SpeculateClearAcceptNums(const paddle::Tensor& accept_num,
                               const paddle::Tensor& seq_lens_decoder);
@@ -242,6 +243,32 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens,
                           const bool truncate_first_token,
                           const bool splitwise_prefill);
 
+void DraftModelPreprocessV2(const paddle::Tensor& draft_tokens,
+                          const paddle::Tensor& input_ids,
+                          const paddle::Tensor& stop_flags,
+                          const paddle::Tensor& seq_lens_this_time,
+                          const paddle::Tensor& seq_lens_encoder,
+                          const paddle::Tensor& seq_lens_decoder,
+                          const paddle::Tensor& step_idx,
+                          const paddle::Tensor& not_need_stop,
+                          const paddle::Tensor& is_block_step,
+                          const paddle::Tensor& batch_drop,
+                          const paddle::Tensor& pre_ids,
+                          const paddle::Tensor& accept_tokens,
+                          const paddle::Tensor& accept_num,
+                          const paddle::Tensor& base_model_seq_lens_this_time,
+                          const paddle::Tensor& base_model_seq_lens_encoder,
+                          const paddle::Tensor& base_model_seq_lens_decoder,
+                          const paddle::Tensor& base_model_step_idx,
+                          const paddle::Tensor& base_model_stop_flags,
+                          const paddle::Tensor& base_model_is_block_step,
+                          const paddle::Tensor& base_model_draft_tokens,
+                          const int num_model_step,
+                          const bool truncate_first_token,
+                          const bool splitwise_prefill,
+                          const bool kvcache_scheduler_v1);
+
+
 void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens,
                            const paddle::Tensor& base_model_seq_lens_this_time,
                            const paddle::Tensor& base_model_seq_lens_encoder,
@@ -278,6 +305,14 @@ std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
     const paddle::Tensor& seq_len,
     const paddle::Tensor& seq_lens_encoder);
 
+std::vector<paddle::Tensor> SpeculateGetPaddingOffsetV2(
+    const paddle::Tensor& input_ids,
+    const paddle::Tensor& draft_tokens,
+    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& token_num,
+    const paddle::Tensor& seq_len,
+    const paddle::Tensor& seq_lens_encoder);
+
 void MTPStepPaddle(
     const paddle::Tensor& base_model_stop_flags,
     const paddle::Tensor& stop_flags,
@@ -552,6 +587,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("max_seq_len"),
         py::arg("verify_window"),
         py::arg("enable_topp"),
+        py::arg("benchmark_mode"),
         "Perform speculative verification for decoding");
 
   m.def("speculate_clear_accept_nums",
@@ -598,6 +634,35 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("splitwise_prefill"),
         "Preprocess data for draft model in speculative decoding");
 
+  m.def("draft_model_preprocess_v2",
+        &DraftModelPreprocessV2,
+        py::arg("draft_tokens"),
+        py::arg("input_ids"),
+        py::arg("stop_flags"),
+        py::arg("seq_lens_this_time"),
+        py::arg("seq_lens_encoder"),
+        py::arg("seq_lens_decoder"),
+        py::arg("step_idx"),
+        py::arg("not_need_stop"),
+        py::arg("is_block_step"),
+        py::arg("batch_drop"),
+        py::arg("pre_ids"),
+        py::arg("accept_tokens"),
+        py::arg("accept_num"),
+        py::arg("base_model_seq_lens_this_time"),
+        py::arg("base_model_seq_lens_encoder"),
+        py::arg("base_model_seq_lens_decoder"),
+        py::arg("base_model_step_idx"),
+        py::arg("base_model_stop_flags"),
+        py::arg("base_model_is_block_step"),
+        py::arg("base_model_draft_tokens"),
+        py::arg("num_model_step"),
+        py::arg("truncate_first_token"),
+        py::arg("splitwise_prefill"),
+        py::arg("kvcache_scheduler_v1"),
+        "Preprocess data for draft model in speculative decoding");
+
+
   m.def("draft_model_postprocess",
         &DraftModelPostprocess,
         py::arg("base_model_draft_tokens"),
@@ -645,6 +710,16 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("seq_lens_encoder"),
         "Get padding offset");
 
+  m.def("speculate_get_padding_offset_v2",
+        &SpeculateGetPaddingOffsetV2,
+        py::arg("input_ids"),
+        py::arg("draft_tokens"),
+        py::arg("cum_offsets"),
+        py::arg("token_num"),
+        py::arg("seq_len"),
+        py::arg("seq_lens_encoder"),
+        "Get padding offset v2");
+
   m.def("mtp_step_paddle",
         &MTPStepPaddle,
         py::arg("base_model_stop_flags"),
diff --git a/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h b/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h
index 5ce2559564..557401d31b 100644
--- a/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h
+++ b/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h
@@ -227,7 +227,8 @@ DLL_EXPORT int speculate_verify(Context* ctx,
                                 const int max_seq_len,
                                 const int max_candidate_len,
                                 const int verify_window,
-                                const bool prefill_one_step_stop);
+                                const bool prefill_one_step_stop,
+                                const bool benchmark_mode);
 
 DLL_EXPORT int speculate_clear_accept_nums(Context* ctx,
                                            int* accept_num,
@@ -343,6 +344,17 @@ DLL_EXPORT int speculate_get_padding_offset(Context* ctx,
                                             const int max_seq_len,
                                             int bsz);
 
+
+DLL_EXPORT int speculate_get_padding_offset_v2(Context* ctx,
+                                            int* batch_id_per_token,
+                                            int* cum_offsets_out,
+                                            int* cu_seqlens_q,
+                                            int* cu_seqlens_k,
+                                            const int* cum_offsets,
+                                            const int* seq_lens,
+                                            const int max_seq_len,
+                                            int bsz);
+
 DLL_EXPORT int compute_self_order(api::Context* ctx,
                                   const int* last_seq_lens_this_time,
                                   const int* seq_lens_this_time,
@@ -457,6 +469,38 @@ DLL_EXPORT int rebuild_self_hidden_states(api::Context* ctx,
                                           T* output,
                                           int dim_embed,
                                           int elem_cnt);
+DLL_EXPORT int draft_model_preprocess_v2(api::Context* ctx,
+                                      int64_t* draft_tokens,
+                                      int64_t* input_ids,
+                                      bool* stop_flags,
+                                      int* seq_lens_this_time,
+                                      int* seq_lens_encoder,
+                                      int* seq_lens_decoder,
+                                      int64_t* step_idx,
+                                      bool* not_need_stop,
+                                      bool* is_block_step,
+                                      bool* batch_drop,
+                                      int64_t* pre_ids,
+                                      const int64_t* accept_tokens,
+                                      const int* accept_num,
+                                      const int* base_model_seq_lens_this_time,
+                                      const int* base_model_seq_lens_encoder,
+                                      const int* base_model_seq_lens_decoder,
+                                      const int64_t* base_model_step_idx,
+                                      const bool* base_model_stop_flags,
+                                      const bool* base_model_is_block_step,
+                                      int64_t* base_model_draft_tokens,
+                                      const int bsz,
+                                      const int num_model_step,
+                                      const int accept_tokens_len,
+                                      const int draft_tokens_len,
+                                      const int input_ids_len,
+                                      const int base_model_draft_tokens_len,
+                                      const int pre_ids_len,
+                                      const bool truncate_first_token,
+                                      const bool splitwise_prefill,
+                                      const bool kvcache_scheduler_v1);
+
 /*--------------------------------------- MTP end --------------------------------------------*/
 
 } // namespace plugin
diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/compute_order.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/compute_order.xpu
index 7cd399d09c..b5bfa6ba91 100644
--- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/compute_order.xpu
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/compute_order.xpu
@@ -20,6 +20,7 @@ __global__ void ComputeOrderKernel(const int* seq_lens_this_time,
     return;
   }
 
+  // 256 * int
   char lm[6 * 1024];
   int buf_size = 6 * 1024 / (6 * sizeof(int));
   int* lm_base_model_seq_lens_this_time = (int*)lm;
@@ -68,10 +69,7 @@ __global__ void ComputeOrderKernel(const int* seq_lens_this_time,
           in_offset += write_size;
         }
         mfence_lm();
-        // 2. base model encoder. Base step=0
-      } else if (cur_base_model_seq_lens_encoder != 0) {
-        // nothing happens
-        // 3. New end
+        // 2. Base model stop at last verify-step.
       } else if (cur_base_model_seq_lens_this_time != 0 &&
                  cur_seq_lens_this_time == 0) {
         in_offset += cur_base_model_seq_lens_this_time;
@@ -80,27 +78,16 @@ __global__ void ComputeOrderKernel(const int* seq_lens_this_time,
                  cur_seq_lens_this_time == 0) {
         // nothing happens
       } else {
-        if (accept_num <= actual_draft_token_num) {
-          int position_map_val = out_offset;
-          LM2GM(&position_map_val,
-                position_map + in_offset + accept_num - 1,
-                sizeof(int));
-          out_offset++;
-          in_offset += cur_base_model_seq_lens_this_time;
-        } else {
-          int position_map_val_1 = out_offset;
-          LM2GM(&position_map_val_1,
-                position_map + in_offset + accept_num - 2,
-                sizeof(int));
-          out_offset++;
-          int position_map_val_2 = out_offset;
-          LM2GM(&position_map_val_2,
-                position_map + in_offset + accept_num - 1,
-                sizeof(int));
-          out_offset++;
-          in_offset += cur_base_model_seq_lens_this_time;
+        // accept_num << buf_size, so do not need split
+        for (int i = 0; i < accept_num; i++) {
+          lm_position_map[i] = out_offset++;
         }
         mfence_lm();
+        LM2GM(lm_position_map,
+              position_map + in_offset,
+              accept_num * sizeof(int));
+        in_offset += cur_base_model_seq_lens_this_time - accept_num;
+        mfence_lm();
       }
     }
   }
diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_preprocess_v2.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_preprocess_v2.xpu
new file mode 100644
index 0000000000..052de1b7fc
--- /dev/null
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_preprocess_v2.xpu
@@ -0,0 +1,250 @@
+#include "xpu/kernel/cluster.h"
+#include "xpu/kernel/cluster_debug.h"
+#include "xpu/kernel/cluster_partition.h"
+#include "xpu/kernel/cluster_primitive.h"
+#include "xpu/kernel/cluster_simd.h"
+
+namespace xpu3 {
+namespace plugin {
+__global__ void draft_model_preprocess_v2(
+    int64_t* draft_tokens,
+    int64_t* input_ids,
+    bool* stop_flags,
+    int* seq_lens_this_time,
+    int* seq_lens_encoder,
+    int* seq_lens_decoder,
+    int64_t* step_idx,
+    bool* not_need_stop,
+    bool* is_block_step,
+    bool* batch_drop,
+    int64_t* pre_ids,
+    const int64_t* accept_tokens,
+    const int* accept_num,
+    const int* base_model_seq_lens_this_time,
+    const int* base_model_seq_lens_encoder,
+    const int* base_model_seq_lens_decoder,
+    const int64_t* base_model_step_idx,
+    const bool* base_model_stop_flags,
+    const bool* base_model_is_block_step,
+    int64_t* base_model_draft_tokens,
+    const int bsz,
+    const int num_model_step,
+    const int accept_tokens_len,
+    const int draft_tokens_len,
+    const int input_ids_len,
+    const int base_model_draft_tokens_len,
+    const int pre_ids_len,
+    const bool truncate_first_token,
+    const bool splitwise_prefill,
+    const bool kvcache_scheduler_v1) {
+    int cid = core_id();
+    int ncores = core_num();
+    int clusterid = cluster_id();
+    int nclusters = cluster_num();
+    int tid = clusterid * ncores + cid;
+    __shared__ int not_stop_flag_sm[64];
+    not_stop_flag_sm[cid] = 0;
+    int64_t accept_tokens_now[128];
+
+    int value_zero = 0;
+    int64_t value_fu = -1;
+
+    if (splitwise_prefill) {
+        for (; tid < bsz; tid += ncores * nclusters) {
+            int64_t base_model_step_idx_now = 0;
+            int seq_lens_encoder_now = 0;
+            int seq_lens_this_time_now = 0;
+            bool stop_flags_now = false;
+            int64_t base_model_first_token;
+            int seq_lens_encoder_record_now = 0;
+            int64_t input_ids_now = 0;
+
+            GM2LM_ASYNC(base_model_step_idx + tid,
+                        &base_model_step_idx_now,
+                        sizeof(int64_t));
+            GM2LM_ASYNC(
+                seq_lens_encoder + tid, &seq_lens_encoder_now, sizeof(int));
+            GM2LM(accept_tokens + tid * accept_tokens_len,
+                  &base_model_first_token,
+                  sizeof(int64_t));
+            if (seq_lens_encoder_now > 0) {
+                not_stop_flag_sm[cid] += 1;
+                stop_flags_now = false;
+                int position = seq_lens_encoder_now;
+                if (truncate_first_token) {
+                    position = position - 1;
+                    input_ids_now = base_model_first_token;
+                    seq_lens_this_time_now = seq_lens_encoder_now;
+                } else {
+                    input_ids_now = base_model_first_token;
+                    seq_lens_this_time_now = seq_lens_encoder_now + 1;
+                }
+                LM2GM_ASYNC(&input_ids_now,
+                            input_ids + tid * input_ids_len + position,
+                            sizeof(int64_t));
+            } else {
+                stop_flags_now = true;
+                seq_lens_this_time_now = 0;
+                seq_lens_encoder_now = 0;
+                not_stop_flag_sm[cid] += 0;
+                LM2GM_ASYNC(&value_zero, seq_lens_decoder + tid, sizeof(int));
+            }
+            LM2GM_ASYNC(
+                &seq_lens_encoder_now, seq_lens_encoder + tid, sizeof(int));
+            LM2GM_ASYNC(&stop_flags_now, stop_flags + tid, sizeof(bool));
+            LM2GM(
+                &seq_lens_this_time_now, seq_lens_this_time + tid, sizeof(int));
+        }
+    } else {
+        for (; tid < bsz; tid += ncores * nclusters) {
+            bool base_model_stop_flags_now = false;
+            bool base_model_is_block_step_now = false;
+            bool batch_drop_now = false;
+            bool stop_flags_now = false;
+            bool is_block_step_now = false;
+            int seq_lens_this_time_now = 0;
+            int seq_lens_encoder_now = 0;
+            int seq_lens_decoder_new = 0;
+            int accept_num_now = 0;
+            int base_model_seq_lens_decoder_now = 0;
+            int base_model_seq_lens_this_time_now = 0;
+            int64_t step_id_now = 0;
+            int64_t base_model_step_idx_now;
+            int64_t pre_ids_now;
+            mfence();
+            GM2LM_ASYNC(is_block_step + tid, &is_block_step_now, sizeof(bool));
+            GM2LM_ASYNC(base_model_stop_flags + tid,
+                        &base_model_stop_flags_now,
+                        sizeof(bool));
+            GM2LM_ASYNC(base_model_is_block_step + tid,
+                        &base_model_is_block_step_now,
+                        sizeof(bool));
+            GM2LM_ASYNC(batch_drop + tid, &batch_drop_now, sizeof(bool));
+            GM2LM_ASYNC(stop_flags + tid, &stop_flags_now, sizeof(bool));
+            GM2LM_ASYNC(
+                seq_lens_encoder + tid, &seq_lens_encoder_now, sizeof(int));
+            GM2LM_ASYNC(
+                seq_lens_decoder + tid, &seq_lens_decoder_new, sizeof(int));
+
+            GM2LM_ASYNC(accept_tokens + tid * accept_tokens_len,
+                        accept_tokens_now,
+                        accept_tokens_len * sizeof(int64_t));
+            GM2LM_ASYNC(accept_num + tid, &accept_num_now, sizeof(int));
+
+            GM2LM_ASYNC(base_model_seq_lens_this_time + tid,
+                        &base_model_seq_lens_this_time_now,
+                        sizeof(int));
+            GM2LM_ASYNC(base_model_seq_lens_decoder + tid,
+                        &base_model_seq_lens_decoder_now,
+                        sizeof(int));
+            GM2LM_ASYNC(step_idx + tid, &step_id_now, sizeof(int64_t));
+            GM2LM(base_model_step_idx + tid,
+                  &base_model_step_idx_now,
+                  sizeof(int64_t));
+
+            for (int i = 1; i < base_model_draft_tokens_len; i++) {
+                LM2GM_ASYNC(&value_fu,
+                            base_model_draft_tokens +
+                                tid * base_model_draft_tokens_len + i,
+                            sizeof(int));
+            }
+            if (kvcache_scheduler_v1) {
+                if (base_model_stop_flags_now && base_model_is_block_step_now) {
+                    stop_flags_now = true;
+                    is_block_step_now = true;
+                }
+            } else {
+                if (base_model_stop_flags_now && base_model_is_block_step_now) {
+                    batch_drop_now = true;
+                    stop_flags_now = true;
+                }
+            }
+
+            if (!(base_model_stop_flags_now || batch_drop_now)) {
+                not_stop_flag_sm[cid] += 1;
+                if (seq_lens_encoder_now > 0) {
+                    int seq_len_encoder = seq_lens_encoder_now;
+                    stop_flags_now = false;
+                    int64_t base_model_first_token = accept_tokens_now[0];
+                    LM2GM(&base_model_first_token,
+                          pre_ids + tid * pre_ids_len,
+                          sizeof(int64_t));
+                    int position = seq_len_encoder;
+                    if (truncate_first_token) {
+                        LM2GM(&base_model_first_token,
+                              input_ids + tid * input_ids_len + position - 1,
+                              sizeof(int64_t));
+                        seq_lens_this_time_now = seq_len_encoder;
+                    } else {
+                        LM2GM(&base_model_first_token,
+                              input_ids + tid * input_ids_len + position,
+                              sizeof(int64_t));
+                        seq_lens_this_time_now = seq_len_encoder + 1;
+                    }
+                } else {
+                    if (kvcache_scheduler_v1) {
+                        if (!base_model_is_block_step_now &&
+                            is_block_step_now) {
+                            is_block_step_now = false;
+                        }
+                    }
+                    if (stop_flags_now) {
+                        stop_flags_now = false;
+                        seq_lens_decoder_new = base_model_seq_lens_decoder_now -
+                                               base_model_seq_lens_this_time_now;
+                        step_id_now = base_model_step_idx_now -
+                                      base_model_seq_lens_this_time_now;
+
+                    } else {
+                        seq_lens_decoder_new -= num_model_step - 1;
+                        step_id_now -= num_model_step - 1;
+                    }
+                    for (int i = 0; i < accept_num_now; i++) {
+                        const int pre_id_pos =
+                            base_model_step_idx_now - (accept_num_now - i);
+                        LM2GM(accept_tokens_now + i,
+                              draft_tokens + tid * draft_tokens_len + i,
+                              sizeof(int64_t));
+                        LM2GM(accept_tokens_now + i,
+                              pre_ids + tid * pre_ids_len + pre_id_pos,
+                              sizeof(int64_t));
+                    }
+                    seq_lens_this_time_now = accept_num_now;
+                }
+
+            } else {
+                stop_flags_now = true;
+                seq_lens_this_time_now = 0;
+                seq_lens_encoder_now = 0;
+                seq_lens_decoder_new = 0;
+            }
+            LM2GM_ASYNC(&stop_flags_now, stop_flags + tid, sizeof(bool));
+            LM2GM_ASYNC(&batch_drop_now, batch_drop + tid, sizeof(bool));
+            LM2GM_ASYNC(&is_block_step_now, is_block_step + tid, sizeof(bool));
+            LM2GM_ASYNC(
+                &seq_lens_decoder_new, seq_lens_decoder + tid, sizeof(int));
+            LM2GM_ASYNC(
+                &seq_lens_this_time_now, seq_lens_this_time + tid, sizeof(int));
+            LM2GM_ASYNC(
+                &seq_lens_encoder_now, seq_lens_encoder + tid, sizeof(int));
+            LM2GM_ASYNC(&step_id_now, step_idx + tid, sizeof(int64_t));
+        }
+    }
+    mfence();
+    sync_cluster();
+    bool value_true = true;
+    bool value_false = false;
+    if (cid == 0) {
+        for (int i = 0; i < ncores; i++) {
+            not_stop_flag_sm[0] += not_stop_flag_sm[i];
+        }
+        if (not_stop_flag_sm[0] > 0) {
+            LM2GM(&value_true, not_need_stop, sizeof(bool));
+        } else {
+            LM2GM(&value_false, not_need_stop, sizeof(bool));
+        }
+    }
+}
+
+}  // namespace plugin
+}  // namespace xpu3
diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_update.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_update.xpu
index 0334995f9d..50ba31d61b 100644
--- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_update.xpu
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_update.xpu
@@ -60,10 +60,8 @@ __global__ void draft_model_update(const int64_t* inter_next_tokens,
         token_this_time = next_tokens_start[seq_len_this_time - 1];
         draft_token_now[0] = next_tokens_start[seq_len_this_time - 1];
         base_model_draft_tokens_now[substep + 1] = token_this_time;
-        for (int i = 0; i < seq_len_this_time; ++i) {
-          pre_ids_now[step_idx[tid] + 1 + i] = next_tokens_start[i];
-        }
         step_idx[tid] += seq_len_this_time;
+        pre_ids_now[step_idx[tid]] = token_this_time;
       } else {
         token_this_time = next_tokens_start[0];
         seq_lens_decoder[tid] = seq_len_encoder + seq_len_decoder;
diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_get_padding_offset.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_get_padding_offset.xpu
index c08d756d7c..4af74b8f62 100644
--- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_get_padding_offset.xpu
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_get_padding_offset.xpu
@@ -101,6 +101,43 @@ __global__ void speculate_get_padding_offset(int* padding_offset,
   }
 }
 
+
+__global__ void speculate_get_padding_offset_v2(int* batch_id_per_token,
+                                                int* cum_offsets_out,
+                                                int* cu_seqlens_q,
+                                                int* cu_seqlens_k,
+                                                const int* cum_offsets,
+                                                const int* seq_lens,
+                                                const int max_seq_len,
+                                                int bsz) {
+  int bid = cluster_id();
+  int tid = core_id();
+  int ncores = core_num();
+  int nclusters = cluster_num();
+  int seq_lens_now = 0;
+  int cum_offsets_now = 0;
+  int cum_offsets_now_ind = 0;
+  for (int bi = bid; bi < bsz; bi += nclusters) {
+    GM2LM(seq_lens + bi, &seq_lens_now, sizeof(int));
+    if (bi == 0) {
+      cum_offsets_now = 0;
+    } else {
+      GM2LM(cum_offsets + bi - 1, &cum_offsets_now, sizeof(int));
+    }
+    GM2LM(cum_offsets + bi, &cum_offsets_now_ind, sizeof(int));
+
+    for (int i = tid; i < seq_lens_now; i += ncores) {
+      LM2GM(&bi,
+            batch_id_per_token + bi * max_seq_len - cum_offsets_now + i,
+            sizeof(int));
+    }
+    LM2GM(&cum_offsets_now, cum_offsets_out + bi, sizeof(int));
+    int cum_seq_len = (bi + 1) * max_seq_len - cum_offsets_now_ind;
+    LM2GM(&cum_seq_len, cu_seqlens_q + bi + 1, sizeof(int));
+    LM2GM(&cum_seq_len, cu_seqlens_k + bi + 1, sizeof(int));
+  }
+}
+
 #define _XPU_DEF_SPECULATE_KERNELS_(T)                             \
   template __global__ void speculate_remove_padding<T>(T*,         \
                                                        const T*,   \
diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_verify.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_verify.xpu
index 68eb2bd606..4287c3e7d8 100644
--- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_verify.xpu
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_verify.xpu
@@ -138,7 +138,8 @@ __global__ void speculate_verify(
     const int max_candidate_len,  // scalar, 每个 verify token
                                   // 的最大候选数（用于验证或采样）
     const int verify_window,  // scalar, TopK 验证窗口（允许连续 top1 匹配次数）
-    const bool prefill_one_step_stop) {
+    const bool prefill_one_step_stop,
+    const bool benchmark_mode) {
   const int cid = core_id();
   const int64_t tid = cluster_id() * core_num() + core_id();
   const int64_t nthreads = cluster_num() * core_num();
@@ -161,6 +162,9 @@ __global__ void speculate_verify(
       // printf("seq_lens_this_time[%d]-1: %d \n",bid,
       // seq_lens_this_time[bid]-1);
       for (; i < seq_lens_this_time[bid] - 1; i++) {
+        if(benchmark_mode){
+          break;
+        }
         if (seq_lens_encoder[bid] != 0) {
           break;
         }
@@ -300,33 +304,34 @@ __global__ void speculate_verify(
     }
   }
 }
-#define SPECULATE_VERIFY_INSTANTIATE(ENABLE_TOPP, USE_TOPK)         \
-  template __global__ void speculate_verify<ENABLE_TOPP, USE_TOPK>( \
-      int64_t * accept_tokens,                                      \
-      int *accept_num,                                              \
-      int64_t *step_idx,                                            \
-      bool *stop_flags,                                             \
-      const int *seq_lens_encoder,                                  \
-      const int *seq_lens_decoder,                                  \
-      const int64_t *draft_tokens,                                  \
-      const int *actual_draft_token_nums,                           \
-      const float *dev_curand_states,                               \
-      const float *topp,                                            \
-      const int *seq_lens_this_time,                                \
-      const int64_t *verify_tokens,                                 \
-      const float *verify_scores,                                   \
-      const int64_t *max_dec_len,                                   \
-      const int64_t *end_tokens,                                    \
-      const bool *is_block_step,                                    \
-      const int *output_cum_offsets,                                \
-      const int *actual_candidate_len,                              \
-      int real_bsz,                                                 \
-      int max_draft_tokens,                                         \
-      int end_length,                                               \
-      int max_seq_len,                                              \
-      int max_candidate_len,                                        \
-      int verify_window,                                            \
-      bool prefill_one_step_stop);
+#define SPECULATE_VERIFY_INSTANTIATE(ENABLE_TOPP, USE_TOPK)           \
+    template __global__ void speculate_verify<ENABLE_TOPP, USE_TOPK>( \
+        int64_t * accept_tokens,                                      \
+        int *accept_num,                                              \
+        int64_t *step_idx,                                            \
+        bool *stop_flags,                                             \
+        const int *seq_lens_encoder,                                  \
+        const int *seq_lens_decoder,                                  \
+        const int64_t *draft_tokens,                                  \
+        const int *actual_draft_token_nums,                           \
+        const float *dev_curand_states,                               \
+        const float *topp,                                            \
+        const int *seq_lens_this_time,                                \
+        const int64_t *verify_tokens,                                 \
+        const float *verify_scores,                                   \
+        const int64_t *max_dec_len,                                   \
+        const int64_t *end_tokens,                                    \
+        const bool *is_block_step,                                    \
+        const int *output_cum_offsets,                                \
+        const int *actual_candidate_len,                              \
+        int real_bsz,                                                 \
+        int max_draft_tokens,                                         \
+        int end_length,                                               \
+        int max_seq_len,                                              \
+        int max_candidate_len,                                        \
+        int verify_window,                                            \
+        bool prefill_one_step_stop,                                   \
+        bool benchmark_mode);
 SPECULATE_VERIFY_INSTANTIATE(true, true)
 SPECULATE_VERIFY_INSTANTIATE(true, false)
 SPECULATE_VERIFY_INSTANTIATE(false, true)
diff --git a/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/draft_model_preprocess_v2.cpp b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/draft_model_preprocess_v2.cpp
new file mode 100644
index 0000000000..3eedc4e67f
--- /dev/null
+++ b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/draft_model_preprocess_v2.cpp
@@ -0,0 +1,425 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xpu/plugin.h"
+#include "xpu/refactor/impl/launch_strategy.h"
+#include "xpu/refactor/impl_public/wrapper_check.h"
+#include "xpu/xdnn.h"
+
+namespace xpu3 {
+namespace plugin {
+__attribute__((global)) void draft_model_preprocess_v2(
+    int64_t* draft_tokens,
+    int64_t* input_ids,
+    bool* stop_flags,
+    int* seq_lens_this_time,
+    int* seq_lens_encoder,
+    int* seq_lens_decoder,
+    int64_t* step_idx,
+    bool* not_need_stop,
+    bool* is_block_step,
+    bool* batch_drop,
+    int64_t* pre_ids,
+    const int64_t* accept_tokens,
+    const int* accept_num,
+    const int* base_model_seq_lens_this_time,
+    const int* base_model_seq_lens_encoder,
+    const int* base_model_seq_lens_decoder,
+    const int64_t* base_model_step_idx,
+    const bool* base_model_stop_flags,
+    const bool* base_model_is_block_step,
+    int64_t* base_model_draft_tokens,
+    const int bsz,
+    const int num_model_step,
+    const int accept_tokens_len,
+    const int draft_tokens_len,
+    const int input_ids_len,
+    const int base_model_draft_tokens_len,
+    const int pre_ids_len,
+    const bool truncate_first_token,
+    const bool splitwise_prefill,
+    const bool kvcache_scheduler_v1);
+}  // namespace plugin
+}  // namespace xpu3
+
+namespace xpu2 {
+namespace plugin {}  // namespace plugin
+}  // namespace xpu2
+
+namespace baidu {
+namespace xpu {
+namespace api {
+namespace plugin {
+
+static int cpu_wrapper(api::Context* ctx,
+                       int64_t* draft_tokens,
+                       int64_t* input_ids,
+                       bool* stop_flags,
+                       int* seq_lens_this_time,
+                       int* seq_lens_encoder,
+                       int* seq_lens_decoder,
+                       int64_t* step_idx,
+                       bool* not_need_stop,
+                       bool* is_block_step,
+                       bool* batch_drop,
+                       int64_t* pre_ids,
+                       const int64_t* accept_tokens,
+                       const int* accept_num,
+                       const int* base_model_seq_lens_this_time,
+                       const int* base_model_seq_lens_encoder,
+                       const int* base_model_seq_lens_decoder,
+                       const int64_t* base_model_step_idx,
+                       const bool* base_model_stop_flags,
+                       const bool* base_model_is_block_step,
+                       int64_t* base_model_draft_tokens,
+                       const int bsz,
+                       const int num_model_step,
+                       const int accept_tokens_len,
+                       const int draft_tokens_len,
+                       const int input_ids_len,
+                       const int base_model_draft_tokens_len,
+                       const int pre_ids_len,
+                       const bool truncate_first_token,
+                       const bool splitwise_prefill,
+                       const bool kvcache_scheduler_v1) {
+    int64_t not_stop_flag_sum = 0;
+    int64_t not_stop_flag = 0;
+    for (int tid = 0; tid < bsz; tid++) {
+        if (splitwise_prefill) {
+            auto* input_ids_now = input_ids + tid * input_ids_len;
+            auto* accept_tokens_now = accept_tokens + tid * accept_tokens_len;
+            if (seq_lens_encoder[tid] > 0) {
+                not_stop_flag = 1;
+                int seq_len_encoder = seq_lens_encoder[tid];
+                stop_flags[tid] = false;
+                int64_t base_model_first_token = accept_tokens_now[0];
+                int position = seq_len_encoder;
+                if (truncate_first_token) {
+                    input_ids_now[position - 1] = base_model_first_token;
+                    seq_lens_this_time[tid] = seq_len_encoder;
+                } else {
+                    input_ids_now[position] = base_model_first_token;
+                    seq_lens_this_time[tid] = seq_len_encoder + 1;
+                }
+            } else {
+                stop_flags[tid] = true;
+                seq_lens_this_time[tid] = 0;
+                seq_lens_decoder[tid] = 0;
+                seq_lens_encoder[tid] = 0;
+                not_stop_flag = 0;
+            }
+            not_stop_flag_sum += not_stop_flag;
+        } else {
+            auto* accept_tokens_now = accept_tokens + tid * accept_tokens_len;
+            auto* draft_tokens_now = draft_tokens + tid * draft_tokens_len;
+            auto accept_num_now = accept_num[tid];
+            auto* input_ids_now = input_ids + tid * input_ids_len;
+            auto* base_model_draft_tokens_now =
+                base_model_draft_tokens + tid * base_model_draft_tokens_len;
+            auto base_model_seq_len_decoder = base_model_seq_lens_decoder[tid];
+            const int32_t base_model_seq_len_this_time =
+                base_model_seq_lens_this_time[tid];
+            auto* pre_ids_now = pre_ids + tid * pre_ids_len;
+            for (int i = 1; i < base_model_draft_tokens_len; i++) {
+                base_model_draft_tokens_now[i] = -1;
+            }
+            if(kvcache_scheduler_v1) {
+                if (base_model_stop_flags[tid] &&
+                    base_model_is_block_step[tid]) {
+                    stop_flags[tid] = true;
+                    is_block_step[tid] = true;
+                    // Need to continue infer
+                }
+            } else {
+                if (base_model_stop_flags[tid] &&
+                    base_model_is_block_step[tid]) {
+                    batch_drop[tid] = true;
+                    stop_flags[tid] = true;
+                }
+            }
+
+            if (!(base_model_stop_flags[tid] || batch_drop[tid])) {
+                not_stop_flag = 1;
+                // prefill generation
+                if (seq_lens_encoder[tid] > 0) {
+                    // Can be extended to first few tokens
+                    int seq_len_encoder = seq_lens_encoder[tid];
+                    stop_flags[tid] = false;
+                    int64_t base_model_first_token = accept_tokens_now[0];
+                    pre_ids_now[0] = base_model_first_token;
+                    int position = seq_len_encoder;
+                    if (truncate_first_token) {
+                        input_ids_now[position - 1] = base_model_first_token;
+                        seq_lens_this_time[tid] = seq_len_encoder;
+                    } else {
+                        input_ids_now[position] = base_model_first_token;
+                        seq_lens_this_time[tid] = seq_len_encoder + 1;
+                    }
+                } else {  // decode generation
+                    if(kvcache_scheduler_v1) {
+                        // 3. try to recover mtp infer in V1 mode
+                        if (!base_model_is_block_step[tid] &&
+                            is_block_step[tid]) {
+                            is_block_step[tid] = false;
+                        }
+                    }
+                    if (stop_flags[tid]) {
+                        stop_flags[tid] = false;
+                        // TODO: check
+                        seq_lens_decoder[tid] = base_model_seq_len_decoder -
+                                                base_model_seq_len_this_time;
+                        step_idx[tid] = base_model_step_idx[tid] -
+                                        base_model_seq_len_this_time;
+                    } else {
+                        // 2: Last base model generated token and first MTP
+                        // token
+                        seq_lens_decoder[tid] -= num_model_step - 1;
+                        step_idx[tid] -= num_model_step - 1;
+                    }
+                    for (int i = 0; i < accept_num_now; i++) {
+                        draft_tokens_now[i] = accept_tokens_now[i];
+                        const int pre_id_pos =
+                            base_model_step_idx[tid] - (accept_num_now - i);
+                        const int64_t accept_token = accept_tokens_now[i];
+                        pre_ids_now[pre_id_pos] = accept_token;
+                    }
+                    seq_lens_this_time[tid] = accept_num_now;
+                }
+            } else {
+                stop_flags[tid] = true;
+                seq_lens_this_time[tid] = 0;
+                seq_lens_decoder[tid] = 0;
+                seq_lens_encoder[tid] = 0;
+            }
+            not_stop_flag_sum += not_stop_flag;
+        }
+    }
+    not_need_stop[0] = not_stop_flag_sum > 0;
+    return api::SUCCESS;
+}
+
+static int xpu3_wrapper(api::Context* ctx,
+                        int64_t* draft_tokens,
+                        int64_t* input_ids,
+                        bool* stop_flags,
+                        int* seq_lens_this_time,
+                        int* seq_lens_encoder,
+                        int* seq_lens_decoder,
+                        int64_t* step_idx,
+                        bool* not_need_stop,
+                        bool* is_block_step,
+                        bool* batch_drop,
+                        int64_t* pre_ids,
+                        const int64_t* accept_tokens,
+                        const int* accept_num,
+                        const int* base_model_seq_lens_this_time,
+                        const int* base_model_seq_lens_encoder,
+                        const int* base_model_seq_lens_decoder,
+                        const int64_t* base_model_step_idx,
+                        const bool* base_model_stop_flags,
+                        const bool* base_model_is_block_step,
+                        int64_t* base_model_draft_tokens,
+                        const int bsz,
+                        const int num_model_step,
+                        const int accept_tokens_len,
+                        const int draft_tokens_len,
+                        const int input_ids_len,
+                        const int base_model_draft_tokens_len,
+                        const int pre_ids_len,
+                        const bool truncate_first_token,
+                        const bool splitwise_prefill,
+                        const bool kvcache_scheduler_v1) {
+    using XPU_INT64 = typename XPUIndexType<int64_t>::type;
+
+    // NOTE: Don't change 16 to 64, because kernel use gsm
+    xpu3::plugin::draft_model_preprocess_v2<<<1, 64, ctx->xpu_stream>>>(
+        reinterpret_cast<XPU_INT64*>(draft_tokens),
+        reinterpret_cast<XPU_INT64*>(input_ids),
+        stop_flags,
+        seq_lens_this_time,
+        seq_lens_encoder,
+        seq_lens_decoder,
+        reinterpret_cast<XPU_INT64*>(step_idx),
+        not_need_stop,
+        is_block_step,
+        batch_drop,
+        reinterpret_cast<XPU_INT64*>(pre_ids),
+        reinterpret_cast<const XPU_INT64*>(accept_tokens),
+        accept_num,
+        base_model_seq_lens_this_time,
+        base_model_seq_lens_encoder,
+        base_model_seq_lens_decoder,
+        reinterpret_cast<const XPU_INT64*>(base_model_step_idx),
+        base_model_stop_flags,
+        base_model_is_block_step,
+        reinterpret_cast<XPU_INT64*>(base_model_draft_tokens),
+        bsz,
+        num_model_step,
+        accept_tokens_len,
+        draft_tokens_len,
+        input_ids_len,
+        base_model_draft_tokens_len,
+        pre_ids_len,
+        truncate_first_token,
+        splitwise_prefill,
+        kvcache_scheduler_v1);
+    return api::SUCCESS;
+}
+
+int draft_model_preprocess_v2(api::Context* ctx,
+                              int64_t* draft_tokens,
+                              int64_t* input_ids,
+                              bool* stop_flags,
+                              int* seq_lens_this_time,
+                              int* seq_lens_encoder,
+                              int* seq_lens_decoder,
+                              int64_t* step_idx,
+                              bool* not_need_stop,
+                              bool* is_block_step,
+                              bool* batch_drop,
+                              int64_t* pre_ids,
+                              const int64_t* accept_tokens,
+                              const int* accept_num,
+                              const int* base_model_seq_lens_this_time,
+                              const int* base_model_seq_lens_encoder,
+                              const int* base_model_seq_lens_decoder,
+                              const int64_t* base_model_step_idx,
+                              const bool* base_model_stop_flags,
+                              const bool* base_model_is_block_step,
+                              int64_t* base_model_draft_tokens,
+                              const int bsz,
+                              const int num_model_step,
+                              const int accept_tokens_len,
+                              const int draft_tokens_len,
+                              const int input_ids_len,
+                              const int base_model_draft_tokens_len,
+                              const int pre_ids_len,
+                              const bool truncate_first_token,
+                              const bool splitwise_prefill,
+                              const bool kvcache_scheduler_v1) {
+    WRAPPER_CHECK_CTX(ctx);
+    WRAPPER_DUMP_FUNCTION_T1(ctx, "draft_model_preprocess_v2", int64_t);
+    WRAPPER_DUMP_PARAM6(ctx,
+                        draft_tokens,
+                        input_ids,
+                        stop_flags,
+                        seq_lens_this_time,
+                        seq_lens_encoder,
+                        seq_lens_decoder);
+    WRAPPER_DUMP_PARAM5(
+        ctx, step_idx, not_need_stop, is_block_step, batch_drop, pre_ids);
+    WRAPPER_DUMP_PARAM3(
+        ctx, accept_tokens, accept_num, base_model_seq_lens_encoder);
+    WRAPPER_DUMP_PARAM4(ctx,
+                        base_model_seq_lens_encoder,
+                        base_model_seq_lens_decoder,
+                        base_model_step_idx,
+                        base_model_stop_flags);
+    WRAPPER_DUMP_PARAM3(
+        ctx, base_model_is_block_step, base_model_draft_tokens, bsz);
+    WRAPPER_DUMP_PARAM3(
+        ctx, num_model_step, accept_tokens_len, draft_tokens_len);
+    WRAPPER_DUMP_PARAM4(ctx,
+                        input_ids_len,
+                        base_model_draft_tokens_len,
+                        pre_ids_len,
+                        truncate_first_token);
+    WRAPPER_DUMP_PARAM2(ctx, splitwise_prefill, kvcache_scheduler_v1);
+    WRAPPER_DUMP(ctx);
+
+    WRAPPER_CHECK_PTR(ctx, int, bsz, seq_lens_this_time);
+    WRAPPER_CHECK_PTR(ctx, int64_t, bsz * accept_tokens_len, accept_tokens);
+    WRAPPER_CHECK_PTR(ctx, int64_t, bsz * input_ids_len, input_ids);
+    WRAPPER_CHECK_PTR(ctx, int64_t, bsz * draft_tokens_len, draft_tokens);
+    WRAPPER_CHECK_PTR(ctx,
+                      int64_t,
+                      bsz * base_model_draft_tokens_len,
+                      base_model_draft_tokens);
+
+    WRAPPER_ASSERT_GT(ctx, bsz, 0);
+    WRAPPER_ASSERT_LT(ctx, accept_tokens_len, 128);
+
+    if (ctx->dev().type() == api::kCPU) {
+        return cpu_wrapper(ctx,
+                           draft_tokens,
+                           input_ids,
+                           stop_flags,
+                           seq_lens_this_time,
+                           seq_lens_encoder,
+                           seq_lens_decoder,
+                           step_idx,
+                           not_need_stop,
+                           is_block_step,
+                           batch_drop,
+                           pre_ids,
+                           accept_tokens,
+                           accept_num,
+                           base_model_seq_lens_this_time,
+                           base_model_seq_lens_encoder,
+                           base_model_seq_lens_decoder,
+                           base_model_step_idx,
+                           base_model_stop_flags,
+                           base_model_is_block_step,
+                           base_model_draft_tokens,
+                           bsz,
+                           num_model_step,
+                           accept_tokens_len,
+                           draft_tokens_len,
+                           input_ids_len,
+                           base_model_draft_tokens_len,
+                           pre_ids_len,
+                           truncate_first_token,
+                           splitwise_prefill,
+                           kvcache_scheduler_v1);
+    }
+    if (ctx->dev().type() == api::kXPU3) {
+        return xpu3_wrapper(ctx,
+                            draft_tokens,
+                            input_ids,
+                            stop_flags,
+                            seq_lens_this_time,
+                            seq_lens_encoder,
+                            seq_lens_decoder,
+                            step_idx,
+                            not_need_stop,
+                            is_block_step,
+                            batch_drop,
+                            pre_ids,
+                            accept_tokens,
+                            accept_num,
+                            base_model_seq_lens_this_time,
+                            base_model_seq_lens_encoder,
+                            base_model_seq_lens_decoder,
+                            base_model_step_idx,
+                            base_model_stop_flags,
+                            base_model_is_block_step,
+                            base_model_draft_tokens,
+                            bsz,
+                            num_model_step,
+                            accept_tokens_len,
+                            draft_tokens_len,
+                            input_ids_len,
+                            base_model_draft_tokens_len,
+                            pre_ids_len,
+                            truncate_first_token,
+                            splitwise_prefill,
+                            kvcache_scheduler_v1);
+    }
+    WRAPPER_UNIMPLEMENTED(ctx);
+}
+
+}  // namespace plugin
+}  // namespace api
+}  // namespace xpu
+}  // namespace baidu
diff --git a/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_get_padding_offset.cpp b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_get_padding_offset.cpp
index a0066e4557..0886a0196a 100644
--- a/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_get_padding_offset.cpp
+++ b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_get_padding_offset.cpp
@@ -42,6 +42,16 @@ __attribute__((global)) void speculate_get_padding_offset(
     const int max_seq_len,
     int bsz);
 
+__attribute__((global)) void speculate_get_padding_offset_v2(
+    int* batch_id_per_token,
+    int* cum_offsets_out,
+    int* cu_seqlens_q,
+    int* cu_seqlens_k,
+    const int* cum_offsets,
+    const int* seq_lens,
+    const int max_seq_len,
+    int bsz);
+
 }  // namespace plugin
 }  // namespace xpu3
 
@@ -99,6 +109,29 @@ static int cpu_wrapper_get_padding_offset(Context* ctx,
   return api::SUCCESS;
 }
 
+
+static int cpu_wrapper_get_padding_offset_v2(Context* ctx,
+                                            int* batch_id_per_token,
+                                            int* cum_offsets_out,
+                                            int* cu_seqlens_q,
+                                            int* cu_seqlens_k,
+                                            const int* cum_offsets,
+                                            const int* seq_lens,
+                                            const int max_seq_len,
+                                            int bsz) {
+  for (int bi = 0; bi < bsz; ++bi) {
+    int cum_offset = bi == 0 ? 0 : cum_offsets[bi - 1];
+    for (int i = 0; i < seq_lens[bi]; i++) {
+      batch_id_per_token[bi * max_seq_len - cum_offset + i] = bi;
+    }
+    cum_offsets_out[bi] = cum_offset;
+    int cum_seq_len = (bi + 1) * max_seq_len - cum_offsets[bi];
+    cu_seqlens_q[bi + 1] = cum_seq_len;
+    cu_seqlens_k[bi + 1] = cum_seq_len;
+  }
+  return api::SUCCESS;
+}
+
 template <typename T>
 static int xpu3_wrapper_remove_padding(Context* ctx,
                                        T* output_data,
@@ -150,6 +183,29 @@ static int xpu3_wrapper_get_padding_offset(Context* ctx,
   return api::SUCCESS;
 }
 
+static int xpu3_wrapper_get_padding_offset_v2(Context* ctx,
+                                           int* batch_id_per_token,
+                                           int* cum_offsets_out,
+                                           int* cu_seqlens_q,
+                                           int* cu_seqlens_k,
+                                           const int* cum_offsets,
+                                           const int* seq_lens,
+                                           const int max_seq_len,
+                                           int bsz) {
+  xpu3::plugin::
+      speculate_get_padding_offset_v2<<<ctx->ncluster(), 64, ctx->xpu_stream>>>(
+          batch_id_per_token,
+          cum_offsets_out,
+          cu_seqlens_q,
+          cu_seqlens_k,
+          cum_offsets,
+          seq_lens,
+          max_seq_len,
+          bsz);
+  return api::SUCCESS;
+}
+
+
 template <typename T>
 int speculate_remove_padding(Context* ctx,
                              T* x_remove_padding,
@@ -271,6 +327,63 @@ int speculate_get_padding_offset(Context* ctx,
   WRAPPER_UNIMPLEMENTED(ctx);
 }
 
+int speculate_get_padding_offset_v2(Context* ctx,
+                                    int* batch_id_per_token,
+                                    int* cum_offsets_out,
+                                    int* cu_seqlens_q,
+                                    int* cu_seqlens_k,
+                                    const int* cum_offsets,
+                                    const int* seq_lens,
+                                    const int max_seq_len,
+                                    int bsz) {
+  WRAPPER_CHECK_CTX(ctx);
+
+  WRAPPER_DUMP_FUNCTION_T1(ctx, "speculate_get_padding_offset", float);
+  WRAPPER_DUMP_PARAM6(ctx,
+                      batch_id_per_token,
+                      cum_offsets_out,
+                      cu_seqlens_q,
+                      cu_seqlens_k,
+                      cum_offsets,
+                      seq_lens);
+  WRAPPER_DUMP_PARAM2(ctx, max_seq_len, bsz);
+  WRAPPER_DUMP(ctx);
+
+  WRAPPER_CHECK_PTR(ctx, int, bsz, cum_offsets);
+  WRAPPER_CHECK_PTR(ctx, int, bsz, seq_lens);
+  WRAPPER_CHECK_PTR(ctx, int, bsz, cum_offsets_out);
+  WRAPPER_CHECK_PTR(ctx, int, bsz + 1, cu_seqlens_q);
+  WRAPPER_CHECK_PTR(ctx, int, bsz + 1, cu_seqlens_k);
+
+  WRAPPER_ASSERT_GT(ctx, bsz, 0);
+  WRAPPER_ASSERT_GT(ctx, max_seq_len, 0);
+
+  if (ctx->dev().type() == api::kCPU) {
+    return cpu_wrapper_get_padding_offset_v2(ctx,
+                                          batch_id_per_token,
+                                          cum_offsets_out,
+                                          cu_seqlens_q,
+                                          cu_seqlens_k,
+                                          cum_offsets,
+                                          seq_lens,
+                                          max_seq_len,
+                                          bsz);
+  }
+  if (ctx->dev().type() == api::kXPU3) {
+    return xpu3_wrapper_get_padding_offset_v2(ctx,
+                                           batch_id_per_token,
+                                           cum_offsets_out,
+                                           cu_seqlens_q,
+                                           cu_seqlens_k,
+                                           cum_offsets,
+                                           seq_lens,
+                                           max_seq_len,
+                                           bsz);
+  }
+
+  WRAPPER_UNIMPLEMENTED(ctx);
+}
+
 #define INSTANTIATION_SPECULATE_REMOVE_PADDING(T)                       \
   template int speculate_remove_padding<T>(Context * ctx,               \
                                            T * x_remove_padding,        \
diff --git a/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_verify.cpp b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_verify.cpp
index c5e3e425b7..c9571bd513 100644
--- a/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_verify.cpp
+++ b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_verify.cpp
@@ -48,7 +48,8 @@ __attribute__((global)) void speculate_verify(
     const int max_seq_len,
     const int max_candidate_len,
     const int verify_window,
-    const bool prefill_one_step_stop);
+    const bool prefill_one_step_stop,
+    const bool benchmark_mode);
 }  // namespace plugin
 }  // namespace xpu3
 
@@ -136,14 +137,15 @@ static int cpu_wrapper(Context *ctx,
                        const int max_seq_len,
                        const int max_candidate_len,
                        const int verify_window,
-                       const bool prefill_one_step_stop) {
+                       const bool prefill_one_step_stop,
+                       const bool benchmark_mode) {
   for (int bid = 0; bid < real_bsz; ++bid) {
-    const int start_token_id = bid * max_seq_len - output_cum_offsets[bid];
     // verify and set stop flags
     int accept_num_now = 1;
     int stop_flag_now_int = 0;
 
     if (!(is_block_step[bid] || bid >= real_bsz)) {
+      const int start_token_id = bid * max_seq_len - output_cum_offsets[bid];
       // printf("debug cpu bid:%d,start_token_id:%d\n",bid, start_token_id);
       // printf("bid %d\n", bid);
 
@@ -160,6 +162,9 @@ static int cpu_wrapper(Context *ctx,
         // printf("seq_lens_this_time[%d]-1: %d \n",bid,
         // seq_lens_this_time[bid]-1);
         for (; i < seq_lens_this_time[bid] - 1; i++) {
+          if(benchmark_mode){
+            break;
+          }
           if (seq_lens_encoder[bid] != 0) {
             break;
           }
@@ -326,7 +331,8 @@ static int xpu3_wrapper(Context *ctx,
                         const int max_seq_len,
                         const int max_candidate_len,
                         const int verify_window,
-                        const bool prefill_one_step_stop) {
+                        const bool prefill_one_step_stop,
+                        const bool benchmark_mode) {
   using XPU_INT64 = typename XPUIndexType<int64_t>::type;
   xpu3::plugin::speculate_verify<ENABLE_TOPP, USE_TOPK>
       <<<1, 64, ctx->xpu_stream>>>(
@@ -354,7 +360,8 @@ static int xpu3_wrapper(Context *ctx,
           max_seq_len,
           max_candidate_len,
           verify_window,
-          prefill_one_step_stop);
+          prefill_one_step_stop,
+          benchmark_mode);
   return api::SUCCESS;
 }
 template <bool ENABLE_TOPP, bool USE_TOPK>
@@ -383,7 +390,8 @@ int speculate_verify(Context *ctx,
                      const int max_seq_len,
                      const int max_candidate_len,
                      const int verify_window,
-                     const bool prefill_one_step_stop) {
+                     const bool prefill_one_step_stop,
+                     const bool benchmark_mode) {
   WRAPPER_CHECK_CTX(ctx);
   WRAPPER_DUMP_FUNCTION_T1(ctx, "speculate_verify", int64_t);
   WRAPPER_DUMP_PARAM3(ctx, accept_tokens, accept_num, step_idx);
@@ -406,12 +414,13 @@ int speculate_verify(Context *ctx,
                       actual_candidate_len,
                       real_bsz,
                       max_draft_tokens);
-  WRAPPER_DUMP_PARAM5(ctx,
+  WRAPPER_DUMP_PARAM6(ctx,
                       end_length,
                       max_seq_len,
                       max_candidate_len,
                       verify_window,
-                      prefill_one_step_stop);
+                      prefill_one_step_stop,
+                      benchmark_mode);
   WRAPPER_DUMP(ctx);
   WRAPPER_CHECK_PTR(ctx, int64_t, real_bsz * max_draft_tokens, accept_tokens);
   WRAPPER_CHECK_PTR(ctx, int, real_bsz, accept_num);
@@ -469,7 +478,8 @@ int speculate_verify(Context *ctx,
                                               max_seq_len,
                                               max_candidate_len,
                                               verify_window,
-                                              prefill_one_step_stop);
+                                              prefill_one_step_stop,
+                                              benchmark_mode);
   }
   if (ctx->dev().type() == api::kXPU3) {
     return xpu3_wrapper<ENABLE_TOPP, USE_TOPK>(ctx,
@@ -497,40 +507,42 @@ int speculate_verify(Context *ctx,
                                                max_seq_len,
                                                max_candidate_len,
                                                verify_window,
-                                               prefill_one_step_stop);
+                                               prefill_one_step_stop,
+                                               benchmark_mode);
   }
   WRAPPER_UNIMPLEMENTED(ctx);
 }
 
-#define INSTANTIATE_SPECULATE_VERIFY(ENABLE_TOPP, USE_TOPK)         \
-  template int                                                      \
-  baidu::xpu::api::plugin::speculate_verify<ENABLE_TOPP, USE_TOPK>( \
-      baidu::xpu::api::Context *, /* xpu_ctx */                     \
-      int64_t *,                  /* accept_tokens */               \
-      int *,                      /* accept_num */                  \
-      int64_t *,                  /* step_idx */                    \
-      bool *,                     /* stop_flags */                  \
-      const int *,                /* seq_lens_encoder */            \
-      const int *,                /* seq_lens_decoder */            \
-      const int64_t *,            /* draft_tokens */                \
-      const int *,                /* actual_draft_token_nums */     \
-      const float *,              /* dev_curand_states or topp */   \
-      const float *,              /* topp or nullptr */             \
-      const int *,                /* seq_lens_this_time */          \
-      const int64_t *,            /* verify_tokens */               \
-      const float *,              /* verify_scores */               \
-      const int64_t *,            /* max_dec_len */                 \
-      const int64_t *,            /* end_tokens */                  \
-      const bool *,               /* is_block_step */               \
-      const int *,                /* output_cum_offsets */          \
-      const int *,                /* actual_candidate_len */        \
-      int,                        /* real_bsz */                    \
-      int,                        /* max_draft_tokens */            \
-      int,                        /* end_length */                  \
-      int,                        /* max_seq_len */                 \
-      int,                        /* max_candidate_len */           \
-      int,                        /* verify_window */               \
-      bool);                      /* prefill_one_step_stop */
+#define INSTANTIATE_SPECULATE_VERIFY(ENABLE_TOPP, USE_TOPK)           \
+    template int                                                      \
+    baidu::xpu::api::plugin::speculate_verify<ENABLE_TOPP, USE_TOPK>( \
+        baidu::xpu::api::Context *, /* xpu_ctx */                     \
+        int64_t *,                  /* accept_tokens */               \
+        int *,                      /* accept_num */                  \
+        int64_t *,                  /* step_idx */                    \
+        bool *,                     /* stop_flags */                  \
+        const int *,                /* seq_lens_encoder */            \
+        const int *,                /* seq_lens_decoder */            \
+        const int64_t *,            /* draft_tokens */                \
+        const int *,                /* actual_draft_token_nums */     \
+        const float *,              /* dev_curand_states or topp */   \
+        const float *,              /* topp or nullptr */             \
+        const int *,                /* seq_lens_this_time */          \
+        const int64_t *,            /* verify_tokens */               \
+        const float *,              /* verify_scores */               \
+        const int64_t *,            /* max_dec_len */                 \
+        const int64_t *,            /* end_tokens */                  \
+        const bool *,               /* is_block_step */               \
+        const int *,                /* output_cum_offsets */          \
+        const int *,                /* actual_candidate_len */        \
+        int,                        /* real_bsz */                    \
+        int,                        /* max_draft_tokens */            \
+        int,                        /* end_length */                  \
+        int,                        /* max_seq_len */                 \
+        int,                        /* max_candidate_len */           \
+        int,                        /* verify_window */               \
+        bool,                                                         \
+        bool); /* prefill_one_step_stop */
 
 INSTANTIATE_SPECULATE_VERIFY(false, false)
 INSTANTIATE_SPECULATE_VERIFY(false, true)
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index bde7d2e9a7..1fa36d875e 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -959,6 +959,7 @@ def from_cli_args(cls, args: FlexibleArgumentParser) -> "EngineArgs":
     def create_speculative_config(self) -> SpeculativeConfig:
         """ """
         speculative_args = asdict(self)
+        print("ch -- debug ", speculative_args)
         if self.speculative_config is not None:
             for k, v in self.speculative_config.items():
                 speculative_args[k] = v
diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py
index 0114bb53f9..7d3d9044e3 100644
--- a/fastdeploy/model_executor/forward_meta.py
+++ b/fastdeploy/model_executor/forward_meta.py
@@ -246,7 +246,8 @@ class XPUForwardMeta(ForwardMeta):
     total_enc_len: Optional[paddle.Tensor] = None
     # position embedding type in rope, supports 'NORMAL' or 'HALF_HEAD_DIM'
     pos_emb_type: Optional[str] = "NORMAL"
-
+    #
+    valid_bs_cpu = None
 
 @dataclass
 class DCUForwardMeta(ForwardMeta):
diff --git a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
index 3e48ab81f5..8401fd0d62 100644
--- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
@@ -72,6 +72,8 @@ def __init__(
         kv_num_heads: int,
         num_heads: int,
         head_dim: int,
+        encoder_block_shape_q: int = -1, # do not use now
+        decoder_block_shape_q: int = -1, # do not use now
     ):
         """
         XPUAttentionBackend __init__
@@ -175,7 +177,7 @@ def forward_mixed(
             qkv,
             forward_meta.caches[2 * layer.layer_id],
             forward_meta.caches[2 * layer.layer_id + 1],
-            forward_meta.cum_offsets,
+            forward_meta.seq_lens_this_time if forward_meta.seq_lens_this_time is not None else forward_meta.cum_offsets,
             metadata.rotary_embs,
             metadata.block_tables,
             forward_meta.prefix_block_tables,
diff --git a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
index 04a8ab1024..4e9a99d6ab 100644
--- a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
+++ b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
@@ -182,24 +182,27 @@ def apply_speculative_penalty_multi_scores(
         from fastdeploy.model_executor.ops.gpu import (
             speculate_get_token_penalty_multi_scores,
         )
-
-        speculate_get_token_penalty_multi_scores(
-            pre_token_ids,
-            logits,
-            repetition_penalties,
-            frequency_penalties,
-            presence_penalties,
-            temperature,
-            bad_words_token_ids,
-            step_idx,
-            min_dec_lens,
-            eos_token_ids,
-            seq_lens_this_time,
-            output_padding_offset,
-            output_cum_offsets,
-            max_len,
+    elif current_platform.is_xpu():
+        from fastdeploy.model_executor.ops.xpu import (
+            speculate_get_token_penalty_multi_scores,
         )
     else:
         raise NotImplementedError
+    speculate_get_token_penalty_multi_scores(
+        pre_token_ids,
+        logits,
+        repetition_penalties,
+        frequency_penalties,
+        presence_penalties,
+        temperature,
+        bad_words_token_ids,
+        step_idx,
+        min_dec_lens,
+        eos_token_ids,
+        seq_lens_this_time,
+        output_padding_offset,
+        output_cum_offsets,
+        max_len,
+    )
     # inplace
     return logits
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
index b5e22d466e..f8f7af2d8b 100644
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -434,9 +434,19 @@ def __init__(self, fd_config: FDConfig):
         """ """
         super().__init__()
         if current_platform.is_cuda():
-            self.forward = self.forward_cuda
+            from fastdeploy.model_executor.ops.gpu import (
+                speculate_verify,
+                top_p_candidates,
+            )
+        elif current_platform.is_xpu():
+            from fastdeploy.model_executor.ops.xpu import (
+                speculate_verify,
+                top_p_candidates,
+            )
         else:
             raise NotImplementedError
+        self.top_p_candidates = top_p_candidates  # 新增这行
+        self.speculate_verify = speculate_verify  # 新增这行
         self.speculative_verify_window = fd_config.speculative_config.verify_window
         self.speculative_max_candidate_len = fd_config.speculative_config.max_candidate_len
         self.speculative_benchmark_mode = fd_config.speculative_config.benchmark_mode
@@ -559,8 +569,6 @@ def forward_cuda(
     ) -> paddle.Tensor:
         """ """
 
-        from fastdeploy.model_executor.ops.gpu import speculate_verify, top_p_candidates
-
         logits = apply_speculative_penalty_multi_scores(
             sampling_metadata.pre_token_ids,
             logits,
@@ -580,7 +588,7 @@ def forward_cuda(
 
         probs = F.softmax(logits)
 
-        verify_scores, verify_tokens, actual_candidate_len = top_p_candidates(
+        verify_scores, verify_tokens, actual_candidate_len = self.top_p_candidates(
             probs,
             sampling_metadata.top_p,
             share_inputs["output_padding_offset"],
@@ -588,7 +596,7 @@ def forward_cuda(
             max_model_len,
         )
 
-        speculate_verify(
+        self.speculate_verify(
             share_inputs["accept_tokens"],
             share_inputs["accept_num"],
             share_inputs["step_idx"],
@@ -612,7 +620,7 @@ def forward_cuda(
             self.speculative_verify_window,
             True,  # enable_topp
             self.speculative_benchmark_mode,
-            accept_all_drafts,
+            # accept_all_drafts,
         )
 
         num_logprobs = sampling_metadata.max_num_logprobs
@@ -673,8 +681,8 @@ class MTPSampler(nn.Layer):
     def __init__(self, fd_config: FDConfig):
         """ """
         super().__init__()
-        if current_platform.is_cuda():
-            self.forward = self.forward_cuda
+        if current_platform.is_cuda() or current_platform.is_xpu():
+            self.forward = self.forward
         else:
             raise NotImplementedError
 
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index 78e2700918..8254495253 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -23,6 +23,7 @@
 from fastdeploy import envs
 from fastdeploy.config import SpeculativeConfig
 from fastdeploy.platforms import current_platform
+from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta
 
 if current_platform.is_iluvatar():
     from fastdeploy.model_executor.ops.iluvatar import (
@@ -56,6 +57,25 @@
         update_inputs,
         update_inputs_v1,
     )
+elif current_platform.is_xpu():
+    from fastdeploy.model_executor.ops.xpu import (
+        get_padding_offset,
+        save_output,
+        adjust_batch,
+        get_infer_param,
+        set_stop_value_multi_ends,
+        speculate_get_output_padding_offset,
+        speculate_get_padding_offset,
+        speculate_get_padding_offset_v2,
+        speculate_clear_accept_nums,
+        speculate_get_seq_lens_output,
+        speculate_save_output,
+        speculate_set_value_by_flags_and_idx,
+        step_paddle,
+        update_inputs,
+        update_inputs_v1,
+        speculate_update_v3,
+    )
 elif current_platform.is_intel_hpu():
     pass
 else:
@@ -88,6 +108,7 @@
 DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1"
 
 
+
 def pre_process(
     input_ids: paddle.Tensor,
     seq_lens_this_time: int,
@@ -123,7 +144,7 @@ def pre_process(
             batch_id_per_token,
             cu_seqlens_q,
             cu_seqlens_k,
-        ) = speculate_get_padding_offset(
+        ) = speculate_get_padding_offset_v2(
             input_ids,
             draft_tokens,
             cum_offsets_now,
@@ -163,7 +184,378 @@ def pre_process(
     )
 
 
-def _build_stream_transfer_data(output_tokens: np.ndarray):
+def xpu_pre_process(
+    input_ids: paddle.Tensor,
+    seq_lens_this_time: int,
+    share_inputs: Dict,
+    use_speculate_method: bool,
+    block_size: int,
+    draft_tokens: Optional[paddle.Tensor] = None,
+    seq_lens_encoder: Optional[paddle.Tensor] = None,
+    seq_lens_decoder: Optional[paddle.Tensor] = None,
+    forward_meta = None,
+) -> XPUForwardMeta:
+    """ """
+    max_len = input_ids.shape[1]
+    cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time, dtype="int32")
+    token_num = paddle.sum(seq_lens_this_time)
+
+    if use_speculate_method:
+        (
+            ids_remove_padding,
+            batch_id_per_token,
+            cu_seqlens_q,
+            cu_seqlens_k,
+        ) = speculate_get_padding_offset_v2(
+            input_ids,
+            draft_tokens,
+            cum_offsets_now,
+            token_num,
+            seq_lens_this_time,
+            seq_lens_encoder,
+        )
+        seq_lens_output = speculate_get_seq_lens_output(
+            seq_lens_this_time,
+            seq_lens_encoder,
+            seq_lens_decoder,
+        )
+        if isinstance(seq_lens_output, list):
+            seq_lens_output = seq_lens_output[0]
+        output_token_num = paddle.sum(seq_lens_output)
+        output_cum_offsets_tmp = paddle.cumsum(max_len - seq_lens_output, dtype="int32")
+        output_padding_offset, output_cum_offsets = speculate_get_output_padding_offset(
+            output_cum_offsets_tmp,
+            output_token_num,
+            seq_lens_output,
+            max_len,
+        )
+        share_inputs["output_cum_offsets"].copy_(output_cum_offsets, False)
+        share_inputs["output_padding_offset"].copy_(output_padding_offset, False)
+    else:
+        (
+            ids_remove_padding,
+            cum_offsets,
+            batch_id_per_token,
+            cu_seqlens_q,
+            cu_seqlens_k,
+        ) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time)
+        share_inputs["cum_offsets"] = cum_offsets
+
+
+    share_inputs["ids_remove_padding"] = None  # set this after adjust batch
+    share_inputs["batch_id_per_token"] = batch_id_per_token
+    share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False)
+    share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False)
+
+    xpu_forward_meta = None
+    if not use_speculate_method:
+        xpu_forward_meta = XPUForwardMeta(
+            input_ids=share_inputs["input_ids"],
+            ids_remove_padding=share_inputs["ids_remove_padding"],
+            rotary_embs=share_inputs["rope_emb"],
+            attn_backend=None,
+            seq_lens_encoder=share_inputs["seq_lens_encoder"],
+            seq_lens_decoder=share_inputs["seq_lens_decoder"],
+            seq_lens_this_time=share_inputs["seq_lens_this_time"],
+            cum_offsets=None if use_speculate_method else share_inputs["cum_offsets"],
+            batch_id_per_token=share_inputs["batch_id_per_token"],
+            cu_seqlens_q=share_inputs["cu_seqlens_q"],
+            cu_seqlens_k=share_inputs["cu_seqlens_k"],
+            block_tables=share_inputs["block_tables"],
+            caches=share_inputs["caches"],
+        )
+    else:
+        xpu_forward_meta = ForwardMeta(
+            input_ids=share_inputs["input_ids"],
+            ids_remove_padding=share_inputs["ids_remove_padding"],
+            rotary_embs=share_inputs["rope_emb"],
+            attn_backend=None,
+            decoder_batch_ids=share_inputs["decoder_batch_ids"],
+            decoder_tile_ids_per_batch=share_inputs["decoder_tile_ids_per_batch"],
+            decoder_num_blocks_cpu=share_inputs["decoder_num_blocks_cpu"],
+            decoder_num_blocks_device=share_inputs["decoder_num_blocks_device"],
+            decoder_chunk_size_device=share_inputs["decoder_chunk_size_device"],
+            max_len_tensor_cpu=share_inputs["max_len_tensor_cpu"],
+            seq_lens_encoder=share_inputs["seq_lens_encoder"],
+            seq_lens_decoder=share_inputs["seq_lens_decoder"],
+            seq_lens_this_time=share_inputs["seq_lens_this_time"],
+            batch_id_per_token=share_inputs["batch_id_per_token"],
+            cu_seqlens_q=share_inputs["cu_seqlens_q"],
+            cu_seqlens_k=share_inputs["cu_seqlens_k"],
+            block_tables=share_inputs["block_tables"],
+            caches=share_inputs["caches"],
+            encoder_batch_ids=share_inputs["encoder_batch_ids"],
+            encoder_tile_ids_per_batch=share_inputs["encoder_tile_ids_per_batch"],
+            encoder_num_blocks_x_cpu=share_inputs["encoder_num_blocks_x_cpu"],
+            kv_batch_ids=share_inputs["kv_batch_ids"],
+            kv_tile_ids_per_batch=share_inputs["kv_tile_ids_per_batch"],
+            kv_num_blocks_x_cpu=share_inputs["kv_num_blocks_x_cpu"],
+            max_len_kv_cpu=share_inputs["max_len_kv_cpu"],
+        )
+
+    (
+        xpu_forward_meta.encoder_batch_map,
+        xpu_forward_meta.decoder_batch_map,
+        xpu_forward_meta.encoder_batch_idx,
+        xpu_forward_meta.decoder_batch_idx,
+        xpu_forward_meta.encoder_seq_lod,
+        xpu_forward_meta.decoder_seq_lod,
+        xpu_forward_meta.encoder_kv_lod,
+        xpu_forward_meta.prefix_len,
+        xpu_forward_meta.decoder_context_len,
+        xpu_forward_meta.decoder_context_len_cache,
+        xpu_forward_meta.prefix_block_tables,
+        xpu_forward_meta.encoder_batch_map_cpu,
+        xpu_forward_meta.decoder_batch_map_cpu,
+        xpu_forward_meta.encoder_batch_idx_cpu,
+        xpu_forward_meta.decoder_batch_idx_cpu,
+        xpu_forward_meta.encoder_seq_lod_cpu,
+        xpu_forward_meta.decoder_seq_lod_cpu,
+        xpu_forward_meta.encoder_kv_lod_cpu,
+        xpu_forward_meta.prefix_len_cpu,
+        xpu_forward_meta.decoder_context_len_cpu,
+        xpu_forward_meta.decoder_context_len_cache_cpu,
+        xpu_forward_meta.len_info_cpu,
+    ) = get_infer_param(
+        seq_lens_encoder, seq_lens_decoder, seq_lens_this_time, xpu_forward_meta.block_tables, block_size
+    )
+    xpu_forward_meta.enc_batch = xpu_forward_meta.len_info_cpu[0]
+    xpu_forward_meta.dec_batch = xpu_forward_meta.len_info_cpu[1]
+    xpu_forward_meta.total_enc_len = xpu_forward_meta.len_info_cpu[2]
+
+    # TODO(chenhuan09)：support MTP
+    if not use_speculate_method:
+        adjusted_input = adjust_batch(
+            ids_remove_padding.reshape([-1, 1]),
+            cum_offsets if not use_speculate_method else seq_lens_this_time,
+            xpu_forward_meta.encoder_seq_lod,
+            xpu_forward_meta.encoder_batch_idx,
+            xpu_forward_meta.decoder_batch_idx,
+            xpu_forward_meta.encoder_seq_lod_cpu,
+            xpu_forward_meta.encoder_batch_idx_cpu,
+            xpu_forward_meta.decoder_batch_idx_cpu,
+            xpu_forward_meta.enc_batch,
+            xpu_forward_meta.dec_batch,
+            None,  # output_padding_offset
+            -1,  # max_input_length
+        )
+        adjusted_input = adjusted_input.squeeze(1)
+
+        share_inputs["ids_remove_padding"] = adjusted_input
+        xpu_forward_meta.ids_remove_padding = adjusted_input
+
+    else:
+        share_inputs["ids_remove_padding"] = ids_remove_padding
+        xpu_forward_meta.ids_remove_padding = ids_remove_padding
+    return xpu_forward_meta
+
+def xpu_process_output(
+    forward_output,
+    cum_offsets: paddle.Tensor,
+    xpu_forward_meta: XPUForwardMeta,
+    share_inputs,
+) -> paddle.Tensor:
+    """ """
+    from fastdeploy.model_executor.ops.xpu import gather_next_token
+    output_padding_offset = share_inputs.get("output_padding_offset", None)
+    hiddden_states = gather_next_token(
+        forward_output,
+        cum_offsets if cum_offsets is not None else xpu_forward_meta.seq_lens_this_time,
+        xpu_forward_meta.encoder_seq_lod,
+        xpu_forward_meta.encoder_batch_map,
+        xpu_forward_meta.decoder_batch_map,
+        xpu_forward_meta.encoder_seq_lod_cpu,
+        xpu_forward_meta.encoder_batch_map_cpu,
+        xpu_forward_meta.decoder_batch_map_cpu,
+        xpu_forward_meta.len_info_cpu,
+        output_padding_offset,  # output_padding_offset
+    )
+    return hiddden_states
+
+
+def xpu_post_process_normal(
+    sampled_token_ids: paddle.Tensor,
+    model_output: ModelOutputData,
+    share_inputs: Dict[str, paddle.Tensor],
+    save_each_rank: bool = False,
+    block_size: int = 64,
+    skip_save_output: bool = False,
+) -> None:
+    """ """
+    from fastdeploy.model_executor.ops.xpu import (
+        save_output,
+        set_stop_value_multi_ends,
+        update_inputs,
+    )
+
+    # handle vl:
+    if model_output.enable_thinking:
+        exists_think_end = sampled_token_ids == model_output.think_end_id
+        paddle.assign(
+            paddle.where(
+                exists_think_end,
+                model_output.need_think_end - 1,
+                model_output.need_think_end,
+            ),
+            model_output.need_think_end,
+        )
+
+        paddle.assign(
+            paddle.where(
+                model_output.need_think_end.cast("bool"),
+                model_output.reasoning_index - 1,
+                model_output.reasoning_index,
+            ),
+            model_output.reasoning_index,
+        )
+
+        stop_wo_think = (
+            (sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True)
+            | (model_output.reasoning_index == 0)
+        ) & (model_output.need_think_end > 0)
+        sampled_token_ids = paddle.where(
+            stop_wo_think,
+            model_output.think_end_id,
+            sampled_token_ids,
+        )
+        paddle.assign(
+            paddle.where(
+                stop_wo_think,
+                model_output.need_think_end - 1,
+                model_output.need_think_end,
+            ),
+            model_output.need_think_end,
+        )
+
+    # 1. Set stop value
+    paddle.assign(
+        paddle.where(
+            model_output.stop_flags,
+            model_output.step_idx,
+            model_output.step_idx + 1,
+        ),
+        model_output.step_idx,
+    )
+    length_cond = paddle.greater_equal(model_output.step_idx, model_output.max_dec_len)
+    paddle.assign(
+        paddle.logical_or(model_output.stop_flags, length_cond),
+        model_output.stop_flags,
+    )
+    set_stop_value_multi_ends(
+        sampled_token_ids,
+        model_output.stop_flags,
+        model_output.seq_lens_this_time,
+        model_output.eos_token_id,
+        model_output.next_tokens,
+        False,
+    )  # multi ends
+
+    # 2. Update the input buffer of the model
+    with paddle.framework._no_check_dy2st_diff():
+        # paddle.device.xpu.set_debug_level(0xa1)
+        if envs.ENABLE_V1_KVCACHE_SCHEDULER and not skip_save_output:
+            update_inputs_v1(
+                model_output.stop_flags,
+                model_output.not_need_stop,
+                model_output.seq_lens_this_time,
+                model_output.seq_lens_encoder,
+                model_output.seq_lens_decoder,
+                share_inputs["step_seq_lens_decoder"],
+                share_inputs["prompt_lens"],
+                sampled_token_ids,
+                model_output.input_ids,
+                share_inputs["block_tables"],
+                model_output.stop_nums,
+                model_output.next_tokens,
+                model_output.is_block_step,
+                block_size,
+            )
+        else:
+            update_inputs(
+                model_output.stop_flags,
+                model_output.not_need_stop,
+                model_output.seq_lens_this_time,
+                model_output.seq_lens_encoder,
+                model_output.seq_lens_decoder,
+                model_output.input_ids,
+                model_output.stop_nums,
+                sampled_token_ids,
+                model_output.is_block_step,
+            )
+        # paddle.device.xpu.set_debug_level(0)
+        
+    # 3. Transmit the model's output and stop generation signal via message queue.
+    #    In the future, we will abandon this approach.
+    if not skip_save_output:
+        save_output(
+            sampled_token_ids,
+            model_output.not_need_stop,
+            model_output.mp_rank,
+            False,  # use_ep
+        )
+
+
+def xpu_post_process_specualate(
+    model_output: ModelOutputData, save_each_rank: bool = False, skip_save_output: bool = False
+):
+    """"""
+    speculate_update_v3(
+        model_output.seq_lens_encoder,
+        model_output.seq_lens_decoder,
+        model_output.not_need_stop,
+        model_output.draft_tokens,
+        model_output.actual_draft_token_num,
+        model_output.accept_tokens,
+        model_output.accept_num,
+        model_output.stop_flags,
+        model_output.seq_lens_this_time,
+        model_output.is_block_step,
+        model_output.stop_nums,
+    )
+    if not skip_save_output:
+        speculate_save_output(
+            model_output.accept_tokens,
+            model_output.accept_num,
+            model_output.not_need_stop,
+            model_output.mp_rank,
+            save_each_rank, # False
+        )
+
+    speculate_clear_accept_nums(model_output.accept_num, model_output.seq_lens_decoder)
+
+    # Update pre_ids through accept tokens
+    speculate_set_value_by_flags_and_idx(
+        model_output.pre_ids,
+        model_output.accept_tokens,
+        model_output.accept_num,
+        model_output.stop_flags,
+        model_output.seq_lens_this_time,
+        model_output.seq_lens_encoder,
+        model_output.seq_lens_decoder,
+        model_output.step_idx,
+    )
+
+
+
+def xpu_post_process(
+    sampler_output: SamplerOutput,
+    model_output: ModelOutputData,
+    share_inputs: Dict[str, paddle.Tensor],
+    block_size: int = 64,
+    speculative_decoding: bool = False,
+    skip_save_output: bool = False,
+) -> None:
+    """Post-processing steps after completing a single token generation."""
+    # save_each_rank do not support in xpu, set defalut False
+    if speculative_decoding:
+        xpu_post_process_specualate(model_output, False, skip_save_output)
+    else:
+        xpu_post_process_normal(
+            sampler_output.sampled_token_ids, model_output, share_inputs, False, block_size, skip_save_output
+        )
+
+
+def _build_stream_transfer_data(zmq_client: ZmqIpcClient, output_tokens: np.ndarray, save_each_rank: bool, mp_rank: int):
     """Split output_tokens and output"""
     output_tokens = output_tokens.reshape([-1]).numpy()
     output_tokens_lists = np.split(output_tokens, output_tokens.shape[0])
diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py
index 7d87033e88..33f552d759 100644
--- a/fastdeploy/output/token_processor.py
+++ b/fastdeploy/output/token_processor.py
@@ -299,7 +299,11 @@ def process_sampling_results(self):
         """
 
         if current_platform.is_xpu():
-            from fastdeploy.model_executor.ops.xpu import get_output, get_output_ep
+            from fastdeploy.model_executor.ops.xpu import (
+                get_output,
+                get_output_ep,
+                speculate_get_output,
+            )
         elif current_platform.is_iluvatar():
             from fastdeploy.model_executor.ops.iluvatar import get_output
         elif current_platform.is_gcu():
diff --git a/fastdeploy/spec_decode/__init__.py b/fastdeploy/spec_decode/__init__.py
index 824d5da56a..86b0456182 100644
--- a/fastdeploy/spec_decode/__init__.py
+++ b/fastdeploy/spec_decode/__init__.py
@@ -17,6 +17,6 @@
 
 from .base import Proposer
 from .mtp import MTPProposer
-from .ngram import NgramProposer
+# from .ngram import NgramProposer
 
 __all__ = ["Proposer", "MTPProposer", "NgramProposer"]
diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py
index ae496d7101..3e7d108e6d 100644
--- a/fastdeploy/spec_decode/mtp.py
+++ b/fastdeploy/spec_decode/mtp.py
@@ -24,7 +24,7 @@
 from fastdeploy import envs
 from fastdeploy.config import FDConfig
 from fastdeploy.engine.request import Request, RequestType
-from fastdeploy.model_executor.forward_meta import ForwardMeta
+from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta
 from fastdeploy.model_executor.layers.attention import get_attention_backend
 from fastdeploy.model_executor.layers.attention.base_attention_backend import (
     AttentionBackend,
@@ -34,20 +34,35 @@
 from fastdeploy.model_executor.layers.sample.sampler import MTPSampler
 from fastdeploy.model_executor.model_loader import get_model_loader
 from fastdeploy.model_executor.models import ModelForCasualLM
-from fastdeploy.model_executor.ops.gpu import (
-    draft_model_postprocess,
-    draft_model_preprocess,
-    draft_model_update,
-    eagle_get_hidden_states,
-    eagle_get_self_hidden_states,
-    hybrid_mtp_ngram,
-    mtp_save_first_token,
-    mtp_step_paddle,
-    share_external_data,
-    speculate_get_logits,
-    speculate_save_output_topk,
-)
+if paddle.is_compiled_with_xpu():
+    from fastdeploy.model_executor.ops.xpu import (
+        draft_model_postprocess,
+        draft_model_preprocess_v2,
+        draft_model_update,
+        eagle_get_hidden_states,
+        eagle_get_self_hidden_states,
+        # hybrid_mtp_ngram,
+        mtp_save_first_token,
+        mtp_step_paddle,
+        share_external_data,
+    )
+else:
+    from fastdeploy.model_executor.ops.gpu import (
+        draft_model_postprocess,
+        draft_model_preprocess,
+        draft_model_update,
+        eagle_get_hidden_states,
+        eagle_get_self_hidden_states,
+        hybrid_mtp_ngram,
+        mtp_save_first_token,
+        mtp_step_paddle,
+        share_external_data,
+        speculate_get_logits,
+        speculate_save_output_topk,
+    )
+
 from fastdeploy.model_executor.pre_and_post_process import pre_process, rebuild_padding
+from fastdeploy.model_executor.pre_and_post_process import xpu_pre_process, xpu_process_output
 
 from .base import Proposer
 
@@ -90,7 +105,7 @@ def __init__(
         self._initialize_attn_backend()
 
         # Forward meta store the global meta information of the forward
-        self.forward_meta: ForwardMeta = None
+        self.forward_meta = None
 
     def _update_mtp_config(self, main_model):
         """
@@ -162,13 +177,10 @@ def initialize_kv_cache(self, main_model_num_blocks, profile: bool = False):
             and hasattr(self.quant_config, "kv_cache_quant_type")
             and self.quant_config.kv_cache_quant_type is not None
         ):
-            cache_type = "uint8"
-            kv_cache_quant_type = self.quant_config.kv_cache_quant_type
+            cache_type = "int8"
 
         # Get kv cache shape
-        kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(
-            max_num_blocks=self.num_gpu_blocks, kv_cache_quant_type=kv_cache_quant_type
-        )
+        kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=self.num_gpu_blocks)
         if not profile and (
             self.cache_config.enable_prefix_caching or self.scheduler_config.splitwise_role != "mixed"
         ):
@@ -202,7 +214,7 @@ def initialize_kv_cache(self, main_model_num_blocks, profile: bool = False):
             self.model_inputs["caches"] = list(self.cache_kvs.values())
             for value in self.cache_kvs.values():
                 del value
-        paddle.device.cuda.empty_cache()
+        paddle.device.xpu.empty_cache()
 
     def _initialize_attn_backend(
         self,
@@ -229,7 +241,8 @@ def _initialize_attn_backend(
         )
         self.model_inputs["decoder_num_blocks_cpu"] = paddle.zeros_like(
             self.target_model_inputs["decoder_num_blocks_cpu"]
-        ).pin_memory()
+        ).cpu()
+        # ).pin_memory()
         self.model_inputs["decoder_num_blocks_device"] = paddle.zeros_like(
             self.target_model_inputs["decoder_num_blocks_device"]
         )
@@ -612,6 +625,51 @@ def _initialize_forward_meta(self, step_use_cudagraph: bool = False):
 
         self.forward_meta.step_use_cudagraph = step_use_cudagraph
 
+    def _initialize_forward_meta_xpu(self):
+
+        # self.forward_meta = XPUForwardMeta(
+        #     input_ids=self.model_inputs["input_ids"],
+        #     ids_remove_padding=self.model_inputs["ids_remove_padding"],
+        #     rotary_embs=self.model_inputs["rope_emb"],
+        #     attn_backend=self.attn_backends[0],
+        #     decoder_batch_ids=self.model_inputs["decoder_batch_ids"],
+        #     decoder_tile_ids_per_batch=self.model_inputs["decoder_tile_ids_per_batch"],
+        #     decoder_num_blocks_cpu=self.model_inputs["decoder_num_blocks_cpu"],
+        #     decoder_num_blocks_device=self.model_inputs["decoder_num_blocks_device"],
+        #     decoder_chunk_size_device=self.model_inputs["decoder_chunk_size_device"],
+        #     max_len_tensor_cpu=self.model_inputs["max_len_tensor_cpu"],
+        #     seq_lens_encoder=self.model_inputs["seq_lens_encoder"],
+        #     seq_lens_decoder=self.model_inputs["seq_lens_decoder"],
+        #     seq_lens_this_time=self.model_inputs["seq_lens_this_time"],
+        #     batch_id_per_token=self.model_inputs["batch_id_per_token"],
+        #     cu_seqlens_q=self.model_inputs["cu_seqlens_q"],
+        #     cu_seqlens_k=self.model_inputs["cu_seqlens_k"],
+        #     block_tables=self.model_inputs["block_tables"],
+        #     caches=self.model_inputs["caches"],
+        #     encoder_batch_ids=self.model_inputs["encoder_batch_ids"],
+        #     encoder_tile_ids_per_batch=self.model_inputs["encoder_tile_ids_per_batch"],
+        #     encoder_num_blocks_x_cpu=self.model_inputs["encoder_num_blocks_x_cpu"],
+        #     kv_batch_ids=self.model_inputs["kv_batch_ids"],
+        #     kv_tile_ids_per_batch=self.model_inputs["kv_tile_ids_per_batch"],
+        #     kv_num_blocks_x_cpu=self.model_inputs["kv_num_blocks_x_cpu"],
+        #     max_len_kv_cpu=self.model_inputs["max_len_kv_cpu"],
+        # )
+        self.forward_meta.attn_backend = self.attn_backends[0]
+
+        # Initialzie attention meta data
+        for attn_backend in self.attn_backends:
+            attn_backend.init_attention_metadata(self.forward_meta)
+
+        # Mix ep in single node
+        if self.fd_config.parallel_config.use_ep and self.fd_config.scheduler_config.splitwise_role == "mixed":
+            only_decode_batch_list = []
+            prefill_exists = self.exist_prefill()
+            paddle.distributed.all_gather_object(only_decode_batch_list, not prefill_exists)
+            only_decode_batch = all(only_decode_batch_list)
+            self.fd_config.model_config.moe_phase.phase = "decode" if only_decode_batch else "prefill"
+
+        # TODO: support cudagraph
+
     def exist_prefill(self):
         """
         check whether prefill stage exist
@@ -625,8 +683,9 @@ def _prepare_inputs(self, full_hidden_states):
         """
         Prepare MTP inputs
         """
-        use_v1_cache_scheduler = envs.ENABLE_V1_KVCACHE_SCHEDULER
-        draft_model_preprocess(
+        use_v1_cache_scheduler = bool(envs.ENABLE_V1_KVCACHE_SCHEDULER)
+        # draft_model_preprocess(
+        draft_model_preprocess_v2(
             self.model_inputs["draft_tokens"],
             self.model_inputs["input_ids"],
             self.model_inputs["stop_flags"],
@@ -688,6 +747,7 @@ def _post_process(self, sampled_token_ids):
             self.max_model_len,
             self.model_inputs["substep"],
         )
+
         if self.role == "prefill":
             mtp_save_first_token(
                 self.model_inputs["base_model_draft_tokens"],
@@ -740,6 +800,17 @@ def _propose(self, step_use_cudagraph: bool = False):
                 # Padding inputs for cuda graph
                 self.padding_cudagraph_inputs()
 
+                self.forward_meta = xpu_pre_process(
+                                        self.model_inputs["input_ids"],
+                                        self.model_inputs["seq_lens_this_time"],
+                                        self.model_inputs,
+                                        True,
+                                        self.cache_config.block_size,
+                                        self.model_inputs["draft_tokens"],
+                                        self.model_inputs["seq_lens_encoder"],
+                                        self.model_inputs["seq_lens_decoder"],
+                                    )
+                self._initialize_forward_meta_xpu()
                 # Get sampling metadata
                 self.sampling_metadata = SamplingMetadata(
                     temperature=self.model_inputs["temperature"],
@@ -767,38 +838,10 @@ def _propose(self, step_use_cudagraph: bool = False):
                     previous_hidden_states=self.model_inputs["target_hidden_states"],
                     forward_meta=self.forward_meta,
                 )
-                if self.forward_meta.step_use_cudagraph:
-                    model_output = model_output[: self.real_token_num]
-                hidden_states = rebuild_padding(
-                    model_output,
-                    self.model_inputs["cu_seqlens_q"],
-                    self.model_inputs["seq_lens_this_time"],
-                    self.model_inputs["seq_lens_decoder"],
-                    self.model_inputs["seq_lens_encoder"],
-                    self.model_inputs["output_padding_offset"],
-                    self.model_config.max_model_len,
-                    self.model_inputs["first_token_hidden_states"],
-                    self.enable_logprob if substep == 0 else False,
-                )
-
+                hidden_states = xpu_process_output(model_output, None, self.forward_meta, self.model_inputs)
                 # 4. Compute logits, Sample
                 logits = self.model.compute_logits(hidden_states)
-                if self.enable_logprob and substep == 0:
-                    first_token_logits = self.model.compute_logits(self.model_inputs["first_token_hidden_states"])
-
-                    speculate_get_logits(
-                        self.model_inputs["draft_logits"],
-                        self.model_inputs["next_token_num"],
-                        self.model_inputs["batch_token_num"],
-                        self.model_inputs["cu_next_token_offset"],
-                        self.model_inputs["cu_batch_token_offset"],
-                        logits,
-                        first_token_logits,
-                        self.model_inputs["seq_lens_this_time"],
-                        self.model_inputs["seq_lens_encoder"],
-                    )
-
-                sampled_token_ids, sampler_output = self.sampler(
+                sampled_token_ids = self.sampler(
                     logits,
                     self.sampling_metadata,
                     self.max_model_len,
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
index 5b1a886c05..db466dd0b8 100644
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -38,264 +38,31 @@
 )
 from fastdeploy.model_executor.layers.rotary_embedding import get_rope, get_rope_3d
 from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
-from fastdeploy.model_executor.layers.sample.sampler import Sampler
+from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler
 from fastdeploy.model_executor.model_loader import get_model_loader
 from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ScatterOp
+from fastdeploy.model_executor.pre_and_post_process import xpu_pre_process, xpu_post_process, xpu_process_output
+
 from fastdeploy.model_executor.ops.xpu import (
-    adjust_batch,
-    get_infer_param,
-    get_padding_offset,
     recover_decode_task,
     set_data_ipc,
     share_external_data,
+    speculate_clear_accept_nums,
+    speculate_get_output_padding_offset,
+    speculate_get_padding_offset,
+    speculate_get_seq_lens_output,
+    speculate_save_output,
+    speculate_set_value_by_flags_and_idx,
     update_inputs_v1,
 )
+from fastdeploy.spec_decode import MTPProposer
 from fastdeploy.utils import get_logger
 from fastdeploy.worker.model_runner_base import ModelRunnerBase
-from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput
+from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput, SamplerOutput
 
 logger = get_logger("xpu_model_runner", "xpu_model_runner.log")
 
 
-def xpu_pre_process(
-    input_ids: paddle.Tensor,
-    seq_lens_this_time: int,
-    share_inputs: Dict,
-    use_speculate_method: bool,
-    block_size: int,
-    draft_tokens: Optional[paddle.Tensor] = None,
-    seq_lens_encoder: Optional[paddle.Tensor] = None,
-    seq_lens_decoder: Optional[paddle.Tensor] = None,
-) -> XPUForwardMeta:
-    """ """
-    max_len = input_ids.shape[1]
-    cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time, dtype="int32")
-    token_num = paddle.sum(seq_lens_this_time)
-
-    (
-        ids_remove_padding,
-        cum_offsets,
-        batch_id_per_token,
-        cu_seqlens_q,
-        cu_seqlens_k,
-    ) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time)
-
-    share_inputs["ids_remove_padding"] = None  # set this after adjust batch
-    share_inputs["cum_offsets"] = cum_offsets
-    share_inputs["batch_id_per_token"] = batch_id_per_token
-    share_inputs["cu_seqlens_q"] = cu_seqlens_q
-    share_inputs["cu_seqlens_k"] = cu_seqlens_k
-
-    xpu_forward_meta = XPUForwardMeta(
-        input_ids=share_inputs["input_ids"],
-        ids_remove_padding=share_inputs["ids_remove_padding"],
-        rotary_embs=share_inputs["rope_emb"],
-        attn_backend=None,
-        seq_lens_encoder=share_inputs["seq_lens_encoder"],
-        seq_lens_decoder=share_inputs["seq_lens_decoder"],
-        seq_lens_this_time=share_inputs["seq_lens_this_time"],
-        cum_offsets=share_inputs["cum_offsets"],
-        batch_id_per_token=share_inputs["batch_id_per_token"],
-        cu_seqlens_q=share_inputs["cu_seqlens_q"],
-        cu_seqlens_k=share_inputs["cu_seqlens_k"],
-        block_tables=share_inputs["block_tables"],
-        caches=share_inputs["caches"],
-    )
-
-    (
-        xpu_forward_meta.encoder_batch_map,
-        xpu_forward_meta.decoder_batch_map,
-        xpu_forward_meta.encoder_batch_idx,
-        xpu_forward_meta.decoder_batch_idx,
-        xpu_forward_meta.encoder_seq_lod,
-        xpu_forward_meta.decoder_seq_lod,
-        xpu_forward_meta.encoder_kv_lod,
-        xpu_forward_meta.prefix_len,
-        xpu_forward_meta.decoder_context_len,
-        xpu_forward_meta.decoder_context_len_cache,
-        xpu_forward_meta.prefix_block_tables,
-        xpu_forward_meta.encoder_batch_map_cpu,
-        xpu_forward_meta.decoder_batch_map_cpu,
-        xpu_forward_meta.encoder_batch_idx_cpu,
-        xpu_forward_meta.decoder_batch_idx_cpu,
-        xpu_forward_meta.encoder_seq_lod_cpu,
-        xpu_forward_meta.decoder_seq_lod_cpu,
-        xpu_forward_meta.encoder_kv_lod_cpu,
-        xpu_forward_meta.prefix_len_cpu,
-        xpu_forward_meta.decoder_context_len_cpu,
-        xpu_forward_meta.decoder_context_len_cache_cpu,
-        xpu_forward_meta.len_info_cpu,
-    ) = get_infer_param(
-        seq_lens_encoder, seq_lens_decoder, seq_lens_this_time, xpu_forward_meta.block_tables, block_size
-    )
-    xpu_forward_meta.enc_batch = xpu_forward_meta.len_info_cpu[0]
-    xpu_forward_meta.dec_batch = xpu_forward_meta.len_info_cpu[1]
-    xpu_forward_meta.total_enc_len = xpu_forward_meta.len_info_cpu[2]
-
-    adjusted_input = adjust_batch(
-        ids_remove_padding.reshape([-1, 1]),
-        cum_offsets,
-        xpu_forward_meta.encoder_seq_lod,
-        xpu_forward_meta.encoder_batch_idx,
-        xpu_forward_meta.decoder_batch_idx,
-        xpu_forward_meta.encoder_seq_lod_cpu,
-        xpu_forward_meta.encoder_batch_idx_cpu,
-        xpu_forward_meta.decoder_batch_idx_cpu,
-        xpu_forward_meta.enc_batch,
-        xpu_forward_meta.dec_batch,
-        None,  # output_padding_offset
-        -1,  # max_input_length
-    )
-
-    adjusted_input = adjusted_input.squeeze(1)
-
-    share_inputs["ids_remove_padding"] = adjusted_input
-    xpu_forward_meta.ids_remove_padding = adjusted_input
-    return xpu_forward_meta
-
-
-def xpu_process_output(
-    forward_output,
-    cum_offsets: paddle.Tensor,
-    xpu_forward_meta: XPUForwardMeta,
-) -> paddle.Tensor:
-    """ """
-    from fastdeploy.model_executor.ops.xpu import gather_next_token
-
-    hiddden_states = gather_next_token(
-        forward_output,
-        cum_offsets,
-        xpu_forward_meta.encoder_seq_lod,
-        xpu_forward_meta.encoder_batch_map,
-        xpu_forward_meta.decoder_batch_map,
-        xpu_forward_meta.encoder_seq_lod_cpu,
-        xpu_forward_meta.encoder_batch_map_cpu,
-        xpu_forward_meta.decoder_batch_map_cpu,
-        xpu_forward_meta.enc_batch,
-        xpu_forward_meta.dec_batch,
-        None,  # output_padding_offset
-        -1,  # max_input_length
-    )
-    return hiddden_states
-
-
-def xpu_post_process(
-    sampled_token_ids: paddle.Tensor,
-    model_output: ModelOutputData,
-    share_inputs: Dict[str, paddle.Tensor],
-    block_size: int = 64,
-    skip_save_output: bool = False,
-) -> None:
-    """ """
-    from fastdeploy.model_executor.ops.xpu import (
-        save_output,
-        set_stop_value_multi_ends,
-        update_inputs,
-    )
-
-    # handle vl:
-    if model_output.enable_thinking:
-        exists_think_end = sampled_token_ids == model_output.think_end_id
-        paddle.assign(
-            paddle.where(
-                exists_think_end,
-                model_output.need_think_end - 1,
-                model_output.need_think_end,
-            ),
-            model_output.need_think_end,
-        )
-
-        paddle.assign(
-            paddle.where(
-                model_output.need_think_end.cast("bool"),
-                model_output.reasoning_index - 1,
-                model_output.reasoning_index,
-            ),
-            model_output.reasoning_index,
-        )
-
-        stop_wo_think = (
-            (sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True)
-            | (model_output.reasoning_index == 0)
-        ) & (model_output.need_think_end > 0)
-        sampled_token_ids = paddle.where(
-            stop_wo_think,
-            model_output.think_end_id,
-            sampled_token_ids,
-        )
-        paddle.assign(
-            paddle.where(
-                stop_wo_think,
-                model_output.need_think_end - 1,
-                model_output.need_think_end,
-            ),
-            model_output.need_think_end,
-        )
-
-    # 1. Set stop value
-    paddle.assign(
-        paddle.where(
-            model_output.stop_flags,
-            model_output.step_idx,
-            model_output.step_idx + 1,
-        ),
-        model_output.step_idx,
-    )
-    length_cond = paddle.greater_equal(model_output.step_idx, model_output.max_dec_len)
-    paddle.assign(
-        paddle.logical_or(model_output.stop_flags, length_cond),
-        model_output.stop_flags,
-    )
-    set_stop_value_multi_ends(
-        sampled_token_ids,
-        model_output.stop_flags,
-        model_output.seq_lens_this_time,
-        model_output.eos_token_id,
-        model_output.next_tokens,
-        False,
-    )  # multi ends
-
-    # 2. Update the input buffer of the model
-    with paddle.framework._no_check_dy2st_diff():
-        if envs.ENABLE_V1_KVCACHE_SCHEDULER and not skip_save_output:
-            update_inputs_v1(
-                model_output.stop_flags,
-                model_output.not_need_stop,
-                model_output.seq_lens_this_time,
-                model_output.seq_lens_encoder,
-                model_output.seq_lens_decoder,
-                share_inputs["step_seq_lens_decoder"],
-                share_inputs["prompt_lens"],
-                sampled_token_ids,
-                model_output.input_ids,
-                share_inputs["block_tables"],
-                model_output.stop_nums,
-                model_output.next_tokens,
-                model_output.is_block_step,
-                block_size,
-            )
-        else:
-            update_inputs(
-                model_output.stop_flags,
-                model_output.not_need_stop,
-                model_output.seq_lens_this_time,
-                model_output.seq_lens_encoder,
-                model_output.seq_lens_decoder,
-                model_output.input_ids,
-                model_output.stop_nums,
-                sampled_token_ids,
-                model_output.is_block_step,
-            )
-    # 3. Transmit the model's output and stop generation signal via message queue.
-    #    In the future, we will abandon this approach.
-    if not skip_save_output:
-        save_output(
-            sampled_token_ids,
-            model_output.not_need_stop,
-            model_output.mp_rank,
-            False,  # use_ep
-        )
-
 
 def step_paddle(
     share_inputs: Dict[str, paddle.Tensor],
@@ -375,9 +142,20 @@ def __init__(
                 "fused_gemm_epilogue",
             ]
 
+        self.device_id = device_id
+        self.speculative_method = self.fd_config.speculative_config.method
+        self.speculative_decoding = self.speculative_method is not None
+
+        # used by SamplingMetadata
+        self.enable_logprob = fd_config.model_config.enable_logprob
+        self.enable_early_stop = self.fd_config.early_stop_config.enable_early_stop
+
         #  Sampler
-        #  TODU(lilujia): sync with GPU
-        self.sampler = Sampler(fd_config)
+        print("ch -- debug self.speculative_decoding:", self.speculative_decoding)
+        if not self.speculative_decoding:
+            self.sampler = Sampler(fd_config)
+        else:
+            self.sampler = SpeculativeSampler(fd_config)
 
         # Lazy initialize kv cache after model loading
         # self.kv_caches: list[paddle.Tensor] = []
@@ -386,7 +164,7 @@ def __init__(
         self.graph_opt_level = self.graph_opt_config.graph_opt_level
         self.use_cudagraph = False
         self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
-        self.input_ids = paddle.zeros(self.scheduler_config.max_num_seqs, dtype="int32")
+        # self.input_ids = paddle.zeros(self.scheduler_config.max_num_seqs, dtype="int32")
 
         # Initialize share inputs
         self._init_share_inputs(self.fd_config.scheduler_config.max_num_seqs)
@@ -395,6 +173,8 @@ def __init__(
             fill_value=4,
             dtype="int64",
         ).cpu()
+        # do not support chunk prefill
+        # self.restore_chunked_prefill_request = dict()
 
         # Initialize attention Backend
         # NOTE(gonshaotian): Currently, all attention layers share one attention backend instance.
@@ -405,16 +185,11 @@ def __init__(
         # Forward meta store the global meta information of the forward
         self.forward_meta: ForwardMeta = None
 
-    def exist_prefill(self):
-        """
-        check whether prefill stage exist
-        """
-        if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0:
-            return 1
-        else:
-            return 0
+        # # Postprocess Env params
+        # os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.parallel_config.engine_worker_queue_port)
+        # logger.info(f"queue id is {str(self.parallel_config.engine_worker_queue_port)}")
 
-    def insert_tasks_v1(self, req_dicts: List[Request]):
+    def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests):
         """
         Process scheduler output tasks, used when ENABLE_V1_KVCACHE_SCHEDULER=1
         req_dict: A list of Request dict
@@ -495,7 +270,7 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
                     has_decode_task = True
                 self.share_inputs["stop_flags"][idx : idx + 1] = False
                 self.share_inputs["seq_lens_decoder"][idx : idx + 1] = prefill_start_index
-                self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length
+                self.seq_lens_this_time_buffer[idx : idx + 1] = length
                 self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length
                 self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = 0
                 self.share_inputs["prompt_lens"][idx : idx + 1] = len(input_ids)
@@ -518,7 +293,7 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
                 logger.debug(f"Handle preempted request {request} at idx {idx}")
                 self.share_inputs["block_tables"][idx : idx + 1, :] = -1
                 self.share_inputs["stop_flags"][idx : idx + 1] = True
-                self.share_inputs["seq_lens_this_time"][idx : idx + 1] = 0
+                self.seq_lens_this_time_buffer[idx : idx + 1] = 0
                 self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
                 self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0
                 self.share_inputs["is_block_step"][idx : idx + 1] = False
@@ -528,6 +303,7 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
             self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
 
             self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
+            # self.share_inputs["top_p"][idx : idx + 1] = 0
             self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
             self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
             self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
@@ -536,6 +312,10 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
             self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
             self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
             self.share_inputs["presence_score"][idx : idx + 1] = request.get("presence_penalty", 0.0)
+            # self.share_inputs["temp_scaled_logprobs"][idx : idx + 1] = request.get("temp_scaled_logprobs", False)
+            # self.share_inputs["top_p_normalized_logprobs"][idx : idx + 1] = request.get(
+            #     "top_p_normalized_logprobs", False
+            # )
 
             self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1)
             self.share_inputs["max_dec_len"][idx : idx + 1] = request.get(
@@ -574,7 +354,12 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
         if has_prefill_task or has_decode_task:
             self.share_inputs["not_need_stop"][0] = True
 
-    def insert_prefill_inputs(self, req_dicts: List[Request]):
+        # self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
+        self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer
+        if self.speculative_method in ["mtp"]:
+            self.proposer.insert_tasks_v1(req_dicts, num_running_requests)
+
+    def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests):
         """Process inputs for prefill tasks and update share_inputs buffer"""
         req_len = len(req_dicts)
         for i in range(req_len):
@@ -599,7 +384,7 @@ def insert_prefill_inputs(self, req_dicts: List[Request]):
             else:
                 self.share_inputs["seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0)
                 self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0)
-            self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length
+            self.seq_lens_this_time_buffer[idx : idx + 1] = length
             self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = length
             self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length
             self.share_inputs["prompt_lens"][idx : idx + 1] = length
@@ -625,6 +410,7 @@ def get_attr_from_request(request, attr, default_value=None):
             assert len(request.eos_token_ids) == self.model_config.eos_tokens_lens
             self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
             self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7)
+            # self.share_inputs["top_p"][idx : idx + 1] = 0
             self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
             self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
             self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
@@ -640,6 +426,13 @@ def get_attr_from_request(request, attr, default_value=None):
             self.share_inputs["presence_score"][idx : idx + 1] = get_attr_from_request(
                 request, "presence_penalty", 0.0
             )
+            # self.share_inputs["temp_scaled_logprobs"][idx : idx + 1] = get_attr_from_request(
+            #     request, "temp_scaled_logprobs", False
+            # )
+            # self.share_inputs["top_p_normalized_logprobs"][idx : idx + 1] = get_attr_from_request(
+            #     request, "top_p_normalized_logprobs", False
+            # )
+
             self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1)
             self.share_inputs["max_dec_len"][idx : idx + 1] = request.get(
                 "max_tokens", self.model_config.max_model_len
@@ -680,8 +473,13 @@ def get_attr_from_request(request, attr, default_value=None):
                 ] = np.array(request.get("stop_token_ids"), dtype="int64")
             else:
                 self.share_inputs["stop_seqs_len"][idx : idx + 1, :] = 0
-
+            # self.sampler.apply_logits_processor(idx, request.get("logits_processor"), prefill_tokens)
         self.share_inputs["not_need_stop"][0] = True
+        # self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
+        self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer
+        if self.speculative_method in ["mtp"]:
+            self.proposer.insert_prefill_inputs(req_dicts, num_running_requests)
+            
 
     def _init_share_inputs(self, max_num_seqs: int):
         """Initialize all share buffers for model inputs.
@@ -734,7 +532,8 @@ def _init_share_inputs(self, max_num_seqs: int):
         self.share_inputs["max_length"] = paddle.full(
             [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
         )
-        self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32")
+        # self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32")
+        self.seq_lens_this_time_buffer = paddle.full(max_num_seqs, 0, dtype="int32")
         self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["step_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
@@ -764,6 +563,29 @@ def _init_share_inputs(self, max_num_seqs: int):
         self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32")
+        
+        self.share_inputs["ids_remove_padding"] = paddle.full(
+            [max_num_seqs * self.parallel_config.max_model_len],
+            0,
+            dtype="int64",
+        )
+        self.share_inputs["batch_id_per_token"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
+        self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
+        self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
+        # Declare AttentionBackend buffers
+        self.share_inputs["decoder_batch_ids"] = None
+        self.share_inputs["decoder_tile_ids_per_batch"] = None
+        self.share_inputs["decoder_num_blocks_cpu"] = None  # Pinning Memory
+        self.share_inputs["decoder_num_blocks_device"] = None
+        self.share_inputs["decoder_chunk_size_device"] = None
+        self.share_inputs["max_len_tensor_cpu"] = None  # CPU
+        self.share_inputs["encoder_batch_ids"] = None
+        self.share_inputs["encoder_tile_ids_per_batch"] = None
+        self.share_inputs["encoder_num_blocks_x_cpu"] = None  # CPU
+        self.share_inputs["kv_batch_ids"] = None
+        self.share_inputs["kv_tile_ids_per_batch"] = None
+        self.share_inputs["kv_num_blocks_x_cpu"] = None  # CPU
+        self.share_inputs["max_len_kv_cpu"] = None  # CPU
 
         # Initialize rotary position embedding
         tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1))
@@ -827,6 +649,46 @@ def _init_share_inputs(self, max_num_seqs: int):
             self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
             self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool")
             self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
+        
+        if self.speculative_decoding:
+            max_draft_token_num = self.speculative_config.num_speculative_tokens
+            self.share_inputs["input_ids_cpu"] = paddle.full(
+                shape=[max_num_seqs, self.parallel_config.max_model_len],
+                fill_value=1,
+                dtype="int64",
+            ).cpu()
+            self.share_inputs["accept_tokens"] = paddle.full(
+                shape=[max_num_seqs, max_draft_token_num + 1],
+                fill_value=0,
+                dtype="int64",
+            )
+            self.share_inputs["accept_num"] = paddle.full(shape=[max_num_seqs], fill_value=0, dtype="int32")
+            self.share_inputs["draft_tokens"] = paddle.full(
+                shape=[max_num_seqs, max_draft_token_num + 1],
+                fill_value=0,
+                dtype="int64",
+            )
+
+            self.share_inputs["actual_draft_token_num"] = paddle.full(
+                shape=[max_num_seqs],
+                fill_value=max_draft_token_num,
+                dtype="int32",
+            )
+            self.share_inputs["output_cum_offsets"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
+            self.share_inputs["output_padding_offset"] = paddle.full(
+                shape=[max_num_seqs * (max_draft_token_num + 1)],
+                fill_value=0,
+                dtype="int32",
+            )
+            # For V1_KVCACHE_SCHEDULER
+            self.share_inputs["step_draft_tokens"] = paddle.full(
+                shape=[max_num_seqs, max_draft_token_num + 1],
+                fill_value=0,
+                dtype="int64",
+            )
+            self.share_inputs["step_seq_lens_this_time"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
+
+        self.max_num_seqs = max_num_seqs
 
     def _prepare_inputs(self, is_dummy_run=False) -> None:
         """Prepare the model inputs"""
@@ -841,22 +703,23 @@ def _prepare_inputs(self, is_dummy_run=False) -> None:
                 self.share_inputs["is_block_step"],
                 self.cache_config.block_size,
             )
+
         self.forward_meta = xpu_pre_process(
             self.share_inputs["input_ids"],
             self.share_inputs["seq_lens_this_time"],
             self.share_inputs,
-            use_speculate_method=False,
-            block_size=self.cache_config.block_size,
-            draft_tokens=None,
+            use_speculate_method=self.speculative_decoding,
+            block_size=self.parallel_config.block_size,
+            draft_tokens=self.share_inputs["draft_tokens"] if self.speculative_decoding else None,
             seq_lens_encoder=self.share_inputs["seq_lens_encoder"],
             seq_lens_decoder=self.share_inputs["seq_lens_decoder"],
-        )
+        ) 
+
         # Update bad tokens len
         max_bad_tokens_len = paddle.max(self.share_inputs["bad_tokens_len"])
 
         if self.enable_mm:  # pos_emb_type is different in EB and VL
             self.forward_meta.pos_emb_type = "HALF_HEAD_DIM"
-        self.forward_meta.attn_backend = self.attn_backends[0]
         self.initialize_attention_backend()
 
         # Get sampling metadata
@@ -893,6 +756,7 @@ def load_model(self) -> None:
         # 2. Load lora model
 
         # 3. Load drafter model(for speculative decoding)
+        self._init_speculative_proposer()
 
     def get_model(self) -> nn.Layer:
         """Get current model"""
@@ -902,6 +766,7 @@ def initialize_attention_backend(self):
         """
         Initialize attention meta data
         """
+        self.forward_meta.attn_backend = self.attn_backends[0]
         # Initialzie attention meta data
         for attn_backend in self.attn_backends:
             attn_backend.init_attention_metadata(self.forward_meta)
@@ -989,6 +854,39 @@ def initialize_attn_backend(self) -> None:
             int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size
         )
         head_dim = self.model_config.head_dim
+        
+        # Initialize AttentionBackend buffers
+        encoder_block_shape_q = 64
+        decoder_block_shape_q = 16
+        decoder_step_token_num = self.speculative_config.num_speculative_tokens + 1
+        decode_max_tile_size = self.max_num_seqs * np.ceil(
+            (decoder_step_token_num * np.ceil(num_heads / self.model_config.kv_num_heads)) / decoder_block_shape_q
+        )
+
+        group_size = np.ceil(num_heads / self.model_config.kv_num_heads)
+        encode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
+            (self.model_config.max_model_len * group_size) / encoder_block_shape_q
+        )
+        kv_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil(
+            self.model_config.max_model_len / self.fd_config.cache_config.block_size
+        )
+        self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
+        self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
+        self.share_inputs["decoder_num_blocks_cpu"] = paddle.full([1], 0, dtype="int32").cpu()
+        # NOTE: (changwenbin) MLA kernel only needs decoder_num_blocks_device in place of GPU tensor,
+        # adapted to cudagraph.
+        self.share_inputs["decoder_num_blocks_device"] = paddle.full([1], 0, dtype="int32")
+        self.share_inputs["decoder_chunk_size_device"] = paddle.full([1], 64, dtype="int32")
+        self.share_inputs["max_len_tensor_cpu"] = paddle.full([8], 0, dtype="int32").cpu()
+
+        self.share_inputs["encoder_batch_ids"] = paddle.full([int(encode_max_tile_size)], 0, dtype="int32")
+        self.share_inputs["encoder_tile_ids_per_batch"] = paddle.full([int(encode_max_tile_size)], 0, dtype="int32")
+        self.share_inputs["encoder_num_blocks_x_cpu"] = paddle.full([1], 0, dtype="int32").cpu()
+
+        self.share_inputs["kv_batch_ids"] = paddle.full([int(kv_max_tile_size)], 0, dtype="int32")
+        self.share_inputs["kv_tile_ids_per_batch"] = paddle.full([int(kv_max_tile_size)], 0, dtype="int32")
+        self.share_inputs["kv_num_blocks_x_cpu"] = paddle.full([1], 0, dtype="int32").cpu()
+        self.share_inputs["max_len_kv_cpu"] = paddle.full([1], 0, dtype="int32").cpu()
 
         # Get the attention backend
         attn_cls = get_attention_backend()
@@ -1004,6 +902,82 @@ def initialize_attn_backend(self) -> None:
             )
         self.attn_backends.append(attn_backend)
 
+    def _init_speculative_proposer(self):
+        """
+        Init speculative proposer
+        """
+        if self.speculative_method == "ngram":
+            # xpu not support ngram proposer now
+            # self.proposer = NgramProposer(self.fd_config)
+            self.proposer = None
+        elif self.speculative_method == "mtp":
+            self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer
+            # print("ch -- debug self.share_inputs, before init MTP Proposer:", self.share_inputs)
+            self.proposer = MTPProposer(
+                self.fd_config,
+                self.get_model(),
+                self.local_rank,
+                self.device_id,
+                self.share_inputs,
+            )
+        else:
+            self.proposer = None
+
+    def _init_logits_processor(self, request):
+        """
+        init logits processor for guided decoding
+        """
+        assert self.guided_backend is not None, (
+            "guided_backend is None, use " "--guided-decoding-backend to specify the backend at server startup."
+        )
+
+        if request.guided_json is not None:
+            schemata_key = ("json", request.guided_json)
+        elif request.guided_regex is not None:
+            schemata_key = ("regex", request.guided_regex)
+        elif request.guided_grammar is not None:
+            schemata_key = ("grammar", request.guided_grammar)
+        elif request.structural_tag is not None:
+            schemata_key = ("structural_tag", request.structural_tag)
+
+        enable_thinking = request.get("enable_thinking", True)
+        enable_thinking = enable_thinking if enable_thinking is not None else True
+
+        return (
+            self.guided_backend.get_logits_processor(
+                schemata_key=schemata_key,
+                enable_thinking=enable_thinking,
+            ),
+            schemata_key,
+        )
+
+    def capture_model(self) -> None:
+        """
+        Trigger CUDA Graph capture for all shapes in 'CudaGraphConfig.cudagraph_capture_sizes'
+        """
+        logger.warn("XPU not support cuda graph currently")
+        pass
+
+    @sot_warmup_guard(True)
+    def sot_warmup(self) -> None:
+        start_time = time.perf_counter()
+        for batch_size in self.sot_warmup_sizes:
+            self._dummy_run(
+                num_tokens=self.scheduler_config.max_num_batched_tokens,
+                batch_size=batch_size,
+            )
+            logger.info(f"SOT warmup the model with the batch size:{batch_size}")
+        logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
+
+    def exist_prefill(self):
+        """
+        check whether prefill stage exist
+        """
+        if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0:
+            return 1
+        else:
+            return 0
+
     def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int):
         """Set dummy prefill inputs to share_inputs"""
         full_length = min(num_tokens // batch_size, self.model_config.max_model_len - 10)
@@ -1017,7 +991,7 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int):
             self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
             self.share_inputs["prompt_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
             self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1)
-            self.share_inputs["seq_lens_this_time"][idx : idx + 1] = input_length
+            self.seq_lens_this_time_buffer[idx : idx + 1] = input_length
 
             self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length
             self.share_inputs["seq_lens_encoder"][idx : idx + 1] = input_length
@@ -1034,6 +1008,7 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int):
             self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(
                 idx * block_num, (idx + 1) * block_num, 1
             )
+        self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer
 
     def _dummy_run(
         self,
@@ -1047,12 +1022,20 @@ def _dummy_run(
             num_tokens: Expected number of tokens generated
         """
         self._dummy_prefill_inputs(num_tokens, batch_size)
-
+        if self.speculative_method in ["mtp"]:
+            self.proposer.dummy_prefill_inputs(
+                num_tokens=num_tokens,
+                batch_size=batch_size,
+                expected_decode_len=1,
+            )
+        print("ch -- debug after _dummy_prefill_inputs")
         while True:
             self.execute_model(is_dummy_run=True)
 
             if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:
                 break
+        print("ch -- debug _dummy_run finished.")
+        print("=========================================================")
 
     def _set_debug_level(
         self, debug_level: int = 0x1, model_forward_batch: Optional[List[Request]] = None, is_dummy_run: bool = False
@@ -1112,7 +1095,7 @@ class at the server level, which is too granular for ModelRunner.
 
         # 1. Prepare inputs of model and decoder.
         self._prepare_inputs(is_dummy_run=is_dummy_run)
-
+        print("ch -- debug after prepare_inputs.")
         # 2. Padding inputs for cuda grph
 
         # 3. Execute model
@@ -1121,17 +1104,26 @@ class at the server level, which is too granular for ModelRunner.
                 self.share_inputs["ids_remove_padding"], self.share_inputs["image_features"], self.forward_meta
             )
         else:
+            print("ch -- debug ids_remove_padding in base:", self.share_inputs["ids_remove_padding"])
             model_output = self.model(
                 ids_remove_padding=self.share_inputs["ids_remove_padding"],
                 forward_meta=self.forward_meta,
             )
-
-        hidden_states = xpu_process_output(model_output, self.share_inputs["cum_offsets"], self.forward_meta)
+        hidden_states = xpu_process_output(model_output, None, self.forward_meta, self.share_inputs)
 
         # 4. Compute logits, Sample
         logits = self.model.compute_logits(hidden_states)
-        sampler_output = self.sampler(logits, self.sampling_metadata)
-
+        sampler_output = None
+        if not self.speculative_decoding:
+            sampler_output = self.sampler(logits, self.sampling_metadata)
+        else:
+            self.sampler(
+                logits,
+                self.sampling_metadata,
+                self.parallel_config.max_model_len,
+                self.share_inputs,
+            )
+            # TODO(chenhuan09): support tp/dp
         # 5. Speculative decode
 
         # 6. Post Process
@@ -1149,15 +1141,16 @@ class at the server level, which is too granular for ModelRunner.
             seq_lens_encoder=self.share_inputs["seq_lens_encoder"],
             seq_lens_decoder=self.share_inputs["seq_lens_decoder"],
             is_block_step=self.share_inputs["is_block_step"],
-            # 投机解码
-            full_hidden_states=None,
+            full_hidden_states=model_output, # 投机解码
             msg_queue_id=self.parallel_config.msg_queue_id,
             mp_rank=self.local_rank,
             use_ep=self.parallel_config.use_ep,
-            draft_tokens=None,
-            actual_draft_token_num=None,
-            accept_tokens=None,
-            accept_num=None,
+            draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None),
+            actual_draft_token_num=(
+                self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None
+            ),
+            accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
+            accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
             enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
             think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
             need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None),
@@ -1165,13 +1158,26 @@ class at the server level, which is too granular for ModelRunner.
             stop_token_ids=self.share_inputs["stop_seqs"],
             stop_seqs_len=self.share_inputs["stop_seqs_len"],
         )
+        skip_save_output = is_dummy_run
+        if self.speculative_config.method in ["mtp"] and self.scheduler_config.splitwise_role == "prefill":
+            skip_save_output = True
+        else:
+            skip_save_output = False
+        # TODO(chenhuan09): need check, support mtp
         xpu_post_process(
-            sampled_token_ids=sampler_output.sampled_token_ids,
+            sampler_output=sampler_output,
             model_output=model_output_data,
             share_inputs=self.share_inputs,
-            block_size=self.cache_config.block_size,
+            block_size=self.parallel_config.block_size,
+            speculative_decoding=self.speculative_decoding,
             skip_save_output=is_dummy_run,
         )
+        print("ch -- debug draft model start ==========================================")
+        print("ch -- debug self.speculative_method:", self.speculative_method)
+        if self.speculative_decoding:
+            if self.speculative_method == "mtp":
+                self.proposer.run(full_hidden_states=model_output)
+        print("ch -- debug draft model end ==========================================")
 
         # 7. Updata 'infer_seed' and step_paddle()
         self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
@@ -1182,14 +1188,23 @@ class at the server level, which is too granular for ModelRunner.
             self.cache_config.enc_dec_block_num,
         )
 
+        # self.seq_lens_this_time_buffer[:num_running_requests].copy_(
+        #     self.share_inputs["seq_lens_this_time"][:num_running_requests], False
+        # )
+
+        self.seq_lens_this_time_buffer.copy_(
+            self.share_inputs["seq_lens_this_time"], False
+        )
         return None
 
     @profile_run_guard(True)
     def profile_run(self) -> None:
         """Execute a forward pass with dummy inputs to profile the memory usage of the model"""
 
-        self.num_gpu_blocks = self.cache_config.total_block_num
-        self.initialize_kv_cache(profile=True)
+        self.num_gpu_blocks = self.parallel_config.total_block_num
+        self.initialize_kv_cache()
+        if self.speculative_method in ["mtp"]:
+            self.proposer.initialize_kv_cache(main_model_num_blocks=self.num_gpu_blocks, profile=True)
 
         self._dummy_run(
             num_tokens=int(self.scheduler_config.max_num_batched_tokens),
diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py
index 0f84b1db08..626689dd13 100644
--- a/fastdeploy/worker/xpu_worker.py
+++ b/fastdeploy/worker/xpu_worker.py
@@ -75,6 +75,7 @@ def init_device(self):
         self.model_runner: XPUModelRunner = XPUModelRunner(
             fd_config=self.fd_config,
             device=self.device,
+            device_id=self.device_ids[self.local_rank % len(self.device_ids)],
             rank=self.rank,
             device_id=self.device_id,
             local_rank=self.local_rank,
@@ -165,9 +166,9 @@ def preprocess_new_task(self, req_dicts: List[Request], num_running_requests: in
         and workers and modelrunners should not perceive it.
         """
         if envs.ENABLE_V1_KVCACHE_SCHEDULER:
-            self.model_runner.insert_tasks_v1(req_dicts=req_dicts)
+            self.model_runner.insert_tasks_v1(req_dicts=req_dicts, num_running_requests=num_running_requests)
         else:
-            self.model_runner.insert_prefill_inputs(req_dicts=req_dicts)
+            self.model_runner.insert_prefill_inputs(req_dicts=req_dicts, num_running_requests=num_running_requests)
 
     def graph_optimize_and_warm_up_model(self) -> None:
         """