diff --git a/1 b/1 new file mode 100644 index 0000000000..92e7e15074 --- /dev/null +++ b/1 @@ -0,0 +1,209 @@ +WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning. + +[notice] A new release of pip is available: 24.2 -> 25.2 +[notice] To update, run: python -m pip install --upgrade pip +XCCL /usr/local/lib/python3.10/dist-packages/paddle/base/../libs/libbkcl.so loaded +/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py:717: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md + warnings.warn(warning_message) +XCCL /usr/local/lib/python3.10/dist-packages/paddle/base/../libs/libbkcl.so loaded +/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py:717: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md + warnings.warn(warning_message) +XCCL /usr/local/lib/python3.10/dist-packages/paddle/base/../libs/libbkcl.so loaded +/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py:717: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md + warnings.warn(warning_message) +XCCL /usr/local/lib/python3.10/dist-packages/paddle/base/../libs/libbkcl.so loaded +/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py:717: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md + warnings.warn(warning_message) +CMake Warning: + Manually-specified variables were not used by the project: + + BUILD_STANDALONE + + +/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_set_value_by_flags.xpu:26:17: note: unsupported dynamic stack alloca +__global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all, + ^ +/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_set_value_by_flags.xpu:26:17: note: unsupported dynamic stack alloca +[2025-09-18 08:43:05,095] [ INFO] dist.py:970 - running install +/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: SetuptoolsDeprecationWarning: setup.py install is deprecated. +!! + + ******************************************************************************** + Please avoid running ``setup.py`` directly. + Instead, use pypa/build, pypa/installer or other + standards-based tools. + + See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details. + ******************************************************************************** + +!! + self.initialize_options() +/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py:66: EasyInstallDeprecationWarning: easy_install command is deprecated. +!! + + ******************************************************************************** + Please avoid running ``setup.py`` and ``easy_install``. + Instead, use pypa/build, pypa/installer or other + standards-based tools. + + See https://github.com/pypa/setuptools/issues/917 for details. + ******************************************************************************** + +!! + self.initialize_options() +[2025-09-18 08:43:05,153] [ INFO] easy_install.py:573 - Checking .pth file support in /home/chenhuan09/FD/FastDeploy/custom_ops/tmp/ +[2025-09-18 08:43:05,153] [ INFO] spawn.py:60 - /usr/bin/python -E -c pass +[2025-09-18 08:43:05,168] [ WARNING] easy_install.py:627 - TEST FAILED: /home/chenhuan09/FD/FastDeploy/custom_ops/tmp/ does NOT support .pth files +[2025-09-18 08:43:05,168] [ WARNING] easy_install.py:504 - bad install directory or PYTHONPATH + +You are attempting to install a package to a directory that is not +on PYTHONPATH and which Python does not read ".pth" files from. The +installation directory you specified (via --install-dir, --prefix, or +the distutils default setting) was: + + /home/chenhuan09/FD/FastDeploy/custom_ops/tmp/ + +and your PYTHONPATH environment variable currently contains: + + '/opt/source/PaddleNLP/llm/server/server:/opt/source/PaddleNLP' + +Here are some of your options for correcting the problem: + +* You can choose a different installation directory, i.e., one that is + on PYTHONPATH or supports .pth files + +* You can add the installation directory to the PYTHONPATH environment + variable. (It must then also be on PYTHONPATH whenever you run + Python and want to use the package(s) you are installing.) + +* You can set up the installation directory to support ".pth" files by + using one of the approaches described here: + + https://setuptools.pypa.io/en/latest/deprecated/easy_install.html#custom-installation-locations + + +Please make the appropriate changes for your system and try again. +[2025-09-18 08:43:05,215] [ INFO] dist.py:970 - running bdist_egg +[2025-09-18 08:43:05,231] [ INFO] dist.py:970 - running egg_info +[2025-09-18 08:43:05,231] [ INFO] dir_util.py:58 - creating fastdeploy_ops.egg-info +[2025-09-18 08:43:05,237] [ INFO] egg_info.py:648 - writing fastdeploy_ops.egg-info/PKG-INFO +[2025-09-18 08:43:05,237] [ INFO] egg_info.py:282 - writing dependency_links to fastdeploy_ops.egg-info/dependency_links.txt +[2025-09-18 08:43:05,237] [ INFO] egg_info.py:282 - writing top-level names to fastdeploy_ops.egg-info/top_level.txt +[2025-09-18 08:43:05,237] [ INFO] util.py:324 - writing manifest file 'fastdeploy_ops.egg-info/SOURCES.txt' +[2025-09-18 08:43:05,362] [ INFO] util.py:324 - writing manifest file 'fastdeploy_ops.egg-info/SOURCES.txt' +[2025-09-18 08:43:05,362] [ INFO] bdist_egg.py:162 - installing library code to build/fastdeploy_ops/bdist.linux-x86_64/egg +[2025-09-18 08:43:05,362] [ INFO] dist.py:970 - running install_lib +[2025-09-18 08:43:05,362] [ INFO] dist.py:970 - running build_ext +[2025-09-18 08:43:05,383] [ INFO] build_ext.py:530 - building 'fastdeploy_ops' extension +[2025-09-18 08:43:05,384] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/adjust_batch.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/adjust_batch.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,385] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/block_attn.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/block_attn.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,385] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/device/get_context_gm_max_mem_demand.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_context_gm_max_mem_demand.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,386] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/device/get_free_global_memory.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_free_global_memory.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,386] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/device/get_total_global_memory.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_total_global_memory.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,387] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/device/get_used_global_memory.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_used_global_memory.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,388] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/fused_rms_norm.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fused_rms_norm.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,388] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/gather_next_token.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/gather_next_token.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,389] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/get_infer_param.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_infer_param.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,390] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/get_output.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_output.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,391] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/get_padding_offset.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_padding_offset.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,391] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/get_token_penalty_multi_scores.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_token_penalty_multi_scores.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,392] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_ep_combine.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_ep_combine.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,392] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_ep_dispatch.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_ep_dispatch.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,393] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_expert_ffn.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,394] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_layer.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_layer.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,395] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_redundant_topk_select.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_redundant_topk_select.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,396] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/moe_topk_select.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_topk_select.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,396] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/draft_model_postprocess.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_postprocess.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,397] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_preprocess.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,397] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_preprocess_v2.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,397] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/draft_model_update.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_update.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,398] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/eagle_get_hidden_states.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/eagle_get_hidden_states.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,399] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/eagle_get_self_hidden_states.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/eagle_get_self_hidden_states.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,400] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/mtp_save_first_token.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/mtp_save_first_token.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,400] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/mtp_step_paddle.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/mtp_step_paddle.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,401] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_clear_accept_nums.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_clear_accept_nums.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,402] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_get_output.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_output.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,402] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_get_output_padding_offset.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_output_padding_offset.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,403] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_get_padding_offset.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_padding_offset.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,403] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_get_seq_lens_output.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_seq_lens_output.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,404] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_rebuild_append_padding.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_rebuild_append_padding.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,405] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_save_output.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,405] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_set_stop_value_multi_seqs.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_set_stop_value_multi_seqs.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,406] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_set_value_by_flags.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_set_value_by_flags.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,406] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_step_reschedule.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_step_reschedule.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,407] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_token_penalty_multi_scores.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_token_penalty_multi_scores.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,408] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_update_input_ids_cpu.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_update_input_ids_cpu.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,408] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_update_v3.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_update_v3.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,409] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_verify.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,409] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/mtp/top_p_candidates.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/top_p_candidates.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,411] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/open_shm_and_get_meta_signal.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/open_shm_and_get_meta_signal.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,412] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/pybind/alloc_cache_pinned.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/alloc_cache_pinned.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,412] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/pybind/cachekv_signal_thread_worker.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/cachekv_signal_thread_worker.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,413] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/pybind/get_peermem_addr.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_peermem_addr.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,414] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/pybind/profiler.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/profiler.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,415] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/pybind/pybind.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/pybind.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,415] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/read_data_ipc.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/read_data_ipc.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,416] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/recover_decode_task.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/recover_decode_task.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,418] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/remote_cache_kv_ipc.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/remote_cache_kv_ipc.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,419] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/save_with_output_msg.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,420] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/set_data_ipc.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/set_data_ipc.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,422] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/set_value_by_flags_and_idx.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/set_value_by_flags_and_idx.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,423] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/share_external_data.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/share_external_data.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,424] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/step.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/step.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,425] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/stop_generation_multi_ends.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/stop_generation_multi_ends.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,428] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/swap_cache_batch.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/swap_cache_batch.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,431] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/update_inputs.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/update_inputs.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,433] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/update_inputs_v1.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/update_inputs_v1.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,434] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/utility/debug.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/debug.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,439] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/utility/env.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/env.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,444] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/utility/logging.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/logging.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,449] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/weight_only_linear.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/weight_only_linear.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:05,458] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./ -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/xpu -I/usr/local/lib/python3.10/dist-packages/paddle/include/xre -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/include -I/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/include -I/usr/local/lib/python3.10/dist-packages/paddle/include -I/usr/local/lib/python3.10/dist-packages/paddle/include/third_party -I/usr/include/python3.10 -I/usr/include/python3.10 -c /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/ops/weight_quantize_xpu.cc -o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/weight_quantize_xpu.o -D_GLIBCXX_USE_CXX11_ABI=1 -DPADDLE_WITH_XPU -DBUILD_MULTI_XPU -w -DPADDLE_WITH_CUSTOM_KERNEL -DPADDLE_EXTENSION_NAME=fastdeploy_ops -D_GLIBCXX_USE_CXX11_ABI=1 -std=c++17 +[2025-09-18 08:43:18,268] [ INFO] spawn.py:60 - x86_64-linux-gnu-g++ -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/adjust_batch.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/block_attn.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_context_gm_max_mem_demand.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_free_global_memory.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_total_global_memory.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_used_global_memory.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fused_rms_norm.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/gather_next_token.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_infer_param.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_output.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_padding_offset.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_token_penalty_multi_scores.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_ep_combine.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_ep_dispatch.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_expert_ffn.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_layer.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_redundant_topk_select.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/moe_topk_select.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_postprocess.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_preprocess.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_preprocess_v2.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/draft_model_update.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/eagle_get_hidden_states.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/eagle_get_self_hidden_states.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/mtp_save_first_token.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/mtp_step_paddle.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_clear_accept_nums.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_output.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_output_padding_offset.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_padding_offset.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_get_seq_lens_output.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_rebuild_append_padding.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_save_output.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_set_stop_value_multi_seqs.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_set_value_by_flags.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_step_reschedule.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_token_penalty_multi_scores.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_update_input_ids_cpu.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_update_v3.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/speculate_verify.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/top_p_candidates.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/open_shm_and_get_meta_signal.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/alloc_cache_pinned.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/cachekv_signal_thread_worker.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/get_peermem_addr.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/profiler.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/pybind.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/read_data_ipc.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/recover_decode_task.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/remote_cache_kv_ipc.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/save_with_output_msg.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/set_data_ipc.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/set_value_by_flags_and_idx.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/share_external_data.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/step.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/stop_generation_multi_ends.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/swap_cache_batch.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/update_inputs.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/update_inputs_v1.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/debug.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/env.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/logging.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/weight_only_linear.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/weight_quantize_xpu.o /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/src/./plugin/build/libxpuplugin.a /usr/local/lib/python3.10/dist-packages/paddle/libs/libbkcl.so /usr/local/lib/python3.10/dist-packages/paddle/libs/libxpucuda.so /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/so/libapiinfer.so /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/so/libxft_blocks.so -L/usr/local/lib/python3.10/dist-packages/paddle/libs -L/usr/local/lib/python3.10/dist-packages/paddle/base -L/usr/lib/x86_64-linux-gnu -Wl,--enable-new-dtags,-rpath,/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/infer_ops/so -Wl,--enable-new-dtags,-rpath,/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/third_party/xvllm/xft_blocks/so -Wl,--enable-new-dtags,-rpath,/usr/local/lib/python3.10/dist-packages/paddle/libs -Wl,--enable-new-dtags,-rpath,/usr/local/lib/python3.10/dist-packages/paddle/base -o build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fastdeploy_ops.so -l:libpaddle.so +[2025-09-18 08:43:20,462] [ INFO] dir_util.py:58 - creating build/fastdeploy_ops/bdist.linux-x86_64/egg +[2025-09-18 08:43:20,463] [ INFO] file_util.py:130 - copying build/fastdeploy_ops/lib.linux-x86_64-cpython-310/version.txt -> build/fastdeploy_ops/bdist.linux-x86_64/egg +[2025-09-18 08:43:20,463] [ INFO] file_util.py:130 - copying build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fastdeploy_ops.so -> build/fastdeploy_ops/bdist.linux-x86_64/egg +[2025-09-18 08:43:20,558] [ INFO] bdist_egg.py:178 - creating stub loader for fastdeploy_ops.so +Traceback (most recent call last): + File "/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/setup_ops.py", line 182, in + xpu_setup_ops() + File "/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/setup_ops.py", line 163, in xpu_setup_ops + setup( + File "/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/cpp_extension.py", line 248, in setup + setuptools.setup(**attr) + File "/usr/local/lib/python3.10/dist-packages/setuptools/__init__.py", line 117, in setup + return distutils.core.setup(**attrs) + File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/core.py", line 183, in setup + return run_commands(dist) + File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/core.py", line 199, in run_commands + dist.run_commands() + File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/dist.py", line 954, in run_commands + self.run_command(cmd) + File "/usr/local/lib/python3.10/dist-packages/setuptools/dist.py", line 950, in run_command + super().run_command(command) + File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/dist.py", line 973, in run_command + cmd_obj.run() + File "/usr/local/lib/python3.10/dist-packages/setuptools/command/install.py", line 97, in run + self.do_egg_install() + File "/usr/local/lib/python3.10/dist-packages/setuptools/command/install.py", line 149, in do_egg_install + self.run_command('bdist_egg') + File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/cmd.py", line 316, in run_command + self.distribution.run_command(command) + File "/usr/local/lib/python3.10/dist-packages/setuptools/dist.py", line 950, in run_command + super().run_command(command) + File "/usr/local/lib/python3.10/dist-packages/setuptools/_distutils/dist.py", line 973, in run_command + cmd_obj.run() + File "/usr/local/lib/python3.10/dist-packages/setuptools/command/bdist_egg.py", line 180, in run + write_stub(os.path.basename(ext_name), pyfile) + File "/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py", line 229, in custom_write_stub + new_custom_ops = load_op_meta_info_and_register_op(so_path) + File "/usr/local/lib/python3.10/dist-packages/paddle/utils/cpp_extension/extension_utils.py", line 167, in load_op_meta_info_and_register_op + core.load_op_meta_info_and_register_op(lib_filename) +RuntimeError: (PreconditionNotMet) The third-party dynamic library (/home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fastdeploy_ops.so) that Paddle depends on is not configured correctly. (error code is /home/chenhuan09/FD/FastDeploy/custom_ops/xpu_ops/build/fastdeploy_ops/lib.linux-x86_64-cpython-310/fastdeploy_ops.so: undefined symbol: _Z15SpeculateVerifyRKN6paddle6TensorES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_iib) + Suggestions: + 1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) is installed correctly and its version is matched with paddlepaddle you installed. + 2. Configure third-party dynamic library environment variables as follows: + - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...` + - Windows: set PATH by `set PATH=XXX;%PATH%` + - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` [Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible unless System Integrity Protection (SIP) is disabled.] (at /home/chenhuan09/Paddle/paddle/phi/backends/dynload/dynamic_loader.cc:351) + diff --git a/ch_build.sh b/ch_build.sh new file mode 100644 index 0000000000..760ced3175 --- /dev/null +++ b/ch_build.sh @@ -0,0 +1,6 @@ +bash custom_ops/xpu_ops/download_dependencies.sh develop + +export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xtdk +export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm + +bash build.sh \ No newline at end of file diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc index 6d34b9736b..b1c42e04f4 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc @@ -23,6 +23,9 @@ #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) #endif +#define GET_OUTPUT_DEBUG +#define SAVE_WITH_OUTPUT_DEBUG + #include "speculate_msg.h" void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens, diff --git a/custom_ops/xpu_ops/setup_ops.py b/custom_ops/xpu_ops/setup_ops.py index fa31f5a8e3..ed917d6d6f 100755 --- a/custom_ops/xpu_ops/setup_ops.py +++ b/custom_ops/xpu_ops/setup_ops.py @@ -152,6 +152,8 @@ def xpu_setup_ops(): if file.endswith(".cc"): ops.append(os.path.join(root, file)) + print(ops) + include_dirs = [ os.path.join(base_dir, "./"), os.path.join(base_dir, "./plugin/include"), diff --git a/custom_ops/xpu_ops/src/ops/block_attn.cc b/custom_ops/xpu_ops/src/ops/block_attn.cc index 72ae247491..abdac19475 100644 --- a/custom_ops/xpu_ops/src/ops/block_attn.cc +++ b/custom_ops/xpu_ops/src/ops/block_attn.cc @@ -639,7 +639,7 @@ std::vector BlockAttnKernel( : quant_v_scale_inv, nullptr, // o_maxptr param.head_dim); // vo_head_dim - PD_CHECK(0, "speculative_attention unimplemented"); + // PD_CHECK(0, "speculative_attention unimplemented"); PD_CHECK(ret == api::SUCCESS, "xfa::speculative_attention_decoder failed."); if (!Eq_len) { diff --git a/custom_ops/xpu_ops/src/ops/gather_next_token.cc b/custom_ops/xpu_ops/src/ops/gather_next_token.cc index 8d9aedcee8..2fc9da16bb 100644 --- a/custom_ops/xpu_ops/src/ops/gather_next_token.cc +++ b/custom_ops/xpu_ops/src/ops/gather_next_token.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,97 +12,135 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include "paddle/extension.h" #include "xpu/plugin.h" -#include -std::vector -GatherNextToken(const paddle::Tensor &tmp_out, // [token_num, dim_embed] - const paddle::Tensor &cum_offsets, // [bsz, 1] - const paddle::Tensor &encoder_seq_lod, - const paddle::Tensor &encoder_batch_map, - const paddle::Tensor &decoder_batch_map, - const paddle::Tensor &encoder_seq_lod_cpu, - const paddle::Tensor &encoder_batch_map_cpu, - const paddle::Tensor &decoder_batch_map_cpu, - const paddle::Tensor &enc_batch_tensor, - const paddle::Tensor &dec_batch_tensor, - const paddle::optional &output_padding_offset, - int max_input_length) { - phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId()); - auto dev_ctx = - paddle::experimental::DeviceContextPool::Instance().Get(place); - auto xpu_ctx = static_cast(dev_ctx); - using XPUType = - typename XPUTypeTrait::Type; // only support bfloat16 - typedef paddle::bfloat16 data_t; - const int dim = tmp_out.dims()[1]; - const int bsz = cum_offsets.shape()[0]; - int enc_batch = enc_batch_tensor.data()[0]; - int dec_batch = dec_batch_tensor.data()[0]; - baidu::xpu::api::VectorParam encoder_seqs_lods_vp{ - const_cast(encoder_seq_lod_cpu.data()), - enc_batch + 1, const_cast(encoder_seq_lod.data())}; - baidu::xpu::api::VectorParam encoder_batch_map_vp{ - const_cast(encoder_batch_map_cpu.data()), enc_batch, - const_cast(encoder_batch_map.data())}; - baidu::xpu::api::VectorParam decoder_batch_map_vp{ - const_cast(decoder_batch_map_cpu.data()), dec_batch, - const_cast(decoder_batch_map.data())}; +std::vector GatherNextToken( + const paddle::Tensor& x, // [token_num, dim_embed] + const paddle::Tensor& cum_offsets, // [bsz, 1] + const paddle::Tensor& encoder_seq_lod, + const paddle::Tensor& encoder_batch_map, + const paddle::Tensor& decoder_batch_map, + const paddle::Tensor& encoder_seq_lod_cpu, + const paddle::Tensor& encoder_batch_map_cpu, + const paddle::Tensor& decoder_batch_map_cpu, + const paddle::Tensor& len_info_cpu, + const paddle::optional& output_padding_offset) { + phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId()); + auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place); + auto xpu_ctx = static_cast(dev_ctx); + using XPUType = + typename XPUTypeTrait::Type; // only support bfloat16 + typedef paddle::bfloat16 data_t; + const int dim = x.dims()[1]; + const int token_num = x.shape()[0]; + const int bsz = cum_offsets.shape()[0]; + int enc_batch = len_info_cpu.data()[0]; + int dec_batch = len_info_cpu.data()[1]; - auto out = paddle::full({bsz, dim}, -2, tmp_out.type(), tmp_out.place()); + baidu::xpu::api::VectorParam encoder_seqs_lods_vp{ + const_cast(encoder_seq_lod_cpu.data()), + enc_batch + 1, + const_cast(encoder_seq_lod.data())}; + baidu::xpu::api::VectorParam encoder_batch_map_vp{ + const_cast(encoder_batch_map_cpu.data()), + enc_batch, + const_cast(encoder_batch_map.data())}; + baidu::xpu::api::VectorParam decoder_batch_map_vp{ + const_cast(decoder_batch_map_cpu.data()), + dec_batch, + const_cast(decoder_batch_map.data())}; + paddle::Tensor out; + std::vector encode_iota_lod_cpu(enc_batch); + if (output_padding_offset) { + int need_delete_token_num = 0; + if (enc_batch > 0) { + need_delete_token_num = + encoder_seq_lod_cpu.data()[enc_batch] - enc_batch; + std::iota(encode_iota_lod_cpu.begin(), encode_iota_lod_cpu.end(), 0); + encoder_batch_map_vp.cpu = + const_cast(encode_iota_lod_cpu.data()); + encoder_batch_map_vp.len = enc_batch; + encoder_batch_map_vp.xpu = nullptr; + } + out = paddle::empty( + {token_num - need_delete_token_num, dim}, x.type(), x.place()); + } else { + out = paddle::empty({bsz, dim}, x.type(), x.place()); + } + if (x.shape()[0] == 0) { + return {out}; + } + + if (output_padding_offset && enc_batch <= 0) { + out = x.copy_to(x.place(), false); + } else { int r = baidu::xpu::api::plugin::eb_gather_next_token( xpu_ctx->x_context(), - reinterpret_cast(tmp_out.data()), - reinterpret_cast(out.data()), encoder_seqs_lods_vp, - encoder_batch_map_vp, decoder_batch_map_vp, dim); - return {out}; + reinterpret_cast(x.data()), + reinterpret_cast(out.data()), + encoder_seqs_lods_vp, + encoder_batch_map_vp, + decoder_batch_map_vp, + dim); + PD_CHECK(r == 0, "xpu::plugin::gather_next_token failed."); + } + return {out}; } std::vector> GatherNextTokenInferShape( - const std::vector &tmp_out_shape, - const std::vector &cum_offsets_shape, - const std::vector &encoder_seq_lod_shape, - const std::vector &encoder_batch_map_shape, - const std::vector &decoder_batch_map_shape, - const std::vector &encoder_seq_lod_cpu_shape, - const std::vector &encoder_batch_map_cpu_shape, - const std::vector &decoder_batch_map_cpu_shape, - const std::vector &enc_batch_tensor_shape, - const std::vector &dec_batch_tensor_shape, - const paddle::optional> &output_padding_offset_shape) { - if (output_padding_offset_shape) { - PD_THROW("speculative decoding is not supported in XPU."); - } + const std::vector& x_shape, + const std::vector& cum_offsets_shape, + const std::vector& encoder_seq_lod_shape, + const std::vector& encoder_batch_map_shape, + const std::vector& decoder_batch_map_shape, + const std::vector& encoder_seq_lod_cpu_shape, + const std::vector& encoder_batch_map_cpu_shape, + const std::vector& decoder_batch_map_cpu_shape, + const std::vector& len_info_cpu_shape, + const paddle::optional>& output_padding_offset_shape) { + // if (output_padding_offset_shape) { + // PD_THROW("speculative decoding is not supported in XPU."); + // } + int64_t bsz = cum_offsets_shape[0]; + int64_t dim_embed = x_shape[1]; + if (output_padding_offset_shape) { + return {{-1, dim_embed}}; + } else { int64_t bsz = cum_offsets_shape[0]; - int64_t dim_embed = tmp_out_shape[1]; return {{bsz, dim_embed}}; + } } std::vector GatherNextTokenInferDtype( - const paddle::DataType &tmp_out_dtype, - const paddle::DataType &cum_offsets_dtype, - const paddle::DataType &encoder_seq_lod_dtype, - const paddle::DataType &encoder_batch_map_dtype, - const paddle::DataType &decoder_batch_map_dtype, - const paddle::DataType &encoder_seq_lod_cpu_dtype, - const paddle::DataType &encoder_batch_map_cpu_dtype, - const paddle::DataType &decoder_batch_map_cpu_dtype, - const paddle::DataType &enc_batch_tensor_dtype, - const paddle::DataType &dec_batch_tensor_dtype, - const paddle::optional &output_padding_offset_dtype) { - return {tmp_out_dtype}; + const paddle::DataType& x_dtype, + const paddle::DataType& cum_offsets_dtype, + const paddle::DataType& encoder_seq_lod_dtype, + const paddle::DataType& encoder_batch_map_dtype, + const paddle::DataType& decoder_batch_map_dtype, + const paddle::DataType& encoder_seq_lod_cpu_dtype, + const paddle::DataType& encoder_batch_map_cpu_dtype, + const paddle::DataType& decoder_batch_map_cpu_dtype, + const paddle::DataType& len_info_cpu_dtype, + const paddle::optional& output_padding_offset_dtype) { + return {x_dtype}; } PD_BUILD_OP(gather_next_token) - .Inputs({"tmp_out", "cum_offsets", "encoder_seq_lod", "encoder_batch_map", - "decoder_batch_map", "encoder_seq_lod_cpu", - "encoder_batch_map_cpu", "decoder_batch_map_cpu", - "enc_batch_tensor", "dec_batch_tensor", + .Inputs({"x", + "cum_offsets", + "encoder_seq_lod", + "encoder_batch_map", + "decoder_batch_map", + "encoder_seq_lod_cpu", + "encoder_batch_map_cpu", + "decoder_batch_map_cpu", + "len_info_cpu", paddle::Optional("output_padding_offset")}) .Outputs({"out"}) - .Attrs({"max_input_length: int"}) .SetKernelFn(PD_KERNEL(GatherNextToken)) .SetInferShapeFn(PD_INFER_SHAPE(GatherNextTokenInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(GatherNextTokenInferDtype)); + .SetInferDtypeFn(PD_INFER_DTYPE(GatherNextTokenInferDtype)); \ No newline at end of file diff --git a/custom_ops/xpu_ops/src/ops/get_infer_param.cc b/custom_ops/xpu_ops/src/ops/get_infer_param.cc index fa57be193a..c0f1cb6d86 100644 --- a/custom_ops/xpu_ops/src/ops/get_infer_param.cc +++ b/custom_ops/xpu_ops/src/ops/get_infer_param.cc @@ -401,4 +401,4 @@ PD_BUILD_OP(get_infer_param) .SetKernelFn(PD_KERNEL(GetInferParam)) .Attrs({"block_size: int"}) .SetInferShapeFn(PD_INFER_SHAPE(GetInferParamInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(GetInferParamInferDtype)); + .SetInferDtypeFn(PD_INFER_DTYPE(GetInferParamInferDtype)); \ No newline at end of file diff --git a/custom_ops/xpu_ops/src/ops/get_infer_param_old.cc b/custom_ops/xpu_ops/src/ops/get_infer_param_old.cc new file mode 100644 index 0000000000..e93dd8b632 --- /dev/null +++ b/custom_ops/xpu_ops/src/ops/get_infer_param_old.cc @@ -0,0 +1,244 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/extension.h" +#include "xpu/plugin.h" +#include + +std::vector +GetInferParam(const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder) { + phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId()); + auto dev_ctx = + paddle::experimental::DeviceContextPool::Instance().Get(place); + auto xpu_ctx = static_cast(dev_ctx); + const int bsz = seq_lens_encoder.dims()[0]; + // 判断逻辑 + std::vector seq_lens_encoder_vec(bsz, 0); // input + std::vector seq_lens_decoder_vec(bsz, 0); // input + + std::vector encoder_batch_map_vec(bsz, 0); // + std::vector decoder_batch_map_vec(bsz, 0); // + std::vector encoder_batch_idx_vec(bsz, 0); // 去除空隙的batch map + std::vector decoder_batch_idx_vec(bsz, 0); // 去除空隙的batch map + std::vector encoder_seq_lod_vec(bsz + 1, 0); + std::vector decoder_context_len_vec(bsz, 0); + std::vector decoder_context_len_cache_vec(bsz, 0); + xpu_wait(xpu_ctx->x_context()->xpu_stream); // 是否需要!!!!TODO + int r = xpu_memcpy(seq_lens_encoder_vec.data(), + seq_lens_encoder.data(), sizeof(int32_t) * bsz, + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + r = xpu_memcpy(seq_lens_decoder_vec.data(), + seq_lens_decoder.data(), sizeof(int32_t) * bsz, + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + + int enc_batch = 0, dec_batch = 0; + int total_enc_len = 0; + int batch_offset = 0; + for (int i = 0; i < bsz; ++i) { + if (seq_lens_encoder_vec[i] > 0) { + enc_batch++; + total_enc_len += seq_lens_encoder_vec[i]; + encoder_batch_map_vec[enc_batch - 1] = i; + encoder_batch_idx_vec[enc_batch - 1] = i - batch_offset; + encoder_seq_lod_vec[enc_batch] = + seq_lens_encoder_vec[i] + encoder_seq_lod_vec[enc_batch - 1]; + } else if (seq_lens_decoder_vec[i] > 0) { + dec_batch++; + decoder_batch_map_vec[dec_batch - 1] = i; + decoder_batch_idx_vec[dec_batch - 1] = i - batch_offset; + decoder_context_len_vec[dec_batch - 1] = + seq_lens_decoder_vec[i] + 1; + decoder_context_len_cache_vec[dec_batch - 1] = + seq_lens_decoder_vec[i]; + } else { + batch_offset++; + } + } + + auto encoder_batch_map_xpu = + paddle::full({encoder_batch_map_vec.size()}, 0, seq_lens_encoder.type(), + seq_lens_encoder.place()); + auto decoder_batch_map_xpu = + paddle::full({decoder_batch_map_vec.size()}, 0, seq_lens_encoder.type(), + seq_lens_encoder.place()); + auto encoder_batch_idx_xpu = + paddle::full({encoder_batch_idx_vec.size()}, 0, seq_lens_encoder.type(), + seq_lens_encoder.place()); + auto decoder_batch_idx_xpu = + paddle::full({decoder_batch_idx_vec.size()}, 0, seq_lens_encoder.type(), + seq_lens_encoder.place()); + auto encoder_seq_lod_xpu = + paddle::full({encoder_seq_lod_vec.size()}, 0, seq_lens_encoder.type(), + seq_lens_encoder.place()); + auto decoder_context_len_xpu = + paddle::full({decoder_context_len_vec.size()}, 0, + seq_lens_encoder.type(), seq_lens_encoder.place()); + auto decoder_context_len_cache_xpu = + paddle::full({decoder_context_len_cache_vec.size()}, 0, + seq_lens_encoder.type(), seq_lens_encoder.place()); + + auto encoder_batch_map_cpu = + paddle::full({encoder_batch_map_vec.size()}, 0, seq_lens_encoder.type(), + paddle::CPUPlace()); + auto decoder_batch_map_cpu = + paddle::full({decoder_batch_map_vec.size()}, 0, seq_lens_encoder.type(), + paddle::CPUPlace()); + auto encoder_batch_idx_cpu = + paddle::full({encoder_batch_idx_vec.size()}, 0, seq_lens_encoder.type(), + paddle::CPUPlace()); + auto decoder_batch_idx_cpu = + paddle::full({decoder_batch_idx_vec.size()}, 0, seq_lens_encoder.type(), + paddle::CPUPlace()); + auto encoder_seq_lod_cpu = + paddle::full({encoder_seq_lod_vec.size()}, 0, seq_lens_encoder.type(), + paddle::CPUPlace()); + auto decoder_context_len_cpu = + paddle::full({decoder_context_len_vec.size()}, 0, + seq_lens_encoder.type(), paddle::CPUPlace()); + auto decoder_context_len_cache_cpu = + paddle::full({decoder_context_len_cache_vec.size()}, 0, + seq_lens_encoder.type(), paddle::CPUPlace()); + + int ret = 0; + ret = xpu_memcpy(reinterpret_cast(const_cast( + encoder_batch_map_xpu.data())), + encoder_batch_map_vec.data(), + sizeof(int32_t) * encoder_batch_map_vec.size(), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + ret = xpu_memcpy(reinterpret_cast(const_cast( + decoder_batch_map_xpu.data())), + decoder_batch_map_vec.data(), + sizeof(int32_t) * decoder_batch_map_vec.size(), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + ret = xpu_memcpy(reinterpret_cast(const_cast( + encoder_batch_idx_xpu.data())), + encoder_batch_idx_vec.data(), + sizeof(int32_t) * encoder_batch_idx_vec.size(), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + ret = xpu_memcpy(reinterpret_cast(const_cast( + decoder_batch_idx_xpu.data())), + decoder_batch_idx_vec.data(), + sizeof(int32_t) * decoder_batch_idx_vec.size(), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + ret = xpu_memcpy(reinterpret_cast(const_cast( + encoder_seq_lod_xpu.data())), + encoder_seq_lod_vec.data(), + sizeof(int32_t) * encoder_seq_lod_vec.size(), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + ret = xpu_memcpy(reinterpret_cast(const_cast( + decoder_context_len_xpu.data())), + decoder_context_len_vec.data(), + sizeof(int32_t) * decoder_context_len_vec.size(), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + ret = xpu_memcpy(reinterpret_cast(const_cast( + decoder_context_len_cache_xpu.data())), + decoder_context_len_cache_vec.data(), + sizeof(int32_t) * decoder_context_len_cache_vec.size(), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + + std::memcpy(encoder_batch_map_cpu.data(), + encoder_batch_map_vec.data(), + sizeof(int32_t) * encoder_batch_map_vec.size()); + std::memcpy(decoder_batch_map_cpu.data(), + decoder_batch_map_vec.data(), + sizeof(int32_t) * decoder_batch_map_vec.size()); + std::memcpy(encoder_batch_idx_cpu.data(), + encoder_batch_idx_vec.data(), + sizeof(int32_t) * encoder_batch_idx_vec.size()); + std::memcpy(decoder_batch_idx_cpu.data(), + decoder_batch_idx_vec.data(), + sizeof(int32_t) * decoder_batch_idx_vec.size()); + std::memcpy(encoder_seq_lod_cpu.data(), encoder_seq_lod_vec.data(), + sizeof(int32_t) * encoder_seq_lod_vec.size()); + std::memcpy(decoder_context_len_cpu.data(), + decoder_context_len_vec.data(), + sizeof(int32_t) * decoder_context_len_vec.size()); + std::memcpy(decoder_context_len_cache_cpu.data(), + decoder_context_len_cache_vec.data(), + sizeof(int32_t) * decoder_context_len_cache_vec.size()); + + auto enc_batch_tensor = paddle::full( + {1}, enc_batch, seq_lens_encoder.type(), paddle::CPUPlace()); + auto dec_batch_tensor = paddle::full( + {1}, dec_batch, seq_lens_encoder.type(), paddle::CPUPlace()); + auto total_enc_len_tensor = paddle::full( + {1}, total_enc_len, seq_lens_encoder.type(), paddle::CPUPlace()); + + return {encoder_batch_map_xpu, + decoder_batch_map_xpu, + encoder_batch_idx_xpu, + decoder_batch_idx_xpu, + encoder_seq_lod_xpu, + decoder_context_len_xpu, + decoder_context_len_cache_xpu, + encoder_batch_map_cpu, + decoder_batch_map_cpu, + encoder_batch_idx_cpu, + decoder_batch_idx_cpu, + encoder_seq_lod_cpu, + decoder_context_len_cpu, + decoder_context_len_cache_cpu, + enc_batch_tensor, + dec_batch_tensor, + total_enc_len_tensor}; +} + +std::vector> +GetInferParamInferShape(const std::vector &seq_lens_encoder_shape, + const std::vector &seq_lens_decoder_shape) { + return {seq_lens_encoder_shape, + seq_lens_encoder_shape, + seq_lens_encoder_shape, + seq_lens_encoder_shape, + {seq_lens_encoder_shape[0] + 1}, + seq_lens_encoder_shape, + seq_lens_encoder_shape, + seq_lens_encoder_shape, + seq_lens_encoder_shape, + seq_lens_encoder_shape, + seq_lens_encoder_shape, + seq_lens_encoder_shape, + seq_lens_encoder_shape, + seq_lens_encoder_shape, + {1}, + {1}, + {1}}; +} + +std::vector +GetInferParamInferDtype(const paddle::DataType &seq_lens_encoder_dtype, + const paddle::DataType &seq_lens_decoder_dtype) { + return { + seq_lens_encoder_dtype, seq_lens_encoder_dtype, seq_lens_encoder_dtype, + seq_lens_encoder_dtype, seq_lens_encoder_dtype, seq_lens_encoder_dtype, + seq_lens_encoder_dtype, seq_lens_encoder_dtype, seq_lens_encoder_dtype, + seq_lens_encoder_dtype, seq_lens_encoder_dtype, seq_lens_encoder_dtype, + seq_lens_encoder_dtype, seq_lens_encoder_dtype, seq_lens_encoder_dtype, + seq_lens_encoder_dtype, seq_lens_encoder_dtype}; +} + +PD_BUILD_OP(get_infer_param_old) + .Inputs({"seq_lens_encoder", "seq_lens_decoder"}) + .Outputs({"encoder_batch_map_xpu", "decoder_batch_map_xpu", + "encoder_batch_idx_xpu", "decoder_batch_idx_xpu", + "encoder_seq_lod_xpu", "decoder_context_len_xpu", + "decoder_context_len_cache_xpu", "encoder_batch_map_cpu", + "decoder_batch_map_cpu", "encoder_batch_idx_cpu", + "decoder_batch_idx_cpu", "encoder_seq_lod_cpu", + "decoder_context_len_cpu", "decoder_context_len_cache_cpu", + "enc_batch_tensor", "dec_batch_tensor", "total_enc_len_tensor"}) + .SetKernelFn(PD_KERNEL(GetInferParam)) + .SetInferShapeFn(PD_INFER_SHAPE(GetInferParamInferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(GetInferParamInferDtype)); \ No newline at end of file diff --git a/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc b/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc new file mode 100644 index 0000000000..d97e28f68f --- /dev/null +++ b/custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/extension.h" +#include "paddle/phi/core/enforce.h" +#include "xpu/plugin.h" + +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif + +namespace api = baidu::xpu::api; +void DraftModelPreprocessV2(const paddle::Tensor& draft_tokens, + const paddle::Tensor& input_ids, + const paddle::Tensor& stop_flags, + const paddle::Tensor& seq_lens_this_time, + const paddle::Tensor& seq_lens_encoder, + const paddle::Tensor& seq_lens_decoder, + const paddle::Tensor& step_idx, + const paddle::Tensor& not_need_stop, + const paddle::Tensor& is_block_step, + const paddle::Tensor& batch_drop, + const paddle::Tensor& pre_ids, + const paddle::Tensor& accept_tokens, + const paddle::Tensor& accept_num, + const paddle::Tensor& base_model_seq_lens_this_time, + const paddle::Tensor& base_model_seq_lens_encoder, + const paddle::Tensor& base_model_seq_lens_decoder, + const paddle::Tensor& base_model_step_idx, + const paddle::Tensor& base_model_stop_flags, + const paddle::Tensor& base_model_is_block_step, + const paddle::Tensor& base_model_draft_tokens, + const int num_model_step, + const bool truncate_first_token, + const bool splitwise_prefill, + const bool kvcache_scheduler_v1) { + + phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId()); + auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place); + api::Context* ctx = static_cast(dev_ctx)->x_context(); + if (draft_tokens.is_cpu()) { + ctx = new api::Context(api::kCPU); + } + int real_bsz = seq_lens_this_time.shape()[0]; + int accept_tokens_len = accept_tokens.shape()[1]; + int input_ids_len = input_ids.shape()[1]; + int draft_tokens_len = draft_tokens.shape()[1]; + int pre_ids_len = pre_ids.shape()[1]; + constexpr int BlockSize = 512; + int base_model_draft_tokens_len = base_model_draft_tokens.shape()[1]; + auto not_need_stop_gpu = + not_need_stop.copy_to(seq_lens_this_time.place(), false); + + int r = baidu::xpu::api::plugin::draft_model_preprocess_v2( + ctx, + const_cast(draft_tokens.data()), + const_cast(input_ids.data()), + const_cast(stop_flags.data()), + const_cast(seq_lens_this_time.data()), + const_cast(seq_lens_encoder.data()), + const_cast(seq_lens_decoder.data()), + const_cast(step_idx.data()), + const_cast(not_need_stop_gpu.data()), + const_cast(is_block_step.data()), + const_cast(batch_drop.data()), + const_cast(pre_ids.data()), + accept_tokens.data(), + accept_num.data(), + base_model_seq_lens_this_time.data(), + base_model_seq_lens_encoder.data(), + base_model_seq_lens_decoder.data(), + base_model_step_idx.data(), + base_model_stop_flags.data(), + base_model_is_block_step.data(), + const_cast(base_model_draft_tokens.data()), + real_bsz, + num_model_step, + accept_tokens_len, + draft_tokens_len, + input_ids_len, + base_model_draft_tokens_len, + pre_ids_len, + truncate_first_token, + splitwise_prefill, + kvcache_scheduler_v1); + + PD_CHECK(r == 0, "xpu::plugin::draft_model_preprocess failed."); + auto not_need_stop_cpu = + not_need_stop_gpu.copy_to(not_need_stop.place(), false); + bool* not_need_stop_data = const_cast(not_need_stop.data()); + not_need_stop_data[0] = not_need_stop_cpu.data()[0]; +} + +PD_BUILD_STATIC_OP(draft_model_preprocess_v2) + .Inputs({"draft_tokens", + "input_ids", + "stop_flags", + "seq_lens_this_time", + "seq_lens_encoder", + "seq_lens_decoder", + "step_idx", + "not_need_stop", + "is_block_step", + "batch_drop", + "pre_ids", + "accept_tokens", + "accept_num", + "base_model_seq_lens_this_time", + "base_model_seq_lens_encoder", + "base_model_seq_lens_decoder", + "base_model_step_idx", + "base_model_stop_flags", + "base_model_is_block_step", + "base_model_draft_tokens"}) + .Outputs({"draft_tokens_out", + "input_ids_out", + "stop_flags_out", + "seq_lens_this_time_out", + "seq_lens_encoder_out", + "seq_lens_decoder_out", + "step_idx_out", + "not_need_stop_out", + "batch_drop_out", + "pre_ids_out"}) + .Attrs({"num_model_step: int", "truncate_first_token: bool", "splitwise_prefill: bool", "kvcache_scheduler_v1: bool"}) + .SetInplaceMap({{"draft_tokens", "draft_tokens_out"}, + {"input_ids", "input_ids_out"}, + {"stop_flags", "stop_flags_out"}, + {"seq_lens_this_time", "seq_lens_this_time_out"}, + {"seq_lens_encoder", "seq_lens_encoder_out"}, + {"seq_lens_decoder", "seq_lens_decoder_out"}, + {"step_idx", "step_idx_out"}, + {"not_need_stop", "not_need_stop_out"}, + {"batch_drop", "batch_drop_out"}, + {"pre_ids", "pre_ids_out"}}) + .SetKernelFn(PD_KERNEL(DraftModelPreprocessV2)); diff --git a/custom_ops/xpu_ops/src/ops/mtp/speculate_get_padding_offset_v2.cc b/custom_ops/xpu_ops/src/ops/mtp/speculate_get_padding_offset_v2.cc new file mode 100644 index 0000000000..3740703b2f --- /dev/null +++ b/custom_ops/xpu_ops/src/ops/mtp/speculate_get_padding_offset_v2.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/extension.h" +#include "xpu/plugin.h" + +#ifndef PD_BUILD_STATIC_OP +#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) +#endif + +std::vector SpeculateGetPaddingOffsetV2( + const paddle::Tensor& input_ids, + const paddle::Tensor& draft_tokens, + const paddle::Tensor& cum_offsets, + const paddle::Tensor& token_num, + const paddle::Tensor& seq_len, + const paddle::Tensor& seq_lens_encoder) { + phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId()); + auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place); + auto xpu_ctx = static_cast(dev_ctx); + + std::vector input_ids_shape = input_ids.shape(); + const int bsz = seq_len.shape()[0]; + const int seq_length = input_ids_shape[1]; + const int max_draft_tokens = draft_tokens.shape()[1]; + auto cum_offsets_out = cum_offsets.copy_to(cum_offsets.place(), false); + auto cpu_token_num = token_num.copy_to(paddle::CPUPlace(), false); + + const int token_num_data = cpu_token_num.data()[0]; + auto x_remove_padding = paddle::empty( + {token_num_data}, paddle::DataType::INT64, input_ids.place()); + auto padding_offset = paddle::empty( + {token_num_data}, paddle::DataType::INT32, input_ids.place()); + auto batch_id_per_token = paddle::empty( + {token_num_data}, paddle::DataType::INT32, input_ids.place()); + auto cu_seqlens_q = + paddle::empty({bsz + 1}, paddle::DataType::INT32, input_ids.place()); + auto cu_seqlens_k = + paddle::empty({bsz + 1}, paddle::DataType::INT32, input_ids.place()); + + PD_CHECK(input_ids.is_contiguous(), "Input ids tensor must be contiguous"); + PD_CHECK(draft_tokens.is_contiguous(), + "Draft tokens tensor must be contiguous"); + PD_CHECK(cum_offsets.is_contiguous(), + "Cum offsets tensor must be contiguous"); + PD_CHECK(seq_len.is_contiguous(), "Seq lens tensor must be contiguous"); + + int r = baidu::xpu::api::plugin::speculate_get_padding_offset_v2( + xpu_ctx->x_context(), + batch_id_per_token.data(), + cum_offsets_out.data(), + cu_seqlens_q.data(), + cu_seqlens_k.data(), + cum_offsets.data(), + seq_len.data(), + seq_length, + bsz); + PD_CHECK(r == 0, "XPU speculate_get_padding_offset_v2 failed"); + + r = baidu::xpu::api::plugin::speculate_remove_padding( + xpu_ctx->x_context(), + x_remove_padding.data(), + input_ids.data(), + draft_tokens.data(), + seq_len.data(), + seq_lens_encoder.data(), + cum_offsets_out.data(), + seq_length, + max_draft_tokens, + bsz, + token_num_data); + PD_CHECK(r == 0, "XPU speculate_remove_padding failed"); + + return {x_remove_padding, + batch_id_per_token, + cu_seqlens_q, + cu_seqlens_k}; // , enc_token_num, dec_token_num}; +} + +std::vector> SpeculateGetPaddingOffsetV2InferShape( + const std::vector& input_ids_shape, + const std::vector& draft_tokens_shape, + const std::vector& cum_offsets_shape, + const std::vector& token_num_shape, + const std::vector& seq_len_shape, + const std::vector& seq_lens_encoder_shape) { + int64_t bsz = seq_len_shape[0]; + int64_t seq_len = input_ids_shape[1]; + return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}}; +} + +std::vector SpeculateGetPaddingOffsetV2InferDtype( + const paddle::DataType& input_ids_dtype, + const paddle::DataType& draft_tokens_dtype, + const paddle::DataType& cum_offsets_dtype, + const paddle::DataType& token_num_dtype, + const paddle::DataType& seq_len_dtype, + const paddle::DataType& seq_lens_encoder_dtype) { + return {input_ids_dtype, + seq_len_dtype, + seq_len_dtype, + seq_len_dtype, + seq_len_dtype}; +} + +PD_BUILD_STATIC_OP(speculate_get_padding_offset_v2) + .Inputs({"input_ids", + "draft_tokens", + "cum_offsets", + "token_num", + "seq_len", + "seq_lens_encoder"}) + .Outputs({"x_remove_padding", + "batch_id_per_token", + "cu_seqlens_q", + "cu_seqlens_k"}) + .SetKernelFn(PD_KERNEL(SpeculateGetPaddingOffsetV2)) + .SetInferShapeFn(PD_INFER_SHAPE(SpeculateGetPaddingOffsetV2InferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(SpeculateGetPaddingOffsetV2InferDtype)); diff --git a/custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc b/custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc index 60764b26a5..5bb6f744c4 100644 --- a/custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc +++ b/custom_ops/xpu_ops/src/ops/mtp/speculate_save_output.cc @@ -35,8 +35,7 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens, const paddle::Tensor& not_need_stop, int64_t rank_id, int msg_queue_id, - int save_each_rank) { - // printf("enter save output"); + bool save_each_rank) { if (!save_each_rank && rank_id > 0) { return; } diff --git a/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc b/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc index 53b5b90dc3..4b9e99944f 100644 --- a/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc +++ b/custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc @@ -45,7 +45,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens, const paddle::Tensor &topp, int max_seq_len, int verify_window, - bool enable_topp) { + bool enable_topp, + bool benchmark_mode) { auto bsz = accept_tokens.shape()[0]; int real_bsz = seq_lens_this_time.shape()[0]; auto max_draft_tokens = draft_tokens.shape()[1]; @@ -133,7 +134,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens, max_seq_len, max_candidate_len, verify_window, - prefill_one_step_stop); + prefill_one_step_stop, + benchmark_mode); } else { baidu::xpu::api::plugin::speculate_verify( ctx, @@ -161,7 +163,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens, max_seq_len, max_candidate_len, verify_window, - prefill_one_step_stop); + prefill_one_step_stop, + benchmark_mode); } } else { if (enable_topp) { @@ -191,7 +194,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens, max_seq_len, max_candidate_len, verify_window, - prefill_one_step_stop); + prefill_one_step_stop, + benchmark_mode); } else { baidu::xpu::api::plugin::speculate_verify( ctx, @@ -219,7 +223,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens, max_seq_len, max_candidate_len, verify_window, - prefill_one_step_stop); + prefill_one_step_stop, + benchmark_mode); } } } @@ -246,7 +251,10 @@ PD_BUILD_STATIC_OP(speculate_verify) "accept_num_out", "step_idx_out", "stop_flags_out"}) - .Attrs({"max_seq_len: int", "verify_window: int", "enable_topp: bool"}) + .Attrs({"max_seq_len: int", + "verify_window: int", + "enable_topp: bool", + "benchmark_mode: bool"}) .SetInplaceMap({{"accept_tokens", "accept_tokens_out"}, {"accept_num", "accept_num_out"}, {"step_idx", "step_idx_out"}, diff --git a/custom_ops/xpu_ops/src/ops/pybind/pybind.cc b/custom_ops/xpu_ops/src/ops/pybind/pybind.cc index e8eea990b2..194e54a682 100644 --- a/custom_ops/xpu_ops/src/ops/pybind/pybind.cc +++ b/custom_ops/xpu_ops/src/ops/pybind/pybind.cc @@ -205,7 +205,8 @@ void SpeculateVerify(const paddle::Tensor& accept_tokens, const paddle::Tensor& topp, int max_seq_len, int verify_window, - bool enable_topp); + bool enable_topp, + bool benchmark_mode); void SpeculateClearAcceptNums(const paddle::Tensor& accept_num, const paddle::Tensor& seq_lens_decoder); @@ -242,6 +243,32 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens, const bool truncate_first_token, const bool splitwise_prefill); +void DraftModelPreprocessV2(const paddle::Tensor& draft_tokens, + const paddle::Tensor& input_ids, + const paddle::Tensor& stop_flags, + const paddle::Tensor& seq_lens_this_time, + const paddle::Tensor& seq_lens_encoder, + const paddle::Tensor& seq_lens_decoder, + const paddle::Tensor& step_idx, + const paddle::Tensor& not_need_stop, + const paddle::Tensor& is_block_step, + const paddle::Tensor& batch_drop, + const paddle::Tensor& pre_ids, + const paddle::Tensor& accept_tokens, + const paddle::Tensor& accept_num, + const paddle::Tensor& base_model_seq_lens_this_time, + const paddle::Tensor& base_model_seq_lens_encoder, + const paddle::Tensor& base_model_seq_lens_decoder, + const paddle::Tensor& base_model_step_idx, + const paddle::Tensor& base_model_stop_flags, + const paddle::Tensor& base_model_is_block_step, + const paddle::Tensor& base_model_draft_tokens, + const int num_model_step, + const bool truncate_first_token, + const bool splitwise_prefill, + const bool kvcache_scheduler_v1); + + void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens, const paddle::Tensor& base_model_seq_lens_this_time, const paddle::Tensor& base_model_seq_lens_encoder, @@ -278,6 +305,14 @@ std::vector SpeculateGetPaddingOffset( const paddle::Tensor& seq_len, const paddle::Tensor& seq_lens_encoder); +std::vector SpeculateGetPaddingOffsetV2( + const paddle::Tensor& input_ids, + const paddle::Tensor& draft_tokens, + const paddle::Tensor& cum_offsets, + const paddle::Tensor& token_num, + const paddle::Tensor& seq_len, + const paddle::Tensor& seq_lens_encoder); + void MTPStepPaddle( const paddle::Tensor& base_model_stop_flags, const paddle::Tensor& stop_flags, @@ -552,6 +587,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("max_seq_len"), py::arg("verify_window"), py::arg("enable_topp"), + py::arg("benchmark_mode"), "Perform speculative verification for decoding"); m.def("speculate_clear_accept_nums", @@ -598,6 +634,35 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("splitwise_prefill"), "Preprocess data for draft model in speculative decoding"); + m.def("draft_model_preprocess_v2", + &DraftModelPreprocessV2, + py::arg("draft_tokens"), + py::arg("input_ids"), + py::arg("stop_flags"), + py::arg("seq_lens_this_time"), + py::arg("seq_lens_encoder"), + py::arg("seq_lens_decoder"), + py::arg("step_idx"), + py::arg("not_need_stop"), + py::arg("is_block_step"), + py::arg("batch_drop"), + py::arg("pre_ids"), + py::arg("accept_tokens"), + py::arg("accept_num"), + py::arg("base_model_seq_lens_this_time"), + py::arg("base_model_seq_lens_encoder"), + py::arg("base_model_seq_lens_decoder"), + py::arg("base_model_step_idx"), + py::arg("base_model_stop_flags"), + py::arg("base_model_is_block_step"), + py::arg("base_model_draft_tokens"), + py::arg("num_model_step"), + py::arg("truncate_first_token"), + py::arg("splitwise_prefill"), + py::arg("kvcache_scheduler_v1"), + "Preprocess data for draft model in speculative decoding"); + + m.def("draft_model_postprocess", &DraftModelPostprocess, py::arg("base_model_draft_tokens"), @@ -645,6 +710,16 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("seq_lens_encoder"), "Get padding offset"); + m.def("speculate_get_padding_offset_v2", + &SpeculateGetPaddingOffsetV2, + py::arg("input_ids"), + py::arg("draft_tokens"), + py::arg("cum_offsets"), + py::arg("token_num"), + py::arg("seq_len"), + py::arg("seq_lens_encoder"), + "Get padding offset v2"); + m.def("mtp_step_paddle", &MTPStepPaddle, py::arg("base_model_stop_flags"), diff --git a/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h b/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h index 5ce2559564..557401d31b 100644 --- a/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h +++ b/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h @@ -227,7 +227,8 @@ DLL_EXPORT int speculate_verify(Context* ctx, const int max_seq_len, const int max_candidate_len, const int verify_window, - const bool prefill_one_step_stop); + const bool prefill_one_step_stop, + const bool benchmark_mode); DLL_EXPORT int speculate_clear_accept_nums(Context* ctx, int* accept_num, @@ -343,6 +344,17 @@ DLL_EXPORT int speculate_get_padding_offset(Context* ctx, const int max_seq_len, int bsz); + +DLL_EXPORT int speculate_get_padding_offset_v2(Context* ctx, + int* batch_id_per_token, + int* cum_offsets_out, + int* cu_seqlens_q, + int* cu_seqlens_k, + const int* cum_offsets, + const int* seq_lens, + const int max_seq_len, + int bsz); + DLL_EXPORT int compute_self_order(api::Context* ctx, const int* last_seq_lens_this_time, const int* seq_lens_this_time, @@ -457,6 +469,38 @@ DLL_EXPORT int rebuild_self_hidden_states(api::Context* ctx, T* output, int dim_embed, int elem_cnt); +DLL_EXPORT int draft_model_preprocess_v2(api::Context* ctx, + int64_t* draft_tokens, + int64_t* input_ids, + bool* stop_flags, + int* seq_lens_this_time, + int* seq_lens_encoder, + int* seq_lens_decoder, + int64_t* step_idx, + bool* not_need_stop, + bool* is_block_step, + bool* batch_drop, + int64_t* pre_ids, + const int64_t* accept_tokens, + const int* accept_num, + const int* base_model_seq_lens_this_time, + const int* base_model_seq_lens_encoder, + const int* base_model_seq_lens_decoder, + const int64_t* base_model_step_idx, + const bool* base_model_stop_flags, + const bool* base_model_is_block_step, + int64_t* base_model_draft_tokens, + const int bsz, + const int num_model_step, + const int accept_tokens_len, + const int draft_tokens_len, + const int input_ids_len, + const int base_model_draft_tokens_len, + const int pre_ids_len, + const bool truncate_first_token, + const bool splitwise_prefill, + const bool kvcache_scheduler_v1); + /*--------------------------------------- MTP end --------------------------------------------*/ } // namespace plugin diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/compute_order.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/compute_order.xpu index 7cd399d09c..b5bfa6ba91 100644 --- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/compute_order.xpu +++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/compute_order.xpu @@ -20,6 +20,7 @@ __global__ void ComputeOrderKernel(const int* seq_lens_this_time, return; } + // 256 * int char lm[6 * 1024]; int buf_size = 6 * 1024 / (6 * sizeof(int)); int* lm_base_model_seq_lens_this_time = (int*)lm; @@ -68,10 +69,7 @@ __global__ void ComputeOrderKernel(const int* seq_lens_this_time, in_offset += write_size; } mfence_lm(); - // 2. base model encoder. Base step=0 - } else if (cur_base_model_seq_lens_encoder != 0) { - // nothing happens - // 3. New end + // 2. Base model stop at last verify-step. } else if (cur_base_model_seq_lens_this_time != 0 && cur_seq_lens_this_time == 0) { in_offset += cur_base_model_seq_lens_this_time; @@ -80,27 +78,16 @@ __global__ void ComputeOrderKernel(const int* seq_lens_this_time, cur_seq_lens_this_time == 0) { // nothing happens } else { - if (accept_num <= actual_draft_token_num) { - int position_map_val = out_offset; - LM2GM(&position_map_val, - position_map + in_offset + accept_num - 1, - sizeof(int)); - out_offset++; - in_offset += cur_base_model_seq_lens_this_time; - } else { - int position_map_val_1 = out_offset; - LM2GM(&position_map_val_1, - position_map + in_offset + accept_num - 2, - sizeof(int)); - out_offset++; - int position_map_val_2 = out_offset; - LM2GM(&position_map_val_2, - position_map + in_offset + accept_num - 1, - sizeof(int)); - out_offset++; - in_offset += cur_base_model_seq_lens_this_time; + // accept_num << buf_size, so do not need split + for (int i = 0; i < accept_num; i++) { + lm_position_map[i] = out_offset++; } mfence_lm(); + LM2GM(lm_position_map, + position_map + in_offset, + accept_num * sizeof(int)); + in_offset += cur_base_model_seq_lens_this_time - accept_num; + mfence_lm(); } } } diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_preprocess_v2.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_preprocess_v2.xpu new file mode 100644 index 0000000000..052de1b7fc --- /dev/null +++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_preprocess_v2.xpu @@ -0,0 +1,250 @@ +#include "xpu/kernel/cluster.h" +#include "xpu/kernel/cluster_debug.h" +#include "xpu/kernel/cluster_partition.h" +#include "xpu/kernel/cluster_primitive.h" +#include "xpu/kernel/cluster_simd.h" + +namespace xpu3 { +namespace plugin { +__global__ void draft_model_preprocess_v2( + int64_t* draft_tokens, + int64_t* input_ids, + bool* stop_flags, + int* seq_lens_this_time, + int* seq_lens_encoder, + int* seq_lens_decoder, + int64_t* step_idx, + bool* not_need_stop, + bool* is_block_step, + bool* batch_drop, + int64_t* pre_ids, + const int64_t* accept_tokens, + const int* accept_num, + const int* base_model_seq_lens_this_time, + const int* base_model_seq_lens_encoder, + const int* base_model_seq_lens_decoder, + const int64_t* base_model_step_idx, + const bool* base_model_stop_flags, + const bool* base_model_is_block_step, + int64_t* base_model_draft_tokens, + const int bsz, + const int num_model_step, + const int accept_tokens_len, + const int draft_tokens_len, + const int input_ids_len, + const int base_model_draft_tokens_len, + const int pre_ids_len, + const bool truncate_first_token, + const bool splitwise_prefill, + const bool kvcache_scheduler_v1) { + int cid = core_id(); + int ncores = core_num(); + int clusterid = cluster_id(); + int nclusters = cluster_num(); + int tid = clusterid * ncores + cid; + __shared__ int not_stop_flag_sm[64]; + not_stop_flag_sm[cid] = 0; + int64_t accept_tokens_now[128]; + + int value_zero = 0; + int64_t value_fu = -1; + + if (splitwise_prefill) { + for (; tid < bsz; tid += ncores * nclusters) { + int64_t base_model_step_idx_now = 0; + int seq_lens_encoder_now = 0; + int seq_lens_this_time_now = 0; + bool stop_flags_now = false; + int64_t base_model_first_token; + int seq_lens_encoder_record_now = 0; + int64_t input_ids_now = 0; + + GM2LM_ASYNC(base_model_step_idx + tid, + &base_model_step_idx_now, + sizeof(int64_t)); + GM2LM_ASYNC( + seq_lens_encoder + tid, &seq_lens_encoder_now, sizeof(int)); + GM2LM(accept_tokens + tid * accept_tokens_len, + &base_model_first_token, + sizeof(int64_t)); + if (seq_lens_encoder_now > 0) { + not_stop_flag_sm[cid] += 1; + stop_flags_now = false; + int position = seq_lens_encoder_now; + if (truncate_first_token) { + position = position - 1; + input_ids_now = base_model_first_token; + seq_lens_this_time_now = seq_lens_encoder_now; + } else { + input_ids_now = base_model_first_token; + seq_lens_this_time_now = seq_lens_encoder_now + 1; + } + LM2GM_ASYNC(&input_ids_now, + input_ids + tid * input_ids_len + position, + sizeof(int64_t)); + } else { + stop_flags_now = true; + seq_lens_this_time_now = 0; + seq_lens_encoder_now = 0; + not_stop_flag_sm[cid] += 0; + LM2GM_ASYNC(&value_zero, seq_lens_decoder + tid, sizeof(int)); + } + LM2GM_ASYNC( + &seq_lens_encoder_now, seq_lens_encoder + tid, sizeof(int)); + LM2GM_ASYNC(&stop_flags_now, stop_flags + tid, sizeof(bool)); + LM2GM( + &seq_lens_this_time_now, seq_lens_this_time + tid, sizeof(int)); + } + } else { + for (; tid < bsz; tid += ncores * nclusters) { + bool base_model_stop_flags_now = false; + bool base_model_is_block_step_now = false; + bool batch_drop_now = false; + bool stop_flags_now = false; + bool is_block_step_now = false; + int seq_lens_this_time_now = 0; + int seq_lens_encoder_now = 0; + int seq_lens_decoder_new = 0; + int accept_num_now = 0; + int base_model_seq_lens_decoder_now = 0; + int base_model_seq_lens_this_time_now = 0; + int64_t step_id_now = 0; + int64_t base_model_step_idx_now; + int64_t pre_ids_now; + mfence(); + GM2LM_ASYNC(is_block_step + tid, &is_block_step_now, sizeof(bool)); + GM2LM_ASYNC(base_model_stop_flags + tid, + &base_model_stop_flags_now, + sizeof(bool)); + GM2LM_ASYNC(base_model_is_block_step + tid, + &base_model_is_block_step_now, + sizeof(bool)); + GM2LM_ASYNC(batch_drop + tid, &batch_drop_now, sizeof(bool)); + GM2LM_ASYNC(stop_flags + tid, &stop_flags_now, sizeof(bool)); + GM2LM_ASYNC( + seq_lens_encoder + tid, &seq_lens_encoder_now, sizeof(int)); + GM2LM_ASYNC( + seq_lens_decoder + tid, &seq_lens_decoder_new, sizeof(int)); + + GM2LM_ASYNC(accept_tokens + tid * accept_tokens_len, + accept_tokens_now, + accept_tokens_len * sizeof(int64_t)); + GM2LM_ASYNC(accept_num + tid, &accept_num_now, sizeof(int)); + + GM2LM_ASYNC(base_model_seq_lens_this_time + tid, + &base_model_seq_lens_this_time_now, + sizeof(int)); + GM2LM_ASYNC(base_model_seq_lens_decoder + tid, + &base_model_seq_lens_decoder_now, + sizeof(int)); + GM2LM_ASYNC(step_idx + tid, &step_id_now, sizeof(int64_t)); + GM2LM(base_model_step_idx + tid, + &base_model_step_idx_now, + sizeof(int64_t)); + + for (int i = 1; i < base_model_draft_tokens_len; i++) { + LM2GM_ASYNC(&value_fu, + base_model_draft_tokens + + tid * base_model_draft_tokens_len + i, + sizeof(int)); + } + if (kvcache_scheduler_v1) { + if (base_model_stop_flags_now && base_model_is_block_step_now) { + stop_flags_now = true; + is_block_step_now = true; + } + } else { + if (base_model_stop_flags_now && base_model_is_block_step_now) { + batch_drop_now = true; + stop_flags_now = true; + } + } + + if (!(base_model_stop_flags_now || batch_drop_now)) { + not_stop_flag_sm[cid] += 1; + if (seq_lens_encoder_now > 0) { + int seq_len_encoder = seq_lens_encoder_now; + stop_flags_now = false; + int64_t base_model_first_token = accept_tokens_now[0]; + LM2GM(&base_model_first_token, + pre_ids + tid * pre_ids_len, + sizeof(int64_t)); + int position = seq_len_encoder; + if (truncate_first_token) { + LM2GM(&base_model_first_token, + input_ids + tid * input_ids_len + position - 1, + sizeof(int64_t)); + seq_lens_this_time_now = seq_len_encoder; + } else { + LM2GM(&base_model_first_token, + input_ids + tid * input_ids_len + position, + sizeof(int64_t)); + seq_lens_this_time_now = seq_len_encoder + 1; + } + } else { + if (kvcache_scheduler_v1) { + if (!base_model_is_block_step_now && + is_block_step_now) { + is_block_step_now = false; + } + } + if (stop_flags_now) { + stop_flags_now = false; + seq_lens_decoder_new = base_model_seq_lens_decoder_now - + base_model_seq_lens_this_time_now; + step_id_now = base_model_step_idx_now - + base_model_seq_lens_this_time_now; + + } else { + seq_lens_decoder_new -= num_model_step - 1; + step_id_now -= num_model_step - 1; + } + for (int i = 0; i < accept_num_now; i++) { + const int pre_id_pos = + base_model_step_idx_now - (accept_num_now - i); + LM2GM(accept_tokens_now + i, + draft_tokens + tid * draft_tokens_len + i, + sizeof(int64_t)); + LM2GM(accept_tokens_now + i, + pre_ids + tid * pre_ids_len + pre_id_pos, + sizeof(int64_t)); + } + seq_lens_this_time_now = accept_num_now; + } + + } else { + stop_flags_now = true; + seq_lens_this_time_now = 0; + seq_lens_encoder_now = 0; + seq_lens_decoder_new = 0; + } + LM2GM_ASYNC(&stop_flags_now, stop_flags + tid, sizeof(bool)); + LM2GM_ASYNC(&batch_drop_now, batch_drop + tid, sizeof(bool)); + LM2GM_ASYNC(&is_block_step_now, is_block_step + tid, sizeof(bool)); + LM2GM_ASYNC( + &seq_lens_decoder_new, seq_lens_decoder + tid, sizeof(int)); + LM2GM_ASYNC( + &seq_lens_this_time_now, seq_lens_this_time + tid, sizeof(int)); + LM2GM_ASYNC( + &seq_lens_encoder_now, seq_lens_encoder + tid, sizeof(int)); + LM2GM_ASYNC(&step_id_now, step_idx + tid, sizeof(int64_t)); + } + } + mfence(); + sync_cluster(); + bool value_true = true; + bool value_false = false; + if (cid == 0) { + for (int i = 0; i < ncores; i++) { + not_stop_flag_sm[0] += not_stop_flag_sm[i]; + } + if (not_stop_flag_sm[0] > 0) { + LM2GM(&value_true, not_need_stop, sizeof(bool)); + } else { + LM2GM(&value_false, not_need_stop, sizeof(bool)); + } + } +} + +} // namespace plugin +} // namespace xpu3 diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_update.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_update.xpu index 0334995f9d..50ba31d61b 100644 --- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_update.xpu +++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/draft_model_update.xpu @@ -60,10 +60,8 @@ __global__ void draft_model_update(const int64_t* inter_next_tokens, token_this_time = next_tokens_start[seq_len_this_time - 1]; draft_token_now[0] = next_tokens_start[seq_len_this_time - 1]; base_model_draft_tokens_now[substep + 1] = token_this_time; - for (int i = 0; i < seq_len_this_time; ++i) { - pre_ids_now[step_idx[tid] + 1 + i] = next_tokens_start[i]; - } step_idx[tid] += seq_len_this_time; + pre_ids_now[step_idx[tid]] = token_this_time; } else { token_this_time = next_tokens_start[0]; seq_lens_decoder[tid] = seq_len_encoder + seq_len_decoder; diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_get_padding_offset.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_get_padding_offset.xpu index c08d756d7c..4af74b8f62 100644 --- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_get_padding_offset.xpu +++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_get_padding_offset.xpu @@ -101,6 +101,43 @@ __global__ void speculate_get_padding_offset(int* padding_offset, } } + +__global__ void speculate_get_padding_offset_v2(int* batch_id_per_token, + int* cum_offsets_out, + int* cu_seqlens_q, + int* cu_seqlens_k, + const int* cum_offsets, + const int* seq_lens, + const int max_seq_len, + int bsz) { + int bid = cluster_id(); + int tid = core_id(); + int ncores = core_num(); + int nclusters = cluster_num(); + int seq_lens_now = 0; + int cum_offsets_now = 0; + int cum_offsets_now_ind = 0; + for (int bi = bid; bi < bsz; bi += nclusters) { + GM2LM(seq_lens + bi, &seq_lens_now, sizeof(int)); + if (bi == 0) { + cum_offsets_now = 0; + } else { + GM2LM(cum_offsets + bi - 1, &cum_offsets_now, sizeof(int)); + } + GM2LM(cum_offsets + bi, &cum_offsets_now_ind, sizeof(int)); + + for (int i = tid; i < seq_lens_now; i += ncores) { + LM2GM(&bi, + batch_id_per_token + bi * max_seq_len - cum_offsets_now + i, + sizeof(int)); + } + LM2GM(&cum_offsets_now, cum_offsets_out + bi, sizeof(int)); + int cum_seq_len = (bi + 1) * max_seq_len - cum_offsets_now_ind; + LM2GM(&cum_seq_len, cu_seqlens_q + bi + 1, sizeof(int)); + LM2GM(&cum_seq_len, cu_seqlens_k + bi + 1, sizeof(int)); + } +} + #define _XPU_DEF_SPECULATE_KERNELS_(T) \ template __global__ void speculate_remove_padding(T*, \ const T*, \ diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_verify.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_verify.xpu index 68eb2bd606..4287c3e7d8 100644 --- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_verify.xpu +++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/mtp_kernel/speculate_verify.xpu @@ -138,7 +138,8 @@ __global__ void speculate_verify( const int max_candidate_len, // scalar, 每个 verify token // 的最大候选数(用于验证或采样) const int verify_window, // scalar, TopK 验证窗口(允许连续 top1 匹配次数) - const bool prefill_one_step_stop) { + const bool prefill_one_step_stop, + const bool benchmark_mode) { const int cid = core_id(); const int64_t tid = cluster_id() * core_num() + core_id(); const int64_t nthreads = cluster_num() * core_num(); @@ -161,6 +162,9 @@ __global__ void speculate_verify( // printf("seq_lens_this_time[%d]-1: %d \n",bid, // seq_lens_this_time[bid]-1); for (; i < seq_lens_this_time[bid] - 1; i++) { + if(benchmark_mode){ + break; + } if (seq_lens_encoder[bid] != 0) { break; } @@ -300,33 +304,34 @@ __global__ void speculate_verify( } } } -#define SPECULATE_VERIFY_INSTANTIATE(ENABLE_TOPP, USE_TOPK) \ - template __global__ void speculate_verify( \ - int64_t * accept_tokens, \ - int *accept_num, \ - int64_t *step_idx, \ - bool *stop_flags, \ - const int *seq_lens_encoder, \ - const int *seq_lens_decoder, \ - const int64_t *draft_tokens, \ - const int *actual_draft_token_nums, \ - const float *dev_curand_states, \ - const float *topp, \ - const int *seq_lens_this_time, \ - const int64_t *verify_tokens, \ - const float *verify_scores, \ - const int64_t *max_dec_len, \ - const int64_t *end_tokens, \ - const bool *is_block_step, \ - const int *output_cum_offsets, \ - const int *actual_candidate_len, \ - int real_bsz, \ - int max_draft_tokens, \ - int end_length, \ - int max_seq_len, \ - int max_candidate_len, \ - int verify_window, \ - bool prefill_one_step_stop); +#define SPECULATE_VERIFY_INSTANTIATE(ENABLE_TOPP, USE_TOPK) \ + template __global__ void speculate_verify( \ + int64_t * accept_tokens, \ + int *accept_num, \ + int64_t *step_idx, \ + bool *stop_flags, \ + const int *seq_lens_encoder, \ + const int *seq_lens_decoder, \ + const int64_t *draft_tokens, \ + const int *actual_draft_token_nums, \ + const float *dev_curand_states, \ + const float *topp, \ + const int *seq_lens_this_time, \ + const int64_t *verify_tokens, \ + const float *verify_scores, \ + const int64_t *max_dec_len, \ + const int64_t *end_tokens, \ + const bool *is_block_step, \ + const int *output_cum_offsets, \ + const int *actual_candidate_len, \ + int real_bsz, \ + int max_draft_tokens, \ + int end_length, \ + int max_seq_len, \ + int max_candidate_len, \ + int verify_window, \ + bool prefill_one_step_stop, \ + bool benchmark_mode); SPECULATE_VERIFY_INSTANTIATE(true, true) SPECULATE_VERIFY_INSTANTIATE(true, false) SPECULATE_VERIFY_INSTANTIATE(false, true) diff --git a/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/draft_model_preprocess_v2.cpp b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/draft_model_preprocess_v2.cpp new file mode 100644 index 0000000000..3eedc4e67f --- /dev/null +++ b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/draft_model_preprocess_v2.cpp @@ -0,0 +1,425 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "xpu/plugin.h" +#include "xpu/refactor/impl/launch_strategy.h" +#include "xpu/refactor/impl_public/wrapper_check.h" +#include "xpu/xdnn.h" + +namespace xpu3 { +namespace plugin { +__attribute__((global)) void draft_model_preprocess_v2( + int64_t* draft_tokens, + int64_t* input_ids, + bool* stop_flags, + int* seq_lens_this_time, + int* seq_lens_encoder, + int* seq_lens_decoder, + int64_t* step_idx, + bool* not_need_stop, + bool* is_block_step, + bool* batch_drop, + int64_t* pre_ids, + const int64_t* accept_tokens, + const int* accept_num, + const int* base_model_seq_lens_this_time, + const int* base_model_seq_lens_encoder, + const int* base_model_seq_lens_decoder, + const int64_t* base_model_step_idx, + const bool* base_model_stop_flags, + const bool* base_model_is_block_step, + int64_t* base_model_draft_tokens, + const int bsz, + const int num_model_step, + const int accept_tokens_len, + const int draft_tokens_len, + const int input_ids_len, + const int base_model_draft_tokens_len, + const int pre_ids_len, + const bool truncate_first_token, + const bool splitwise_prefill, + const bool kvcache_scheduler_v1); +} // namespace plugin +} // namespace xpu3 + +namespace xpu2 { +namespace plugin {} // namespace plugin +} // namespace xpu2 + +namespace baidu { +namespace xpu { +namespace api { +namespace plugin { + +static int cpu_wrapper(api::Context* ctx, + int64_t* draft_tokens, + int64_t* input_ids, + bool* stop_flags, + int* seq_lens_this_time, + int* seq_lens_encoder, + int* seq_lens_decoder, + int64_t* step_idx, + bool* not_need_stop, + bool* is_block_step, + bool* batch_drop, + int64_t* pre_ids, + const int64_t* accept_tokens, + const int* accept_num, + const int* base_model_seq_lens_this_time, + const int* base_model_seq_lens_encoder, + const int* base_model_seq_lens_decoder, + const int64_t* base_model_step_idx, + const bool* base_model_stop_flags, + const bool* base_model_is_block_step, + int64_t* base_model_draft_tokens, + const int bsz, + const int num_model_step, + const int accept_tokens_len, + const int draft_tokens_len, + const int input_ids_len, + const int base_model_draft_tokens_len, + const int pre_ids_len, + const bool truncate_first_token, + const bool splitwise_prefill, + const bool kvcache_scheduler_v1) { + int64_t not_stop_flag_sum = 0; + int64_t not_stop_flag = 0; + for (int tid = 0; tid < bsz; tid++) { + if (splitwise_prefill) { + auto* input_ids_now = input_ids + tid * input_ids_len; + auto* accept_tokens_now = accept_tokens + tid * accept_tokens_len; + if (seq_lens_encoder[tid] > 0) { + not_stop_flag = 1; + int seq_len_encoder = seq_lens_encoder[tid]; + stop_flags[tid] = false; + int64_t base_model_first_token = accept_tokens_now[0]; + int position = seq_len_encoder; + if (truncate_first_token) { + input_ids_now[position - 1] = base_model_first_token; + seq_lens_this_time[tid] = seq_len_encoder; + } else { + input_ids_now[position] = base_model_first_token; + seq_lens_this_time[tid] = seq_len_encoder + 1; + } + } else { + stop_flags[tid] = true; + seq_lens_this_time[tid] = 0; + seq_lens_decoder[tid] = 0; + seq_lens_encoder[tid] = 0; + not_stop_flag = 0; + } + not_stop_flag_sum += not_stop_flag; + } else { + auto* accept_tokens_now = accept_tokens + tid * accept_tokens_len; + auto* draft_tokens_now = draft_tokens + tid * draft_tokens_len; + auto accept_num_now = accept_num[tid]; + auto* input_ids_now = input_ids + tid * input_ids_len; + auto* base_model_draft_tokens_now = + base_model_draft_tokens + tid * base_model_draft_tokens_len; + auto base_model_seq_len_decoder = base_model_seq_lens_decoder[tid]; + const int32_t base_model_seq_len_this_time = + base_model_seq_lens_this_time[tid]; + auto* pre_ids_now = pre_ids + tid * pre_ids_len; + for (int i = 1; i < base_model_draft_tokens_len; i++) { + base_model_draft_tokens_now[i] = -1; + } + if(kvcache_scheduler_v1) { + if (base_model_stop_flags[tid] && + base_model_is_block_step[tid]) { + stop_flags[tid] = true; + is_block_step[tid] = true; + // Need to continue infer + } + } else { + if (base_model_stop_flags[tid] && + base_model_is_block_step[tid]) { + batch_drop[tid] = true; + stop_flags[tid] = true; + } + } + + if (!(base_model_stop_flags[tid] || batch_drop[tid])) { + not_stop_flag = 1; + // prefill generation + if (seq_lens_encoder[tid] > 0) { + // Can be extended to first few tokens + int seq_len_encoder = seq_lens_encoder[tid]; + stop_flags[tid] = false; + int64_t base_model_first_token = accept_tokens_now[0]; + pre_ids_now[0] = base_model_first_token; + int position = seq_len_encoder; + if (truncate_first_token) { + input_ids_now[position - 1] = base_model_first_token; + seq_lens_this_time[tid] = seq_len_encoder; + } else { + input_ids_now[position] = base_model_first_token; + seq_lens_this_time[tid] = seq_len_encoder + 1; + } + } else { // decode generation + if(kvcache_scheduler_v1) { + // 3. try to recover mtp infer in V1 mode + if (!base_model_is_block_step[tid] && + is_block_step[tid]) { + is_block_step[tid] = false; + } + } + if (stop_flags[tid]) { + stop_flags[tid] = false; + // TODO: check + seq_lens_decoder[tid] = base_model_seq_len_decoder - + base_model_seq_len_this_time; + step_idx[tid] = base_model_step_idx[tid] - + base_model_seq_len_this_time; + } else { + // 2: Last base model generated token and first MTP + // token + seq_lens_decoder[tid] -= num_model_step - 1; + step_idx[tid] -= num_model_step - 1; + } + for (int i = 0; i < accept_num_now; i++) { + draft_tokens_now[i] = accept_tokens_now[i]; + const int pre_id_pos = + base_model_step_idx[tid] - (accept_num_now - i); + const int64_t accept_token = accept_tokens_now[i]; + pre_ids_now[pre_id_pos] = accept_token; + } + seq_lens_this_time[tid] = accept_num_now; + } + } else { + stop_flags[tid] = true; + seq_lens_this_time[tid] = 0; + seq_lens_decoder[tid] = 0; + seq_lens_encoder[tid] = 0; + } + not_stop_flag_sum += not_stop_flag; + } + } + not_need_stop[0] = not_stop_flag_sum > 0; + return api::SUCCESS; +} + +static int xpu3_wrapper(api::Context* ctx, + int64_t* draft_tokens, + int64_t* input_ids, + bool* stop_flags, + int* seq_lens_this_time, + int* seq_lens_encoder, + int* seq_lens_decoder, + int64_t* step_idx, + bool* not_need_stop, + bool* is_block_step, + bool* batch_drop, + int64_t* pre_ids, + const int64_t* accept_tokens, + const int* accept_num, + const int* base_model_seq_lens_this_time, + const int* base_model_seq_lens_encoder, + const int* base_model_seq_lens_decoder, + const int64_t* base_model_step_idx, + const bool* base_model_stop_flags, + const bool* base_model_is_block_step, + int64_t* base_model_draft_tokens, + const int bsz, + const int num_model_step, + const int accept_tokens_len, + const int draft_tokens_len, + const int input_ids_len, + const int base_model_draft_tokens_len, + const int pre_ids_len, + const bool truncate_first_token, + const bool splitwise_prefill, + const bool kvcache_scheduler_v1) { + using XPU_INT64 = typename XPUIndexType::type; + + // NOTE: Don't change 16 to 64, because kernel use gsm + xpu3::plugin::draft_model_preprocess_v2<<<1, 64, ctx->xpu_stream>>>( + reinterpret_cast(draft_tokens), + reinterpret_cast(input_ids), + stop_flags, + seq_lens_this_time, + seq_lens_encoder, + seq_lens_decoder, + reinterpret_cast(step_idx), + not_need_stop, + is_block_step, + batch_drop, + reinterpret_cast(pre_ids), + reinterpret_cast(accept_tokens), + accept_num, + base_model_seq_lens_this_time, + base_model_seq_lens_encoder, + base_model_seq_lens_decoder, + reinterpret_cast(base_model_step_idx), + base_model_stop_flags, + base_model_is_block_step, + reinterpret_cast(base_model_draft_tokens), + bsz, + num_model_step, + accept_tokens_len, + draft_tokens_len, + input_ids_len, + base_model_draft_tokens_len, + pre_ids_len, + truncate_first_token, + splitwise_prefill, + kvcache_scheduler_v1); + return api::SUCCESS; +} + +int draft_model_preprocess_v2(api::Context* ctx, + int64_t* draft_tokens, + int64_t* input_ids, + bool* stop_flags, + int* seq_lens_this_time, + int* seq_lens_encoder, + int* seq_lens_decoder, + int64_t* step_idx, + bool* not_need_stop, + bool* is_block_step, + bool* batch_drop, + int64_t* pre_ids, + const int64_t* accept_tokens, + const int* accept_num, + const int* base_model_seq_lens_this_time, + const int* base_model_seq_lens_encoder, + const int* base_model_seq_lens_decoder, + const int64_t* base_model_step_idx, + const bool* base_model_stop_flags, + const bool* base_model_is_block_step, + int64_t* base_model_draft_tokens, + const int bsz, + const int num_model_step, + const int accept_tokens_len, + const int draft_tokens_len, + const int input_ids_len, + const int base_model_draft_tokens_len, + const int pre_ids_len, + const bool truncate_first_token, + const bool splitwise_prefill, + const bool kvcache_scheduler_v1) { + WRAPPER_CHECK_CTX(ctx); + WRAPPER_DUMP_FUNCTION_T1(ctx, "draft_model_preprocess_v2", int64_t); + WRAPPER_DUMP_PARAM6(ctx, + draft_tokens, + input_ids, + stop_flags, + seq_lens_this_time, + seq_lens_encoder, + seq_lens_decoder); + WRAPPER_DUMP_PARAM5( + ctx, step_idx, not_need_stop, is_block_step, batch_drop, pre_ids); + WRAPPER_DUMP_PARAM3( + ctx, accept_tokens, accept_num, base_model_seq_lens_encoder); + WRAPPER_DUMP_PARAM4(ctx, + base_model_seq_lens_encoder, + base_model_seq_lens_decoder, + base_model_step_idx, + base_model_stop_flags); + WRAPPER_DUMP_PARAM3( + ctx, base_model_is_block_step, base_model_draft_tokens, bsz); + WRAPPER_DUMP_PARAM3( + ctx, num_model_step, accept_tokens_len, draft_tokens_len); + WRAPPER_DUMP_PARAM4(ctx, + input_ids_len, + base_model_draft_tokens_len, + pre_ids_len, + truncate_first_token); + WRAPPER_DUMP_PARAM2(ctx, splitwise_prefill, kvcache_scheduler_v1); + WRAPPER_DUMP(ctx); + + WRAPPER_CHECK_PTR(ctx, int, bsz, seq_lens_this_time); + WRAPPER_CHECK_PTR(ctx, int64_t, bsz * accept_tokens_len, accept_tokens); + WRAPPER_CHECK_PTR(ctx, int64_t, bsz * input_ids_len, input_ids); + WRAPPER_CHECK_PTR(ctx, int64_t, bsz * draft_tokens_len, draft_tokens); + WRAPPER_CHECK_PTR(ctx, + int64_t, + bsz * base_model_draft_tokens_len, + base_model_draft_tokens); + + WRAPPER_ASSERT_GT(ctx, bsz, 0); + WRAPPER_ASSERT_LT(ctx, accept_tokens_len, 128); + + if (ctx->dev().type() == api::kCPU) { + return cpu_wrapper(ctx, + draft_tokens, + input_ids, + stop_flags, + seq_lens_this_time, + seq_lens_encoder, + seq_lens_decoder, + step_idx, + not_need_stop, + is_block_step, + batch_drop, + pre_ids, + accept_tokens, + accept_num, + base_model_seq_lens_this_time, + base_model_seq_lens_encoder, + base_model_seq_lens_decoder, + base_model_step_idx, + base_model_stop_flags, + base_model_is_block_step, + base_model_draft_tokens, + bsz, + num_model_step, + accept_tokens_len, + draft_tokens_len, + input_ids_len, + base_model_draft_tokens_len, + pre_ids_len, + truncate_first_token, + splitwise_prefill, + kvcache_scheduler_v1); + } + if (ctx->dev().type() == api::kXPU3) { + return xpu3_wrapper(ctx, + draft_tokens, + input_ids, + stop_flags, + seq_lens_this_time, + seq_lens_encoder, + seq_lens_decoder, + step_idx, + not_need_stop, + is_block_step, + batch_drop, + pre_ids, + accept_tokens, + accept_num, + base_model_seq_lens_this_time, + base_model_seq_lens_encoder, + base_model_seq_lens_decoder, + base_model_step_idx, + base_model_stop_flags, + base_model_is_block_step, + base_model_draft_tokens, + bsz, + num_model_step, + accept_tokens_len, + draft_tokens_len, + input_ids_len, + base_model_draft_tokens_len, + pre_ids_len, + truncate_first_token, + splitwise_prefill, + kvcache_scheduler_v1); + } + WRAPPER_UNIMPLEMENTED(ctx); +} + +} // namespace plugin +} // namespace api +} // namespace xpu +} // namespace baidu diff --git a/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_get_padding_offset.cpp b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_get_padding_offset.cpp index a0066e4557..0886a0196a 100644 --- a/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_get_padding_offset.cpp +++ b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_get_padding_offset.cpp @@ -42,6 +42,16 @@ __attribute__((global)) void speculate_get_padding_offset( const int max_seq_len, int bsz); +__attribute__((global)) void speculate_get_padding_offset_v2( + int* batch_id_per_token, + int* cum_offsets_out, + int* cu_seqlens_q, + int* cu_seqlens_k, + const int* cum_offsets, + const int* seq_lens, + const int max_seq_len, + int bsz); + } // namespace plugin } // namespace xpu3 @@ -99,6 +109,29 @@ static int cpu_wrapper_get_padding_offset(Context* ctx, return api::SUCCESS; } + +static int cpu_wrapper_get_padding_offset_v2(Context* ctx, + int* batch_id_per_token, + int* cum_offsets_out, + int* cu_seqlens_q, + int* cu_seqlens_k, + const int* cum_offsets, + const int* seq_lens, + const int max_seq_len, + int bsz) { + for (int bi = 0; bi < bsz; ++bi) { + int cum_offset = bi == 0 ? 0 : cum_offsets[bi - 1]; + for (int i = 0; i < seq_lens[bi]; i++) { + batch_id_per_token[bi * max_seq_len - cum_offset + i] = bi; + } + cum_offsets_out[bi] = cum_offset; + int cum_seq_len = (bi + 1) * max_seq_len - cum_offsets[bi]; + cu_seqlens_q[bi + 1] = cum_seq_len; + cu_seqlens_k[bi + 1] = cum_seq_len; + } + return api::SUCCESS; +} + template static int xpu3_wrapper_remove_padding(Context* ctx, T* output_data, @@ -150,6 +183,29 @@ static int xpu3_wrapper_get_padding_offset(Context* ctx, return api::SUCCESS; } +static int xpu3_wrapper_get_padding_offset_v2(Context* ctx, + int* batch_id_per_token, + int* cum_offsets_out, + int* cu_seqlens_q, + int* cu_seqlens_k, + const int* cum_offsets, + const int* seq_lens, + const int max_seq_len, + int bsz) { + xpu3::plugin:: + speculate_get_padding_offset_v2<<ncluster(), 64, ctx->xpu_stream>>>( + batch_id_per_token, + cum_offsets_out, + cu_seqlens_q, + cu_seqlens_k, + cum_offsets, + seq_lens, + max_seq_len, + bsz); + return api::SUCCESS; +} + + template int speculate_remove_padding(Context* ctx, T* x_remove_padding, @@ -271,6 +327,63 @@ int speculate_get_padding_offset(Context* ctx, WRAPPER_UNIMPLEMENTED(ctx); } +int speculate_get_padding_offset_v2(Context* ctx, + int* batch_id_per_token, + int* cum_offsets_out, + int* cu_seqlens_q, + int* cu_seqlens_k, + const int* cum_offsets, + const int* seq_lens, + const int max_seq_len, + int bsz) { + WRAPPER_CHECK_CTX(ctx); + + WRAPPER_DUMP_FUNCTION_T1(ctx, "speculate_get_padding_offset", float); + WRAPPER_DUMP_PARAM6(ctx, + batch_id_per_token, + cum_offsets_out, + cu_seqlens_q, + cu_seqlens_k, + cum_offsets, + seq_lens); + WRAPPER_DUMP_PARAM2(ctx, max_seq_len, bsz); + WRAPPER_DUMP(ctx); + + WRAPPER_CHECK_PTR(ctx, int, bsz, cum_offsets); + WRAPPER_CHECK_PTR(ctx, int, bsz, seq_lens); + WRAPPER_CHECK_PTR(ctx, int, bsz, cum_offsets_out); + WRAPPER_CHECK_PTR(ctx, int, bsz + 1, cu_seqlens_q); + WRAPPER_CHECK_PTR(ctx, int, bsz + 1, cu_seqlens_k); + + WRAPPER_ASSERT_GT(ctx, bsz, 0); + WRAPPER_ASSERT_GT(ctx, max_seq_len, 0); + + if (ctx->dev().type() == api::kCPU) { + return cpu_wrapper_get_padding_offset_v2(ctx, + batch_id_per_token, + cum_offsets_out, + cu_seqlens_q, + cu_seqlens_k, + cum_offsets, + seq_lens, + max_seq_len, + bsz); + } + if (ctx->dev().type() == api::kXPU3) { + return xpu3_wrapper_get_padding_offset_v2(ctx, + batch_id_per_token, + cum_offsets_out, + cu_seqlens_q, + cu_seqlens_k, + cum_offsets, + seq_lens, + max_seq_len, + bsz); + } + + WRAPPER_UNIMPLEMENTED(ctx); +} + #define INSTANTIATION_SPECULATE_REMOVE_PADDING(T) \ template int speculate_remove_padding(Context * ctx, \ T * x_remove_padding, \ diff --git a/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_verify.cpp b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_verify.cpp index c5e3e425b7..c9571bd513 100644 --- a/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_verify.cpp +++ b/custom_ops/xpu_ops/src/plugin/src/wrapper/mtp_wrapper/speculate_verify.cpp @@ -48,7 +48,8 @@ __attribute__((global)) void speculate_verify( const int max_seq_len, const int max_candidate_len, const int verify_window, - const bool prefill_one_step_stop); + const bool prefill_one_step_stop, + const bool benchmark_mode); } // namespace plugin } // namespace xpu3 @@ -136,14 +137,15 @@ static int cpu_wrapper(Context *ctx, const int max_seq_len, const int max_candidate_len, const int verify_window, - const bool prefill_one_step_stop) { + const bool prefill_one_step_stop, + const bool benchmark_mode) { for (int bid = 0; bid < real_bsz; ++bid) { - const int start_token_id = bid * max_seq_len - output_cum_offsets[bid]; // verify and set stop flags int accept_num_now = 1; int stop_flag_now_int = 0; if (!(is_block_step[bid] || bid >= real_bsz)) { + const int start_token_id = bid * max_seq_len - output_cum_offsets[bid]; // printf("debug cpu bid:%d,start_token_id:%d\n",bid, start_token_id); // printf("bid %d\n", bid); @@ -160,6 +162,9 @@ static int cpu_wrapper(Context *ctx, // printf("seq_lens_this_time[%d]-1: %d \n",bid, // seq_lens_this_time[bid]-1); for (; i < seq_lens_this_time[bid] - 1; i++) { + if(benchmark_mode){ + break; + } if (seq_lens_encoder[bid] != 0) { break; } @@ -326,7 +331,8 @@ static int xpu3_wrapper(Context *ctx, const int max_seq_len, const int max_candidate_len, const int verify_window, - const bool prefill_one_step_stop) { + const bool prefill_one_step_stop, + const bool benchmark_mode) { using XPU_INT64 = typename XPUIndexType::type; xpu3::plugin::speculate_verify <<<1, 64, ctx->xpu_stream>>>( @@ -354,7 +360,8 @@ static int xpu3_wrapper(Context *ctx, max_seq_len, max_candidate_len, verify_window, - prefill_one_step_stop); + prefill_one_step_stop, + benchmark_mode); return api::SUCCESS; } template @@ -383,7 +390,8 @@ int speculate_verify(Context *ctx, const int max_seq_len, const int max_candidate_len, const int verify_window, - const bool prefill_one_step_stop) { + const bool prefill_one_step_stop, + const bool benchmark_mode) { WRAPPER_CHECK_CTX(ctx); WRAPPER_DUMP_FUNCTION_T1(ctx, "speculate_verify", int64_t); WRAPPER_DUMP_PARAM3(ctx, accept_tokens, accept_num, step_idx); @@ -406,12 +414,13 @@ int speculate_verify(Context *ctx, actual_candidate_len, real_bsz, max_draft_tokens); - WRAPPER_DUMP_PARAM5(ctx, + WRAPPER_DUMP_PARAM6(ctx, end_length, max_seq_len, max_candidate_len, verify_window, - prefill_one_step_stop); + prefill_one_step_stop, + benchmark_mode); WRAPPER_DUMP(ctx); WRAPPER_CHECK_PTR(ctx, int64_t, real_bsz * max_draft_tokens, accept_tokens); WRAPPER_CHECK_PTR(ctx, int, real_bsz, accept_num); @@ -469,7 +478,8 @@ int speculate_verify(Context *ctx, max_seq_len, max_candidate_len, verify_window, - prefill_one_step_stop); + prefill_one_step_stop, + benchmark_mode); } if (ctx->dev().type() == api::kXPU3) { return xpu3_wrapper(ctx, @@ -497,40 +507,42 @@ int speculate_verify(Context *ctx, max_seq_len, max_candidate_len, verify_window, - prefill_one_step_stop); + prefill_one_step_stop, + benchmark_mode); } WRAPPER_UNIMPLEMENTED(ctx); } -#define INSTANTIATE_SPECULATE_VERIFY(ENABLE_TOPP, USE_TOPK) \ - template int \ - baidu::xpu::api::plugin::speculate_verify( \ - baidu::xpu::api::Context *, /* xpu_ctx */ \ - int64_t *, /* accept_tokens */ \ - int *, /* accept_num */ \ - int64_t *, /* step_idx */ \ - bool *, /* stop_flags */ \ - const int *, /* seq_lens_encoder */ \ - const int *, /* seq_lens_decoder */ \ - const int64_t *, /* draft_tokens */ \ - const int *, /* actual_draft_token_nums */ \ - const float *, /* dev_curand_states or topp */ \ - const float *, /* topp or nullptr */ \ - const int *, /* seq_lens_this_time */ \ - const int64_t *, /* verify_tokens */ \ - const float *, /* verify_scores */ \ - const int64_t *, /* max_dec_len */ \ - const int64_t *, /* end_tokens */ \ - const bool *, /* is_block_step */ \ - const int *, /* output_cum_offsets */ \ - const int *, /* actual_candidate_len */ \ - int, /* real_bsz */ \ - int, /* max_draft_tokens */ \ - int, /* end_length */ \ - int, /* max_seq_len */ \ - int, /* max_candidate_len */ \ - int, /* verify_window */ \ - bool); /* prefill_one_step_stop */ +#define INSTANTIATE_SPECULATE_VERIFY(ENABLE_TOPP, USE_TOPK) \ + template int \ + baidu::xpu::api::plugin::speculate_verify( \ + baidu::xpu::api::Context *, /* xpu_ctx */ \ + int64_t *, /* accept_tokens */ \ + int *, /* accept_num */ \ + int64_t *, /* step_idx */ \ + bool *, /* stop_flags */ \ + const int *, /* seq_lens_encoder */ \ + const int *, /* seq_lens_decoder */ \ + const int64_t *, /* draft_tokens */ \ + const int *, /* actual_draft_token_nums */ \ + const float *, /* dev_curand_states or topp */ \ + const float *, /* topp or nullptr */ \ + const int *, /* seq_lens_this_time */ \ + const int64_t *, /* verify_tokens */ \ + const float *, /* verify_scores */ \ + const int64_t *, /* max_dec_len */ \ + const int64_t *, /* end_tokens */ \ + const bool *, /* is_block_step */ \ + const int *, /* output_cum_offsets */ \ + const int *, /* actual_candidate_len */ \ + int, /* real_bsz */ \ + int, /* max_draft_tokens */ \ + int, /* end_length */ \ + int, /* max_seq_len */ \ + int, /* max_candidate_len */ \ + int, /* verify_window */ \ + bool, \ + bool); /* prefill_one_step_stop */ INSTANTIATE_SPECULATE_VERIFY(false, false) INSTANTIATE_SPECULATE_VERIFY(false, true) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index bde7d2e9a7..1fa36d875e 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -959,6 +959,7 @@ def from_cli_args(cls, args: FlexibleArgumentParser) -> "EngineArgs": def create_speculative_config(self) -> SpeculativeConfig: """ """ speculative_args = asdict(self) + print("ch -- debug ", speculative_args) if self.speculative_config is not None: for k, v in self.speculative_config.items(): speculative_args[k] = v diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py index 0114bb53f9..7d3d9044e3 100644 --- a/fastdeploy/model_executor/forward_meta.py +++ b/fastdeploy/model_executor/forward_meta.py @@ -246,7 +246,8 @@ class XPUForwardMeta(ForwardMeta): total_enc_len: Optional[paddle.Tensor] = None # position embedding type in rope, supports 'NORMAL' or 'HALF_HEAD_DIM' pos_emb_type: Optional[str] = "NORMAL" - + # + valid_bs_cpu = None @dataclass class DCUForwardMeta(ForwardMeta): diff --git a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py index 3e48ab81f5..8401fd0d62 100644 --- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py @@ -72,6 +72,8 @@ def __init__( kv_num_heads: int, num_heads: int, head_dim: int, + encoder_block_shape_q: int = -1, # do not use now + decoder_block_shape_q: int = -1, # do not use now ): """ XPUAttentionBackend __init__ @@ -175,7 +177,7 @@ def forward_mixed( qkv, forward_meta.caches[2 * layer.layer_id], forward_meta.caches[2 * layer.layer_id + 1], - forward_meta.cum_offsets, + forward_meta.seq_lens_this_time if forward_meta.seq_lens_this_time is not None else forward_meta.cum_offsets, metadata.rotary_embs, metadata.block_tables, forward_meta.prefix_block_tables, diff --git a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py index 04a8ab1024..4e9a99d6ab 100644 --- a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py +++ b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py @@ -182,24 +182,27 @@ def apply_speculative_penalty_multi_scores( from fastdeploy.model_executor.ops.gpu import ( speculate_get_token_penalty_multi_scores, ) - - speculate_get_token_penalty_multi_scores( - pre_token_ids, - logits, - repetition_penalties, - frequency_penalties, - presence_penalties, - temperature, - bad_words_token_ids, - step_idx, - min_dec_lens, - eos_token_ids, - seq_lens_this_time, - output_padding_offset, - output_cum_offsets, - max_len, + elif current_platform.is_xpu(): + from fastdeploy.model_executor.ops.xpu import ( + speculate_get_token_penalty_multi_scores, ) else: raise NotImplementedError + speculate_get_token_penalty_multi_scores( + pre_token_ids, + logits, + repetition_penalties, + frequency_penalties, + presence_penalties, + temperature, + bad_words_token_ids, + step_idx, + min_dec_lens, + eos_token_ids, + seq_lens_this_time, + output_padding_offset, + output_cum_offsets, + max_len, + ) # inplace return logits diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index b5e22d466e..f8f7af2d8b 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -434,9 +434,19 @@ def __init__(self, fd_config: FDConfig): """ """ super().__init__() if current_platform.is_cuda(): - self.forward = self.forward_cuda + from fastdeploy.model_executor.ops.gpu import ( + speculate_verify, + top_p_candidates, + ) + elif current_platform.is_xpu(): + from fastdeploy.model_executor.ops.xpu import ( + speculate_verify, + top_p_candidates, + ) else: raise NotImplementedError + self.top_p_candidates = top_p_candidates # 新增这行 + self.speculate_verify = speculate_verify # 新增这行 self.speculative_verify_window = fd_config.speculative_config.verify_window self.speculative_max_candidate_len = fd_config.speculative_config.max_candidate_len self.speculative_benchmark_mode = fd_config.speculative_config.benchmark_mode @@ -559,8 +569,6 @@ def forward_cuda( ) -> paddle.Tensor: """ """ - from fastdeploy.model_executor.ops.gpu import speculate_verify, top_p_candidates - logits = apply_speculative_penalty_multi_scores( sampling_metadata.pre_token_ids, logits, @@ -580,7 +588,7 @@ def forward_cuda( probs = F.softmax(logits) - verify_scores, verify_tokens, actual_candidate_len = top_p_candidates( + verify_scores, verify_tokens, actual_candidate_len = self.top_p_candidates( probs, sampling_metadata.top_p, share_inputs["output_padding_offset"], @@ -588,7 +596,7 @@ def forward_cuda( max_model_len, ) - speculate_verify( + self.speculate_verify( share_inputs["accept_tokens"], share_inputs["accept_num"], share_inputs["step_idx"], @@ -612,7 +620,7 @@ def forward_cuda( self.speculative_verify_window, True, # enable_topp self.speculative_benchmark_mode, - accept_all_drafts, + # accept_all_drafts, ) num_logprobs = sampling_metadata.max_num_logprobs @@ -673,8 +681,8 @@ class MTPSampler(nn.Layer): def __init__(self, fd_config: FDConfig): """ """ super().__init__() - if current_platform.is_cuda(): - self.forward = self.forward_cuda + if current_platform.is_cuda() or current_platform.is_xpu(): + self.forward = self.forward else: raise NotImplementedError diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 78e2700918..8254495253 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -23,6 +23,7 @@ from fastdeploy import envs from fastdeploy.config import SpeculativeConfig from fastdeploy.platforms import current_platform +from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta if current_platform.is_iluvatar(): from fastdeploy.model_executor.ops.iluvatar import ( @@ -56,6 +57,25 @@ update_inputs, update_inputs_v1, ) +elif current_platform.is_xpu(): + from fastdeploy.model_executor.ops.xpu import ( + get_padding_offset, + save_output, + adjust_batch, + get_infer_param, + set_stop_value_multi_ends, + speculate_get_output_padding_offset, + speculate_get_padding_offset, + speculate_get_padding_offset_v2, + speculate_clear_accept_nums, + speculate_get_seq_lens_output, + speculate_save_output, + speculate_set_value_by_flags_and_idx, + step_paddle, + update_inputs, + update_inputs_v1, + speculate_update_v3, + ) elif current_platform.is_intel_hpu(): pass else: @@ -88,6 +108,7 @@ DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1" + def pre_process( input_ids: paddle.Tensor, seq_lens_this_time: int, @@ -123,7 +144,7 @@ def pre_process( batch_id_per_token, cu_seqlens_q, cu_seqlens_k, - ) = speculate_get_padding_offset( + ) = speculate_get_padding_offset_v2( input_ids, draft_tokens, cum_offsets_now, @@ -163,7 +184,378 @@ def pre_process( ) -def _build_stream_transfer_data(output_tokens: np.ndarray): +def xpu_pre_process( + input_ids: paddle.Tensor, + seq_lens_this_time: int, + share_inputs: Dict, + use_speculate_method: bool, + block_size: int, + draft_tokens: Optional[paddle.Tensor] = None, + seq_lens_encoder: Optional[paddle.Tensor] = None, + seq_lens_decoder: Optional[paddle.Tensor] = None, + forward_meta = None, +) -> XPUForwardMeta: + """ """ + max_len = input_ids.shape[1] + cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time, dtype="int32") + token_num = paddle.sum(seq_lens_this_time) + + if use_speculate_method: + ( + ids_remove_padding, + batch_id_per_token, + cu_seqlens_q, + cu_seqlens_k, + ) = speculate_get_padding_offset_v2( + input_ids, + draft_tokens, + cum_offsets_now, + token_num, + seq_lens_this_time, + seq_lens_encoder, + ) + seq_lens_output = speculate_get_seq_lens_output( + seq_lens_this_time, + seq_lens_encoder, + seq_lens_decoder, + ) + if isinstance(seq_lens_output, list): + seq_lens_output = seq_lens_output[0] + output_token_num = paddle.sum(seq_lens_output) + output_cum_offsets_tmp = paddle.cumsum(max_len - seq_lens_output, dtype="int32") + output_padding_offset, output_cum_offsets = speculate_get_output_padding_offset( + output_cum_offsets_tmp, + output_token_num, + seq_lens_output, + max_len, + ) + share_inputs["output_cum_offsets"].copy_(output_cum_offsets, False) + share_inputs["output_padding_offset"].copy_(output_padding_offset, False) + else: + ( + ids_remove_padding, + cum_offsets, + batch_id_per_token, + cu_seqlens_q, + cu_seqlens_k, + ) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time) + share_inputs["cum_offsets"] = cum_offsets + + + share_inputs["ids_remove_padding"] = None # set this after adjust batch + share_inputs["batch_id_per_token"] = batch_id_per_token + share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) + share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False) + + xpu_forward_meta = None + if not use_speculate_method: + xpu_forward_meta = XPUForwardMeta( + input_ids=share_inputs["input_ids"], + ids_remove_padding=share_inputs["ids_remove_padding"], + rotary_embs=share_inputs["rope_emb"], + attn_backend=None, + seq_lens_encoder=share_inputs["seq_lens_encoder"], + seq_lens_decoder=share_inputs["seq_lens_decoder"], + seq_lens_this_time=share_inputs["seq_lens_this_time"], + cum_offsets=None if use_speculate_method else share_inputs["cum_offsets"], + batch_id_per_token=share_inputs["batch_id_per_token"], + cu_seqlens_q=share_inputs["cu_seqlens_q"], + cu_seqlens_k=share_inputs["cu_seqlens_k"], + block_tables=share_inputs["block_tables"], + caches=share_inputs["caches"], + ) + else: + xpu_forward_meta = ForwardMeta( + input_ids=share_inputs["input_ids"], + ids_remove_padding=share_inputs["ids_remove_padding"], + rotary_embs=share_inputs["rope_emb"], + attn_backend=None, + decoder_batch_ids=share_inputs["decoder_batch_ids"], + decoder_tile_ids_per_batch=share_inputs["decoder_tile_ids_per_batch"], + decoder_num_blocks_cpu=share_inputs["decoder_num_blocks_cpu"], + decoder_num_blocks_device=share_inputs["decoder_num_blocks_device"], + decoder_chunk_size_device=share_inputs["decoder_chunk_size_device"], + max_len_tensor_cpu=share_inputs["max_len_tensor_cpu"], + seq_lens_encoder=share_inputs["seq_lens_encoder"], + seq_lens_decoder=share_inputs["seq_lens_decoder"], + seq_lens_this_time=share_inputs["seq_lens_this_time"], + batch_id_per_token=share_inputs["batch_id_per_token"], + cu_seqlens_q=share_inputs["cu_seqlens_q"], + cu_seqlens_k=share_inputs["cu_seqlens_k"], + block_tables=share_inputs["block_tables"], + caches=share_inputs["caches"], + encoder_batch_ids=share_inputs["encoder_batch_ids"], + encoder_tile_ids_per_batch=share_inputs["encoder_tile_ids_per_batch"], + encoder_num_blocks_x_cpu=share_inputs["encoder_num_blocks_x_cpu"], + kv_batch_ids=share_inputs["kv_batch_ids"], + kv_tile_ids_per_batch=share_inputs["kv_tile_ids_per_batch"], + kv_num_blocks_x_cpu=share_inputs["kv_num_blocks_x_cpu"], + max_len_kv_cpu=share_inputs["max_len_kv_cpu"], + ) + + ( + xpu_forward_meta.encoder_batch_map, + xpu_forward_meta.decoder_batch_map, + xpu_forward_meta.encoder_batch_idx, + xpu_forward_meta.decoder_batch_idx, + xpu_forward_meta.encoder_seq_lod, + xpu_forward_meta.decoder_seq_lod, + xpu_forward_meta.encoder_kv_lod, + xpu_forward_meta.prefix_len, + xpu_forward_meta.decoder_context_len, + xpu_forward_meta.decoder_context_len_cache, + xpu_forward_meta.prefix_block_tables, + xpu_forward_meta.encoder_batch_map_cpu, + xpu_forward_meta.decoder_batch_map_cpu, + xpu_forward_meta.encoder_batch_idx_cpu, + xpu_forward_meta.decoder_batch_idx_cpu, + xpu_forward_meta.encoder_seq_lod_cpu, + xpu_forward_meta.decoder_seq_lod_cpu, + xpu_forward_meta.encoder_kv_lod_cpu, + xpu_forward_meta.prefix_len_cpu, + xpu_forward_meta.decoder_context_len_cpu, + xpu_forward_meta.decoder_context_len_cache_cpu, + xpu_forward_meta.len_info_cpu, + ) = get_infer_param( + seq_lens_encoder, seq_lens_decoder, seq_lens_this_time, xpu_forward_meta.block_tables, block_size + ) + xpu_forward_meta.enc_batch = xpu_forward_meta.len_info_cpu[0] + xpu_forward_meta.dec_batch = xpu_forward_meta.len_info_cpu[1] + xpu_forward_meta.total_enc_len = xpu_forward_meta.len_info_cpu[2] + + # TODO(chenhuan09):support MTP + if not use_speculate_method: + adjusted_input = adjust_batch( + ids_remove_padding.reshape([-1, 1]), + cum_offsets if not use_speculate_method else seq_lens_this_time, + xpu_forward_meta.encoder_seq_lod, + xpu_forward_meta.encoder_batch_idx, + xpu_forward_meta.decoder_batch_idx, + xpu_forward_meta.encoder_seq_lod_cpu, + xpu_forward_meta.encoder_batch_idx_cpu, + xpu_forward_meta.decoder_batch_idx_cpu, + xpu_forward_meta.enc_batch, + xpu_forward_meta.dec_batch, + None, # output_padding_offset + -1, # max_input_length + ) + adjusted_input = adjusted_input.squeeze(1) + + share_inputs["ids_remove_padding"] = adjusted_input + xpu_forward_meta.ids_remove_padding = adjusted_input + + else: + share_inputs["ids_remove_padding"] = ids_remove_padding + xpu_forward_meta.ids_remove_padding = ids_remove_padding + return xpu_forward_meta + +def xpu_process_output( + forward_output, + cum_offsets: paddle.Tensor, + xpu_forward_meta: XPUForwardMeta, + share_inputs, +) -> paddle.Tensor: + """ """ + from fastdeploy.model_executor.ops.xpu import gather_next_token + output_padding_offset = share_inputs.get("output_padding_offset", None) + hiddden_states = gather_next_token( + forward_output, + cum_offsets if cum_offsets is not None else xpu_forward_meta.seq_lens_this_time, + xpu_forward_meta.encoder_seq_lod, + xpu_forward_meta.encoder_batch_map, + xpu_forward_meta.decoder_batch_map, + xpu_forward_meta.encoder_seq_lod_cpu, + xpu_forward_meta.encoder_batch_map_cpu, + xpu_forward_meta.decoder_batch_map_cpu, + xpu_forward_meta.len_info_cpu, + output_padding_offset, # output_padding_offset + ) + return hiddden_states + + +def xpu_post_process_normal( + sampled_token_ids: paddle.Tensor, + model_output: ModelOutputData, + share_inputs: Dict[str, paddle.Tensor], + save_each_rank: bool = False, + block_size: int = 64, + skip_save_output: bool = False, +) -> None: + """ """ + from fastdeploy.model_executor.ops.xpu import ( + save_output, + set_stop_value_multi_ends, + update_inputs, + ) + + # handle vl: + if model_output.enable_thinking: + exists_think_end = sampled_token_ids == model_output.think_end_id + paddle.assign( + paddle.where( + exists_think_end, + model_output.need_think_end - 1, + model_output.need_think_end, + ), + model_output.need_think_end, + ) + + paddle.assign( + paddle.where( + model_output.need_think_end.cast("bool"), + model_output.reasoning_index - 1, + model_output.reasoning_index, + ), + model_output.reasoning_index, + ) + + stop_wo_think = ( + (sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True) + | (model_output.reasoning_index == 0) + ) & (model_output.need_think_end > 0) + sampled_token_ids = paddle.where( + stop_wo_think, + model_output.think_end_id, + sampled_token_ids, + ) + paddle.assign( + paddle.where( + stop_wo_think, + model_output.need_think_end - 1, + model_output.need_think_end, + ), + model_output.need_think_end, + ) + + # 1. Set stop value + paddle.assign( + paddle.where( + model_output.stop_flags, + model_output.step_idx, + model_output.step_idx + 1, + ), + model_output.step_idx, + ) + length_cond = paddle.greater_equal(model_output.step_idx, model_output.max_dec_len) + paddle.assign( + paddle.logical_or(model_output.stop_flags, length_cond), + model_output.stop_flags, + ) + set_stop_value_multi_ends( + sampled_token_ids, + model_output.stop_flags, + model_output.seq_lens_this_time, + model_output.eos_token_id, + model_output.next_tokens, + False, + ) # multi ends + + # 2. Update the input buffer of the model + with paddle.framework._no_check_dy2st_diff(): + # paddle.device.xpu.set_debug_level(0xa1) + if envs.ENABLE_V1_KVCACHE_SCHEDULER and not skip_save_output: + update_inputs_v1( + model_output.stop_flags, + model_output.not_need_stop, + model_output.seq_lens_this_time, + model_output.seq_lens_encoder, + model_output.seq_lens_decoder, + share_inputs["step_seq_lens_decoder"], + share_inputs["prompt_lens"], + sampled_token_ids, + model_output.input_ids, + share_inputs["block_tables"], + model_output.stop_nums, + model_output.next_tokens, + model_output.is_block_step, + block_size, + ) + else: + update_inputs( + model_output.stop_flags, + model_output.not_need_stop, + model_output.seq_lens_this_time, + model_output.seq_lens_encoder, + model_output.seq_lens_decoder, + model_output.input_ids, + model_output.stop_nums, + sampled_token_ids, + model_output.is_block_step, + ) + # paddle.device.xpu.set_debug_level(0) + + # 3. Transmit the model's output and stop generation signal via message queue. + # In the future, we will abandon this approach. + if not skip_save_output: + save_output( + sampled_token_ids, + model_output.not_need_stop, + model_output.mp_rank, + False, # use_ep + ) + + +def xpu_post_process_specualate( + model_output: ModelOutputData, save_each_rank: bool = False, skip_save_output: bool = False +): + """""" + speculate_update_v3( + model_output.seq_lens_encoder, + model_output.seq_lens_decoder, + model_output.not_need_stop, + model_output.draft_tokens, + model_output.actual_draft_token_num, + model_output.accept_tokens, + model_output.accept_num, + model_output.stop_flags, + model_output.seq_lens_this_time, + model_output.is_block_step, + model_output.stop_nums, + ) + if not skip_save_output: + speculate_save_output( + model_output.accept_tokens, + model_output.accept_num, + model_output.not_need_stop, + model_output.mp_rank, + save_each_rank, # False + ) + + speculate_clear_accept_nums(model_output.accept_num, model_output.seq_lens_decoder) + + # Update pre_ids through accept tokens + speculate_set_value_by_flags_and_idx( + model_output.pre_ids, + model_output.accept_tokens, + model_output.accept_num, + model_output.stop_flags, + model_output.seq_lens_this_time, + model_output.seq_lens_encoder, + model_output.seq_lens_decoder, + model_output.step_idx, + ) + + + +def xpu_post_process( + sampler_output: SamplerOutput, + model_output: ModelOutputData, + share_inputs: Dict[str, paddle.Tensor], + block_size: int = 64, + speculative_decoding: bool = False, + skip_save_output: bool = False, +) -> None: + """Post-processing steps after completing a single token generation.""" + # save_each_rank do not support in xpu, set defalut False + if speculative_decoding: + xpu_post_process_specualate(model_output, False, skip_save_output) + else: + xpu_post_process_normal( + sampler_output.sampled_token_ids, model_output, share_inputs, False, block_size, skip_save_output + ) + + +def _build_stream_transfer_data(zmq_client: ZmqIpcClient, output_tokens: np.ndarray, save_each_rank: bool, mp_rank: int): """Split output_tokens and output""" output_tokens = output_tokens.reshape([-1]).numpy() output_tokens_lists = np.split(output_tokens, output_tokens.shape[0]) diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 7d87033e88..33f552d759 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -299,7 +299,11 @@ def process_sampling_results(self): """ if current_platform.is_xpu(): - from fastdeploy.model_executor.ops.xpu import get_output, get_output_ep + from fastdeploy.model_executor.ops.xpu import ( + get_output, + get_output_ep, + speculate_get_output, + ) elif current_platform.is_iluvatar(): from fastdeploy.model_executor.ops.iluvatar import get_output elif current_platform.is_gcu(): diff --git a/fastdeploy/spec_decode/__init__.py b/fastdeploy/spec_decode/__init__.py index 824d5da56a..86b0456182 100644 --- a/fastdeploy/spec_decode/__init__.py +++ b/fastdeploy/spec_decode/__init__.py @@ -17,6 +17,6 @@ from .base import Proposer from .mtp import MTPProposer -from .ngram import NgramProposer +# from .ngram import NgramProposer __all__ = ["Proposer", "MTPProposer", "NgramProposer"] diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index ae496d7101..3e7d108e6d 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -24,7 +24,7 @@ from fastdeploy import envs from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request, RequestType -from fastdeploy.model_executor.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, @@ -34,20 +34,35 @@ from fastdeploy.model_executor.layers.sample.sampler import MTPSampler from fastdeploy.model_executor.model_loader import get_model_loader from fastdeploy.model_executor.models import ModelForCasualLM -from fastdeploy.model_executor.ops.gpu import ( - draft_model_postprocess, - draft_model_preprocess, - draft_model_update, - eagle_get_hidden_states, - eagle_get_self_hidden_states, - hybrid_mtp_ngram, - mtp_save_first_token, - mtp_step_paddle, - share_external_data, - speculate_get_logits, - speculate_save_output_topk, -) +if paddle.is_compiled_with_xpu(): + from fastdeploy.model_executor.ops.xpu import ( + draft_model_postprocess, + draft_model_preprocess_v2, + draft_model_update, + eagle_get_hidden_states, + eagle_get_self_hidden_states, + # hybrid_mtp_ngram, + mtp_save_first_token, + mtp_step_paddle, + share_external_data, + ) +else: + from fastdeploy.model_executor.ops.gpu import ( + draft_model_postprocess, + draft_model_preprocess, + draft_model_update, + eagle_get_hidden_states, + eagle_get_self_hidden_states, + hybrid_mtp_ngram, + mtp_save_first_token, + mtp_step_paddle, + share_external_data, + speculate_get_logits, + speculate_save_output_topk, + ) + from fastdeploy.model_executor.pre_and_post_process import pre_process, rebuild_padding +from fastdeploy.model_executor.pre_and_post_process import xpu_pre_process, xpu_process_output from .base import Proposer @@ -90,7 +105,7 @@ def __init__( self._initialize_attn_backend() # Forward meta store the global meta information of the forward - self.forward_meta: ForwardMeta = None + self.forward_meta = None def _update_mtp_config(self, main_model): """ @@ -162,13 +177,10 @@ def initialize_kv_cache(self, main_model_num_blocks, profile: bool = False): and hasattr(self.quant_config, "kv_cache_quant_type") and self.quant_config.kv_cache_quant_type is not None ): - cache_type = "uint8" - kv_cache_quant_type = self.quant_config.kv_cache_quant_type + cache_type = "int8" # Get kv cache shape - kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( - max_num_blocks=self.num_gpu_blocks, kv_cache_quant_type=kv_cache_quant_type - ) + kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=self.num_gpu_blocks) if not profile and ( self.cache_config.enable_prefix_caching or self.scheduler_config.splitwise_role != "mixed" ): @@ -202,7 +214,7 @@ def initialize_kv_cache(self, main_model_num_blocks, profile: bool = False): self.model_inputs["caches"] = list(self.cache_kvs.values()) for value in self.cache_kvs.values(): del value - paddle.device.cuda.empty_cache() + paddle.device.xpu.empty_cache() def _initialize_attn_backend( self, @@ -229,7 +241,8 @@ def _initialize_attn_backend( ) self.model_inputs["decoder_num_blocks_cpu"] = paddle.zeros_like( self.target_model_inputs["decoder_num_blocks_cpu"] - ).pin_memory() + ).cpu() + # ).pin_memory() self.model_inputs["decoder_num_blocks_device"] = paddle.zeros_like( self.target_model_inputs["decoder_num_blocks_device"] ) @@ -612,6 +625,51 @@ def _initialize_forward_meta(self, step_use_cudagraph: bool = False): self.forward_meta.step_use_cudagraph = step_use_cudagraph + def _initialize_forward_meta_xpu(self): + + # self.forward_meta = XPUForwardMeta( + # input_ids=self.model_inputs["input_ids"], + # ids_remove_padding=self.model_inputs["ids_remove_padding"], + # rotary_embs=self.model_inputs["rope_emb"], + # attn_backend=self.attn_backends[0], + # decoder_batch_ids=self.model_inputs["decoder_batch_ids"], + # decoder_tile_ids_per_batch=self.model_inputs["decoder_tile_ids_per_batch"], + # decoder_num_blocks_cpu=self.model_inputs["decoder_num_blocks_cpu"], + # decoder_num_blocks_device=self.model_inputs["decoder_num_blocks_device"], + # decoder_chunk_size_device=self.model_inputs["decoder_chunk_size_device"], + # max_len_tensor_cpu=self.model_inputs["max_len_tensor_cpu"], + # seq_lens_encoder=self.model_inputs["seq_lens_encoder"], + # seq_lens_decoder=self.model_inputs["seq_lens_decoder"], + # seq_lens_this_time=self.model_inputs["seq_lens_this_time"], + # batch_id_per_token=self.model_inputs["batch_id_per_token"], + # cu_seqlens_q=self.model_inputs["cu_seqlens_q"], + # cu_seqlens_k=self.model_inputs["cu_seqlens_k"], + # block_tables=self.model_inputs["block_tables"], + # caches=self.model_inputs["caches"], + # encoder_batch_ids=self.model_inputs["encoder_batch_ids"], + # encoder_tile_ids_per_batch=self.model_inputs["encoder_tile_ids_per_batch"], + # encoder_num_blocks_x_cpu=self.model_inputs["encoder_num_blocks_x_cpu"], + # kv_batch_ids=self.model_inputs["kv_batch_ids"], + # kv_tile_ids_per_batch=self.model_inputs["kv_tile_ids_per_batch"], + # kv_num_blocks_x_cpu=self.model_inputs["kv_num_blocks_x_cpu"], + # max_len_kv_cpu=self.model_inputs["max_len_kv_cpu"], + # ) + self.forward_meta.attn_backend = self.attn_backends[0] + + # Initialzie attention meta data + for attn_backend in self.attn_backends: + attn_backend.init_attention_metadata(self.forward_meta) + + # Mix ep in single node + if self.fd_config.parallel_config.use_ep and self.fd_config.scheduler_config.splitwise_role == "mixed": + only_decode_batch_list = [] + prefill_exists = self.exist_prefill() + paddle.distributed.all_gather_object(only_decode_batch_list, not prefill_exists) + only_decode_batch = all(only_decode_batch_list) + self.fd_config.model_config.moe_phase.phase = "decode" if only_decode_batch else "prefill" + + # TODO: support cudagraph + def exist_prefill(self): """ check whether prefill stage exist @@ -625,8 +683,9 @@ def _prepare_inputs(self, full_hidden_states): """ Prepare MTP inputs """ - use_v1_cache_scheduler = envs.ENABLE_V1_KVCACHE_SCHEDULER - draft_model_preprocess( + use_v1_cache_scheduler = bool(envs.ENABLE_V1_KVCACHE_SCHEDULER) + # draft_model_preprocess( + draft_model_preprocess_v2( self.model_inputs["draft_tokens"], self.model_inputs["input_ids"], self.model_inputs["stop_flags"], @@ -688,6 +747,7 @@ def _post_process(self, sampled_token_ids): self.max_model_len, self.model_inputs["substep"], ) + if self.role == "prefill": mtp_save_first_token( self.model_inputs["base_model_draft_tokens"], @@ -740,6 +800,17 @@ def _propose(self, step_use_cudagraph: bool = False): # Padding inputs for cuda graph self.padding_cudagraph_inputs() + self.forward_meta = xpu_pre_process( + self.model_inputs["input_ids"], + self.model_inputs["seq_lens_this_time"], + self.model_inputs, + True, + self.cache_config.block_size, + self.model_inputs["draft_tokens"], + self.model_inputs["seq_lens_encoder"], + self.model_inputs["seq_lens_decoder"], + ) + self._initialize_forward_meta_xpu() # Get sampling metadata self.sampling_metadata = SamplingMetadata( temperature=self.model_inputs["temperature"], @@ -767,38 +838,10 @@ def _propose(self, step_use_cudagraph: bool = False): previous_hidden_states=self.model_inputs["target_hidden_states"], forward_meta=self.forward_meta, ) - if self.forward_meta.step_use_cudagraph: - model_output = model_output[: self.real_token_num] - hidden_states = rebuild_padding( - model_output, - self.model_inputs["cu_seqlens_q"], - self.model_inputs["seq_lens_this_time"], - self.model_inputs["seq_lens_decoder"], - self.model_inputs["seq_lens_encoder"], - self.model_inputs["output_padding_offset"], - self.model_config.max_model_len, - self.model_inputs["first_token_hidden_states"], - self.enable_logprob if substep == 0 else False, - ) - + hidden_states = xpu_process_output(model_output, None, self.forward_meta, self.model_inputs) # 4. Compute logits, Sample logits = self.model.compute_logits(hidden_states) - if self.enable_logprob and substep == 0: - first_token_logits = self.model.compute_logits(self.model_inputs["first_token_hidden_states"]) - - speculate_get_logits( - self.model_inputs["draft_logits"], - self.model_inputs["next_token_num"], - self.model_inputs["batch_token_num"], - self.model_inputs["cu_next_token_offset"], - self.model_inputs["cu_batch_token_offset"], - logits, - first_token_logits, - self.model_inputs["seq_lens_this_time"], - self.model_inputs["seq_lens_encoder"], - ) - - sampled_token_ids, sampler_output = self.sampler( + sampled_token_ids = self.sampler( logits, self.sampling_metadata, self.max_model_len, diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 5b1a886c05..db466dd0b8 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -38,264 +38,31 @@ ) from fastdeploy.model_executor.layers.rotary_embedding import get_rope, get_rope_3d from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata -from fastdeploy.model_executor.layers.sample.sampler import Sampler +from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler from fastdeploy.model_executor.model_loader import get_model_loader from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ScatterOp +from fastdeploy.model_executor.pre_and_post_process import xpu_pre_process, xpu_post_process, xpu_process_output + from fastdeploy.model_executor.ops.xpu import ( - adjust_batch, - get_infer_param, - get_padding_offset, recover_decode_task, set_data_ipc, share_external_data, + speculate_clear_accept_nums, + speculate_get_output_padding_offset, + speculate_get_padding_offset, + speculate_get_seq_lens_output, + speculate_save_output, + speculate_set_value_by_flags_and_idx, update_inputs_v1, ) +from fastdeploy.spec_decode import MTPProposer from fastdeploy.utils import get_logger from fastdeploy.worker.model_runner_base import ModelRunnerBase -from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput +from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput, SamplerOutput logger = get_logger("xpu_model_runner", "xpu_model_runner.log") -def xpu_pre_process( - input_ids: paddle.Tensor, - seq_lens_this_time: int, - share_inputs: Dict, - use_speculate_method: bool, - block_size: int, - draft_tokens: Optional[paddle.Tensor] = None, - seq_lens_encoder: Optional[paddle.Tensor] = None, - seq_lens_decoder: Optional[paddle.Tensor] = None, -) -> XPUForwardMeta: - """ """ - max_len = input_ids.shape[1] - cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time, dtype="int32") - token_num = paddle.sum(seq_lens_this_time) - - ( - ids_remove_padding, - cum_offsets, - batch_id_per_token, - cu_seqlens_q, - cu_seqlens_k, - ) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time) - - share_inputs["ids_remove_padding"] = None # set this after adjust batch - share_inputs["cum_offsets"] = cum_offsets - share_inputs["batch_id_per_token"] = batch_id_per_token - share_inputs["cu_seqlens_q"] = cu_seqlens_q - share_inputs["cu_seqlens_k"] = cu_seqlens_k - - xpu_forward_meta = XPUForwardMeta( - input_ids=share_inputs["input_ids"], - ids_remove_padding=share_inputs["ids_remove_padding"], - rotary_embs=share_inputs["rope_emb"], - attn_backend=None, - seq_lens_encoder=share_inputs["seq_lens_encoder"], - seq_lens_decoder=share_inputs["seq_lens_decoder"], - seq_lens_this_time=share_inputs["seq_lens_this_time"], - cum_offsets=share_inputs["cum_offsets"], - batch_id_per_token=share_inputs["batch_id_per_token"], - cu_seqlens_q=share_inputs["cu_seqlens_q"], - cu_seqlens_k=share_inputs["cu_seqlens_k"], - block_tables=share_inputs["block_tables"], - caches=share_inputs["caches"], - ) - - ( - xpu_forward_meta.encoder_batch_map, - xpu_forward_meta.decoder_batch_map, - xpu_forward_meta.encoder_batch_idx, - xpu_forward_meta.decoder_batch_idx, - xpu_forward_meta.encoder_seq_lod, - xpu_forward_meta.decoder_seq_lod, - xpu_forward_meta.encoder_kv_lod, - xpu_forward_meta.prefix_len, - xpu_forward_meta.decoder_context_len, - xpu_forward_meta.decoder_context_len_cache, - xpu_forward_meta.prefix_block_tables, - xpu_forward_meta.encoder_batch_map_cpu, - xpu_forward_meta.decoder_batch_map_cpu, - xpu_forward_meta.encoder_batch_idx_cpu, - xpu_forward_meta.decoder_batch_idx_cpu, - xpu_forward_meta.encoder_seq_lod_cpu, - xpu_forward_meta.decoder_seq_lod_cpu, - xpu_forward_meta.encoder_kv_lod_cpu, - xpu_forward_meta.prefix_len_cpu, - xpu_forward_meta.decoder_context_len_cpu, - xpu_forward_meta.decoder_context_len_cache_cpu, - xpu_forward_meta.len_info_cpu, - ) = get_infer_param( - seq_lens_encoder, seq_lens_decoder, seq_lens_this_time, xpu_forward_meta.block_tables, block_size - ) - xpu_forward_meta.enc_batch = xpu_forward_meta.len_info_cpu[0] - xpu_forward_meta.dec_batch = xpu_forward_meta.len_info_cpu[1] - xpu_forward_meta.total_enc_len = xpu_forward_meta.len_info_cpu[2] - - adjusted_input = adjust_batch( - ids_remove_padding.reshape([-1, 1]), - cum_offsets, - xpu_forward_meta.encoder_seq_lod, - xpu_forward_meta.encoder_batch_idx, - xpu_forward_meta.decoder_batch_idx, - xpu_forward_meta.encoder_seq_lod_cpu, - xpu_forward_meta.encoder_batch_idx_cpu, - xpu_forward_meta.decoder_batch_idx_cpu, - xpu_forward_meta.enc_batch, - xpu_forward_meta.dec_batch, - None, # output_padding_offset - -1, # max_input_length - ) - - adjusted_input = adjusted_input.squeeze(1) - - share_inputs["ids_remove_padding"] = adjusted_input - xpu_forward_meta.ids_remove_padding = adjusted_input - return xpu_forward_meta - - -def xpu_process_output( - forward_output, - cum_offsets: paddle.Tensor, - xpu_forward_meta: XPUForwardMeta, -) -> paddle.Tensor: - """ """ - from fastdeploy.model_executor.ops.xpu import gather_next_token - - hiddden_states = gather_next_token( - forward_output, - cum_offsets, - xpu_forward_meta.encoder_seq_lod, - xpu_forward_meta.encoder_batch_map, - xpu_forward_meta.decoder_batch_map, - xpu_forward_meta.encoder_seq_lod_cpu, - xpu_forward_meta.encoder_batch_map_cpu, - xpu_forward_meta.decoder_batch_map_cpu, - xpu_forward_meta.enc_batch, - xpu_forward_meta.dec_batch, - None, # output_padding_offset - -1, # max_input_length - ) - return hiddden_states - - -def xpu_post_process( - sampled_token_ids: paddle.Tensor, - model_output: ModelOutputData, - share_inputs: Dict[str, paddle.Tensor], - block_size: int = 64, - skip_save_output: bool = False, -) -> None: - """ """ - from fastdeploy.model_executor.ops.xpu import ( - save_output, - set_stop_value_multi_ends, - update_inputs, - ) - - # handle vl: - if model_output.enable_thinking: - exists_think_end = sampled_token_ids == model_output.think_end_id - paddle.assign( - paddle.where( - exists_think_end, - model_output.need_think_end - 1, - model_output.need_think_end, - ), - model_output.need_think_end, - ) - - paddle.assign( - paddle.where( - model_output.need_think_end.cast("bool"), - model_output.reasoning_index - 1, - model_output.reasoning_index, - ), - model_output.reasoning_index, - ) - - stop_wo_think = ( - (sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True) - | (model_output.reasoning_index == 0) - ) & (model_output.need_think_end > 0) - sampled_token_ids = paddle.where( - stop_wo_think, - model_output.think_end_id, - sampled_token_ids, - ) - paddle.assign( - paddle.where( - stop_wo_think, - model_output.need_think_end - 1, - model_output.need_think_end, - ), - model_output.need_think_end, - ) - - # 1. Set stop value - paddle.assign( - paddle.where( - model_output.stop_flags, - model_output.step_idx, - model_output.step_idx + 1, - ), - model_output.step_idx, - ) - length_cond = paddle.greater_equal(model_output.step_idx, model_output.max_dec_len) - paddle.assign( - paddle.logical_or(model_output.stop_flags, length_cond), - model_output.stop_flags, - ) - set_stop_value_multi_ends( - sampled_token_ids, - model_output.stop_flags, - model_output.seq_lens_this_time, - model_output.eos_token_id, - model_output.next_tokens, - False, - ) # multi ends - - # 2. Update the input buffer of the model - with paddle.framework._no_check_dy2st_diff(): - if envs.ENABLE_V1_KVCACHE_SCHEDULER and not skip_save_output: - update_inputs_v1( - model_output.stop_flags, - model_output.not_need_stop, - model_output.seq_lens_this_time, - model_output.seq_lens_encoder, - model_output.seq_lens_decoder, - share_inputs["step_seq_lens_decoder"], - share_inputs["prompt_lens"], - sampled_token_ids, - model_output.input_ids, - share_inputs["block_tables"], - model_output.stop_nums, - model_output.next_tokens, - model_output.is_block_step, - block_size, - ) - else: - update_inputs( - model_output.stop_flags, - model_output.not_need_stop, - model_output.seq_lens_this_time, - model_output.seq_lens_encoder, - model_output.seq_lens_decoder, - model_output.input_ids, - model_output.stop_nums, - sampled_token_ids, - model_output.is_block_step, - ) - # 3. Transmit the model's output and stop generation signal via message queue. - # In the future, we will abandon this approach. - if not skip_save_output: - save_output( - sampled_token_ids, - model_output.not_need_stop, - model_output.mp_rank, - False, # use_ep - ) - def step_paddle( share_inputs: Dict[str, paddle.Tensor], @@ -375,9 +142,20 @@ def __init__( "fused_gemm_epilogue", ] + self.device_id = device_id + self.speculative_method = self.fd_config.speculative_config.method + self.speculative_decoding = self.speculative_method is not None + + # used by SamplingMetadata + self.enable_logprob = fd_config.model_config.enable_logprob + self.enable_early_stop = self.fd_config.early_stop_config.enable_early_stop + # Sampler - # TODU(lilujia): sync with GPU - self.sampler = Sampler(fd_config) + print("ch -- debug self.speculative_decoding:", self.speculative_decoding) + if not self.speculative_decoding: + self.sampler = Sampler(fd_config) + else: + self.sampler = SpeculativeSampler(fd_config) # Lazy initialize kv cache after model loading # self.kv_caches: list[paddle.Tensor] = [] @@ -386,7 +164,7 @@ def __init__( self.graph_opt_level = self.graph_opt_config.graph_opt_level self.use_cudagraph = False self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes - self.input_ids = paddle.zeros(self.scheduler_config.max_num_seqs, dtype="int32") + # self.input_ids = paddle.zeros(self.scheduler_config.max_num_seqs, dtype="int32") # Initialize share inputs self._init_share_inputs(self.fd_config.scheduler_config.max_num_seqs) @@ -395,6 +173,8 @@ def __init__( fill_value=4, dtype="int64", ).cpu() + # do not support chunk prefill + # self.restore_chunked_prefill_request = dict() # Initialize attention Backend # NOTE(gonshaotian): Currently, all attention layers share one attention backend instance. @@ -405,16 +185,11 @@ def __init__( # Forward meta store the global meta information of the forward self.forward_meta: ForwardMeta = None - def exist_prefill(self): - """ - check whether prefill stage exist - """ - if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: - return 1 - else: - return 0 + # # Postprocess Env params + # os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.parallel_config.engine_worker_queue_port) + # logger.info(f"queue id is {str(self.parallel_config.engine_worker_queue_port)}") - def insert_tasks_v1(self, req_dicts: List[Request]): + def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests): """ Process scheduler output tasks, used when ENABLE_V1_KVCACHE_SCHEDULER=1 req_dict: A list of Request dict @@ -495,7 +270,7 @@ def insert_tasks_v1(self, req_dicts: List[Request]): has_decode_task = True self.share_inputs["stop_flags"][idx : idx + 1] = False self.share_inputs["seq_lens_decoder"][idx : idx + 1] = prefill_start_index - self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length + self.seq_lens_this_time_buffer[idx : idx + 1] = length self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = 0 self.share_inputs["prompt_lens"][idx : idx + 1] = len(input_ids) @@ -518,7 +293,7 @@ def insert_tasks_v1(self, req_dicts: List[Request]): logger.debug(f"Handle preempted request {request} at idx {idx}") self.share_inputs["block_tables"][idx : idx + 1, :] = -1 self.share_inputs["stop_flags"][idx : idx + 1] = True - self.share_inputs["seq_lens_this_time"][idx : idx + 1] = 0 + self.seq_lens_this_time_buffer[idx : idx + 1] = 0 self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0 self.share_inputs["is_block_step"][idx : idx + 1] = False @@ -528,6 +303,7 @@ def insert_tasks_v1(self, req_dicts: List[Request]): self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1) self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7) + # self.share_inputs["top_p"][idx : idx + 1] = 0 self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0) self.share_inputs["top_k_list"][idx] = request.get("top_k", 0) self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0) @@ -536,6 +312,10 @@ def insert_tasks_v1(self, req_dicts: List[Request]): self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0) self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0) self.share_inputs["presence_score"][idx : idx + 1] = request.get("presence_penalty", 0.0) + # self.share_inputs["temp_scaled_logprobs"][idx : idx + 1] = request.get("temp_scaled_logprobs", False) + # self.share_inputs["top_p_normalized_logprobs"][idx : idx + 1] = request.get( + # "top_p_normalized_logprobs", False + # ) self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1) self.share_inputs["max_dec_len"][idx : idx + 1] = request.get( @@ -574,7 +354,12 @@ def insert_tasks_v1(self, req_dicts: List[Request]): if has_prefill_task or has_decode_task: self.share_inputs["not_need_stop"][0] = True - def insert_prefill_inputs(self, req_dicts: List[Request]): + # self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests] + self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer + if self.speculative_method in ["mtp"]: + self.proposer.insert_tasks_v1(req_dicts, num_running_requests) + + def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests): """Process inputs for prefill tasks and update share_inputs buffer""" req_len = len(req_dicts) for i in range(req_len): @@ -599,7 +384,7 @@ def insert_prefill_inputs(self, req_dicts: List[Request]): else: self.share_inputs["seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length + self.seq_lens_this_time_buffer[idx : idx + 1] = length self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = length self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length self.share_inputs["prompt_lens"][idx : idx + 1] = length @@ -625,6 +410,7 @@ def get_attr_from_request(request, attr, default_value=None): assert len(request.eos_token_ids) == self.model_config.eos_tokens_lens self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1) self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7) + # self.share_inputs["top_p"][idx : idx + 1] = 0 self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0) self.share_inputs["top_k_list"][idx] = request.get("top_k", 0) self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0) @@ -640,6 +426,13 @@ def get_attr_from_request(request, attr, default_value=None): self.share_inputs["presence_score"][idx : idx + 1] = get_attr_from_request( request, "presence_penalty", 0.0 ) + # self.share_inputs["temp_scaled_logprobs"][idx : idx + 1] = get_attr_from_request( + # request, "temp_scaled_logprobs", False + # ) + # self.share_inputs["top_p_normalized_logprobs"][idx : idx + 1] = get_attr_from_request( + # request, "top_p_normalized_logprobs", False + # ) + self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1) self.share_inputs["max_dec_len"][idx : idx + 1] = request.get( "max_tokens", self.model_config.max_model_len @@ -680,8 +473,13 @@ def get_attr_from_request(request, attr, default_value=None): ] = np.array(request.get("stop_token_ids"), dtype="int64") else: self.share_inputs["stop_seqs_len"][idx : idx + 1, :] = 0 - + # self.sampler.apply_logits_processor(idx, request.get("logits_processor"), prefill_tokens) self.share_inputs["not_need_stop"][0] = True + # self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests] + self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer + if self.speculative_method in ["mtp"]: + self.proposer.insert_prefill_inputs(req_dicts, num_running_requests) + def _init_share_inputs(self, max_num_seqs: int): """Initialize all share buffers for model inputs. @@ -734,7 +532,8 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["max_length"] = paddle.full( [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" ) - self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32") + # self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32") + self.seq_lens_this_time_buffer = paddle.full(max_num_seqs, 0, dtype="int32") self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["step_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") @@ -764,6 +563,29 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32") + + self.share_inputs["ids_remove_padding"] = paddle.full( + [max_num_seqs * self.parallel_config.max_model_len], + 0, + dtype="int64", + ) + self.share_inputs["batch_id_per_token"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + # Declare AttentionBackend buffers + self.share_inputs["decoder_batch_ids"] = None + self.share_inputs["decoder_tile_ids_per_batch"] = None + self.share_inputs["decoder_num_blocks_cpu"] = None # Pinning Memory + self.share_inputs["decoder_num_blocks_device"] = None + self.share_inputs["decoder_chunk_size_device"] = None + self.share_inputs["max_len_tensor_cpu"] = None # CPU + self.share_inputs["encoder_batch_ids"] = None + self.share_inputs["encoder_tile_ids_per_batch"] = None + self.share_inputs["encoder_num_blocks_x_cpu"] = None # CPU + self.share_inputs["kv_batch_ids"] = None + self.share_inputs["kv_tile_ids_per_batch"] = None + self.share_inputs["kv_num_blocks_x_cpu"] = None # CPU + self.share_inputs["max_len_kv_cpu"] = None # CPU # Initialize rotary position embedding tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1)) @@ -827,6 +649,46 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool") self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") + + if self.speculative_decoding: + max_draft_token_num = self.speculative_config.num_speculative_tokens + self.share_inputs["input_ids_cpu"] = paddle.full( + shape=[max_num_seqs, self.parallel_config.max_model_len], + fill_value=1, + dtype="int64", + ).cpu() + self.share_inputs["accept_tokens"] = paddle.full( + shape=[max_num_seqs, max_draft_token_num + 1], + fill_value=0, + dtype="int64", + ) + self.share_inputs["accept_num"] = paddle.full(shape=[max_num_seqs], fill_value=0, dtype="int32") + self.share_inputs["draft_tokens"] = paddle.full( + shape=[max_num_seqs, max_draft_token_num + 1], + fill_value=0, + dtype="int64", + ) + + self.share_inputs["actual_draft_token_num"] = paddle.full( + shape=[max_num_seqs], + fill_value=max_draft_token_num, + dtype="int32", + ) + self.share_inputs["output_cum_offsets"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") + self.share_inputs["output_padding_offset"] = paddle.full( + shape=[max_num_seqs * (max_draft_token_num + 1)], + fill_value=0, + dtype="int32", + ) + # For V1_KVCACHE_SCHEDULER + self.share_inputs["step_draft_tokens"] = paddle.full( + shape=[max_num_seqs, max_draft_token_num + 1], + fill_value=0, + dtype="int64", + ) + self.share_inputs["step_seq_lens_this_time"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + + self.max_num_seqs = max_num_seqs def _prepare_inputs(self, is_dummy_run=False) -> None: """Prepare the model inputs""" @@ -841,22 +703,23 @@ def _prepare_inputs(self, is_dummy_run=False) -> None: self.share_inputs["is_block_step"], self.cache_config.block_size, ) + self.forward_meta = xpu_pre_process( self.share_inputs["input_ids"], self.share_inputs["seq_lens_this_time"], self.share_inputs, - use_speculate_method=False, - block_size=self.cache_config.block_size, - draft_tokens=None, + use_speculate_method=self.speculative_decoding, + block_size=self.parallel_config.block_size, + draft_tokens=self.share_inputs["draft_tokens"] if self.speculative_decoding else None, seq_lens_encoder=self.share_inputs["seq_lens_encoder"], seq_lens_decoder=self.share_inputs["seq_lens_decoder"], - ) + ) + # Update bad tokens len max_bad_tokens_len = paddle.max(self.share_inputs["bad_tokens_len"]) if self.enable_mm: # pos_emb_type is different in EB and VL self.forward_meta.pos_emb_type = "HALF_HEAD_DIM" - self.forward_meta.attn_backend = self.attn_backends[0] self.initialize_attention_backend() # Get sampling metadata @@ -893,6 +756,7 @@ def load_model(self) -> None: # 2. Load lora model # 3. Load drafter model(for speculative decoding) + self._init_speculative_proposer() def get_model(self) -> nn.Layer: """Get current model""" @@ -902,6 +766,7 @@ def initialize_attention_backend(self): """ Initialize attention meta data """ + self.forward_meta.attn_backend = self.attn_backends[0] # Initialzie attention meta data for attn_backend in self.attn_backends: attn_backend.init_attention_metadata(self.forward_meta) @@ -989,6 +854,39 @@ def initialize_attn_backend(self) -> None: int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size ) head_dim = self.model_config.head_dim + + # Initialize AttentionBackend buffers + encoder_block_shape_q = 64 + decoder_block_shape_q = 16 + decoder_step_token_num = self.speculative_config.num_speculative_tokens + 1 + decode_max_tile_size = self.max_num_seqs * np.ceil( + (decoder_step_token_num * np.ceil(num_heads / self.model_config.kv_num_heads)) / decoder_block_shape_q + ) + + group_size = np.ceil(num_heads / self.model_config.kv_num_heads) + encode_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil( + (self.model_config.max_model_len * group_size) / encoder_block_shape_q + ) + kv_max_tile_size = self.scheduler_config.max_num_seqs * np.ceil( + self.model_config.max_model_len / self.fd_config.cache_config.block_size + ) + self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") + self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32") + self.share_inputs["decoder_num_blocks_cpu"] = paddle.full([1], 0, dtype="int32").cpu() + # NOTE: (changwenbin) MLA kernel only needs decoder_num_blocks_device in place of GPU tensor, + # adapted to cudagraph. + self.share_inputs["decoder_num_blocks_device"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["decoder_chunk_size_device"] = paddle.full([1], 64, dtype="int32") + self.share_inputs["max_len_tensor_cpu"] = paddle.full([8], 0, dtype="int32").cpu() + + self.share_inputs["encoder_batch_ids"] = paddle.full([int(encode_max_tile_size)], 0, dtype="int32") + self.share_inputs["encoder_tile_ids_per_batch"] = paddle.full([int(encode_max_tile_size)], 0, dtype="int32") + self.share_inputs["encoder_num_blocks_x_cpu"] = paddle.full([1], 0, dtype="int32").cpu() + + self.share_inputs["kv_batch_ids"] = paddle.full([int(kv_max_tile_size)], 0, dtype="int32") + self.share_inputs["kv_tile_ids_per_batch"] = paddle.full([int(kv_max_tile_size)], 0, dtype="int32") + self.share_inputs["kv_num_blocks_x_cpu"] = paddle.full([1], 0, dtype="int32").cpu() + self.share_inputs["max_len_kv_cpu"] = paddle.full([1], 0, dtype="int32").cpu() # Get the attention backend attn_cls = get_attention_backend() @@ -1004,6 +902,82 @@ def initialize_attn_backend(self) -> None: ) self.attn_backends.append(attn_backend) + def _init_speculative_proposer(self): + """ + Init speculative proposer + """ + if self.speculative_method == "ngram": + # xpu not support ngram proposer now + # self.proposer = NgramProposer(self.fd_config) + self.proposer = None + elif self.speculative_method == "mtp": + self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer + # print("ch -- debug self.share_inputs, before init MTP Proposer:", self.share_inputs) + self.proposer = MTPProposer( + self.fd_config, + self.get_model(), + self.local_rank, + self.device_id, + self.share_inputs, + ) + else: + self.proposer = None + + def _init_logits_processor(self, request): + """ + init logits processor for guided decoding + """ + assert self.guided_backend is not None, ( + "guided_backend is None, use " "--guided-decoding-backend to specify the backend at server startup." + ) + + if request.guided_json is not None: + schemata_key = ("json", request.guided_json) + elif request.guided_regex is not None: + schemata_key = ("regex", request.guided_regex) + elif request.guided_grammar is not None: + schemata_key = ("grammar", request.guided_grammar) + elif request.structural_tag is not None: + schemata_key = ("structural_tag", request.structural_tag) + + enable_thinking = request.get("enable_thinking", True) + enable_thinking = enable_thinking if enable_thinking is not None else True + + return ( + self.guided_backend.get_logits_processor( + schemata_key=schemata_key, + enable_thinking=enable_thinking, + ), + schemata_key, + ) + + def capture_model(self) -> None: + """ + Trigger CUDA Graph capture for all shapes in 'CudaGraphConfig.cudagraph_capture_sizes' + """ + logger.warn("XPU not support cuda graph currently") + pass + + @sot_warmup_guard(True) + def sot_warmup(self) -> None: + start_time = time.perf_counter() + for batch_size in self.sot_warmup_sizes: + self._dummy_run( + num_tokens=self.scheduler_config.max_num_batched_tokens, + batch_size=batch_size, + ) + logger.info(f"SOT warmup the model with the batch size:{batch_size}") + logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds") + + def exist_prefill(self): + """ + check whether prefill stage exist + """ + if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: + return 1 + else: + return 0 + def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int): """Set dummy prefill inputs to share_inputs""" full_length = min(num_tokens // batch_size, self.model_config.max_model_len - 10) @@ -1017,7 +991,7 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int): self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length) self.share_inputs["prompt_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length) self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1) - self.share_inputs["seq_lens_this_time"][idx : idx + 1] = input_length + self.seq_lens_this_time_buffer[idx : idx + 1] = input_length self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length self.share_inputs["seq_lens_encoder"][idx : idx + 1] = input_length @@ -1034,6 +1008,7 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int): self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange( idx * block_num, (idx + 1) * block_num, 1 ) + self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer def _dummy_run( self, @@ -1047,12 +1022,20 @@ def _dummy_run( num_tokens: Expected number of tokens generated """ self._dummy_prefill_inputs(num_tokens, batch_size) - + if self.speculative_method in ["mtp"]: + self.proposer.dummy_prefill_inputs( + num_tokens=num_tokens, + batch_size=batch_size, + expected_decode_len=1, + ) + print("ch -- debug after _dummy_prefill_inputs") while True: self.execute_model(is_dummy_run=True) if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0: break + print("ch -- debug _dummy_run finished.") + print("=========================================================") def _set_debug_level( self, debug_level: int = 0x1, model_forward_batch: Optional[List[Request]] = None, is_dummy_run: bool = False @@ -1112,7 +1095,7 @@ class at the server level, which is too granular for ModelRunner. # 1. Prepare inputs of model and decoder. self._prepare_inputs(is_dummy_run=is_dummy_run) - + print("ch -- debug after prepare_inputs.") # 2. Padding inputs for cuda grph # 3. Execute model @@ -1121,17 +1104,26 @@ class at the server level, which is too granular for ModelRunner. self.share_inputs["ids_remove_padding"], self.share_inputs["image_features"], self.forward_meta ) else: + print("ch -- debug ids_remove_padding in base:", self.share_inputs["ids_remove_padding"]) model_output = self.model( ids_remove_padding=self.share_inputs["ids_remove_padding"], forward_meta=self.forward_meta, ) - - hidden_states = xpu_process_output(model_output, self.share_inputs["cum_offsets"], self.forward_meta) + hidden_states = xpu_process_output(model_output, None, self.forward_meta, self.share_inputs) # 4. Compute logits, Sample logits = self.model.compute_logits(hidden_states) - sampler_output = self.sampler(logits, self.sampling_metadata) - + sampler_output = None + if not self.speculative_decoding: + sampler_output = self.sampler(logits, self.sampling_metadata) + else: + self.sampler( + logits, + self.sampling_metadata, + self.parallel_config.max_model_len, + self.share_inputs, + ) + # TODO(chenhuan09): support tp/dp # 5. Speculative decode # 6. Post Process @@ -1149,15 +1141,16 @@ class at the server level, which is too granular for ModelRunner. seq_lens_encoder=self.share_inputs["seq_lens_encoder"], seq_lens_decoder=self.share_inputs["seq_lens_decoder"], is_block_step=self.share_inputs["is_block_step"], - # 投机解码 - full_hidden_states=None, + full_hidden_states=model_output, # 投机解码 msg_queue_id=self.parallel_config.msg_queue_id, mp_rank=self.local_rank, use_ep=self.parallel_config.use_ep, - draft_tokens=None, - actual_draft_token_num=None, - accept_tokens=None, - accept_num=None, + draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + actual_draft_token_num=( + self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None + ), + accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), + accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), @@ -1165,13 +1158,26 @@ class at the server level, which is too granular for ModelRunner. stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) + skip_save_output = is_dummy_run + if self.speculative_config.method in ["mtp"] and self.scheduler_config.splitwise_role == "prefill": + skip_save_output = True + else: + skip_save_output = False + # TODO(chenhuan09): need check, support mtp xpu_post_process( - sampled_token_ids=sampler_output.sampled_token_ids, + sampler_output=sampler_output, model_output=model_output_data, share_inputs=self.share_inputs, - block_size=self.cache_config.block_size, + block_size=self.parallel_config.block_size, + speculative_decoding=self.speculative_decoding, skip_save_output=is_dummy_run, ) + print("ch -- debug draft model start ==========================================") + print("ch -- debug self.speculative_method:", self.speculative_method) + if self.speculative_decoding: + if self.speculative_method == "mtp": + self.proposer.run(full_hidden_states=model_output) + print("ch -- debug draft model end ==========================================") # 7. Updata 'infer_seed' and step_paddle() self.share_inputs["infer_seed"].add_(self.infer_seed_increment) @@ -1182,14 +1188,23 @@ class at the server level, which is too granular for ModelRunner. self.cache_config.enc_dec_block_num, ) + # self.seq_lens_this_time_buffer[:num_running_requests].copy_( + # self.share_inputs["seq_lens_this_time"][:num_running_requests], False + # ) + + self.seq_lens_this_time_buffer.copy_( + self.share_inputs["seq_lens_this_time"], False + ) return None @profile_run_guard(True) def profile_run(self) -> None: """Execute a forward pass with dummy inputs to profile the memory usage of the model""" - self.num_gpu_blocks = self.cache_config.total_block_num - self.initialize_kv_cache(profile=True) + self.num_gpu_blocks = self.parallel_config.total_block_num + self.initialize_kv_cache() + if self.speculative_method in ["mtp"]: + self.proposer.initialize_kv_cache(main_model_num_blocks=self.num_gpu_blocks, profile=True) self._dummy_run( num_tokens=int(self.scheduler_config.max_num_batched_tokens), diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py index 0f84b1db08..626689dd13 100644 --- a/fastdeploy/worker/xpu_worker.py +++ b/fastdeploy/worker/xpu_worker.py @@ -75,6 +75,7 @@ def init_device(self): self.model_runner: XPUModelRunner = XPUModelRunner( fd_config=self.fd_config, device=self.device, + device_id=self.device_ids[self.local_rank % len(self.device_ids)], rank=self.rank, device_id=self.device_id, local_rank=self.local_rank, @@ -165,9 +166,9 @@ def preprocess_new_task(self, req_dicts: List[Request], num_running_requests: in and workers and modelrunners should not perceive it. """ if envs.ENABLE_V1_KVCACHE_SCHEDULER: - self.model_runner.insert_tasks_v1(req_dicts=req_dicts) + self.model_runner.insert_tasks_v1(req_dicts=req_dicts, num_running_requests=num_running_requests) else: - self.model_runner.insert_prefill_inputs(req_dicts=req_dicts) + self.model_runner.insert_prefill_inputs(req_dicts=req_dicts, num_running_requests=num_running_requests) def graph_optimize_and_warm_up_model(self) -> None: """