Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions custom_ops/xpu_ops/setup_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ def xpu_setup_ops():
if file.endswith(".cc"):
ops.append(os.path.join(root, file))

print(ops)

include_dirs = [
os.path.join(base_dir, "./"),
os.path.join(base_dir, "./plugin/include"),
Expand Down
144 changes: 144 additions & 0 deletions custom_ops/xpu_ops/src/ops/mtp/draft_model_preprocess_v2.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <paddle/phi/backends/xpu/xpu_context.h>
#include "paddle/extension.h"
#include "paddle/phi/core/enforce.h"
#include "xpu/plugin.h"

namespace api = baidu::xpu::api;
void DraftModelPreprocessV2(const paddle::Tensor& draft_tokens,
const paddle::Tensor& input_ids,
const paddle::Tensor& stop_flags,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& step_idx,
const paddle::Tensor& not_need_stop,
const paddle::Tensor& is_block_step,
const paddle::Tensor& batch_drop,
const paddle::Tensor& pre_ids,
const paddle::Tensor& accept_tokens,
const paddle::Tensor& accept_num,
const paddle::Tensor& base_model_seq_lens_this_time,
const paddle::Tensor& base_model_seq_lens_encoder,
const paddle::Tensor& base_model_seq_lens_decoder,
const paddle::Tensor& base_model_step_idx,
const paddle::Tensor& base_model_stop_flags,
const paddle::Tensor& base_model_is_block_step,
const paddle::Tensor& base_model_draft_tokens,
const int num_model_step,
const bool truncate_first_token,
const bool splitwise_prefill,
const bool kvcache_scheduler_v1) {

phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
api::Context* ctx = static_cast<const phi::XPUContext*>(dev_ctx)->x_context();
if (draft_tokens.is_cpu()) {
ctx = new api::Context(api::kCPU);
}
int real_bsz = seq_lens_this_time.shape()[0];
int accept_tokens_len = accept_tokens.shape()[1];
int input_ids_len = input_ids.shape()[1];
int draft_tokens_len = draft_tokens.shape()[1];
int pre_ids_len = pre_ids.shape()[1];
constexpr int BlockSize = 512;
int base_model_draft_tokens_len = base_model_draft_tokens.shape()[1];
auto not_need_stop_gpu =
not_need_stop.copy_to(seq_lens_this_time.place(), false);

int r = baidu::xpu::api::plugin::draft_model_preprocess_v2(
ctx,
const_cast<int64_t*>(draft_tokens.data<int64_t>()),
const_cast<int64_t*>(input_ids.data<int64_t>()),
const_cast<bool*>(stop_flags.data<bool>()),
const_cast<int*>(seq_lens_this_time.data<int>()),
const_cast<int*>(seq_lens_encoder.data<int>()),
const_cast<int*>(seq_lens_decoder.data<int>()),
const_cast<int64_t*>(step_idx.data<int64_t>()),
const_cast<bool*>(not_need_stop_gpu.data<bool>()),
const_cast<bool*>(is_block_step.data<bool>()),
const_cast<bool*>(batch_drop.data<bool>()),
const_cast<int64_t*>(pre_ids.data<int64_t>()),
accept_tokens.data<int64_t>(),
accept_num.data<int>(),
base_model_seq_lens_this_time.data<int>(),
base_model_seq_lens_encoder.data<int>(),
base_model_seq_lens_decoder.data<int>(),
base_model_step_idx.data<int64_t>(),
base_model_stop_flags.data<bool>(),
base_model_is_block_step.data<bool>(),
const_cast<int64_t*>(base_model_draft_tokens.data<int64_t>()),
real_bsz,
num_model_step,
accept_tokens_len,
draft_tokens_len,
input_ids_len,
base_model_draft_tokens_len,
pre_ids_len,
truncate_first_token,
splitwise_prefill,
kvcache_scheduler_v1);

PD_CHECK(r == 0, "xpu::plugin::draft_model_preprocess failed.");
auto not_need_stop_cpu =
not_need_stop_gpu.copy_to(not_need_stop.place(), false);
bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
not_need_stop_data[0] = not_need_stop_cpu.data<bool>()[0];
}

PD_BUILD_OP(draft_model_preprocess_v2)
.Inputs({"draft_tokens",
"input_ids",
"stop_flags",
"seq_lens_this_time",
"seq_lens_encoder",
"seq_lens_decoder",
"step_idx",
"not_need_stop",
"is_block_step",
"batch_drop",
"pre_ids",
"accept_tokens",
"accept_num",
"base_model_seq_lens_this_time",
"base_model_seq_lens_encoder",
"base_model_seq_lens_decoder",
"base_model_step_idx",
"base_model_stop_flags",
"base_model_is_block_step",
"base_model_draft_tokens"})
.Outputs({"draft_tokens_out",
"input_ids_out",
"stop_flags_out",
"seq_lens_this_time_out",
"seq_lens_encoder_out",
"seq_lens_decoder_out",
"step_idx_out",
"not_need_stop_out",
"batch_drop_out",
"pre_ids_out"})
.Attrs({"num_model_step: int", "truncate_first_token: bool", "splitwise_prefill: bool", "kvcache_scheduler_v1: bool"})
.SetInplaceMap({{"draft_tokens", "draft_tokens_out"},
{"input_ids", "input_ids_out"},
{"stop_flags", "stop_flags_out"},
{"seq_lens_this_time", "seq_lens_this_time_out"},
{"seq_lens_encoder", "seq_lens_encoder_out"},
{"seq_lens_decoder", "seq_lens_decoder_out"},
{"step_idx", "step_idx_out"},
{"not_need_stop", "not_need_stop_out"},
{"batch_drop", "batch_drop_out"},
{"pre_ids", "pre_ids_out"}})
.SetKernelFn(PD_KERNEL(DraftModelPreprocessV2));
20 changes: 14 additions & 6 deletions custom_ops/xpu_ops/src/ops/mtp/speculate_verify.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
const paddle::Tensor &topp,
int max_seq_len,
int verify_window,
bool enable_topp) {
bool enable_topp,
bool benchmark_mode) {
auto bsz = accept_tokens.shape()[0];
int real_bsz = seq_lens_this_time.shape()[0];
auto max_draft_tokens = draft_tokens.shape()[1];
Expand Down Expand Up @@ -133,7 +134,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
max_seq_len,
max_candidate_len,
verify_window,
prefill_one_step_stop);
prefill_one_step_stop,
benchmark_mode);
} else {
baidu::xpu::api::plugin::speculate_verify<false, true>(
ctx,
Expand Down Expand Up @@ -161,7 +163,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
max_seq_len,
max_candidate_len,
verify_window,
prefill_one_step_stop);
prefill_one_step_stop,
benchmark_mode);
}
} else {
if (enable_topp) {
Expand Down Expand Up @@ -191,7 +194,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
max_seq_len,
max_candidate_len,
verify_window,
prefill_one_step_stop);
prefill_one_step_stop,
benchmark_mode);
} else {
baidu::xpu::api::plugin::speculate_verify<false, false>(
ctx,
Expand Down Expand Up @@ -219,7 +223,8 @@ void SpeculateVerify(const paddle::Tensor &accept_tokens,
max_seq_len,
max_candidate_len,
verify_window,
prefill_one_step_stop);
prefill_one_step_stop,
benchmark_mode);
}
}
}
Expand All @@ -246,7 +251,10 @@ PD_BUILD_STATIC_OP(speculate_verify)
"accept_num_out",
"step_idx_out",
"stop_flags_out"})
.Attrs({"max_seq_len: int", "verify_window: int", "enable_topp: bool"})
.Attrs({"max_seq_len: int",
"verify_window: int",
"enable_topp: bool",
"benchmark_mode: bool"})
.SetInplaceMap({{"accept_tokens", "accept_tokens_out"},
{"accept_num", "accept_num_out"},
{"step_idx", "step_idx_out"},
Expand Down
4 changes: 3 additions & 1 deletion custom_ops/xpu_ops/src/ops/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,8 @@ void SpeculateVerify(const paddle::Tensor& accept_tokens,
const paddle::Tensor& topp,
int max_seq_len,
int verify_window,
bool enable_topp);
bool enable_topp,
bool benchmark_mode);

void SpeculateClearAcceptNums(const paddle::Tensor& accept_num,
const paddle::Tensor& seq_lens_decoder);
Expand Down Expand Up @@ -552,6 +553,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
py::arg("max_seq_len"),
py::arg("verify_window"),
py::arg("enable_topp"),
py::arg("benchmark_mode"),
"Perform speculative verification for decoding");

m.def("speculate_clear_accept_nums",
Expand Down
35 changes: 34 additions & 1 deletion custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,8 @@ DLL_EXPORT int speculate_verify(Context* ctx,
const int max_seq_len,
const int max_candidate_len,
const int verify_window,
const bool prefill_one_step_stop);
const bool prefill_one_step_stop,
const bool benchmark_mode);

DLL_EXPORT int speculate_clear_accept_nums(Context* ctx,
int* accept_num,
Expand Down Expand Up @@ -457,6 +458,38 @@ DLL_EXPORT int rebuild_self_hidden_states(api::Context* ctx,
T* output,
int dim_embed,
int elem_cnt);
DLL_EXPORT int draft_model_preprocess_v2(api::Context* ctx,
int64_t* draft_tokens,
int64_t* input_ids,
bool* stop_flags,
int* seq_lens_this_time,
int* seq_lens_encoder,
int* seq_lens_decoder,
int64_t* step_idx,
bool* not_need_stop,
bool* is_block_step,
bool* batch_drop,
int64_t* pre_ids,
const int64_t* accept_tokens,
const int* accept_num,
const int* base_model_seq_lens_this_time,
const int* base_model_seq_lens_encoder,
const int* base_model_seq_lens_decoder,
const int64_t* base_model_step_idx,
const bool* base_model_stop_flags,
const bool* base_model_is_block_step,
int64_t* base_model_draft_tokens,
const int bsz,
const int num_model_step,
const int accept_tokens_len,
const int draft_tokens_len,
const int input_ids_len,
const int base_model_draft_tokens_len,
const int pre_ids_len,
const bool truncate_first_token,
const bool splitwise_prefill,
const bool kvcache_scheduler_v1);

/*--------------------------------------- MTP end --------------------------------------------*/

} // namespace plugin
Expand Down
Loading