Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] Update npu-llama model #9784

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llm/predict/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def main():
export_args.output_path = os.path.join(export_args.output_path, f"rank_{tensor_parallel_rank}")

if predictor_args.device == "npu":
from npu.llama.export_utils import process_params
from llm.npu.llama.export_utils import process_params

process_params(os.path.join(export_args.output_path, predictor_args.model_prefix))

Expand Down
25 changes: 17 additions & 8 deletions paddlenlp/experimental/transformers/fused_transformer_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2290,14 +2290,23 @@
seq_lens_decoder = kwargs.get("seq_lens_decoder", None)
max_input_length = kwargs.get("max_input_length", -1)
output_padding_offset = kwargs.get("output_padding_offset", None) # only used in speculative decoding
out = rebuild_padding_v2(
multi_block_output,
cum_offsets,
seq_lens_decoder,
seq_lens_encoder,
output_padding_offset,
max_input_length,
)
if paddle.is_compiled_with_custom_device("npu"):
out = rebuild_padding_v2(

Check warning on line 2294 in paddlenlp/experimental/transformers/fused_transformer_layers.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/fused_transformer_layers.py#L2293-L2294

Added lines #L2293 - L2294 were not covered by tests
multi_block_output,
cum_offsets,
seq_lens_decoder,
seq_lens_encoder,
max_input_length,
)
else:
out = rebuild_padding_v2(

Check warning on line 2302 in paddlenlp/experimental/transformers/fused_transformer_layers.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/fused_transformer_layers.py#L2302

Added line #L2302 was not covered by tests
multi_block_output,
cum_offsets,
seq_lens_decoder,
seq_lens_encoder,
output_padding_offset,
max_input_length,
)

return out

Expand Down
129 changes: 94 additions & 35 deletions paddlenlp/experimental/transformers/generation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,26 +692,53 @@
):
step_idx = model_kwargs["step_idx"]
logits = paddle.cast(outputs, paddle.float32)
if paddle.is_compiled_with_custom_device("npu"):
from paddlenlp_ops import (

Check warning on line 696 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L695-L696

Added lines #L695 - L696 were not covered by tests
get_token_penalty_multi_scores_v2,
set_value_by_flags_and_idx_v2,
)

from paddlenlp_ops import set_preids_token_penalty_multi_scores

set_preids_token_penalty_multi_scores(
model_kwargs["pre_ids"],
model_kwargs["input_ids"],
model_kwargs["seq_lens_encoder"],
model_kwargs["seq_lens_decoder"],
step_idx,
model_kwargs["stop_flags"],
logits,
penalty_score,
frequency_score,
presence_score,
temperature,
model_kwargs["bad_tokens"],
step_idx,
model_kwargs["min_dec_len"],
eos_token_id,
)
set_value_by_flags_and_idx_v2(

Check warning on line 701 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L701

Added line #L701 was not covered by tests
model_kwargs["pre_ids"],
model_kwargs["input_ids"],
model_kwargs["seq_lens_this_time"],
model_kwargs["seq_lens_encoder"],
model_kwargs["seq_lens_decoder"],
step_idx,
model_kwargs["stop_flags"],
)
logits = get_token_penalty_multi_scores_v2(

Check warning on line 710 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L710

Added line #L710 was not covered by tests
model_kwargs["pre_ids"],
logits,
penalty_score,
frequency_score,
presence_score,
temperature,
model_kwargs["bad_tokens"],
step_idx,
model_kwargs["min_dec_len"],
eos_token_id,
)
else:
from paddlenlp_ops import set_preids_token_penalty_multi_scores

Check warning on line 723 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L723

Added line #L723 was not covered by tests

set_preids_token_penalty_multi_scores(

Check warning on line 725 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L725

Added line #L725 was not covered by tests
model_kwargs["pre_ids"],
model_kwargs["input_ids"],
model_kwargs["seq_lens_encoder"],
model_kwargs["seq_lens_decoder"],
step_idx,
model_kwargs["stop_flags"],
logits,
penalty_score,
frequency_score,
presence_score,
temperature,
model_kwargs["bad_tokens"],
step_idx,
model_kwargs["min_dec_len"],
eos_token_id,
)

# sample
probs = F.softmax(logits)
Expand All @@ -727,23 +754,55 @@
if self.config.tensor_parallel_degree > 1:
paddle.distributed.broadcast(next_tokens, 0)

from paddlenlp_ops import update_inputs_v2
if paddle.is_compiled_with_custom_device("npu"):
step_idx = paddle.where(

Check warning on line 758 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L757-L758

Added lines #L757 - L758 were not covered by tests
model_kwargs["stop_flags"], model_kwargs["step_idx"], model_kwargs["step_idx"] + 1
)
paddle.assign(step_idx, model_kwargs["step_idx"])
length_cond = paddle.greater_equal(step_idx, model_kwargs["max_dec_len"])
stop_flags = paddle.logical_or(model_kwargs["stop_flags"], length_cond)
from paddlenlp_ops import set_stop_value_multi_ends_v2

Check warning on line 764 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L761-L764

Added lines #L761 - L764 were not covered by tests

set_stop_value_multi_ends_v2(

Check warning on line 766 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L766

Added line #L766 was not covered by tests
next_tokens,
stop_flags,
model_kwargs["seq_lens_this_time"],
eos_token_id,
model_kwargs["next_tokens"],
) # multi ends
paddle.assign(stop_flags, model_kwargs["stop_flags"])

Check warning on line 773 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L773

Added line #L773 was not covered by tests
# update inputs
from paddlenlp_ops import update_inputs

Check warning on line 775 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L775

Added line #L775 was not covered by tests

update_inputs(

Check warning on line 777 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L777

Added line #L777 was not covered by tests
stop_flags,
model_kwargs["not_need_stop"],
model_kwargs["seq_lens_this_time"],
model_kwargs["seq_lens_encoder"],
model_kwargs["seq_lens_decoder"],
model_kwargs["input_ids"],
model_kwargs["stop_nums"],
next_tokens,
model_kwargs["is_block_step"],
)
else:
from paddlenlp_ops import update_inputs_v2

Check warning on line 789 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L789

Added line #L789 was not covered by tests

update_inputs_v2(
model_kwargs["stop_flags"],
model_kwargs["step_idx"],
model_kwargs["not_need_stop"],
model_kwargs["seq_lens_this_time"],
model_kwargs["seq_lens_encoder"],
model_kwargs["seq_lens_decoder"],
model_kwargs["max_dec_len"],
model_kwargs["input_ids"],
model_kwargs["stop_nums"],
next_tokens,
model_kwargs["is_block_step"],
eos_token_id,
model_kwargs["next_tokens"],
)
update_inputs_v2(

Check warning on line 791 in paddlenlp/experimental/transformers/generation_utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/generation_utils.py#L791

Added line #L791 was not covered by tests
model_kwargs["stop_flags"],
model_kwargs["step_idx"],
model_kwargs["not_need_stop"],
model_kwargs["seq_lens_this_time"],
model_kwargs["seq_lens_encoder"],
model_kwargs["seq_lens_decoder"],
model_kwargs["max_dec_len"],
model_kwargs["input_ids"],
model_kwargs["stop_nums"],
next_tokens,
model_kwargs["is_block_step"],
eos_token_id,
model_kwargs["next_tokens"],
)

from paddlenlp_ops import save_output

Expand Down
12 changes: 9 additions & 3 deletions paddlenlp/experimental/transformers/llama/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1390,9 +1390,15 @@
token_num = paddle.sum(seq_lens_this_time)
from paddlenlp_ops import get_padding_offset_v2

ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k = get_padding_offset_v2(
input_ids, cum_offsets_now, token_num, seq_lens_this_time, draft_tokens, seq_lens_encoder
)
# whether speculative decoding or not
if draft_tokens is None and seq_lens_encoder is None:
ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k = get_padding_offset_v2(

Check warning on line 1395 in paddlenlp/experimental/transformers/llama/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/llama/modeling.py#L1394-L1395

Added lines #L1394 - L1395 were not covered by tests
input_ids, cum_offsets_now, token_num, seq_lens_this_time
)
else:
ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k = get_padding_offset_v2(

Check warning on line 1399 in paddlenlp/experimental/transformers/llama/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/llama/modeling.py#L1399

Added line #L1399 was not covered by tests
input_ids, cum_offsets_now, token_num, seq_lens_this_time, draft_tokens, seq_lens_encoder
)
return ids_remove_padding, padding_offset, cum_offsets, cu_seqlens_q, cu_seqlens_k

def forward(
Expand Down