From 6507bab0b96f32e88caf28d5d4bb37be244b32a2 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Tue, 16 Jul 2024 23:13:25 +0800 Subject: [PATCH] [Core] Use numpy to speed up padded token processing (#6442) --- vllm/model_executor/sampling_metadata.py | 38 +++++++++++------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index ad5fb13176edc..c346cd0562867 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from typing import Dict, List, Optional, Tuple +import numpy as np import torch from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits @@ -457,16 +458,20 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], if do_penalties: prompt_max_len = max([len(tokens) for tokens in prompt_tokens], default=0) - prompt_padded_tokens = [ - tokens + [vocab_size] * (prompt_max_len - len(tokens)) - for tokens in prompt_tokens - ] + prompt_padded_tokens = np.full( + (len(prompt_tokens), prompt_max_len), + vocab_size, + dtype=np.int64) + for i, tokens in enumerate(prompt_tokens): + prompt_padded_tokens[i, :len(tokens)] = tokens output_max_len = max([len(tokens) for tokens in output_tokens], default=0) - output_padded_tokens = [ - tokens + [vocab_size] * (output_max_len - len(tokens)) - for tokens in output_tokens - ] + output_padded_tokens = np.full( + (len(output_tokens), output_max_len), + vocab_size, + dtype=np.int64) + for i, tokens in enumerate(output_tokens): + output_padded_tokens[i, :len(tokens)] = tokens temperatures_t = torch.tensor( temperatures, @@ -517,18 +522,11 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], pin_memory=pin_memory, ) if do_penalties: - prompt_tensor = torch.tensor( - prompt_padded_tokens, - device="cpu", - dtype=torch.long, - pin_memory=pin_memory, - ) - output_tensor = torch.tensor( - output_padded_tokens, - device="cpu", - dtype=torch.long, - pin_memory=pin_memory, - ) + prompt_tensor = torch.from_numpy(prompt_padded_tokens) + output_tensor = torch.from_numpy(output_padded_tokens) + if pin_memory: + prompt_tensor = prompt_tensor.pin_memory() + output_tensor = output_tensor.pin_memory() else: prompt_tensor = None output_tensor = None