Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support of AutoModel #192

Merged
merged 17 commits into from
Dec 18, 2024
3 changes: 2 additions & 1 deletion QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
from QEfficient.compile.compile_helper import compile
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_embed, cloud_ai_100_exec_kv
from QEfficient.peft import QEffAutoPeftModelForCausalLM
from QEfficient.transformers.transform import transform

Expand All @@ -21,6 +21,7 @@
"export",
"compile",
"cloud_ai_100_exec_kv",
"cloud_ai_100_exec_embed",
"QEffAutoModel",
"QEFFAutoModelForCausalLM",
"QEffAutoPeftModelForCausalLM",
Expand Down
131 changes: 130 additions & 1 deletion QEfficient/generation/text_generation_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

from QEfficient.generation.cloud_infer import QAICInferenceSession
from QEfficient.utils import padding_check_and_fix
from QEfficient.utils import constants, padding_check_and_fix
from QEfficient.utils.logging_utils import logger


Expand Down Expand Up @@ -347,6 +347,68 @@ def cloud_ai_100_exec_kv(
return exec_info


def cloud_ai_100_exec_embed(
tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
qpc_path: str,
prompts: List[str],
device_id: List[int] = [0],
enable_debug_logs: bool = False,
) -> dict:
"""
This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed.
If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped.

``Mandatory`` Args:
:tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer.
:qpc_path (str): Path to the saved generated binary file after compilation.
:prompt (str): Sample prompt for the model text generation.
``Optional`` Args:
:device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``.

Returns:
:dict: Output from the ``AI_100`` runtime.
"""
generate_feature = FeatureGeneration(
tokenizer=tokenizer,
qpc_path=qpc_path,
device_id=device_id,
enable_debug_logs=enable_debug_logs,
)

return generate_feature.generate(prompts=prompts)


def pytorch_feature_generate(
model,
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
prompts: List[str],
seq_len: int = constants.Constants.CTX_LEN,
):
"""
Generates features from a list of text prompts using a PyTorch model and tokenizer.

``Mandatory`` Args:
model: The PyTorch model used for generating features.
tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): The tokenizer used to preprocess the prompts.
prompts (List[str]): A list of text prompts to be tokenized and processed.
``Optional`` Args:
seq_len (int, optional): The maximum sequence length for tokenization. Defaults to constants.Constants.CTX_LEN.

Returns:
List[torch.Tensor]: A list of output features generated by the model for each prompt.
"""

outputs = []
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
import ipdb

ipdb.set_trace()
outputs.append(model(**inputs))
return outputs


class QEffTextGenerationBase:
def __init__(
self,
Expand Down Expand Up @@ -1070,3 +1132,70 @@ def generate(
perf_metrics=perf_metrics,
)
return latency_stats


class QEffFeatureGenerationBase:
def __init__(
self,
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
qpc_path: str,
ctx_len: Optional[int] = None,
device_id: Optional[List[int]] = None,
enable_debug_logs: bool = False,
) -> None:
self.ctx_len = ctx_len

# Load QPC
self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs)

self._batch_size = self._session.bindings[0].dims[0]
self._seq_len = self._session.bindings[0].dims[1]

self.tokenizer = tokenizer
self._set_tokenizer_params() # set tokenizer params

def _set_tokenizer_params(self):
"""
Sets the tokenizer parameters for the model.
"""
if self.tokenizer.padding_side != "right":
logger.warning("Please use padding_side='right' while initializing the tokenizer")
self.tokenizer.padding_side = "right"
if self.tokenizer.pad_token_id is None:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id


class FeatureGeneration:
quic-amitraj marked this conversation as resolved.
Show resolved Hide resolved
def __init__(
self,
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
qpc_path: str,
seq_len: Optional[int] = None,
device_id: Optional[List[int]] = None,
enable_debug_logs: bool = False,
) -> None:
self._qaic_model = QEffFeatureGenerationBase(tokenizer, qpc_path, seq_len, device_id, enable_debug_logs)
self._batch_size = self._qaic_model._batch_size
self._tokenizer = self._qaic_model.tokenizer
self._seq_len = self._qaic_model._seq_len
self._session = self._qaic_model._session

def generate(self, prompts: List[str]):
outputs = []

for prompt in prompts:
inputs = self._tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=self._seq_len)

inputs = dict(
input_ids=inputs["input_ids"].numpy(),
attention_mask=inputs["attention_mask"].numpy(),
)
output = {
"output": np.random.randn(self._batch_size, self._seq_len, self._session.bindings[2].dims[2]).astype(
np.float32
),
}
self._session.set_buffers(output)
output = self._session.run(inputs)
outputs.append(output)
return outputs
Loading
Loading