Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support of AutoModel #192

Merged
merged 17 commits into from
Dec 18, 2024
4 changes: 2 additions & 2 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# -----------------------------------------------------------------------------

from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
from QEfficient.compile.compile_helper import compile
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
Expand All @@ -21,7 +21,7 @@
"export",
"compile",
"cloud_ai_100_exec_kv",
"QEffAutoModel",
"QEFFAutoModel",
"QEFFAutoModelForCausalLM",
"QEffAutoPeftModelForCausalLM",
"QEFFCommonLoader",
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# -----------------------------------------------------------------------------

from QEfficient.base.common import QEFFCommonLoader # noqa: F401
from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM # noqa: F401
from QEfficient.transformers.models.modeling_auto import QEFFAutoModel, QEFFAutoModelForCausalLM # noqa: F401
2 changes: 1 addition & 1 deletion QEfficient/base/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def __init__(self, *args: Any, **kwds: Any) -> None:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel:
"""
Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model.
Downloads HuggingFace model if already doesn't exist locally, returns QEFFAutoModel object based on type of model.
"""
if not os.path.isdir(pretrained_model_name_or_path):
pretrained_model_name_or_path = login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs)
Expand Down
4 changes: 3 additions & 1 deletion QEfficient/base/modeling_qeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,9 @@ def _compile(

# Check if already compiled
compile_hash = compile_hash.hexdigest()[:16]
qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
compile_dir = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
qpc_path = compile_dir / "qpc"
qpc_path.mkdir(parents=True, exist_ok=True)
if qpc_path.is_dir():
if (qpc_path / "programqpc.bin").is_file():
self.qpc_path = qpc_path
Expand Down
323 changes: 277 additions & 46 deletions QEfficient/transformers/models/modeling_auto.py

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion docs/source/hl_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
:member-order: bysource
:members:
```

## `QEFFAutoModel`
```{eval-rst}
.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModel
:member-order: bysource
:members:
```
## `QEffAutoPeftModelForCausalLM`
```{eval-rst}
.. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM
Expand Down
12 changes: 6 additions & 6 deletions scripts/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ pipeline {
steps {
sh '''
. ~/.bashrc
docker run --privileged -dit --name ${BUILD_TAG} -v ./:/efficient-transformers -v ${HF_PATH}:${DOCKER_HF_PATH} ${DOCKER_LATEST}:master_latest
docker exec ${BUILD_TAG} bash -c "
sudo docker run --privileged -dit --name ${BUILD_TAG} -v ./:/efficient-transformers -v ${HF_PATH}:${DOCKER_HF_PATH} ${DOCKER_LATEST}:master_latest
sudo docker exec ${BUILD_TAG} bash -c "
cd /efficient-transformers &&
apt update &&
apt install -y python3.10-venv &&
Expand All @@ -34,7 +34,7 @@ pipeline {
steps {
timeout(time: 10, unit: 'MINUTES') {
sh '''
docker exec ${BUILD_TAG} bash -c "
sudo docker exec ${BUILD_TAG} bash -c "
cd /efficient-transformers &&
. preflight_qeff/bin/activate &&
mkdir -p $PWD/Non_cli_qaic &&
Expand All @@ -50,7 +50,7 @@ pipeline {
steps {
timeout(time: 60, unit: 'MINUTES') {
sh '''
docker exec ${BUILD_TAG} bash -c "
sudo docker exec ${BUILD_TAG} bash -c "
cd /efficient-transformers &&
. preflight_qeff/bin/activate &&
mkdir -p $PWD/Non_qaic &&
Expand All @@ -68,7 +68,7 @@ pipeline {
steps {
timeout(time: 15, unit: 'MINUTES') {
sh '''
docker exec ${BUILD_TAG} bash -c "
sudo docker exec ${BUILD_TAG} bash -c "
cd /efficient-transformers &&
. preflight_qeff/bin/activate &&
mkdir -p $PWD/cli &&
Expand All @@ -88,7 +88,7 @@ pipeline {
script {
try {
sh '''
docker rm -f ${BUILD_TAG}
sudo docker rm -f ${BUILD_TAG}
sudo chown -R ubuntu .
'''
} catch (error) {
Expand Down
88 changes: 88 additions & 0 deletions tests/transformers/models/test_embedding_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------


import numpy as np
import onnxruntime as ort
import pytest
from transformers import AutoModel, AutoTokenizer

from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
from QEfficient.utils.constants import Constants

embed_test_models = [
# model_name, architecture
"sentence-transformers/multi-qa-mpnet-base-cos-v1", # MPNetForMaskedLM
"BAAI/bge-reranker-v2-m3", # XLMRobertaForSequenceClassification
"BAAI/bge-small-en-v1.5", # BertModel
]


def check_embed_pytorch_vs_ort_vs_ai100(
model_name: str,
seq_len: int = Constants.CTX_LEN,
n_layer: int = 1,
):
# Prepare input
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer("My name is", return_tensors="pt")

# Original PyTorch model
pt_model = AutoModel.from_pretrained(
model_name,
num_hidden_layers=n_layer,
attn_implementation="eager",
trust_remote_code=True,
)

pt_outputs = pt_model(**inputs)
pt_embeddings = pt_outputs[0][0].detach().numpy()
# Pytorch transformed model
qeff_model = QEFFAutoModel(pt_model)
qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"

onnx_model = qeff_model.export()
ort_session = ort.InferenceSession(str(onnx_model))

# Prepare the inputs for ONNX Runtime
input_ids = np.array(inputs["input_ids"])
attention_mask = np.array(inputs["attention_mask"])

onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
# Run inference
onnx_outputs = ort_session.run(None, onnx_inputs)

# Compare Transformed PyTorch and ONNX outputs

pt_embeddings = pt_outputs[0][0].detach().numpy()
onnx_embeddings = onnx_outputs[0]
mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
print("Mad for onnx and PyTorch is ", mad)
assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"

qeff_model.compile(
num_cores=14,
)
ai100_output = qeff_model.generate(inputs=inputs)

# Compare ONNX and AI 100 outputs
mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))
print("Mad for onnx and AI 100 output is ", mad)
assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"


@pytest.mark.on_qaic
@pytest.mark.parametrize("model_name", embed_test_models)
def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
"""
Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
"""
check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
Loading