From 8b5db45507fb876ae0475cf7d30c0ac60daacb34 Mon Sep 17 00:00:00 2001 From: Co Tran Date: Wed, 9 Oct 2024 11:42:05 -0700 Subject: [PATCH] Fix gpu dependency and only leverage onnx when GPU is available (#157) * replacing appending instead of write * fix eetq dependency * gpu guard required eetq * fix bug when gpu is available * fix for gpu device * reverse * fix * replace gpu -> cuda --- arch/src/consts.rs | 4 ++-- model_server/app/guard_model_config.yaml | 2 +- model_server/app/load_models.py | 26 ++++++++++++++---------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/src/consts.rs b/arch/src/consts.rs index 48dd5494..07d38cf8 100644 --- a/arch/src/consts.rs +++ b/arch/src/consts.rs @@ -1,5 +1,5 @@ -pub const DEFAULT_EMBEDDING_MODEL: &str = "katanemo/bge-large-en-v1.5-onnx"; -pub const DEFAULT_INTENT_MODEL: &str = "katanemo/deberta-base-nli-onnx"; +pub const DEFAULT_EMBEDDING_MODEL: &str = "katanemo/bge-large-en-v1.5"; +pub const DEFAULT_INTENT_MODEL: &str = "katanemo/deberta-base-nli"; pub const DEFAULT_PROMPT_TARGET_THRESHOLD: f64 = 0.8; pub const DEFAULT_HALLUCINATED_THRESHOLD: f64 = 0.1; pub const RATELIMIT_SELECTOR_HEADER_KEY: &str = "x-arch-ratelimit-selector"; diff --git a/model_server/app/guard_model_config.yaml b/model_server/app/guard_model_config.yaml index f86c7083..d1b9ffa5 100644 --- a/model_server/app/guard_model_config.yaml +++ b/model_server/app/guard_model_config.yaml @@ -1,3 +1,3 @@ jailbreak: cpu: "katanemo/Arch-Guard-cpu" - gpu: "katanemo/Arch-Guard-gpu" + gpu: "katanemo/Arch-Guard" diff --git a/model_server/app/load_models.py b/model_server/app/load_models.py index a13578ff..bdb50e1d 100644 --- a/model_server/app/load_models.py +++ b/model_server/app/load_models.py @@ -1,6 +1,6 @@ import os import sentence_transformers -from transformers import AutoTokenizer, pipeline +from transformers import AutoTokenizer, AutoModel, pipeline import sqlite3 import torch from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForSequenceClassification # type: ignore @@ -18,16 +18,17 @@ def get_device(): return device -def load_transformers( - model_name=os.getenv("MODELS", "katanemo/bge-large-en-v1.5-onnx") -): +def load_transformers(model_name=os.getenv("MODELS", "katanemo/bge-large-en-v1.5")): print("Loading Embedding Model") transformers = {} device = get_device() transformers["tokenizer"] = AutoTokenizer.from_pretrained(model_name) - transformers["model"] = ORTModelForFeatureExtraction.from_pretrained( - model_name, device_map=device - ) + if device != "cuda": + transformers["model"] = ORTModelForFeatureExtraction.from_pretrained( + model_name, file_name="onnx/model.onnx" + ) + else: + transformers["model"] = AutoModel.from_pretrained(model_name, device_map=device) transformers["model_name"] = model_name return transformers @@ -64,13 +65,16 @@ def load_guard_model( def load_zero_shot_models( - model_name=os.getenv("ZERO_SHOT_MODELS", "katanemo/deberta-base-nli-onnx") + model_name=os.getenv("ZERO_SHOT_MODELS", "katanemo/deberta-base-nli") ): zero_shot_model = {} device = get_device() - zero_shot_model["model"] = ORTModelForSequenceClassification.from_pretrained( - model_name - ) + if device != "cuda": + zero_shot_model["model"] = ORTModelForSequenceClassification.from_pretrained( + model_name, file_name="onnx/model.onnx" + ) + else: + zero_shot_model["model"] = AutoModel.from_pretrained(model_name) zero_shot_model["tokenizer"] = AutoTokenizer.from_pretrained(model_name) # create pipeline