diff --git a/.gitignore b/.gitignore index e44df7d..c7f22ad 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,5 @@ cython_debug/ #.idea/ /.vscode/ -/logs/ \ No newline at end of file +/logs/ +/mteb_results/ \ No newline at end of file diff --git a/benchmark/bench_mteb.py b/benchmark/bench_mteb.py new file mode 100644 index 0000000..11ca124 --- /dev/null +++ b/benchmark/bench_mteb.py @@ -0,0 +1,40 @@ +from typing import Any +import mteb +import numpy as np +import torch +from tqdm import tqdm + +# Define the sentence-transformers model name +model_name = "gte-Qwen2-7B-instruct" + +import openai + +client = openai.OpenAI( + base_url = "http://127.0.0.1:8000/v1", + api_key="", +) + +def batched(data, batch_size): + for i in range(0, len(data), batch_size): + yield data[i:i+batch_size] + +class EmbeddingModel(): + def encode( + self, sentences: list[str], **kwargs: Any + ) -> torch.Tensor | np.ndarray: + ret = [] + for sent in tqdm(batched(sentences, 16), total=len(sentences)//16): + response = client.embeddings.create( + model=model_name, + input=sent, + encoding_format="float", + ) + for embed_data in response.data: + embed_final = embed_data.embedding + ret.append(embed_final) + return ret + +model = EmbeddingModel() +tasks = mteb.get_tasks(tasks=["Banking77Classification"]) +evaluation = mteb.MTEB(tasks=tasks) +results = evaluation.run(model, output_folder=f"mteb_results/{model_name}") \ No newline at end of file diff --git a/langport/model/executor/embedding/huggingface.py b/langport/model/executor/embedding/huggingface.py index b953c6d..9f008ad 100644 --- a/langport/model/executor/embedding/huggingface.py +++ b/langport/model/executor/embedding/huggingface.py @@ -61,20 +61,21 @@ def __init__( model_path, device, num_gpus, max_gpu_memory, quantization, cpu_offloading, deepspeed, gptq, group_size, trust_remote_code, offload_folder ) + if hasattr(self.model, "max_seq_length"): + self._context_len = self.model.max_seq_length + else: + self._context_len = 2048 else: self.adapter, self.model, self.tokenizer = self.load_model( model_path, device, num_gpus, max_gpu_memory, quantization, cpu_offloading, deepspeed, gptq, group_size, trust_remote_code, offload_folder ) - - if hasattr(self.model, "max_seq_length"): - self._context_len = self.model.max_seq_length - elif hasattr(self.model.config, "max_sequence_length"): - self._context_len = self.model.config.max_sequence_length - elif hasattr(self.model.config, "max_position_embeddings"): - self._context_len = self.model.config.max_position_embeddings - else: - self._context_len = 2048 + if hasattr(self.model.config, "max_sequence_length"): + self._context_len = self.model.config.max_sequence_length + elif hasattr(self.model.config, "max_position_embeddings"): + self._context_len = self.model.config.max_position_embeddings + else: + self._context_len = 2048 def _record_call_time(self): self.last_call_time = time.time() @@ -182,7 +183,7 @@ def inference(self, worker: "EmbeddingModelWorker"): else: data = model(**encoded_prompts) # embeddings = torch.mean(data, dim=1) - embeddings = self._mean_pooling(data, encoded_prompts['attention_mask']) + embeddings = self._mean_pooling(data, encoded_prompts['attention_mask']).cpu() else: embeddings = model.encode(prompts) diff --git a/langport/model/executor/huggingface.py b/langport/model/executor/huggingface.py index c09e633..106e7cb 100644 --- a/langport/model/executor/huggingface.py +++ b/langport/model/executor/huggingface.py @@ -193,7 +193,7 @@ def load_sentence_transformer_model( kwargs["offload_folder"] = offload_folder model = SentenceTransformer(model_path, device=device, trust_remote_code=trust_remote_code, model_kwargs=kwargs) - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=trust_remote_code) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=trust_remote_code) return adapter, model, tokenizer def load_model( diff --git a/langport/service/gateway/openai_api.py b/langport/service/gateway/openai_api.py index 8ed75ff..87382b4 100644 --- a/langport/service/gateway/openai_api.py +++ b/langport/service/gateway/openai_api.py @@ -59,35 +59,50 @@ def redirect_model_name(model: str): break return model +def check_and_log_response(response): + if isinstance(response, JSONResponse): + response_body = json.loads(response.body) + if "object" in response_body and response_body["object"] == "error": + if "object" in response_body and "message" in response_body and "code" in response_body: + logger.error(f"[{response_body['object']}] [{response_body['code']}] - {response_body['message']}") + else: + logger.error(response.body) + @app.exception_handler(RequestValidationError) async def validation_exception_handler(request: Request, exc): - logger.error(f"Invalid request: {await request.body()}") - return create_bad_request_response(ErrorCode.VALIDATION_TYPE_ERROR, str(exc)) + logger.error(f"Invalid request: {(await request.body()).decode('utf-8')}") + response = create_bad_request_response(ErrorCode.VALIDATION_TYPE_ERROR, str(exc)) + check_and_log_response(response) + return response @app.get("/v1/models") async def models(): - return await api_models(app.app_settings) + response = await api_models(app.app_settings) + check_and_log_response(response) + return response @app.post("/v1/chat/completions") async def chat_completions(request: ChatCompletionRequest): - logger.info(request.json()) + logger.info(json.dumps(json.loads(request.model_dump_json()), ensure_ascii=False)) request.model = redirect_model_name(request.model) response = await api_chat_completions(app.app_settings, request) - + check_and_log_response(response) return response @app.post("/v1/completions") async def completions(request: CompletionRequest): - logger.info(request.json()) + logger.info(json.dumps(json.loads(request.model_dump_json()), ensure_ascii=False)) request.model = redirect_model_name(request.model) response = await api_completions(app.app_settings, request) + check_and_log_response(response) return response @app.post("/v1/embeddings") async def embeddings(request: EmbeddingsRequest): - logger.info(request.json()) + logger.info(json.dumps(json.loads(request.model_dump_json()), ensure_ascii=False)) request.model = redirect_model_name(request.model) response = await api_embeddings(app.app_settings, request) + check_and_log_response(response) return response if __name__ in ["__main__", "langport.service.gateway.openai_api"]: