From d0dbe9de73f8cc1ece56d37ad4a3996e1cbbd890 Mon Sep 17 00:00:00 2001 From: jstzwj <1103870790@qq.com> Date: Tue, 20 Jun 2023 02:48:26 +0800 Subject: [PATCH 1/8] optimum support --- .../model/executor/generation/huggingface.py | 1 + langport/model/executor/generation/optimum.py | 67 +++++++++++++++++ langport/model/executor/optimum.py | 47 ++++++++++++ .../server/optimum_generation_worker.py | 72 +++++++++++++++++++ langport/utils/__init__.py | 4 +- langport/version.py | 2 +- pyproject.toml | 3 +- 7 files changed, 192 insertions(+), 4 deletions(-) create mode 100644 langport/model/executor/generation/optimum.py create mode 100644 langport/model/executor/optimum.py create mode 100644 langport/service/server/optimum_generation_worker.py diff --git a/langport/model/executor/generation/huggingface.py b/langport/model/executor/generation/huggingface.py index ee09c49..df31578 100644 --- a/langport/model/executor/generation/huggingface.py +++ b/langport/model/executor/generation/huggingface.py @@ -208,6 +208,7 @@ def generate(self, inputs: BatchingTask, else: out = self.model( input_ids=decoder_input_ids, + attention_mask=torch.ones_like(decoder_input_ids, dtype=torch.long, device=decoder_input_ids.device), use_cache=self.model.generation_config.use_cache, past_key_values=past_key_values, ) diff --git a/langport/model/executor/generation/optimum.py b/langport/model/executor/generation/optimum.py new file mode 100644 index 0000000..f15223b --- /dev/null +++ b/langport/model/executor/generation/optimum.py @@ -0,0 +1,67 @@ + +from typing import List, Optional +from langport.model.executor.generation.huggingface import BatchingTask, GenerationModel, GenerationWorkerStreamer +from langport.model.executor.optimum import OptimumExecutor +from langport.workers.generation_worker import GenerationModelWorker + + +class OptimumGenerationExecutor(OptimumExecutor): + def __init__( + self, + model_name: str, + model_path: str, + device: str, + num_gpus: int, + max_gpu_memory: Optional[str], + load_8bit: bool, + cpu_offloading: bool, + trust_remote_code: bool = False + ) -> None: + super(OptimumGenerationExecutor, self).__init__( + model_name=model_name, + model_path=model_path, + device=device, + num_gpus=num_gpus, + max_gpu_memory=max_gpu_memory, + load_8bit=load_8bit, + cpu_offloading=cpu_offloading + ) + self.adapter = None + self.model = None + self.tokenizer = None + self.adapter, self.model, self.tokenizer = self.load_model( + model_path, {} + ) + + # self.model = torch.compile(self.model) + + if hasattr(self.model.config, "max_sequence_length"): + self._context_len = self.model.config.max_sequence_length + elif hasattr(self.model.config, "max_position_embeddings"): + self._context_len = self.model.config.max_position_embeddings + else: + self._context_len = 2048 + + @property + def context_length(self) -> int: + return self._context_len + + def tokenize(self, text: str) -> List[int]: + input_ids = self.tokenizer(text).input_ids + return input_ids + + def inference(self, worker: "GenerationModelWorker"): + if not worker.online: + return + + tasks = worker.fetch_tasks() + + # batch inference + inputs = BatchingTask(tasks, self.tokenizer, self.device, self.model.config.is_encoder_decoder) + if inputs.batch_size == 0: + return + streamer = GenerationWorkerStreamer(inputs, self.tokenizer, worker) + model = GenerationModel(self.model) + max_new_tokens = max(inputs.max_tokens) + model.generate(inputs, max_new_tokens, streamer) + \ No newline at end of file diff --git a/langport/model/executor/optimum.py b/langport/model/executor/optimum.py new file mode 100644 index 0000000..16be5b5 --- /dev/null +++ b/langport/model/executor/optimum.py @@ -0,0 +1,47 @@ +import os +from typing import List, Optional +from langport.model.executor.base import LocalModelExecutor +from langport.model.model_adapter import get_model_adapter + +from optimum.onnxruntime import ORTModelForCausalLM +from transformers import AutoTokenizer + + +class OptimumExecutor(LocalModelExecutor): + def __init__( + self, + model_name: str, + model_path: str, + device: str, + num_gpus: int, + max_gpu_memory: Optional[str], + load_8bit: bool = False, + cpu_offloading: bool = False, + ) -> None: + super(OptimumExecutor, self).__init__( + model_name = model_name, + model_path = model_path, + device = device, + num_gpus = num_gpus, + max_gpu_memory = max_gpu_memory, + load_8bit = load_8bit, + cpu_offloading = cpu_offloading, + ) + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + adapter = get_model_adapter(model_path) + + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) + if os.path.exists(os.path.join(model_path, "decoder_model.onnx")): + export_onnx = False + else: + export_onnx = True + + use_gpu = self.device == "cuda" + if use_gpu: + provider = "CUDAExecutionProvider" + else: + provider = "CPUExecutionProvider" + model = ORTModelForCausalLM.from_pretrained(model_path, export=export_onnx, provider=provider) + + return adapter, model, tokenizer diff --git a/langport/service/server/optimum_generation_worker.py b/langport/service/server/optimum_generation_worker.py new file mode 100644 index 0000000..545a316 --- /dev/null +++ b/langport/service/server/optimum_generation_worker.py @@ -0,0 +1,72 @@ +import argparse +import os +import random +import uuid +import uvicorn + +from langport.workers.generation_worker import GenerationModelWorker +from langport.model.model_args import add_model_args +from langport.utils import build_logger +from langport.routers.server.generation_node import app + + +# We suggest that concurrency == batch * thread (thread == 4) +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=None) + parser.add_argument("--worker-address", type=str, default=None) + parser.add_argument("--neighbors", type=str, nargs="*", default=[]) + + add_model_args(parser) + parser.add_argument("--model-name", type=str, help="Optional display name") + parser.add_argument("--limit-model-concurrency", type=int, default=8) + parser.add_argument("--batch", type=int, default=1) + parser.add_argument("--stream-interval", type=int, default=2) + + args = parser.parse_args() + + node_id = str(uuid.uuid4()) + logger = build_logger("optimum_generation_worker", f"optimum_generation_worker_{node_id}.log") + logger.info(f"args: {args}") + + if args.gpus: + if len(args.gpus.split(",")) < args.num_gpus: + raise ValueError( + f"Larger --num-gpus ({args.num_gpus}) than --gpus {args.gpus}!" + ) + os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus + + if args.port is None: + args.port = random.randint(21001, 29001) + + if args.worker_address is None: + args.worker_address = f"http://{args.host}:{args.port}" + + if args.model_name is None: + args.model_name = os.path.basename(os.path.normpath(args.model_path)) + + from langport.model.executor.generation.optimum import OptimumGenerationExecutor + executor = OptimumGenerationExecutor( + model_name=args.model_name, + model_path=args.model_path, + device=args.device, + num_gpus=args.num_gpus, + max_gpu_memory=args.max_gpu_memory, + load_8bit=args.load_8bit, + cpu_offloading=args.cpu_offloading, + trust_remote_code=args.trust_remote_code + ) + + app.node = GenerationModelWorker( + node_addr=args.worker_address, + node_id=node_id, + init_neighborhoods_addr=args.neighbors, + executor=executor, + limit_model_concurrency=args.limit_model_concurrency, + max_batch=args.batch, + stream_interval=args.stream_interval, + logger=logger, + ) + uvicorn.run(app, host=args.host, port=args.port, log_level="info") + diff --git a/langport/utils/__init__.py b/langport/utils/__init__.py index 3a3e4bd..59577ae 100644 --- a/langport/utils/__init__.py +++ b/langport/utils/__init__.py @@ -48,7 +48,7 @@ def build_logger(logger_name, logger_filename) -> logging.Logger: # Redirect stdout and stderr to loggers stdout_logger = logging.getLogger("stdout") - stdout_logger.setLevel(logging.INFO) + stdout_logger.setLevel(logging.DEBUG) sl = StreamToLogger(stdout_logger, logging.INFO) sys.stdout = sl @@ -59,7 +59,7 @@ def build_logger(logger_name, logger_filename) -> logging.Logger: # Get logger logger = logging.getLogger(logger_name) - logger.setLevel(logging.INFO) + logger.setLevel(logging.DEBUG) # Add a file handler for all loggers if handler is None: diff --git a/langport/version.py b/langport/version.py index 0c879f1..00360ed 100644 --- a/langport/version.py +++ b/langport/version.py @@ -1 +1 @@ -LANGPORT_VERSION = "0.2.0" +LANGPORT_VERSION = "0.3.0" diff --git a/pyproject.toml b/pyproject.toml index 7fb808b..4eb3fbd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "langport" -version = "0.2.0" +version = "0.3.0" description = "A large language model serving platform." readme = "README.md" requires-python = ">=3.8" @@ -22,6 +22,7 @@ dependencies = [ [project.optional-dependencies] dev = ["black==23.3.0", "pylint==2.8.2"] ggml = ["ctransformers"] +optimum = ["onnx", "onnxruntime", "optimum"] [project.urls] "Homepage" = "https://github.com/vtuber-plan/langport" From 55fb5ca4bcdabd8fc12246e6a370584d1dd5c475 Mon Sep 17 00:00:00 2001 From: jstzwj <1103870790@qq.com> Date: Wed, 21 Jun 2023 21:39:42 +0800 Subject: [PATCH 2/8] attention fix --- benchmark/bench_chat.py | 43 ++++++-- benchmark/bench_completions.py | 97 +++++++++++++++++++ .../model/executor/generation/huggingface.py | 5 +- 3 files changed, 136 insertions(+), 9 deletions(-) create mode 100644 benchmark/bench_completions.py diff --git a/benchmark/bench_chat.py b/benchmark/bench_chat.py index 16659d1..e0b0d17 100644 --- a/benchmark/bench_chat.py +++ b/benchmark/bench_chat.py @@ -1,20 +1,20 @@ import argparse import random import time +import traceback import openai import threading import tqdm +import datasets from concurrent.futures import ThreadPoolExecutor -def start_session(i: int, url: str, model: str, stream: bool=False, max_tokens: int=2048, random_len: int=0) -> str: +def start_session(i: int, url: str, model: str, dataset, stream: bool=False, max_tokens: int=2048, random_len: int=0) -> str: try: openai.api_key = "EMPTY" # Not support yet openai.api_base = url - if random_len != 0 : - messages = [{"role": "user", "content": "Hello! What is your name?" + "a" * random.randint(1, random_len)}] - else: - messages = [{"role": "user", "content": "Hello! What is your name?"}] + messages = dataset[i] + # create a chat completion response = openai.ChatCompletion.create( model=model, @@ -32,17 +32,44 @@ def start_session(i: int, url: str, model: str, stream: bool=False, max_tokens: total_tokens = response.usage.total_tokens completion_tokens = response.usage.completion_tokens except Exception as e: - print(e) + traceback.print_exc() return "", 0, 0 return out, total_tokens, completion_tokens + +def get_prompt(raw_dataset): + dataset = [] + for conversations in raw_dataset["conversations"]: + messages = [] + for data in conversations: + out_data = {"role": "system", "content": ""} + if data["user"] == "human": + out_data["role"] = "user" + if data["user"] == "gpt": + out_data["role"] = "assitant" + + out_data["content"] = data["text"] + messages.append(out_data) + + if messages[-1]["role"] == "gpt": + messages = messages[:-1] + + prompt = "\n###".join([msg["role"] + ": " + msg["content"] for msg in messages]) + "\n### assistant: " + if len(prompt) > 2048: + continue + dataset.append(messages) + return dataset + def main(args): + dataset = datasets.load_dataset("theblackcat102/sharegpt-english", split="train") + dataset = get_prompt(dataset) + tik = time.time() tasks = [] with ThreadPoolExecutor(max_workers=args.n_thread) as t: for i in range(args.total_task): - task = t.submit(start_session, i=i, url=args.url, model=args.model_name, stream=False, max_tokens=args.max_tokens, random_len=args.random_len) + task = t.submit(start_session, i=i, url=args.url, model=args.model_name, dataset=dataset, stream=False, max_tokens=args.max_tokens, random_len=args.random_len) tasks.append(task) results = [] @@ -63,7 +90,7 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument("--url", type=str, default="http://localhost:8000/v1") parser.add_argument("--model-name", type=str, default="vicuna") - parser.add_argument("--max-tokens", type=int, default=1024) + parser.add_argument("--max-tokens", type=int, default=512) parser.add_argument("--total-task", type=int, default=200) parser.add_argument("--n-thread", type=int, default=32) parser.add_argument("--random-len", type=int, default=0) diff --git a/benchmark/bench_completions.py b/benchmark/bench_completions.py new file mode 100644 index 0000000..811e535 --- /dev/null +++ b/benchmark/bench_completions.py @@ -0,0 +1,97 @@ +import argparse +import random +import time +import traceback +import openai +import threading +import tqdm +import datasets +from concurrent.futures import ThreadPoolExecutor + +def start_session(i: int, url: str, model: str, dataset, stream: bool=False, max_tokens: int=2048) -> str: + try: + openai.api_key = "EMPTY" # Not support yet + openai.api_base = url + + prompt = dataset[i] + # create a chat completion + response = openai.Completion.create( + model=model, + prompt=prompt, + stream=stream, + max_tokens=max_tokens, + temperature=0.9, + ) + # print the completion + if stream: + out = "" + for chunk in response: + out += str(chunk) + else: + out = response.choices[0].text + total_tokens = response.usage.total_tokens + completion_tokens = response.usage.completion_tokens + except Exception as e: + traceback.print_exc() + return "", 0, 0 + + return out, total_tokens, completion_tokens + +def get_prompt(raw_dataset): + dataset = [] + for conversations in raw_dataset["conversations"]: + messages = [] + for data in conversations: + out_data = {"role": "system", "content": ""} + if data["user"] == "human": + out_data["role"] = "user" + if data["user"] == "gpt": + out_data["role"] = "assitant" + + out_data["content"] = data["text"] + messages.append(out_data) + + if messages[-1]["role"] == "gpt": + messages = messages[:-1] + + prompt = "\n###".join([msg["role"] + ": " + msg["content"] for msg in messages]) + "\n### assistant: " + if len(prompt) > 2048: + continue + dataset.append(prompt) + return dataset + +def main(args): + dataset = datasets.load_dataset("theblackcat102/sharegpt-english", split="train") + dataset = get_prompt(dataset) + + tik = time.time() + tasks = [] + with ThreadPoolExecutor(max_workers=args.n_thread) as t: + for i in range(args.total_task): + task = t.submit(start_session, i=i, url=args.url, model=args.model_name, dataset=dataset, stream=False, max_tokens=args.max_tokens) + tasks.append(task) + + results = [] + for task in tqdm.tqdm(tasks): + results.append(task.result()) + + n_tokens = sum([ret[2] for ret in results]) + n_queries = sum([1 for ret in results if ret[2] != 0]) + time_seconds = time.time() - tik + print( + f"Successful number: {n_queries} / {args.total_task}. " + f"Time (Completion): {time_seconds}, n threads: {args.n_thread}, " + f"throughput: {n_tokens / time_seconds} tokens/s." + f"QPS: {n_queries / time_seconds} queries/s." + ) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--url", type=str, default="http://localhost:8000/v1") + parser.add_argument("--model-name", type=str, default="vicuna") + parser.add_argument("--max-tokens", type=int, default=512) + parser.add_argument("--total-task", type=int, default=64) + parser.add_argument("--n-thread", type=int, default=4) + args = parser.parse_args() + + main(args) \ No newline at end of file diff --git a/langport/model/executor/generation/huggingface.py b/langport/model/executor/generation/huggingface.py index df31578..c0efe80 100644 --- a/langport/model/executor/generation/huggingface.py +++ b/langport/model/executor/generation/huggingface.py @@ -208,7 +208,10 @@ def generate(self, inputs: BatchingTask, else: out = self.model( input_ids=decoder_input_ids, - attention_mask=torch.ones_like(decoder_input_ids, dtype=torch.long, device=decoder_input_ids.device), + attention_mask=torch.ones( + inputs.batch_size, full_input_ids.shape[1] + step, + dtype=torch.long, device=decoder_input_ids.device + ), use_cache=self.model.generation_config.use_cache, past_key_values=past_key_values, ) From 46f2d7a106ff88ea38a3f1ba66701508899e84aa Mon Sep 17 00:00:00 2001 From: jstzwj <1103870790@qq.com> Date: Wed, 21 Jun 2023 21:49:26 +0800 Subject: [PATCH 3/8] add wizard and baichuan adapter --- .../data/conversation/settings/wizardlm.py | 13 +++++++++++++ langport/model/adapters/baichuan.py | 19 +++++++++++++++++++ langport/model/adapters/wizardlm.py | 19 +++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 langport/data/conversation/settings/wizardlm.py create mode 100644 langport/model/adapters/baichuan.py create mode 100644 langport/model/adapters/wizardlm.py diff --git a/langport/data/conversation/settings/wizardlm.py b/langport/data/conversation/settings/wizardlm.py new file mode 100644 index 0000000..852b23c --- /dev/null +++ b/langport/data/conversation/settings/wizardlm.py @@ -0,0 +1,13 @@ +from langport.data.conversation import ( + ConversationSettings, + SeparatorStyle, +) + + +# Vicuna v1.1 template +wizardlm = ConversationSettings( + name="wizardlm", + roles=("USER", "ASSISTANT"), + sep_style=SeparatorStyle.ADD_COLON_SINGLE, + sep=" ", +) \ No newline at end of file diff --git a/langport/model/adapters/baichuan.py b/langport/model/adapters/baichuan.py new file mode 100644 index 0000000..6286975 --- /dev/null +++ b/langport/model/adapters/baichuan.py @@ -0,0 +1,19 @@ +from typing import List, Optional +from langport.data.conversation import ConversationHistory, SeparatorStyle +from langport.data.conversation.conversation_settings import get_conv_settings +from langport.model.model_adapter import BaseAdapter + +class BaichuanAdapter(BaseAdapter): + """The model adapter for baichuan-inc/baichuan-7B""" + + def match(self, model_path: str): + return "baichuan" in model_path + + def get_default_conv_template(self, model_path: str) -> ConversationHistory: + settings = get_conv_settings("one_shot") + return ConversationHistory( + system="", + messages=(), + offset=0, + settings=settings, + ) \ No newline at end of file diff --git a/langport/model/adapters/wizardlm.py b/langport/model/adapters/wizardlm.py new file mode 100644 index 0000000..8d6de32 --- /dev/null +++ b/langport/model/adapters/wizardlm.py @@ -0,0 +1,19 @@ +from typing import List, Optional +from langport.data.conversation import ConversationHistory, SeparatorStyle +from langport.data.conversation.conversation_settings import get_conv_settings +from langport.model.model_adapter import BaseAdapter + +class WizardLMAdapter(BaseAdapter): + """The model adapter for WizardLM/WizardLM-13B-V1.0""" + + def match(self, model_path: str): + return "wizardlm" in model_path + + def get_default_conv_template(self, model_path: str) -> ConversationHistory: + settings = get_conv_settings("wizardlm") + return ConversationHistory( + system="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. ", + messages=(), + offset=0, + settings=settings, + ) \ No newline at end of file From 1c53983fb400b448cd78f85fbece60a2a8e8f9dd Mon Sep 17 00:00:00 2001 From: jstzwj <1103870790@qq.com> Date: Wed, 21 Jun 2023 23:12:07 +0800 Subject: [PATCH 4/8] attention mask fix --- langport/model/executor/generation/huggingface.py | 15 +++++++++++---- langport/model/executor/optimum.py | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/langport/model/executor/generation/huggingface.py b/langport/model/executor/generation/huggingface.py index c0efe80..542e33e 100644 --- a/langport/model/executor/generation/huggingface.py +++ b/langport/model/executor/generation/huggingface.py @@ -206,12 +206,19 @@ def generate(self, inputs: BatchingTask, past_key_values=past_key_values, ) else: + if step > 0: + dynamic_attention_mask = torch.cat( + (attention_mask, + torch.ones( + inputs.batch_size, step, + dtype=torch.long, device=decoder_input_ids.device + )), dim=1 + ) + else: + dynamic_attention_mask = attention_mask out = self.model( input_ids=decoder_input_ids, - attention_mask=torch.ones( - inputs.batch_size, full_input_ids.shape[1] + step, - dtype=torch.long, device=decoder_input_ids.device - ), + attention_mask=dynamic_attention_mask, use_cache=self.model.generation_config.use_cache, past_key_values=past_key_values, ) diff --git a/langport/model/executor/optimum.py b/langport/model/executor/optimum.py index 16be5b5..49456ad 100644 --- a/langport/model/executor/optimum.py +++ b/langport/model/executor/optimum.py @@ -37,7 +37,7 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict): else: export_onnx = True - use_gpu = self.device == "cuda" + use_gpu = False # self.device == "cuda" if use_gpu: provider = "CUDAExecutionProvider" else: From 8f1ff0201c470084a75ac11127d44d1ba432e413 Mon Sep 17 00:00:00 2001 From: jstzwj <1103870790@qq.com> Date: Mon, 26 Jun 2023 00:41:10 +0800 Subject: [PATCH 5/8] update readme --- README.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 58b59e2..242aca6 100644 --- a/README.md +++ b/README.md @@ -30,13 +30,29 @@ The core features include: - HuggingFace-compatible RESTful APIs. - Tabby-compatible RESTful APIs. +## Support Model Architectures +* LLaMa +* GLM +* Bloom +* OPT +* GPT2 +* GPT Neo +* GPT Big Code + +## Tested Models +* LLaMa +* Vicuna +* ChatGLM +* Falcon +* Starcoder +* WizardLM + ## Benchmark We use single RTX3090 to run a finetuned 7B LLaMA model (OpenBuddy V0.9) under the bf16 setting. We create 32 threads to submit chat tasks to the server, and the following figure shows the Queries Per Second (QPS) and Tokens Per Second (TPS) of FastChat and LangPort with different max model concurrency settings. ![benchmark_chat](assets/benchmark_chat.jpg) - ## News - [2023/06/18] Add ggml (llama.cpp gpt.cpp starcoder.cpp etc.) worker support. - [2023/06/09] Add LLama.cpp worker support. From 8727dc6d9cb242687b3ae4f05b5db10bf99b3481 Mon Sep 17 00:00:00 2001 From: jstzwj <1103870790@qq.com> Date: Fri, 30 Jun 2023 13:16:47 +0800 Subject: [PATCH 6/8] tokenizer trust_remote_code --- README.md | 2 ++ langport/model/executor/huggingface.py | 3 ++- langport/service/server/generation_worker.py | 2 +- langport/utils/__init__.py | 8 ++++---- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 242aca6..2847a76 100644 --- a/README.md +++ b/README.md @@ -43,9 +43,11 @@ The core features include: * LLaMa * Vicuna * ChatGLM +* ChatGLM2 * Falcon * Starcoder * WizardLM +* OpenBuddy ## Benchmark We use single RTX3090 to run a finetuned 7B LLaMA model (OpenBuddy V0.9) under the bf16 setting. diff --git a/langport/model/executor/huggingface.py b/langport/model/executor/huggingface.py index c279551..6fb05a2 100644 --- a/langport/model/executor/huggingface.py +++ b/langport/model/executor/huggingface.py @@ -79,7 +79,8 @@ def _load_hf_model(self, adapter, model_path: str, from_pretrained_kwargs: dict) model = BertModel.from_pretrained(model_path, **from_pretrained_kwargs) else: - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) + trust_remote_code = from_pretrained_kwargs.get("trust_remote_code", False) + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=trust_remote_code) model = AutoModelForCausalLM.from_pretrained( model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs ) diff --git a/langport/service/server/generation_worker.py b/langport/service/server/generation_worker.py index 0ec7ec0..3632c29 100644 --- a/langport/service/server/generation_worker.py +++ b/langport/service/server/generation_worker.py @@ -57,7 +57,7 @@ deepspeed=args.deepspeed, trust_remote_code=args.trust_remote_code ) - + app.node = GenerationModelWorker( node_addr=args.worker_address, node_id=node_id, diff --git a/langport/utils/__init__.py b/langport/utils/__init__.py index 59577ae..a1f35dc 100644 --- a/langport/utils/__init__.py +++ b/langport/utils/__init__.py @@ -52,10 +52,10 @@ def build_logger(logger_name, logger_filename) -> logging.Logger: sl = StreamToLogger(stdout_logger, logging.INFO) sys.stdout = sl - stderr_logger = logging.getLogger("stderr") - stderr_logger.setLevel(logging.ERROR) - sl = StreamToLogger(stderr_logger, logging.ERROR) - sys.stderr = sl + # stderr_logger = logging.getLogger("stderr") + # stderr_logger.setLevel(logging.ERROR) + # sl = StreamToLogger(stderr_logger, logging.ERROR) + # sys.stderr = sl # Get logger logger = logging.getLogger(logger_name) From 3d0db3dec8912c0b702c39f2b8a8830ae1440f18 Mon Sep 17 00:00:00 2001 From: jstzwj <1103870790@qq.com> Date: Fri, 30 Jun 2023 13:37:48 +0800 Subject: [PATCH 7/8] chatglm adapter quick workaround --- langport/model/adapters/{chatglm.py => chatglm2.py} | 5 ----- langport/model/executor/huggingface.py | 11 ++++++++++- langport/routers/gateway/openai_compatible.py | 8 ++------ 3 files changed, 12 insertions(+), 12 deletions(-) rename langport/model/adapters/{chatglm.py => chatglm2.py} (77%) diff --git a/langport/model/adapters/chatglm.py b/langport/model/adapters/chatglm2.py similarity index 77% rename from langport/model/adapters/chatglm.py rename to langport/model/adapters/chatglm2.py index 10c0a50..8883028 100644 --- a/langport/model/adapters/chatglm.py +++ b/langport/model/adapters/chatglm2.py @@ -1,8 +1,3 @@ -from transformers import ( - AutoModel, - AutoTokenizer, -) - from langport.model.model_adapter import BaseAdapter class ChatGLMAdapter(BaseAdapter): diff --git a/langport/model/executor/huggingface.py b/langport/model/executor/huggingface.py index 6fb05a2..e354d32 100644 --- a/langport/model/executor/huggingface.py +++ b/langport/model/executor/huggingface.py @@ -4,12 +4,15 @@ from langport.model.adapters.rwkv import RwkvAdapter from langport.model.adapters.t5 import T5Adapter from langport.model.adapters.text2vec import BertAdapter +from langport.model.adapters.chatglm2 import ChatGLMAdapter + from langport.model.executor.base import LocalModelExecutor import torch from transformers import ( AutoModelForCausalLM, + AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM, T5Tokenizer, @@ -77,7 +80,13 @@ def _load_hf_model(self, adapter, model_path: str, from_pretrained_kwargs: dict) elif isinstance(adapter, BertAdapter): tokenizer = BertTokenizer.from_pretrained(model_path) model = BertModel.from_pretrained(model_path, **from_pretrained_kwargs) - + elif isinstance(adapter, ChatGLMAdapter): + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True) + if "trust_remote_code" in from_pretrained_kwargs: + from_pretrained_kwargs.pop("trust_remote_code") + model = AutoModel.from_pretrained( + model_path, low_cpu_mem_usage=True, trust_remote_code=True, **from_pretrained_kwargs + ) else: trust_remote_code = from_pretrained_kwargs.get("trust_remote_code", False) tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=trust_remote_code) diff --git a/langport/routers/gateway/openai_compatible.py b/langport/routers/gateway/openai_compatible.py index 31978ce..37c54ee 100644 --- a/langport/routers/gateway/openai_compatible.py +++ b/langport/routers/gateway/openai_compatible.py @@ -69,7 +69,7 @@ def get_gen_params( stream: Optional[bool], stop: Optional[Union[str, List[str]]], ) -> Dict[str, Any]: - is_chatglm = "chatglm" in model_name.lower() + # is_chatglm = "chatglm" in model_name.lower() conv = get_conversation_template(model_name) if isinstance(messages, str): @@ -88,11 +88,7 @@ def get_gen_params( # Add a blank message for the assistant. conv.append_message(conv.settings.roles[1], None) - - if is_chatglm: - prompt = conv.messages[conv.offset :] - else: - prompt = conv.get_prompt() + prompt = conv.get_prompt() if max_tokens is None: max_tokens = 512 From bed90c00eab0d679c4310f6c0a8353c76bf49b35 Mon Sep 17 00:00:00 2001 From: jstzwj <1103870790@qq.com> Date: Fri, 30 Jun 2023 15:23:08 +0800 Subject: [PATCH 8/8] chatglm fix --- langport/data/conversation/__init__.py | 11 +++++++++++ langport/data/conversation/settings/chatglm.py | 13 +++++++++++++ langport/model/adapters/chatglm.py | 18 ++++++++++++++++++ langport/model/adapters/chatglm2.py | 7 ------- langport/model/executor/huggingface.py | 2 +- 5 files changed, 43 insertions(+), 8 deletions(-) create mode 100644 langport/data/conversation/settings/chatglm.py create mode 100644 langport/model/adapters/chatglm.py delete mode 100644 langport/model/adapters/chatglm2.py diff --git a/langport/data/conversation/__init__.py b/langport/data/conversation/__init__.py index 5e3c077..c0b3256 100644 --- a/langport/data/conversation/__init__.py +++ b/langport/data/conversation/__init__.py @@ -19,6 +19,7 @@ class SeparatorStyle(Enum): DOLLY = auto() RWKV = auto() PHOENIX = auto() + CHATGLM = auto() @dataclasses.dataclass @@ -128,6 +129,16 @@ def get_prompt(self) -> str: else: ret += role + ": " + "" return ret + elif self.settings.sep_style == SeparatorStyle.CHATGLM: + ret = self.system + for i, (role, message) in enumerate(self.messages): + if message: + if i % 2 == 0: + ret += f"[Round {i+1}]\n\n" + ret += role + ":" + message + self.settings.sep + else: + ret += role + ":" + return ret else: raise ValueError(f"Invalid style: {self.settings.sep_style}") diff --git a/langport/data/conversation/settings/chatglm.py b/langport/data/conversation/settings/chatglm.py new file mode 100644 index 0000000..dc2c405 --- /dev/null +++ b/langport/data/conversation/settings/chatglm.py @@ -0,0 +1,13 @@ +from langport.data.conversation import ( + ConversationSettings, + SeparatorStyle, +) + +# Chatglm default template +chatglm = ConversationSettings( + name="chatglm", + roles=("问", "答"), + sep_style=SeparatorStyle.CHATGLM, + sep="\n\n", + stop_str="\n\n", +) \ No newline at end of file diff --git a/langport/model/adapters/chatglm.py b/langport/model/adapters/chatglm.py new file mode 100644 index 0000000..6c1df87 --- /dev/null +++ b/langport/model/adapters/chatglm.py @@ -0,0 +1,18 @@ +from langport.data.conversation import ConversationHistory +from langport.data.conversation.conversation_settings import get_conv_settings +from langport.model.model_adapter import BaseAdapter + +class ChatGLMAdapter(BaseAdapter): + """The model adapter for THUDM/chatglm-6b""" + + def match(self, model_path: str): + return "chatglm" in model_path + + def get_default_conv_template(self, model_path: str) -> ConversationHistory: + settings = get_conv_settings("chatglm") + return ConversationHistory( + system="", + messages=[], + offset=0, + settings=settings, + ) diff --git a/langport/model/adapters/chatglm2.py b/langport/model/adapters/chatglm2.py deleted file mode 100644 index 8883028..0000000 --- a/langport/model/adapters/chatglm2.py +++ /dev/null @@ -1,7 +0,0 @@ -from langport.model.model_adapter import BaseAdapter - -class ChatGLMAdapter(BaseAdapter): - """The model adapter for THUDM/chatglm-6b""" - - def match(self, model_path: str): - return "chatglm" in model_path diff --git a/langport/model/executor/huggingface.py b/langport/model/executor/huggingface.py index e354d32..f2bbefe 100644 --- a/langport/model/executor/huggingface.py +++ b/langport/model/executor/huggingface.py @@ -4,7 +4,7 @@ from langport.model.adapters.rwkv import RwkvAdapter from langport.model.adapters.t5 import T5Adapter from langport.model.adapters.text2vec import BertAdapter -from langport.model.adapters.chatglm2 import ChatGLMAdapter +from langport.model.adapters.chatglm import ChatGLMAdapter from langport.model.executor.base import LocalModelExecutor