diff --git a/README.md b/README.md index 58b59e2..2847a76 100644 --- a/README.md +++ b/README.md @@ -30,13 +30,31 @@ The core features include: - HuggingFace-compatible RESTful APIs. - Tabby-compatible RESTful APIs. +## Support Model Architectures +* LLaMa +* GLM +* Bloom +* OPT +* GPT2 +* GPT Neo +* GPT Big Code + +## Tested Models +* LLaMa +* Vicuna +* ChatGLM +* ChatGLM2 +* Falcon +* Starcoder +* WizardLM +* OpenBuddy + ## Benchmark We use single RTX3090 to run a finetuned 7B LLaMA model (OpenBuddy V0.9) under the bf16 setting. We create 32 threads to submit chat tasks to the server, and the following figure shows the Queries Per Second (QPS) and Tokens Per Second (TPS) of FastChat and LangPort with different max model concurrency settings. ![benchmark_chat](assets/benchmark_chat.jpg) - ## News - [2023/06/18] Add ggml (llama.cpp gpt.cpp starcoder.cpp etc.) worker support. - [2023/06/09] Add LLama.cpp worker support. diff --git a/benchmark/bench_chat.py b/benchmark/bench_chat.py index 16659d1..e0b0d17 100644 --- a/benchmark/bench_chat.py +++ b/benchmark/bench_chat.py @@ -1,20 +1,20 @@ import argparse import random import time +import traceback import openai import threading import tqdm +import datasets from concurrent.futures import ThreadPoolExecutor -def start_session(i: int, url: str, model: str, stream: bool=False, max_tokens: int=2048, random_len: int=0) -> str: +def start_session(i: int, url: str, model: str, dataset, stream: bool=False, max_tokens: int=2048, random_len: int=0) -> str: try: openai.api_key = "EMPTY" # Not support yet openai.api_base = url - if random_len != 0 : - messages = [{"role": "user", "content": "Hello! What is your name?" + "a" * random.randint(1, random_len)}] - else: - messages = [{"role": "user", "content": "Hello! What is your name?"}] + messages = dataset[i] + # create a chat completion response = openai.ChatCompletion.create( model=model, @@ -32,17 +32,44 @@ def start_session(i: int, url: str, model: str, stream: bool=False, max_tokens: total_tokens = response.usage.total_tokens completion_tokens = response.usage.completion_tokens except Exception as e: - print(e) + traceback.print_exc() return "", 0, 0 return out, total_tokens, completion_tokens + +def get_prompt(raw_dataset): + dataset = [] + for conversations in raw_dataset["conversations"]: + messages = [] + for data in conversations: + out_data = {"role": "system", "content": ""} + if data["user"] == "human": + out_data["role"] = "user" + if data["user"] == "gpt": + out_data["role"] = "assitant" + + out_data["content"] = data["text"] + messages.append(out_data) + + if messages[-1]["role"] == "gpt": + messages = messages[:-1] + + prompt = "\n###".join([msg["role"] + ": " + msg["content"] for msg in messages]) + "\n### assistant: " + if len(prompt) > 2048: + continue + dataset.append(messages) + return dataset + def main(args): + dataset = datasets.load_dataset("theblackcat102/sharegpt-english", split="train") + dataset = get_prompt(dataset) + tik = time.time() tasks = [] with ThreadPoolExecutor(max_workers=args.n_thread) as t: for i in range(args.total_task): - task = t.submit(start_session, i=i, url=args.url, model=args.model_name, stream=False, max_tokens=args.max_tokens, random_len=args.random_len) + task = t.submit(start_session, i=i, url=args.url, model=args.model_name, dataset=dataset, stream=False, max_tokens=args.max_tokens, random_len=args.random_len) tasks.append(task) results = [] @@ -63,7 +90,7 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument("--url", type=str, default="http://localhost:8000/v1") parser.add_argument("--model-name", type=str, default="vicuna") - parser.add_argument("--max-tokens", type=int, default=1024) + parser.add_argument("--max-tokens", type=int, default=512) parser.add_argument("--total-task", type=int, default=200) parser.add_argument("--n-thread", type=int, default=32) parser.add_argument("--random-len", type=int, default=0) diff --git a/benchmark/bench_completions.py b/benchmark/bench_completions.py new file mode 100644 index 0000000..811e535 --- /dev/null +++ b/benchmark/bench_completions.py @@ -0,0 +1,97 @@ +import argparse +import random +import time +import traceback +import openai +import threading +import tqdm +import datasets +from concurrent.futures import ThreadPoolExecutor + +def start_session(i: int, url: str, model: str, dataset, stream: bool=False, max_tokens: int=2048) -> str: + try: + openai.api_key = "EMPTY" # Not support yet + openai.api_base = url + + prompt = dataset[i] + # create a chat completion + response = openai.Completion.create( + model=model, + prompt=prompt, + stream=stream, + max_tokens=max_tokens, + temperature=0.9, + ) + # print the completion + if stream: + out = "" + for chunk in response: + out += str(chunk) + else: + out = response.choices[0].text + total_tokens = response.usage.total_tokens + completion_tokens = response.usage.completion_tokens + except Exception as e: + traceback.print_exc() + return "", 0, 0 + + return out, total_tokens, completion_tokens + +def get_prompt(raw_dataset): + dataset = [] + for conversations in raw_dataset["conversations"]: + messages = [] + for data in conversations: + out_data = {"role": "system", "content": ""} + if data["user"] == "human": + out_data["role"] = "user" + if data["user"] == "gpt": + out_data["role"] = "assitant" + + out_data["content"] = data["text"] + messages.append(out_data) + + if messages[-1]["role"] == "gpt": + messages = messages[:-1] + + prompt = "\n###".join([msg["role"] + ": " + msg["content"] for msg in messages]) + "\n### assistant: " + if len(prompt) > 2048: + continue + dataset.append(prompt) + return dataset + +def main(args): + dataset = datasets.load_dataset("theblackcat102/sharegpt-english", split="train") + dataset = get_prompt(dataset) + + tik = time.time() + tasks = [] + with ThreadPoolExecutor(max_workers=args.n_thread) as t: + for i in range(args.total_task): + task = t.submit(start_session, i=i, url=args.url, model=args.model_name, dataset=dataset, stream=False, max_tokens=args.max_tokens) + tasks.append(task) + + results = [] + for task in tqdm.tqdm(tasks): + results.append(task.result()) + + n_tokens = sum([ret[2] for ret in results]) + n_queries = sum([1 for ret in results if ret[2] != 0]) + time_seconds = time.time() - tik + print( + f"Successful number: {n_queries} / {args.total_task}. " + f"Time (Completion): {time_seconds}, n threads: {args.n_thread}, " + f"throughput: {n_tokens / time_seconds} tokens/s." + f"QPS: {n_queries / time_seconds} queries/s." + ) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--url", type=str, default="http://localhost:8000/v1") + parser.add_argument("--model-name", type=str, default="vicuna") + parser.add_argument("--max-tokens", type=int, default=512) + parser.add_argument("--total-task", type=int, default=64) + parser.add_argument("--n-thread", type=int, default=4) + args = parser.parse_args() + + main(args) \ No newline at end of file diff --git a/langport/data/conversation/__init__.py b/langport/data/conversation/__init__.py index 5e3c077..c0b3256 100644 --- a/langport/data/conversation/__init__.py +++ b/langport/data/conversation/__init__.py @@ -19,6 +19,7 @@ class SeparatorStyle(Enum): DOLLY = auto() RWKV = auto() PHOENIX = auto() + CHATGLM = auto() @dataclasses.dataclass @@ -128,6 +129,16 @@ def get_prompt(self) -> str: else: ret += role + ": " + "" return ret + elif self.settings.sep_style == SeparatorStyle.CHATGLM: + ret = self.system + for i, (role, message) in enumerate(self.messages): + if message: + if i % 2 == 0: + ret += f"[Round {i+1}]\n\n" + ret += role + ":" + message + self.settings.sep + else: + ret += role + ":" + return ret else: raise ValueError(f"Invalid style: {self.settings.sep_style}") diff --git a/langport/data/conversation/settings/chatglm.py b/langport/data/conversation/settings/chatglm.py new file mode 100644 index 0000000..dc2c405 --- /dev/null +++ b/langport/data/conversation/settings/chatglm.py @@ -0,0 +1,13 @@ +from langport.data.conversation import ( + ConversationSettings, + SeparatorStyle, +) + +# Chatglm default template +chatglm = ConversationSettings( + name="chatglm", + roles=("问", "答"), + sep_style=SeparatorStyle.CHATGLM, + sep="\n\n", + stop_str="\n\n", +) \ No newline at end of file diff --git a/langport/data/conversation/settings/wizardlm.py b/langport/data/conversation/settings/wizardlm.py new file mode 100644 index 0000000..852b23c --- /dev/null +++ b/langport/data/conversation/settings/wizardlm.py @@ -0,0 +1,13 @@ +from langport.data.conversation import ( + ConversationSettings, + SeparatorStyle, +) + + +# Vicuna v1.1 template +wizardlm = ConversationSettings( + name="wizardlm", + roles=("USER", "ASSISTANT"), + sep_style=SeparatorStyle.ADD_COLON_SINGLE, + sep=" ", +) \ No newline at end of file diff --git a/langport/model/adapters/baichuan.py b/langport/model/adapters/baichuan.py new file mode 100644 index 0000000..6286975 --- /dev/null +++ b/langport/model/adapters/baichuan.py @@ -0,0 +1,19 @@ +from typing import List, Optional +from langport.data.conversation import ConversationHistory, SeparatorStyle +from langport.data.conversation.conversation_settings import get_conv_settings +from langport.model.model_adapter import BaseAdapter + +class BaichuanAdapter(BaseAdapter): + """The model adapter for baichuan-inc/baichuan-7B""" + + def match(self, model_path: str): + return "baichuan" in model_path + + def get_default_conv_template(self, model_path: str) -> ConversationHistory: + settings = get_conv_settings("one_shot") + return ConversationHistory( + system="", + messages=(), + offset=0, + settings=settings, + ) \ No newline at end of file diff --git a/langport/model/adapters/chatglm.py b/langport/model/adapters/chatglm.py index 10c0a50..6c1df87 100644 --- a/langport/model/adapters/chatglm.py +++ b/langport/model/adapters/chatglm.py @@ -1,8 +1,5 @@ -from transformers import ( - AutoModel, - AutoTokenizer, -) - +from langport.data.conversation import ConversationHistory +from langport.data.conversation.conversation_settings import get_conv_settings from langport.model.model_adapter import BaseAdapter class ChatGLMAdapter(BaseAdapter): @@ -10,3 +7,12 @@ class ChatGLMAdapter(BaseAdapter): def match(self, model_path: str): return "chatglm" in model_path + + def get_default_conv_template(self, model_path: str) -> ConversationHistory: + settings = get_conv_settings("chatglm") + return ConversationHistory( + system="", + messages=[], + offset=0, + settings=settings, + ) diff --git a/langport/model/adapters/wizardlm.py b/langport/model/adapters/wizardlm.py new file mode 100644 index 0000000..8d6de32 --- /dev/null +++ b/langport/model/adapters/wizardlm.py @@ -0,0 +1,19 @@ +from typing import List, Optional +from langport.data.conversation import ConversationHistory, SeparatorStyle +from langport.data.conversation.conversation_settings import get_conv_settings +from langport.model.model_adapter import BaseAdapter + +class WizardLMAdapter(BaseAdapter): + """The model adapter for WizardLM/WizardLM-13B-V1.0""" + + def match(self, model_path: str): + return "wizardlm" in model_path + + def get_default_conv_template(self, model_path: str) -> ConversationHistory: + settings = get_conv_settings("wizardlm") + return ConversationHistory( + system="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. ", + messages=(), + offset=0, + settings=settings, + ) \ No newline at end of file diff --git a/langport/model/executor/generation/huggingface.py b/langport/model/executor/generation/huggingface.py index 43b907d..41ef4c3 100644 --- a/langport/model/executor/generation/huggingface.py +++ b/langport/model/executor/generation/huggingface.py @@ -206,8 +206,19 @@ def generate(self, inputs: BatchingTask, past_key_values=past_key_values, ) else: + if step > 0: + dynamic_attention_mask = torch.cat( + (attention_mask, + torch.ones( + inputs.batch_size, step, + dtype=torch.long, device=decoder_input_ids.device + )), dim=1 + ) + else: + dynamic_attention_mask = attention_mask out = self.model( input_ids=decoder_input_ids, + attention_mask=dynamic_attention_mask, use_cache=self.model.generation_config.use_cache, past_key_values=past_key_values, ) diff --git a/langport/model/executor/generation/optimum.py b/langport/model/executor/generation/optimum.py new file mode 100644 index 0000000..f15223b --- /dev/null +++ b/langport/model/executor/generation/optimum.py @@ -0,0 +1,67 @@ + +from typing import List, Optional +from langport.model.executor.generation.huggingface import BatchingTask, GenerationModel, GenerationWorkerStreamer +from langport.model.executor.optimum import OptimumExecutor +from langport.workers.generation_worker import GenerationModelWorker + + +class OptimumGenerationExecutor(OptimumExecutor): + def __init__( + self, + model_name: str, + model_path: str, + device: str, + num_gpus: int, + max_gpu_memory: Optional[str], + load_8bit: bool, + cpu_offloading: bool, + trust_remote_code: bool = False + ) -> None: + super(OptimumGenerationExecutor, self).__init__( + model_name=model_name, + model_path=model_path, + device=device, + num_gpus=num_gpus, + max_gpu_memory=max_gpu_memory, + load_8bit=load_8bit, + cpu_offloading=cpu_offloading + ) + self.adapter = None + self.model = None + self.tokenizer = None + self.adapter, self.model, self.tokenizer = self.load_model( + model_path, {} + ) + + # self.model = torch.compile(self.model) + + if hasattr(self.model.config, "max_sequence_length"): + self._context_len = self.model.config.max_sequence_length + elif hasattr(self.model.config, "max_position_embeddings"): + self._context_len = self.model.config.max_position_embeddings + else: + self._context_len = 2048 + + @property + def context_length(self) -> int: + return self._context_len + + def tokenize(self, text: str) -> List[int]: + input_ids = self.tokenizer(text).input_ids + return input_ids + + def inference(self, worker: "GenerationModelWorker"): + if not worker.online: + return + + tasks = worker.fetch_tasks() + + # batch inference + inputs = BatchingTask(tasks, self.tokenizer, self.device, self.model.config.is_encoder_decoder) + if inputs.batch_size == 0: + return + streamer = GenerationWorkerStreamer(inputs, self.tokenizer, worker) + model = GenerationModel(self.model) + max_new_tokens = max(inputs.max_tokens) + model.generate(inputs, max_new_tokens, streamer) + \ No newline at end of file diff --git a/langport/model/executor/huggingface.py b/langport/model/executor/huggingface.py index c279551..f2bbefe 100644 --- a/langport/model/executor/huggingface.py +++ b/langport/model/executor/huggingface.py @@ -4,12 +4,15 @@ from langport.model.adapters.rwkv import RwkvAdapter from langport.model.adapters.t5 import T5Adapter from langport.model.adapters.text2vec import BertAdapter +from langport.model.adapters.chatglm import ChatGLMAdapter + from langport.model.executor.base import LocalModelExecutor import torch from transformers import ( AutoModelForCausalLM, + AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM, T5Tokenizer, @@ -77,9 +80,16 @@ def _load_hf_model(self, adapter, model_path: str, from_pretrained_kwargs: dict) elif isinstance(adapter, BertAdapter): tokenizer = BertTokenizer.from_pretrained(model_path) model = BertModel.from_pretrained(model_path, **from_pretrained_kwargs) - + elif isinstance(adapter, ChatGLMAdapter): + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True) + if "trust_remote_code" in from_pretrained_kwargs: + from_pretrained_kwargs.pop("trust_remote_code") + model = AutoModel.from_pretrained( + model_path, low_cpu_mem_usage=True, trust_remote_code=True, **from_pretrained_kwargs + ) else: - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) + trust_remote_code = from_pretrained_kwargs.get("trust_remote_code", False) + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=trust_remote_code) model = AutoModelForCausalLM.from_pretrained( model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs ) diff --git a/langport/model/executor/optimum.py b/langport/model/executor/optimum.py new file mode 100644 index 0000000..49456ad --- /dev/null +++ b/langport/model/executor/optimum.py @@ -0,0 +1,47 @@ +import os +from typing import List, Optional +from langport.model.executor.base import LocalModelExecutor +from langport.model.model_adapter import get_model_adapter + +from optimum.onnxruntime import ORTModelForCausalLM +from transformers import AutoTokenizer + + +class OptimumExecutor(LocalModelExecutor): + def __init__( + self, + model_name: str, + model_path: str, + device: str, + num_gpus: int, + max_gpu_memory: Optional[str], + load_8bit: bool = False, + cpu_offloading: bool = False, + ) -> None: + super(OptimumExecutor, self).__init__( + model_name = model_name, + model_path = model_path, + device = device, + num_gpus = num_gpus, + max_gpu_memory = max_gpu_memory, + load_8bit = load_8bit, + cpu_offloading = cpu_offloading, + ) + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + adapter = get_model_adapter(model_path) + + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) + if os.path.exists(os.path.join(model_path, "decoder_model.onnx")): + export_onnx = False + else: + export_onnx = True + + use_gpu = False # self.device == "cuda" + if use_gpu: + provider = "CUDAExecutionProvider" + else: + provider = "CPUExecutionProvider" + model = ORTModelForCausalLM.from_pretrained(model_path, export=export_onnx, provider=provider) + + return adapter, model, tokenizer diff --git a/langport/routers/gateway/openai_compatible.py b/langport/routers/gateway/openai_compatible.py index 31978ce..37c54ee 100644 --- a/langport/routers/gateway/openai_compatible.py +++ b/langport/routers/gateway/openai_compatible.py @@ -69,7 +69,7 @@ def get_gen_params( stream: Optional[bool], stop: Optional[Union[str, List[str]]], ) -> Dict[str, Any]: - is_chatglm = "chatglm" in model_name.lower() + # is_chatglm = "chatglm" in model_name.lower() conv = get_conversation_template(model_name) if isinstance(messages, str): @@ -88,11 +88,7 @@ def get_gen_params( # Add a blank message for the assistant. conv.append_message(conv.settings.roles[1], None) - - if is_chatglm: - prompt = conv.messages[conv.offset :] - else: - prompt = conv.get_prompt() + prompt = conv.get_prompt() if max_tokens is None: max_tokens = 512 diff --git a/langport/service/server/generation_worker.py b/langport/service/server/generation_worker.py index 0ec7ec0..3632c29 100644 --- a/langport/service/server/generation_worker.py +++ b/langport/service/server/generation_worker.py @@ -57,7 +57,7 @@ deepspeed=args.deepspeed, trust_remote_code=args.trust_remote_code ) - + app.node = GenerationModelWorker( node_addr=args.worker_address, node_id=node_id, diff --git a/langport/service/server/optimum_generation_worker.py b/langport/service/server/optimum_generation_worker.py new file mode 100644 index 0000000..545a316 --- /dev/null +++ b/langport/service/server/optimum_generation_worker.py @@ -0,0 +1,72 @@ +import argparse +import os +import random +import uuid +import uvicorn + +from langport.workers.generation_worker import GenerationModelWorker +from langport.model.model_args import add_model_args +from langport.utils import build_logger +from langport.routers.server.generation_node import app + + +# We suggest that concurrency == batch * thread (thread == 4) +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=None) + parser.add_argument("--worker-address", type=str, default=None) + parser.add_argument("--neighbors", type=str, nargs="*", default=[]) + + add_model_args(parser) + parser.add_argument("--model-name", type=str, help="Optional display name") + parser.add_argument("--limit-model-concurrency", type=int, default=8) + parser.add_argument("--batch", type=int, default=1) + parser.add_argument("--stream-interval", type=int, default=2) + + args = parser.parse_args() + + node_id = str(uuid.uuid4()) + logger = build_logger("optimum_generation_worker", f"optimum_generation_worker_{node_id}.log") + logger.info(f"args: {args}") + + if args.gpus: + if len(args.gpus.split(",")) < args.num_gpus: + raise ValueError( + f"Larger --num-gpus ({args.num_gpus}) than --gpus {args.gpus}!" + ) + os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus + + if args.port is None: + args.port = random.randint(21001, 29001) + + if args.worker_address is None: + args.worker_address = f"http://{args.host}:{args.port}" + + if args.model_name is None: + args.model_name = os.path.basename(os.path.normpath(args.model_path)) + + from langport.model.executor.generation.optimum import OptimumGenerationExecutor + executor = OptimumGenerationExecutor( + model_name=args.model_name, + model_path=args.model_path, + device=args.device, + num_gpus=args.num_gpus, + max_gpu_memory=args.max_gpu_memory, + load_8bit=args.load_8bit, + cpu_offloading=args.cpu_offloading, + trust_remote_code=args.trust_remote_code + ) + + app.node = GenerationModelWorker( + node_addr=args.worker_address, + node_id=node_id, + init_neighborhoods_addr=args.neighbors, + executor=executor, + limit_model_concurrency=args.limit_model_concurrency, + max_batch=args.batch, + stream_interval=args.stream_interval, + logger=logger, + ) + uvicorn.run(app, host=args.host, port=args.port, log_level="info") + diff --git a/langport/utils/__init__.py b/langport/utils/__init__.py index 3a3e4bd..a1f35dc 100644 --- a/langport/utils/__init__.py +++ b/langport/utils/__init__.py @@ -48,18 +48,18 @@ def build_logger(logger_name, logger_filename) -> logging.Logger: # Redirect stdout and stderr to loggers stdout_logger = logging.getLogger("stdout") - stdout_logger.setLevel(logging.INFO) + stdout_logger.setLevel(logging.DEBUG) sl = StreamToLogger(stdout_logger, logging.INFO) sys.stdout = sl - stderr_logger = logging.getLogger("stderr") - stderr_logger.setLevel(logging.ERROR) - sl = StreamToLogger(stderr_logger, logging.ERROR) - sys.stderr = sl + # stderr_logger = logging.getLogger("stderr") + # stderr_logger.setLevel(logging.ERROR) + # sl = StreamToLogger(stderr_logger, logging.ERROR) + # sys.stderr = sl # Get logger logger = logging.getLogger(logger_name) - logger.setLevel(logging.INFO) + logger.setLevel(logging.DEBUG) # Add a file handler for all loggers if handler is None: diff --git a/langport/version.py b/langport/version.py index 2745480..9a6f28d 100644 --- a/langport/version.py +++ b/langport/version.py @@ -1 +1 @@ -LANGPORT_VERSION = "0.2.1" +LANGPORT_VERSION = "0.3.0" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 35c1039..4eb3fbd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "langport" -version = "0.2.1" +version = "0.3.0" description = "A large language model serving platform." readme = "README.md" requires-python = ">=3.8" @@ -22,6 +22,7 @@ dependencies = [ [project.optional-dependencies] dev = ["black==23.3.0", "pylint==2.8.2"] ggml = ["ctransformers"] +optimum = ["onnx", "onnxruntime", "optimum"] [project.urls] "Homepage" = "https://github.com/vtuber-plan/langport"