Skip to content

Commit 9dbb144

Browse files
committed
合并后的提交信息
1 parent a98ca9e commit 9dbb144

File tree

17 files changed

+110
-322
lines changed

17 files changed

+110
-322
lines changed

gpt_server/model_backend/hf_backend.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def __init__(self, tokenizer: PreTrainedTokenizer, model: torch.nn.Module) -> No
5555
self.model.load_adapter(model_id=lora_path, adapter_name=lora_name)
5656

5757
async def stream_chat(self, params: Dict[str, Any]):
58+
# params 已不需要传入 prompt
5859
messages = params["messages"]
5960
chat_template = params.get("chat_template", None)
6061
tools = params.get("tools", None)

gpt_server/model_worker/auto.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import asyncio
2+
import json
3+
from typing import List
4+
from fastchat.constants import ErrorCode, SERVER_ERROR_MSG
5+
from loguru import logger
6+
import torch
7+
import traceback
8+
from gpt_server.model_worker.base.model_worker_base import ModelWorkerBase
9+
from gpt_server.model_handler.prompts import MODELS
10+
from gpt_server.model_handler.tool_parser import tool_parser, ToolParserManager
11+
from gpt_server.model_handler.chat_template.get_chat_template import get_chat_template
12+
13+
14+
class AutoWorker(ModelWorkerBase):
15+
def __init__(
16+
self,
17+
controller_addr: str,
18+
worker_addr: str,
19+
worker_id: str,
20+
model_path: str,
21+
model_names: List[str],
22+
limit_worker_concurrency: int,
23+
conv_template: str = None, # type: ignore
24+
):
25+
super().__init__(
26+
controller_addr,
27+
worker_addr,
28+
worker_id,
29+
model_path,
30+
model_names,
31+
limit_worker_concurrency,
32+
conv_template,
33+
model_type="AutoModelForCausalLM",
34+
)
35+
36+
self.stop_words_ids = []
37+
38+
self.stop = [
39+
self.tokenizer.decode(skip_word) for skip_word in self.stop_words_ids
40+
]
41+
logger.warning(f"{model_names[0]} 停用词: {self.stop}")
42+
43+
# from https://github.com/xorbitsai/inference/blob/c70ea74fa820a613f8d577047ef1818da20a96b3/xinference/model/llm/llm_family_modelscope.json
44+
self.tool_parser = ToolParserManager.module_dict["qwen2_5"](
45+
tokenizer=self.tokenizer
46+
)
47+
48+
async def generate_stream_gate(self, params):
49+
self.call_ct += 1
50+
try:
51+
tools = params.get("tools", None)
52+
# ---------------添加额外的参数------------------------
53+
params["stop"].extend(self.stop)
54+
params["stop_words_ids"] = self.stop_words_ids
55+
# ---------------添加额外的参数------------------------
56+
full_text = ""
57+
ret = {}
58+
async for ret in self.backend.stream_chat(params=params):
59+
full_text += ret.get("text", "")
60+
yield json.dumps(ret).encode() + b"\0"
61+
# ------ add tool_calls ------
62+
yield tool_parser(
63+
full_text=full_text, tool_parser=self.tool_parser, tools=tools, ret=ret
64+
)
65+
# ------ add tool_calls ------
66+
except torch.cuda.OutOfMemoryError as e:
67+
ret = {
68+
"text": f"{SERVER_ERROR_MSG}\n\n({e})",
69+
"error_code": ErrorCode.CUDA_OUT_OF_MEMORY,
70+
}
71+
yield json.dumps(ret).encode() + b"\0"
72+
except (ValueError, RuntimeError) as e:
73+
traceback.print_exc()
74+
logger.info(e)
75+
ret = {
76+
"text": f"{SERVER_ERROR_MSG}\n\n({e})",
77+
"error_code": ErrorCode.INTERNAL_ERROR,
78+
}
79+
yield json.dumps(ret).encode() + b"\0"
80+
81+
82+
if __name__ == "__main__":
83+
AutoWorker.run()

gpt_server/model_worker/baichuan.py

Lines changed: 1 addition & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -6,57 +6,6 @@
66
from gpt_server.model_worker.base.model_worker_base import ModelWorkerBase
77

88

9-
def build_chat_input(tokenizer, messages: List[dict], max_new_tokens: int = 0):
10-
user_token_id = 195
11-
assistant_token_id = 196
12-
13-
def _parse_messages(messages, split_role="user"):
14-
system, rounds = "", []
15-
round = []
16-
for i, message in enumerate(messages):
17-
if message["role"] == "system":
18-
assert i == 0
19-
system = message["content"]
20-
continue
21-
if message["role"] == split_role and round:
22-
rounds.append(round)
23-
round = []
24-
round.append(message)
25-
if round:
26-
rounds.append(round)
27-
return system, rounds
28-
29-
max_new_tokens = max_new_tokens or 2048
30-
max_input_tokens = 4096 - max_new_tokens
31-
system, rounds = _parse_messages(messages, split_role="user")
32-
system_tokens = tokenizer.encode(system)
33-
max_history_tokens = max_input_tokens - len(system_tokens)
34-
35-
history_tokens = []
36-
for round in rounds[::-1]:
37-
round_tokens = []
38-
for message in round:
39-
if message["role"] == "user":
40-
round_tokens.append(user_token_id)
41-
else:
42-
round_tokens.append(assistant_token_id)
43-
round_tokens.extend(tokenizer.encode(message["content"]))
44-
if (
45-
len(history_tokens) == 0
46-
or len(history_tokens) + len(round_tokens) <= max_history_tokens
47-
):
48-
history_tokens = round_tokens + history_tokens # concat left
49-
if len(history_tokens) < max_history_tokens:
50-
continue
51-
break
52-
53-
input_tokens = system_tokens + history_tokens
54-
if messages[-1]["role"] != "assistant":
55-
input_tokens.append(assistant_token_id)
56-
input_tokens = input_tokens[-max_input_tokens:] # truncate left
57-
return torch.LongTensor([input_tokens])
58-
59-
609
class BaiChuanWorker(ModelWorkerBase):
6110
def __init__(
6211
self,
@@ -78,9 +27,7 @@ def __init__(
7827
conv_template,
7928
model_type="AutoModelForCausalLM",
8029
)
81-
self.stop_words_ids = [
82-
2, # </s>
83-
]
30+
self.stop_words_ids = []
8431
self.stop = [
8532
self.tokenizer.decode(skip_word) for skip_word in self.stop_words_ids
8633
]
@@ -89,29 +36,11 @@ def __init__(
8936
async def generate_stream_gate(self, params):
9037
self.call_ct += 1
9138
try:
92-
messages = params["messages"]
93-
if isinstance(messages, list):
94-
task = "chat"
95-
elif isinstance(messages, str):
96-
task = "completion"
97-
if task == "chat":
98-
input_ids = build_chat_input(
99-
tokenizer=self.tokenizer, messages=messages
100-
)
101-
text = self.tokenizer.decode(input_ids.tolist()[0])
102-
elif task == "completion":
103-
text = messages
104-
input_ids = self.tokenizer([text], return_tensors="pt").input_ids
105-
106-
params["messages"] = messages
107-
params["prompt"] = text
10839
params["stop"].extend(self.stop)
10940
params["stop_words_ids"] = self.stop_words_ids
110-
params["input_ids"] = input_ids
11141

11242
async for ret in self.backend.stream_chat(params=params):
11343
response = ret["text"]
114-
11544
yield json.dumps(ret).encode() + b"\0"
11645

11746
except torch.cuda.OutOfMemoryError as e:

gpt_server/model_worker/base/model_worker_base.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,19 +137,20 @@ def __init__(
137137

138138
def preprocess_params(self, params: dict) -> dict:
139139
"""预处理 params"""
140-
messages = params["messages"]
140+
# ---------- 添加 chat_template 信息 ----------
141141
params["chat_template"] = self.chat_template
142+
# ---------- 添加多模态信息 ----------
142143
if self.vision_config:
143144
params["multimodal"] = True
144145
params["chat_template"] = self.vl_chat_template
146+
# ---------- 如果传入的是 str 则修改为messages ----------
147+
messages = params["messages"]
145148
if isinstance(messages, str):
146149
messages = [{"role": "user", "content": messages}]
147150
params["messages"] = messages
148-
# 1. 处理 工具,支持 tool_choice 的控制
151+
# ---------- 处理 工具,支持 tool_choice 的控制 ----------
149152
tool_choice = params.get("tool_choice", "none")
150153
tools = params.get("tools", None)
151-
if self.chat_template:
152-
params["chat_template"] = self.chat_template
153154
params["extra_prompt"] = ""
154155
if tools:
155156
if tool_choice == "none":
@@ -404,6 +405,8 @@ async def api_generate_stream(request: Request):
404405
params["request"] = request
405406
params.pop("prompt")
406407
logger.debug(f"params {params}")
408+
# 对 params 进行预处理
409+
params = worker.preprocess_params(params)
407410
generator = worker.generate_stream_gate(params)
408411
background_tasks = create_background_tasks(request_id)
409412
return StreamingResponse(generator, background=background_tasks)
@@ -450,6 +453,8 @@ async def api_generate(request: Request):
450453
params["request"] = request
451454
params.pop("prompt")
452455
logger.debug(f"params {params}")
456+
# 对 params 进行预处理
457+
params = worker.preprocess_params(params)
453458
output = await worker.generate_gate(params)
454459
release_worker_semaphore()
455460

gpt_server/model_worker/chatglm.py

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -44,46 +44,20 @@ def __init__(
4444
model_type="AutoModel",
4545
multimodal=False,
4646
)
47-
self.chat_template = MODELS.module_dict["glm4"]()
4847
self.tool_parser = ToolParserManager.module_dict["glm"](
4948
tokenizer=self.tokenizer
5049
)
5150
self.stop_words_ids = []
52-
5351
self.stop = ["Observation:"]
5452
logger.warning(f"{model_names[0]} 停用词: {self.stop}")
5553

5654
async def generate_stream_gate(self, params):
5755
self.call_ct += 1
5856
try:
59-
messages = params.get("messages", [])
6057
tools = params.get("tools", None)
61-
tool_choice = params.get("tool_choice", "none")
62-
if tool_choice == "none":
63-
tools = None
64-
elif tool_choice == "auto" or tool_choice == "required":
65-
pass
66-
elif isinstance(tool_choice, dict):
67-
tools = pop_matching_tool(tools=tools, tool_choice=tool_choice)
68-
if not self.vision_config:
69-
if isinstance(messages, list):
70-
text = await asyncio.to_thread(
71-
self.chat_template.messages2prompt, messages, True, tools
72-
)
73-
elif isinstance(messages, str):
74-
text = messages
75-
# input_ids = self.tokenizer([text], return_tensors="pt").input_ids
7658

77-
# text = self.tokenizer.decode(input_ids.tolist()[0])
78-
params["prompt"] = text
79-
# params["input_ids"] = input_ids
80-
else: # 多模态模型
81-
params["multimodal"] = True
82-
# ---------------添加额外的参数------------------------
83-
params["messages"] = messages
8459
params["stop"].extend(self.stop)
8560
params["stop_words_ids"] = self.stop_words_ids
86-
# ---------------添加额外的参数------------------------
8761
full_text = ""
8862
ret = {}
8963
async for ret in self.backend.stream_chat(params=params):

gpt_server/model_worker/deepseek.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,7 @@ def __init__(
2828
model_type="AutoModelForCausalLM",
2929
)
3030

31-
self.stop_words_ids = [
32-
# 32013, # bos <|begin▁of▁sentence|>
33-
# 32021, # eos <|EOT|>
34-
]
31+
self.stop_words_ids = []
3532

3633
self.stop = [
3734
self.tokenizer.decode(skip_word) for skip_word in self.stop_words_ids
@@ -41,19 +38,6 @@ def __init__(
4138
async def generate_stream_gate(self, params):
4239
self.call_ct += 1
4340
try:
44-
messages = params["messages"]
45-
if not self.vision_config:
46-
if isinstance(messages, list):
47-
text = self.tokenizer.apply_chat_template(
48-
conversation=messages,
49-
tokenize=False,
50-
add_generation_prompt=True,
51-
)
52-
elif isinstance(messages, str):
53-
text = messages
54-
params["prompt"] = text
55-
# ---------------添加额外的参数------------------------
56-
params["messages"] = messages
5741
params["stop"].extend(self.stop)
5842
params["stop_words_ids"] = self.stop_words_ids
5943
# ---------------添加额外的参数------------------------

gpt_server/model_worker/gemma.py

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from gpt_server.model_worker.base.model_worker_base import ModelWorkerBase
77
import traceback
88

9+
910
class GemmaWorker(ModelWorkerBase):
1011
def __init__(
1112
self,
@@ -27,7 +28,7 @@ def __init__(
2728
conv_template,
2829
model_type="AutoModelForCausalLM",
2930
)
30-
self.stop_words_ids = [1, 106]
31+
self.stop_words_ids = []
3132
self.stop = [
3233
self.tokenizer.decode(skip_word) for skip_word in self.stop_words_ids
3334
]
@@ -36,28 +37,11 @@ def __init__(
3637
async def generate_stream_gate(self, params):
3738
self.call_ct += 1
3839
try:
39-
messages = params["messages"]
40-
if isinstance(messages, list):
41-
task = "chat"
42-
elif isinstance(messages, str):
43-
task = "completion"
44-
if task == "chat":
45-
text = self.tokenizer.apply_chat_template(
46-
conversation=messages,
47-
tokenize=True,
48-
add_generation_prompt=True,
49-
)
50-
elif task == "completion":
51-
text = messages
52-
53-
params["messages"] = messages
54-
params["prompt"] = text
5540
params["stop"].extend(self.stop)
5641
params["stop_words_ids"] = self.stop_words_ids
5742

5843
async for ret in self.backend.stream_chat(params=params):
5944
response = ret["text"]
60-
6145
yield json.dumps(ret).encode() + b"\0"
6246

6347
except torch.cuda.OutOfMemoryError as e:

0 commit comments

Comments
 (0)