Skip to content

Commit

Permalink
Merge pull request #32 from javierdlrm/release-0.14.0-tools-and-chat-…
Browse files Browse the repository at this point in the history
…template

[HWORKS-1846] Cherry-pick tools and chat template
  • Loading branch information
SirOibaf authored Dec 10, 2024
2 parents 0144ddd + 903101d commit 38bfcef
Show file tree
Hide file tree
Showing 13 changed files with 804 additions and 607 deletions.
2 changes: 1 addition & 1 deletion python/huggingface_server.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ ARG POETRY_HOME=/opt/poetry
ARG POETRY_VERSION=1.8.3

# Install vllm
ARG VLLM_VERSION=0.6.1.post2
ARG VLLM_VERSION=0.6.2

RUN apt-get update -y && apt-get install gcc python3.10-venv python3-dev -y && apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
TypedDict,
Union,
cast,
List,
)

import torch
Expand All @@ -36,6 +37,7 @@
CompletionRequest,
OpenAIChatAdapterModel,
)
from kserve.protocol.rest.openai.types.openapi import ChatCompletionTool
from kserve.protocol.rest.openai.types import (
ChatCompletionRequestMessage,
Completion,
Expand Down Expand Up @@ -384,7 +386,10 @@ def build_generation_config(
return GenerationConfig(**kwargs)

def apply_chat_template(
self, messages: Iterable[ChatCompletionRequestMessage]
self,
messages: Iterable[ChatCompletionRequestMessage],
chat_template: Optional[str] = None,
tools: Optional[List[ChatCompletionTool]] = None,
) -> ChatPrompt:
"""
Given a list of chat completion messages, convert them to a prompt.
Expand All @@ -394,8 +399,10 @@ def apply_chat_template(
str,
self._tokenizer.apply_chat_template(
[m.model_dump() for m in messages],
chat_template=chat_template,
tokenize=False,
add_generation_prompt=True,
tools=[tool.model_dump() for tool in tools] if tools else None,
),
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
CreateCompletionRequest,
CreateCompletionResponse as Completion,
Logprobs,
ChatCompletionTool,
)
from kserve.protocol.rest.openai.errors import OpenAIError, create_error_response
from kserve.protocol.rest.openai import ChatCompletionRequestMessage, CompletionRequest
Expand Down Expand Up @@ -89,7 +90,6 @@ def logit_bias_logits_processor(


class OpenAIServingCompletion:

def __init__(self, engine: AsyncLLMEngine, request_logger: RequestLogger = None):
self.engine = engine

Expand Down Expand Up @@ -358,10 +358,16 @@ def request_output_to_completion_response(

def apply_chat_template(
self,
messages: Iterable[ChatCompletionRequestMessage,],
messages: Iterable[ChatCompletionRequestMessage],
chat_template: Optional[str] = None,
tools: Optional[List[ChatCompletionTool]] = None,
):
return self.tokenizer.apply_chat_template(
conversation=messages, tokenize=False, add_generation_prompt=True
conversation=messages,
chat_template=chat_template,
tokenize=False,
add_generation_prompt=True,
tools=[tool.model_dump() for tool in tools] if tools else None,
)

async def _post_init(self):
Expand Down
11 changes: 8 additions & 3 deletions python/huggingfaceserver/huggingfaceserver/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import AsyncIterator, Iterable, Optional, Union
from typing import AsyncIterator, Iterable, Optional, Union, List

import torch
from vllm.entrypoints.logger import RequestLogger
Expand All @@ -26,6 +26,7 @@
CompletionRequest,
OpenAIChatAdapterModel,
)
from kserve.protocol.rest.openai.types.openapi import ChatCompletionTool
from kserve.protocol.rest.openai.types import Completion
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm import AsyncEngineArgs
Expand Down Expand Up @@ -68,13 +69,17 @@ async def healthy(self) -> bool:

def apply_chat_template(
self,
messages: Iterable[ChatCompletionRequestMessage,],
messages: Iterable[ChatCompletionRequestMessage],
chat_template: Optional[str] = None,
tools: Optional[List[ChatCompletionTool]] = None,
) -> ChatPrompt:
"""
Given a list of chat completion messages, convert them to a prompt.
"""
return ChatPrompt(
prompt=self.openai_serving_completion.apply_chat_template(messages)
prompt=self.openai_serving_completion.apply_chat_template(
messages, chat_template, tools
)
)

async def create_completion(
Expand Down
1,137 changes: 554 additions & 583 deletions python/huggingfaceserver/poetry.lock

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions python/huggingfaceserver/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ packages = [
[tool.poetry.dependencies]
python = ">=3.9,<3.12"
kserve = { path = "../kserve", extras = ["storage"], develop = true }
transformers = "~4.43.3"
transformers = ">=4.45.0"
accelerate = "~0.32.0"
torch = "~2.4.0"
vllm = { version = "^0.6.1.post2", optional = true }
vllm = { version = "^0.6.2", optional = true }
setuptools = {version = ">=70.0.0", python = "3.12"} # setuptools is not part of python 3.12

[tool.poetry.extras]
vllm = [
Expand Down
64 changes: 62 additions & 2 deletions python/huggingfaceserver/tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,9 @@ async def test_bloom_chat_completion(bloom_model: HuggingfaceGenerativeModel):
messages=messages,
stream=False,
max_tokens=20,
chat_template="{% for message in messages %}"
"{{ message.content }}{{ eos_token }}"
"{% endfor %}",
)
request = ChatCompletionRequest(params=params, context={})
response = await bloom_model.create_chat_completion(request)
Expand Down Expand Up @@ -415,6 +418,9 @@ async def test_bloom_chat_completion_streaming(bloom_model: HuggingfaceGenerativ
messages=messages,
stream=True,
max_tokens=20,
chat_template="{% for message in messages %}"
"{{ message.content }}{{ eos_token }}"
"{% endfor %}",
)
request = ChatCompletionRequest(params=params, context={})
response = await bloom_model.create_chat_completion(request)
Expand Down Expand Up @@ -497,6 +503,60 @@ async def test_input_padding_with_pad_token_not_specified(
response = await openai_gpt_model.create_completion(request)
assert (
response.choices[0].text
== "west, and the sun sets in the west. \n the sun rises in the"
== "west , and the sun sets in the west . \n the sun rises in the"
)
assert "a member of the royal family." in response.choices[1].text
assert "a member of the royal family ." in response.choices[1].text


@pytest.mark.asyncio
async def test_tools_chat_completion(bloom_model: HuggingfaceGenerativeModel):
messages = [
{
"role": "system",
"content": "You are a friendly chatbot whose purpose is to tell me what the weather is.",
},
{
"role": "user",
"content": "weather in Ithaca, NY",
},
]

tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather",
"parameters": {
"type": "dict",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location.",
},
},
"required": ["location", "format"],
},
},
}
]
params = CreateChatCompletionRequest(
model="bloom-560m",
messages=messages,
stream=False,
max_tokens=100,
tools=tools,
tool_choice="auto",
chat_template="{% for message in messages %}"
"{{ message.content }} You have these tools: {% for tool in tools %} {{ eos_token }}"
"{% endfor %}{% endfor %}",
)
request = ChatCompletionRequest(params=params, context={})
response = await bloom_model.create_chat_completion(request)

assert response.choices[0].message.content
Loading

0 comments on commit 38bfcef

Please sign in to comment.