diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/__main__.py b/__main__.py new file mode 100644 index 0000000..2fbd575 --- /dev/null +++ b/__main__.py @@ -0,0 +1,98 @@ +"""Example FastAPI server for llama.cpp. + +To run this example: + +```bash +pip install fastapi uvicorn sse-starlette pydantic-settings +export MODEL=../models/7B/... +``` + +Then run: +``` +uvicorn llama_cpp.server.app:create_app --reload +``` + +or + +``` +python3 -m llama_cpp.server +``` + +Then visit http://localhost:8000/docs to see the interactive API docs. + +""" +from __future__ import annotations + +import os +import sys +import argparse + +import uvicorn + +from app import create_app +from llama_cpp.server.settings import ( + Settings, + ServerSettings, + ModelSettings, + ConfigFileSettings, +) +from llama_cpp.server.cli import add_args_from_model, parse_model_from_args + + +def main(): + description = "🦙 Llama.cpp python server. Host your own LLMs!🚀" + parser = argparse.ArgumentParser(description=description) + + add_args_from_model(parser, Settings) + parser.add_argument( + "--config_file", + type=str, + help="Path to a config file to load.", + default="/home/test/api_server.cfg", + ) + server_settings: ServerSettings | None = None + model_settings: list[ModelSettings] = [] + args = parser.parse_args() + try: + # Load server settings from config_file if provided + config_file = os.environ.get("CONFIG_FILE", args.config_file) + if config_file: + if not os.path.exists(config_file): + raise ValueError(f"Config file {config_file} not found!") + with open(config_file, "rb") as f: + # Check if yaml file + if config_file.endswith(".yaml") or config_file.endswith(".yml"): + import yaml + import json + + config_file_settings = ConfigFileSettings.model_validate_json( + json.dumps(yaml.safe_load(f)) + ) + else: + config_file_settings = ConfigFileSettings.model_validate_json(f.read()) + server_settings = ServerSettings.model_validate(config_file_settings) + model_settings = config_file_settings.models + else: + server_settings = parse_model_from_args(ServerSettings, args) + model_settings = [parse_model_from_args(ModelSettings, args)] + except Exception as e: + print(e, file=sys.stderr) + parser.print_help() + sys.exit(1) + assert server_settings is not None + assert model_settings is not None + app = create_app( + server_settings=server_settings, + model_settings=model_settings, + ) + uvicorn.run( + app, + host=os.getenv("HOST", server_settings.host), + port=int(os.getenv("PORT", server_settings.port)), + ssl_keyfile=server_settings.ssl_keyfile, + ssl_certfile=server_settings.ssl_certfile, + ) + + +if __name__ == "__main__": + main() diff --git a/app.py b/app.py new file mode 100644 index 0000000..ad11bd8 --- /dev/null +++ b/app.py @@ -0,0 +1,635 @@ +from __future__ import annotations + +import os +import json + +from threading import Lock +from functools import partial +from typing import Iterator, List, Optional, Union, Dict +import uuid +import llama_cpp +import chatglm +import extends +import anyio +from anyio.streams.memory import MemoryObjectSendStream +from starlette.concurrency import run_in_threadpool, iterate_in_threadpool +from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body +from fastapi.middleware import Middleware +from fastapi.middleware.cors import CORSMiddleware +from fastapi.security import HTTPBearer +from sse_starlette.sse import EventSourceResponse +from starlette_context.plugins import RequestIdPlugin # type: ignore +from starlette_context.middleware import RawContextMiddleware + +from model import ( + LlamaProxy, +) +from llama_cpp.server.settings import ( + ConfigFileSettings, + Settings, + ModelSettings, + ServerSettings, +) +from llama_cpp.server.types import ( + CreateCompletionRequest, + CreateEmbeddingRequest, + CreateChatCompletionRequest, + ModelList, + TokenizeInputRequest, + TokenizeInputResponse, + TokenizeInputCountResponse, + DetokenizeInputRequest, + DetokenizeInputResponse, +) + +from llama_cpp.llama_types import ( + ChatCompletionStreamResponseChoice, + ChatCompletionStreamResponseDelta, + ChatCompletionStreamResponseDeltaEmpty, +) + +from llama_cpp.server.errors import RouteErrorHandler + + +router = APIRouter(route_class=RouteErrorHandler) + +_server_settings: Optional[ServerSettings] = None + + +def set_server_settings(server_settings: ServerSettings): + global _server_settings + _server_settings = server_settings + + +def get_server_settings(): + yield _server_settings + + +_llama_proxy: Optional[LlamaProxy] = None + +llama_outer_lock = Lock() +llama_inner_lock = Lock() + + +def set_llama_proxy(model_settings: List[ModelSettings]): + global _llama_proxy + _llama_proxy = LlamaProxy(models=model_settings) + + +def get_llama_proxy(): + # NOTE: This double lock allows the currently streaming llama model to + # check if any other requests are pending in the same thread and cancel + # the stream if so. + llama_outer_lock.acquire() + release_outer_lock = True + try: + llama_inner_lock.acquire() + try: + llama_outer_lock.release() + release_outer_lock = False + yield _llama_proxy + finally: + llama_inner_lock.release() + finally: + if release_outer_lock: + llama_outer_lock.release() + + +_ping_message_factory = None + + +def set_ping_message_factory(factory): + global _ping_message_factory + _ping_message_factory = factory + + +def create_app( + settings: Settings | None = None, + server_settings: ServerSettings | None = None, + model_settings: List[ModelSettings] | None = None, +): + config_file = os.environ.get("CONFIG_FILE", None) + if config_file is not None: + if not os.path.exists(config_file): + raise ValueError(f"Config file {config_file} not found!") + with open(config_file, "rb") as f: + # Check if yaml file + if config_file.endswith(".yaml") or config_file.endswith(".yml"): + import yaml + + config_file_settings = ConfigFileSettings.model_validate_json( + json.dumps(yaml.safe_load(f)) + ) + else: + config_file_settings = ConfigFileSettings.model_validate_json(f.read()) + server_settings = ServerSettings.model_validate(config_file_settings) + model_settings = config_file_settings.models + + if server_settings is None and model_settings is None: + if settings is None: + settings = Settings() + server_settings = ServerSettings.model_validate(settings) + model_settings = [ModelSettings.model_validate(settings)] + + assert ( + server_settings is not None and model_settings is not None + ), "server_settings and model_settings must be provided together" + + set_server_settings(server_settings) + middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))] + app = FastAPI( + middleware=middleware, + title="🦙 llama.cpp Python API", + version=llama_cpp.__version__, + ) + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + app.include_router(router) + + assert model_settings is not None + set_llama_proxy(model_settings=model_settings) + + if server_settings.disable_ping_events: + set_ping_message_factory(lambda: bytes()) + + return app + + +async def get_event_publisher( + request: Request, + inner_send_chan: MemoryObjectSendStream, + iterator: Iterator, +): + async with inner_send_chan: + try: + async for chunk in iterate_in_threadpool(iterator): + await inner_send_chan.send(dict(data=json.dumps(chunk))) + if await request.is_disconnected(): + raise anyio.get_cancelled_exc_class()() + if ( + next(get_server_settings()).interrupt_requests + and llama_outer_lock.locked() + ): + await inner_send_chan.send(dict(data="[DONE]")) + raise anyio.get_cancelled_exc_class()() + await inner_send_chan.send(dict(data="[DONE]")) + except anyio.get_cancelled_exc_class() as e: + print("disconnected") + with anyio.move_on_after(1, shield=True): + print(f"Disconnected from client (via refresh/close) {request.client}") + raise e + + +def _logit_bias_tokens_to_input_ids( + llama: llama_cpp.Llama, + logit_bias: Dict[str, float], +) -> Dict[str, float]: + to_bias: Dict[str, float] = {} + for token, score in logit_bias.items(): + token = token.encode("utf-8") + for input_id in llama.tokenize(token, add_bos=False, special=True): + to_bias[str(input_id)] = score + return to_bias + + +# Setup Bearer authentication scheme +bearer_scheme = HTTPBearer(auto_error=False) + + +async def authenticate( + settings: Settings = Depends(get_server_settings), + authorization: Optional[str] = Depends(bearer_scheme), +): + # Skip API key check if it's not set in settings + if settings.api_key is None: + return True + + # check bearer credentials against the api_key + if authorization and authorization.credentials == settings.api_key: + # api key is valid + return authorization.credentials + + # raise http error 401 + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid API key", + ) + + +openai_v1_tag = "OpenAI V1" + + +@router.post( + "/v1/completions", + summary="Completion", + dependencies=[Depends(authenticate)], + response_model=Union[ + llama_cpp.CreateCompletionResponse, + str, + ], + responses={ + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "anyOf": [ + {"$ref": "#/components/schemas/CreateCompletionResponse"} + ], + "title": "Completion response, when stream=False", + } + }, + "text/event-stream": { + "schema": { + "type": "string", + "title": "Server Side Streaming response, when stream=True. " + + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format", # noqa: E501 + "example": """data: {... see CreateCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""", + } + }, + }, + } + }, + tags=[openai_v1_tag], +) +@router.post( + "/v1/engines/copilot-codex/completions", + include_in_schema=False, + dependencies=[Depends(authenticate)], + tags=[openai_v1_tag], +) +async def create_completion( + request: Request, + body: CreateCompletionRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> llama_cpp.Completion: + if isinstance(body.prompt, list): + assert len(body.prompt) <= 1 + body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" + + llama = llama_proxy( + body.model + if request.url.path != "/v1/engines/copilot-codex/completions" + else "copilot-codex" + ) + + exclude = { + "n", + "best_of", + "logit_bias_type", + "user", + } + kwargs = body.model_dump(exclude=exclude) + + if body.logit_bias is not None: + kwargs["logit_bias"] = ( + _logit_bias_tokens_to_input_ids(llama, body.logit_bias) + if body.logit_bias_type == "tokens" + else body.logit_bias + ) + + if body.grammar is not None: + kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) + + iterator_or_completion: Union[ + llama_cpp.CreateCompletionResponse, + Iterator[llama_cpp.CreateCompletionStreamResponse], + ] = await run_in_threadpool(llama, **kwargs) + + if isinstance(iterator_or_completion, Iterator): + # EAFP: It's easier to ask for forgiveness than permission + + first_response = await run_in_threadpool(next, iterator_or_completion) + + # If no exception was raised from first_response, we can assume that + # the iterator is valid and we can use it to stream the response. + def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: + yield first_response + yield from iterator_or_completion + + send_chan, recv_chan = anyio.create_memory_object_stream(10) + return EventSourceResponse( + recv_chan, + data_sender_callable=partial( # type: ignore + get_event_publisher, + request=request, + inner_send_chan=send_chan, + iterator=iterator(), + ), + sep="\n", + ping_message_factory=_ping_message_factory, + ) + else: + return iterator_or_completion + + +@router.post( + "/v1/embeddings", + summary="Embedding", + dependencies=[Depends(authenticate)], + tags=[openai_v1_tag], +) +async def create_embedding( + request: CreateEmbeddingRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +): + return await run_in_threadpool( + llama_proxy(request.model).create_embedding, + **request.model_dump(exclude={"user"}), + ) + + +@router.post( + "/v1/chat/completions", + summary="Chat", + dependencies=[Depends(authenticate)], + response_model=Union[llama_cpp.ChatCompletion, str], + responses={ + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "anyOf": [ + { + "$ref": "#/components/schemas/CreateChatCompletionResponse" + } + ], + "title": "Completion response, when stream=False", + } + }, + "text/event-stream": { + "schema": { + "type": "string", + "title": "Server Side Streaming response, when stream=True" + + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format", # noqa: E501 + "example": """data: {... see CreateChatCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""", + } + }, + }, + } + }, + tags=[openai_v1_tag], +) +async def create_chat_completion( + request: Request, + body: CreateChatCompletionRequest = Body( + openapi_examples={ + "normal": { + "summary": "Chat Completion", + "value": { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ], + }, + }, + "json_mode": { + "summary": "JSON Mode", + "value": { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020"}, + ], + "response_format": {"type": "json_object"}, + }, + }, + "tool_calling": { + "summary": "Tool Calling", + "value": { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Extract Jason is 30 years old."}, + ], + "tools": [ + { + "type": "function", + "function": { + "name": "User", + "description": "User record", + "parameters": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "number"}, + }, + "required": ["name", "age"], + }, + }, + } + ], + "tool_choice": { + "type": "function", + "function": { + "name": "User", + }, + }, + }, + }, + "logprobs": { + "summary": "Logprobs", + "value": { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ], + "logprobs": True, + "top_logprobs": 10, + }, + }, + } + ), + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> llama_cpp.ChatCompletion: + exclude = { + "n", + "logit_bias_type", + "user", + } + print(body) + kwargs = body.model_dump(exclude=exclude) + llama = llama_proxy(body.model) + if body.logit_bias is not None: + kwargs["logit_bias"] = ( + _logit_bias_tokens_to_input_ids(llama, body.logit_bias) + if body.logit_bias_type == "tokens" + else body.logit_bias + ) + + if body.grammar is not None: + kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) + + model_settings = llama_proxy._model_settings_dict[body.model] + model_chat_format = model_settings.chat_format + if model_chat_format == "chatglm3": + max_context_length = model_settings.n_ctx + num_threads = model_settings.n_threads + chatglm_pipeline = llama + if body.stream: + iterator = chatglm.stream_chat( + chatglm_pipeline, body, max_context_length, num_threads + ) + send_chan, recv_chan = anyio.create_memory_object_stream(10) + return EventSourceResponse( + recv_chan, + data_sender_callable=partial( # type: ignore + get_event_publisher, + request=request, + inner_send_chan=send_chan, + iterator=iterator, + ), + sep="\n", + ping_message_factory=_ping_message_factory, + ) + else: + return chatglm.create_chat_completion( + chatglm_pipeline, body, max_context_length, num_threads + ) + if model_chat_format == "functionary-v2" and body.stream: + iterator = extends.functionary_stream_chat( body, llama) + send_chan, recv_chan = anyio.create_memory_object_stream(10) + return EventSourceResponse( + recv_chan, + data_sender_callable=partial( # type: ignore + get_event_publisher, + request=request, + inner_send_chan=send_chan, + iterator=iterator, + ), + sep="\n", + ping_message_factory=_ping_message_factory, + ) + + elif model_chat_format == "openfunctions": + if body.stream: + iterator = extends.openfunction_stream_chat( body, llama) + send_chan, recv_chan = anyio.create_memory_object_stream(10) + return EventSourceResponse( + recv_chan, + data_sender_callable=partial( # type: ignore + get_event_publisher, + request=request, + inner_send_chan=send_chan, + iterator=iterator, + ), + sep="\n", + ping_message_factory=_ping_message_factory, + ) + else: + return extends.handle_openfunction(body, llama) + + elif model_chat_format == "firefunction": + return extends.handle_firefunction(body, llama) + + else: + iterator_or_completion: Union[ + llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk] + ] = await run_in_threadpool(llama.create_chat_completion, **kwargs) + + if isinstance(iterator_or_completion, Iterator): + # EAFP: It's easier to ask for forgiveness than permission + first_response = await run_in_threadpool(next, iterator_or_completion) + + print(type(first_response)) + print(first_response) + + # If no exception was raised from first_response, we can assume that + # the iterator is valid and we can use it to stream the response. + def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: + yield first_response + yield from iterator_or_completion + + send_chan, recv_chan = anyio.create_memory_object_stream(10) + return EventSourceResponse( + recv_chan, + data_sender_callable=partial( # type: ignore + get_event_publisher, + request=request, + inner_send_chan=send_chan, + iterator=iterator(), + ), + sep="\n", + ping_message_factory=_ping_message_factory, + ) + else: + return iterator_or_completion + + +@router.get( + "/v1/models", + summary="Models", + dependencies=[Depends(authenticate)], + tags=[openai_v1_tag], +) +async def get_models( + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> ModelList: + return { + "object": "list", + "data": [ + { + "id": model_alias, + "object": "model", + "owned_by": "me", + "permissions": [], + } + for model_alias in llama_proxy + ], + } + + +extras_tag = "Extras" + + +@router.post( + "/extras/tokenize", + summary="Tokenize", + dependencies=[Depends(authenticate)], + tags=[extras_tag], +) +async def tokenize( + body: TokenizeInputRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> TokenizeInputResponse: + tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) + + return TokenizeInputResponse(tokens=tokens) + + +@router.post( + "/extras/tokenize/count", + summary="Tokenize Count", + dependencies=[Depends(authenticate)], + tags=[extras_tag], +) +async def count_query_tokens( + body: TokenizeInputRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> TokenizeInputCountResponse: + tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) + + return TokenizeInputCountResponse(count=len(tokens)) + + +@router.post( + "/extras/detokenize", + summary="Detokenize", + dependencies=[Depends(authenticate)], + tags=[extras_tag], +) +async def detokenize( + body: DetokenizeInputRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> DetokenizeInputResponse: + text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8") + + return DetokenizeInputResponse(text=text) diff --git a/chatglm.py b/chatglm.py new file mode 100644 index 0000000..fd0c01e --- /dev/null +++ b/chatglm.py @@ -0,0 +1,165 @@ +import asyncio +import json +import logging +import time +import os +import uuid +import chatglm_cpp +import llama_cpp +from sse_starlette.sse import EventSourceResponse +from pprint import pprint +from fastapi import HTTPException, status +from llama_cpp.server.types import ChatCompletionRequestMessage +from llama_cpp.llama_types import ( + ChatCompletionResponseChoice, + ChatCompletionMessageToolCall, + ChatCompletionStreamResponseChoice, + CreateChatCompletionStreamResponse, + CompletionUsage, + ChatCompletionStreamResponseDelta, + CreateChatCompletionResponse, + ChatCompletionResponseMessage, + ChatCompletionRequestAssistantMessage, + ChatCompletionMessageToolCallFunction, + ChatCompletionStreamResponseDeltaEmpty +) + + +def _buid_msg(body: ChatCompletionRequestMessage): + messages = [ + chatglm_cpp.ChatMessage(role=msg["role"], content=msg["content"]) + for msg in body.messages + ] + if body.tools: + system_content = ( + "Answer the following questions as best as you can. You have access to the following tools:\n" + + json.dumps(body.tools, indent=4) + ) + messages.insert( + 0, chatglm_cpp.ChatMessage( + role="system", content=system_content) + ) + return messages + + +def stream_chat(chatglm_pipeline: chatglm_cpp.Pipeline, body: ChatCompletionRequestMessage, max_context_length: int, num_threads: int): + max_tokens = 1024 + if body.max_tokens: + max_tokens = body.max_tokens + + for chunk in chatglm_pipeline.chat( + messages=_buid_msg(body), + max_length=max_tokens, + max_context_length=max_context_length, + do_sample=body.temperature > 0, + top_p=body.top_p, + temperature=body.temperature, + num_threads=num_threads, + stream=True, + ): + choices = [ChatCompletionStreamResponseChoice( + index=1, + delta=ChatCompletionStreamResponseDelta( + content=chunk.content, role=chunk.role), + finish_reason=None, + logprobs=None, + )] + chunk= llama_cpp.ChatCompletionChunk( + id="chatcmpl-" + uuid.uuid4().hex, + model=body.model, + object="chat.completion.chunk", + created=int(time.time()), + choices=choices, + ) + yield chunk + + +def create_chat_completion(chatglm_pipeline: chatglm_cpp.Pipeline, body: ChatCompletionRequestMessage, max_context_length: int, num_threads: int) -> CreateChatCompletionResponse: + def to_json_arguments(arguments): + def tool_call(**kwargs): + return kwargs + + try: + return json.dumps(eval(arguments, dict(tool_call=tool_call))) + except Exception: + return arguments + if not body.messages: + raise HTTPException(status.HTTP_400_BAD_REQUEST, "empty messages") + + max_tokens = 2048 + if body.max_tokens: + max_tokens = body.max_tokens + + messages = _buid_msg(body) + + output = chatglm_pipeline.chat( + messages=messages, + max_length=max_tokens, + max_context_length=max_context_length, + do_sample=body.temperature > 0, + top_p=body.top_p, + temperature=body.temperature, + num_threads=num_threads, + ) + logging.info( + f'prompt: "{messages[-1].content}", sync response: "{output.content}"') + prompt_tokens = len( + chatglm_pipeline.tokenizer.encode_messages( + messages, max_context_length) + ) + completion_tokens = len( + chatglm_pipeline.tokenizer.encode(output.content, max_tokens)) + + finish_reason = "stop" + tool_calls = None + if output.tool_calls: + tool_calls = [ + ChatCompletionMessageToolCall( + id="tool_call_" + uuid.uuid4().hex, + type=tool_call.type, + function=ChatCompletionMessageToolCallFunction( + name=tool_call.function.name, + arguments=to_json_arguments(tool_call.function.arguments), + ), + ) + for tool_call in output.tool_calls + ] + finish_reason = "function_call" + + if tool_calls is None: + choices = [ + ChatCompletionResponseChoice( + index=0, + message=ChatCompletionResponseMessage( + role="assistant", content=output.content + ), + finish_reason=finish_reason, + logprobs=None, + ) + ] + else: + choices = [ + ChatCompletionResponseChoice( + index=0, + message=ChatCompletionRequestAssistantMessage( + role="assistant", content=output.content, tool_calls=tool_calls + ), + finish_reason=finish_reason, + logprobs=None, + ) + ] + + response = CreateChatCompletionResponse( + id="chatcmpl", + object="chat.completion", + created=int(time.time()), + model="chatglm", + choices=choices, + usage=CompletionUsage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ), + ) + print(response) + return response diff --git a/cli.py b/cli.py new file mode 100644 index 0000000..3dd0076 --- /dev/null +++ b/cli.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +import argparse + +from typing import List, Literal, Union, Any, Type, TypeVar + +from pydantic import BaseModel + + +def _get_base_type(annotation: Type[Any]) -> Type[Any]: + if getattr(annotation, "__origin__", None) is Literal: + assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1 # type: ignore + return type(annotation.__args__[0]) # type: ignore + elif getattr(annotation, "__origin__", None) is Union: + assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1 # type: ignore + non_optional_args: List[Type[Any]] = [ + arg for arg in annotation.__args__ if arg is not type(None) # type: ignore + ] + if non_optional_args: + return _get_base_type(non_optional_args[0]) + elif ( + getattr(annotation, "__origin__", None) is list + or getattr(annotation, "__origin__", None) is List + ): + assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1 # type: ignore + return _get_base_type(annotation.__args__[0]) # type: ignore + return annotation + + +def _contains_list_type(annotation: Type[Any] | None) -> bool: + origin = getattr(annotation, "__origin__", None) + + if origin is list or origin is List: + return True + elif origin in (Literal, Union): + return any(_contains_list_type(arg) for arg in annotation.__args__) # type: ignore + else: + return False + + +def _parse_bool_arg(arg: str | bytes | bool) -> bool: + if isinstance(arg, bytes): + arg = arg.decode("utf-8") + + true_values = {"1", "on", "t", "true", "y", "yes"} + false_values = {"0", "off", "f", "false", "n", "no"} + + arg_str = str(arg).lower().strip() + + if arg_str in true_values: + return True + elif arg_str in false_values: + return False + else: + raise ValueError(f"Invalid boolean argument: {arg}") + + +def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]): + """Add arguments from a pydantic model to an argparse parser.""" + + for name, field in model.model_fields.items(): + description = field.description + if field.default and description and not field.is_required(): + description += f" (default: {field.default})" + base_type = ( + _get_base_type(field.annotation) if field.annotation is not None else str + ) + list_type = _contains_list_type(field.annotation) + if base_type is not bool: + parser.add_argument( + f"--{name}", + dest=name, + nargs="*" if list_type else None, + type=base_type, + help=description, + ) + if base_type is bool: + parser.add_argument( + f"--{name}", + dest=name, + type=_parse_bool_arg, + help=f"{description}", + ) + + +T = TypeVar("T", bound=Type[BaseModel]) + + +def parse_model_from_args(model: T, args: argparse.Namespace) -> T: + """Parse a pydantic model from an argparse namespace.""" + return model( + **{ + k: v + for k, v in vars(args).items() + if v is not None and k in model.model_fields + } + ) diff --git a/errors.py b/errors.py new file mode 100644 index 0000000..fbf9fd8 --- /dev/null +++ b/errors.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +import sys +import traceback +import time +from re import compile, Match, Pattern +from typing import Callable, Coroutine, Optional, Tuple, Union, Dict +from typing_extensions import TypedDict + + +from fastapi import ( + Request, + Response, + HTTPException, +) +from fastapi.responses import JSONResponse +from fastapi.routing import APIRoute + +from llama_cpp.server.types import ( + CreateCompletionRequest, + CreateEmbeddingRequest, + CreateChatCompletionRequest, +) + + +class ErrorResponse(TypedDict): + """OpenAI style error response""" + + message: str + type: str + param: Optional[str] + code: Optional[str] + + +class ErrorResponseFormatters: + """Collection of formatters for error responses. + + Args: + request (Union[CreateCompletionRequest, CreateChatCompletionRequest]): + Request body + match (Match[str]): Match object from regex pattern + + Returns: + Tuple[int, ErrorResponse]: Status code and error response + """ + + @staticmethod + def context_length_exceeded( + request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"], + match, # type: Match[str] # type: ignore + ) -> Tuple[int, ErrorResponse]: + """Formatter for context length exceeded error""" + + context_window = int(match.group(2)) + prompt_tokens = int(match.group(1)) + completion_tokens = request.max_tokens + if hasattr(request, "messages"): + # Chat completion + message = ( + "This model's maximum context length is {} tokens. " + "However, you requested {} tokens " + "({} in the messages, {} in the completion). " + "Please reduce the length of the messages or completion." + ) + else: + # Text completion + message = ( + "This model's maximum context length is {} tokens, " + "however you requested {} tokens " + "({} in your prompt; {} for the completion). " + "Please reduce your prompt; or completion length." + ) + return 400, ErrorResponse( + message=message.format( + context_window, + (completion_tokens or 0) + prompt_tokens, + prompt_tokens, + completion_tokens, + ), # type: ignore + type="invalid_request_error", + param="messages", + code="context_length_exceeded", + ) + + @staticmethod + def model_not_found( + request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"], + match, # type: Match[str] # type: ignore + ) -> Tuple[int, ErrorResponse]: + """Formatter for model_not_found error""" + + model_path = str(match.group(1)) + message = f"The model `{model_path}` does not exist" + return 400, ErrorResponse( + message=message, + type="invalid_request_error", + param=None, + code="model_not_found", + ) + + +class RouteErrorHandler(APIRoute): + """Custom APIRoute that handles application errors and exceptions""" + + # key: regex pattern for original error message from llama_cpp + # value: formatter function + pattern_and_formatters: Dict[ + "Pattern[str]", + Callable[ + [ + Union["CreateCompletionRequest", "CreateChatCompletionRequest"], + "Match[str]", + ], + Tuple[int, ErrorResponse], + ], + ] = { + compile( + r"Requested tokens \((\d+)\) exceed context window of (\d+)" + ): ErrorResponseFormatters.context_length_exceeded, + compile( + r"Model path does not exist: (.+)" + ): ErrorResponseFormatters.model_not_found, + } + + def error_message_wrapper( + self, + error: Exception, + body: Optional[ + Union[ + "CreateChatCompletionRequest", + "CreateCompletionRequest", + "CreateEmbeddingRequest", + ] + ] = None, + ) -> Tuple[int, ErrorResponse]: + """Wraps error message in OpenAI style error response""" + print(f"Exception: {str(error)}", file=sys.stderr) + traceback.print_exc(file=sys.stderr) + if body is not None and isinstance( + body, + ( + CreateCompletionRequest, + CreateChatCompletionRequest, + ), + ): + # When text completion or chat completion + for pattern, callback in self.pattern_and_formatters.items(): + match = pattern.search(str(error)) + if match is not None: + return callback(body, match) + + # Wrap other errors as internal server error + return 500, ErrorResponse( + message=str(error), + type="internal_server_error", + param=None, + code=None, + ) + + def get_route_handler( + self, + ) -> Callable[[Request], Coroutine[None, None, Response]]: + """Defines custom route handler that catches exceptions and formats + in OpenAI style error response""" + + original_route_handler = super().get_route_handler() + + async def custom_route_handler(request: Request) -> Response: + try: + start_sec = time.perf_counter() + response = await original_route_handler(request) + elapsed_time_ms = int((time.perf_counter() - start_sec) * 1000) + response.headers["openai-processing-ms"] = f"{elapsed_time_ms}" + return response + except HTTPException as unauthorized: + # api key check failed + raise unauthorized + except Exception as exc: + json_body = await request.json() + try: + if "messages" in json_body: + # Chat completion + body: Optional[ + Union[ + CreateChatCompletionRequest, + CreateCompletionRequest, + CreateEmbeddingRequest, + ] + ] = CreateChatCompletionRequest(**json_body) + elif "prompt" in json_body: + # Text completion + body = CreateCompletionRequest(**json_body) + else: + # Embedding + body = CreateEmbeddingRequest(**json_body) + except Exception: + # Invalid request body + body = None + + # Get proper error message from the exception + ( + status_code, + error_message, + ) = self.error_message_wrapper(error=exc, body=body) + return JSONResponse( + {"error": error_message}, + status_code=status_code, + ) + + return custom_route_handler diff --git a/extends.py b/extends.py new file mode 100644 index 0000000..0986e8f --- /dev/null +++ b/extends.py @@ -0,0 +1,400 @@ +from llama_cpp.server.types import ( + CreateCompletionRequest, + CreateEmbeddingRequest, + CreateChatCompletionRequest, + ChatCompletionRequestMessage, +) +from llama_cpp.llama_types import ( + ChatCompletionResponseChoice, + ChatCompletionMessageToolCall, + ChatCompletionStreamResponseChoice, + CreateChatCompletionStreamResponse, + CompletionUsage, + ChatCompletionStreamResponseDelta, + CreateChatCompletionResponse, + ChatCompletionResponseMessage, + ChatCompletionRequestAssistantMessage, + ChatCompletionMessageToolCallFunction, + ChatCompletionStreamResponseDeltaEmpty, + ChatCompletionMessageToolCallChunk, + ChatCompletionMessageToolCallChunkFunction, +) +import json +import uuid +import ast +import llama_cpp +import time + + +def process_ast_node(node): + # Check if the node is a function call + if isinstance(node, ast.Call): + # Return a string representation of the function call + return ast.unparse(node) + else: + # Convert the node to source code and evaluate to get the value + node_str = ast.unparse(node) + return eval(node_str) + + +def parse_python_function_call(call_str): + tree = ast.parse(call_str) + expr = tree.body[0] + + call_node = expr.value + function_name = ( + call_node.func.id + if isinstance(call_node.func, ast.Name) + else str(call_node.func) + ) + + parameters = {} + noNameParam = [] + + # Process positional arguments + for arg in call_node.args: + noNameParam.append(process_ast_node(arg)) + + # Process keyword arguments + for kw in call_node.keywords: + parameters[kw.arg] = process_ast_node(kw.value) + + if noNameParam: + parameters["None"] = noNameParam + + function_dict = {"name": function_name, "arguments": parameters} + return function_dict + + +FN_CALL_DELIMITER = "<>" + + +def strip_function_calls(content: str) -> list[str]: + """ + Split the content by the function call delimiter and remove empty strings + """ + return [ + element.strip() + for element in content.split(FN_CALL_DELIMITER)[1:] + if element.strip() + ] + + +def parse_function_call(call: str) -> dict[str, any]: + """ + This is temporary. The long term solution is to union all the + types of the parameters from the user's input function definition, + and check which language is a proper super set of the union type. + """ + try: + return parse_python_function_call(call) + except Exception as e: + # If Python parsing fails, try Java parsing + + return None + + +def get_openfunctions_prompt(messages: list = [], functions: list = []) -> str: + """ + Generates a conversation prompt based on the user's query and a list of functions. + + Parameters: + - user_query (str): The user's query. + - functions (list): A list of functions to include in the prompt. + + Returns: + - str: The formatted conversation prompt. + """ + system = "You are an AI programming assistant" # , utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer." + if len(messages) > 0: + if messages[0]["role"] == "system": + system = messages[0]["content"] + messages = messages[1:] + + user_query = "" + for message in messages: + if message["role"] == "user": + user_query += "### Instruction: <> " + message["content"] + "\n" + elif message["role"] == "system": + user_query += message["content"] + else: + user_query += "### Response:\n" + message["content"] + "\n<|EOT|>\n" + + if len(functions) == 0: + return f"{system}\n### Instruction: <> {user_query}\n### Response: " + functions_string = json.dumps(functions) + result = f"<|begin▁of▁sentence|>{system}\n### Instruction: <>{functions_string}\n{user_query} ### Response: " + + print(result) + return result + + +def format_response(response): + result = [] + choices = response["choices"] + for choice in choices: + text = choice["text"] + calls = strip_function_calls(text) + tool_calls = [] + for function_call in calls: + fc = parse_function_call(function_call) + if fc is not None: + tool_calls.append( + { + "id": "call_" + uuid.uuid4().hex, + "function": { + "name": fc["name"], + "arguments": json.dumps(fc["arguments"]), + }, + "type": "function", + } + ) + result.append( + { + "finish_reason": choice["finish_reason"], + "index": choice["index"], + "logprobs": choice["logprobs"], + "message": { + "content": text, + "role": "assistant", + "tool_calls": tool_calls, + }, + } + ) + return result + + +def handle_openfunction(body: CreateCompletionRequest, llama) -> any: + tools = body.tools + if tools is None: + tools = [] + user_prompt = get_openfunctions_prompt(body.messages, tools) + response = llama( + user_prompt, + stop=["<|EOT|>"], + temperature=body.temperature, + top_p=body.top_p, + max_tokens=body.max_tokens, + ) + tool_calls = format_response(response) + return { + "id": response["id"], + "object": "chat.completion", + "created": response["created"], + "model": body.model, + "choices": tool_calls, + "usage": { + "prompt_tokens": response["usage"]["prompt_tokens"], + "completion_tokens": response["usage"]["completion_tokens"], + "total_tokens": response["usage"]["total_tokens"], + }, + } + + +def openfunction_stream_chat(body: CreateCompletionRequest, llama) -> any: + tools = body.tools + if tools is None: + tools = [] + user_prompt = get_openfunctions_prompt(body.messages, tools) + if tools is None: + for chunk in llama( + user_prompt, + stop=["<|EOT|>"], + temperature=body.temperature, + top_p=body.top_p, + max_tokens=body.max_tokens, + stream=True, + ): + choices = [ + ChatCompletionStreamResponseChoice( + index=1, + delta=ChatCompletionStreamResponseDelta( + content=choice["text"], + ), + finish_reason=choice["finish_reason"], + logprobs=choice["logprobs"], + ) + for choice in chunk["choices"] + ] + chatCompletionChunk = llama_cpp.ChatCompletionChunk( + id="chatcmpl-" + uuid.uuid4().hex, + model=body.model, + object="chat.completion.chunk", + created=int(time.time()), + choices=choices, + ) + yield chatCompletionChunk + else: + response = llama( + user_prompt, + stop=["<|EOT|>"], + temperature=body.temperature, + top_p=body.top_p, + max_tokens=body.max_tokens, + stream=False, + ) + tool_calls = format_response(response) + choices = response["choices"] + stream_response_choices = [] + choices_index = 0 + for choice in choices: + text = choice["text"] + calls = strip_function_calls(text) + tool_calls = [] + index = 0 + for function_call in calls: + fc = parse_function_call(function_call) + if fc is not None: + tool_calls.append( + ChatCompletionMessageToolCallChunk( + index=index, + id="call_" + uuid.uuid4().hex, + type="function", + function=ChatCompletionMessageToolCallChunkFunction( + name=fc["name"], + arguments=json.dumps(fc["arguments"]), + ), + ) + ) + index += 1 + stream_response_choices.append( + ChatCompletionStreamResponseChoice( + index=choices_index, + delta=ChatCompletionStreamResponseDelta( + content=json.dumps(calls), + finish_reason=choice["finish_reason"], + logprobs=choice["logprobs"], + tool_calls=tool_calls, + ), + ) + ) + choices_index += 1 + + chatCompletionChunk = llama_cpp.ChatCompletionChunk( + id="chatcmpl-" + uuid.uuid4().hex, + model=body.model, + object="chat.completion.chunk", + created=int(time.time()), + choices=stream_response_choices, + ) + yield chatCompletionChunk + + +def functionary_stream_chat(body: CreateCompletionRequest, llama) -> any: + response = llama.create_chat_completion( + messages=body.messages, tools=body.tools, tool_choice="auto", stream=False + ) + stream_response_choices = [] + choices = response["choices"] + choices_index = 0 + for choice in choices: + tool_calls = [] + calls = choice["message"]["tool_calls"] + index = 0 + for call in calls: + tool_calls.append( + ChatCompletionMessageToolCallChunk( + index=index, + id=call["id"], + type=call["type"], + function=ChatCompletionMessageToolCallChunkFunction( + name=call["function"]["name"], + arguments=json.dumps(call["function"]["arguments"]), + ), + ) + ) + index += 1 + stream_response_choices.append( + ChatCompletionStreamResponseChoice( + index=choices_index, + delta=ChatCompletionStreamResponseDelta( + content=None, + finish_reason=choice["finish_reason"], + logprobs=choice["logprobs"], + tool_calls=tool_calls, + ), + ) + ) + choices_index += 1 + + chatCompletionChunk = llama_cpp.ChatCompletionChunk( + id=response["id"], + model=body.model, + object="chat.completion.chunk", + created=response["created"], + choices=stream_response_choices, + ) + print(chatCompletionChunk) + yield chatCompletionChunk + + +def handle_firefunction(body: CreateChatCompletionRequest, llama) -> any: + messages = [] + function_spec = json.dumps(body.tools) + messages.append({"role": "functions", "content": function_spec}) + for message in body.messages: + messages.append({"role": message["role"], "content": message["content"]}) + response = llama.create_chat_completion( + messages=messages, + tools=function_spec, + tool_choice="auto", + temperature=body.temperature, + top_p=body.top_p, + logprobs=body.logprobs, + max_tokens=body.max_tokens, + ) + choices = [] + for choice in response["choices"]: + message_content = choice["message"]["content"] + if "" in message_content: + function_call_json = message_content[len("") :] + function_call_data = json.loads(function_call_json) + choices.append( + { + "index": choice["index"], + "logprobs": choice["logprobs"], + "finish_reason": choice["finish_reason"], + "message": { + "content": message_content, + "role": choice["message"]["role"], + "tool_calls": [ + { + "id": "tool_call_" + uuid.uuid4().hex, + "type": "function", + "function": { + "name": function_call_data["name"], + "arguments": json.dumps( + function_call_data["arguments"] + ), + }, + } + ], + }, + } + ) + else: + choices.append( + { + "index": choice["index"], + "logprobs": choice["logprobs"], + "finish_reason": choice["finish_reason"], + "message": { + "content": message_content, + "role": choice["message"]["role"], + }, + } + ) + + result = { + "id": response["id"], + "object": response["object"], + "created": response["created"], + "model": body.model, + "choices": choices, + "usage": { + "prompt_tokens": response["usage"]["prompt_tokens"], + "completion_tokens": response["usage"]["completion_tokens"], + "total_tokens": response["usage"]["total_tokens"], + }, + } + return result diff --git a/model.py b/model.py new file mode 100644 index 0000000..c256173 --- /dev/null +++ b/model.py @@ -0,0 +1,251 @@ +from __future__ import annotations + +import json + +from typing import Dict, Optional, Union, List + +import llama_cpp +import llama_cpp.llama_speculative as llama_speculative +import llama_cpp.llama_tokenizer as llama_tokenizer +from llama_cpp.server.settings import ModelSettings +import chatglm_cpp +import chatglm +import gc +from llama_cpp import Llama + + +class LlamaProxy: + def __init__(self, models: List[ModelSettings]) -> None: + assert len(models) > 0, "No models provided!" + + self._model_settings_dict: dict[str, ModelSettings] = {} + for model in models: + if not model.model_alias: + model.model_alias = model.model + self._model_settings_dict[model.model_alias] = model + + self._current_model: Optional[llama_cpp.Llama] = None + self._current_model_alias: Optional[str] = None + + self._default_model_settings: ModelSettings = models[0] + self._default_model_alias: str = self._default_model_settings.model_alias # type: ignore + + # Load default model + self._current_model = self.load_llama_from_model_settings( + self._default_model_settings + ) + self._current_model_alias = self._default_model_alias + + def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: + if model is None: + model = self._default_model_alias + + if model not in self._model_settings_dict: + model = self._default_model_alias + + if model == self._current_model_alias: + if self._current_model is not None: + return self._current_model + + self._current_model = None + + settings = self._model_settings_dict[model] + self._current_model = self.load_llama_from_model_settings(settings) + self._current_model_alias = model + return self._current_model + + def __getitem__(self, model: str): + return self._model_settings_dict[model].model_dump() + + def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]): + if isinstance(settings, (bytes, str)): + settings = ModelSettings.model_validate_json(settings) + self._model_settings_dict[model] = settings + + def __iter__(self): + for model in self._model_settings_dict: + yield model + + def free(self): + if self._current_model: + del self._current_model + gc.collect() + + @staticmethod + def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: + chat_handler = None + if settings.chat_format == "llava-1-5": + assert settings.clip_model_path is not None, "clip model not found" + chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) + elif settings.chat_format == "hf-autotokenizer": + assert ( + settings.hf_pretrained_model_name_or_path is not None + ), "hf_pretrained_model_name_or_path must be set for hf-autotokenizer" + chat_handler = ( + llama_cpp.llama_chat_format.hf_autotokenizer_to_chat_completion_handler( + settings.hf_pretrained_model_name_or_path + ) + ) + elif settings.chat_format == "hf-tokenizer-config": + assert ( + settings.hf_tokenizer_config_path is not None + ), "hf_tokenizer_config_path must be set for hf-tokenizer-config" + chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler( + json.load(open(settings.hf_tokenizer_config_path)) + ) + + tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None + if settings.hf_pretrained_model_name_or_path is not None: + tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained( + settings.hf_pretrained_model_name_or_path + ) + + draft_model = None + if settings.draft_model is not None: + draft_model = llama_speculative.LlamaPromptLookupDecoding( + num_pred_tokens=settings.draft_model_num_pred_tokens + ) + + kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None + if settings.kv_overrides is not None: + assert isinstance(settings.kv_overrides, list) + kv_overrides = {} + for kv in settings.kv_overrides: + key, value = kv.split("=") + if ":" in value: + value_type, value = value.split(":") + if value_type == "bool": + kv_overrides[key] = value.lower() in ["true", "1"] + elif value_type == "int": + kv_overrides[key] = int(value) + elif value_type == "float": + kv_overrides[key] = float(value) + else: + raise ValueError(f"Unknown value type {value_type}") + + import functools + + kwargs = {} + + if settings.hf_model_repo_id is not None: + create_fn = functools.partial( + llama_cpp.Llama.from_pretrained, + repo_id=settings.hf_model_repo_id, + filename=settings.model, + ) + elif settings.chat_format == "chatglm": + create_fn = chatglm_cpp.Pipeline + kwargs["model_path"] = settings.model + else: + create_fn = llama_cpp.Llama + kwargs["model_path"] = settings.model + + if settings.chat_format == "chatglm3": + _model = chatglm_cpp.Pipeline(settings.model) + _model.create_chat_completion = chatglm.create_chat_completion + + elif settings.chat_format == "firefunction" : + _model = Llama.from_pretrained( + # repo_id="neopolita/firefunction-v1-gguf", + # filename="firefunction-v1_q2_k.gguf", + repo_id=settings.hf_model_repo_id, + filename=settings.model, + tokenizer=tokenizer, + n_gpu_layers=settings.n_gpu_layers, + main_gpu=settings.main_gpu, + tensor_split=settings.tensor_split, + vocab_only=settings.vocab_only, + use_mmap=settings.use_mmap, + use_mlock=settings.use_mlock, + kv_overrides=kv_overrides, + # Context Params + seed=settings.seed, + n_ctx=settings.n_ctx, + n_batch=settings.n_batch, + n_threads=settings.n_threads, + n_threads_batch=settings.n_threads_batch, + ) + elif settings.chat_format == "openfunctions": + _model = Llama( + model_path=settings.model, + tokenizer=tokenizer, + n_gpu_layers=settings.n_gpu_layers, + main_gpu=settings.main_gpu, + tensor_split=settings.tensor_split, + vocab_only=settings.vocab_only, + use_mmap=settings.use_mmap, + use_mlock=settings.use_mlock, + kv_overrides=kv_overrides, + # Context Params + seed=settings.seed, + n_ctx=settings.n_ctx, + n_batch=settings.n_batch, + n_threads=settings.n_threads, + n_threads_batch=settings.n_threads_batch, + ) + else: + _model = create_fn( + **kwargs, + # Model Params + n_gpu_layers=settings.n_gpu_layers, + main_gpu=settings.main_gpu, + tensor_split=settings.tensor_split, + vocab_only=settings.vocab_only, + use_mmap=settings.use_mmap, + use_mlock=settings.use_mlock, + kv_overrides=kv_overrides, + # Context Params + seed=settings.seed, + n_ctx=settings.n_ctx, + n_batch=settings.n_batch, + n_threads=settings.n_threads, + n_threads_batch=settings.n_threads_batch, + rope_scaling_type=settings.rope_scaling_type, + rope_freq_base=settings.rope_freq_base, + rope_freq_scale=settings.rope_freq_scale, + yarn_ext_factor=settings.yarn_ext_factor, + yarn_attn_factor=settings.yarn_attn_factor, + yarn_beta_fast=settings.yarn_beta_fast, + yarn_beta_slow=settings.yarn_beta_slow, + yarn_orig_ctx=settings.yarn_orig_ctx, + mul_mat_q=settings.mul_mat_q, + logits_all=settings.logits_all, + embedding=settings.embedding, + offload_kqv=settings.offload_kqv, + # Sampling Params + last_n_tokens_size=settings.last_n_tokens_size, + # LoRA Params + lora_base=settings.lora_base, + lora_path=settings.lora_path, + # Backend Params + numa=settings.numa, + # Chat Format Params + chat_format=settings.chat_format, + chat_handler=chat_handler, + # Speculative Decoding + draft_model=draft_model, + # KV Cache Quantization + type_k=settings.type_k, + type_v=settings.type_v, + # Tokenizer + tokenizer=tokenizer, + # Misc + verbose=settings.verbose, + ) + if settings.cache: + if settings.cache_type == "disk": + if settings.verbose: + print( + f"Using disk cache with size {settings.cache_size}") + cache = llama_cpp.LlamaDiskCache( + capacity_bytes=settings.cache_size) + else: + if settings.verbose: + print( + f"Using ram cache with size {settings.cache_size}") + cache = llama_cpp.LlamaRAMCache( + capacity_bytes=settings.cache_size) + _model.set_cache(cache) + return _model diff --git a/server.cfg b/server.cfg new file mode 100644 index 0000000..5cebfa5 --- /dev/null +++ b/server.cfg @@ -0,0 +1,138 @@ +{ + "host": "0.0.0.0", + "port": 8000, + "models": [ + { + "model": "/home/test/llm-models/chatglm3-ggml-q8.bin", + "model_alias": "chatglm3", + "chat_format": "chatglm3", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512 + }, + { + "model": "/home/test/llm-models/gorilla-openfunctions-v2-q4_K_M.gguf", + "hf_pretrained_model_name_or_path":"gorilla-llm/gorilla-openfunctions-v2", + "model_alias": "openfunctions", + "chat_format": "openfunctions", + "n_gpu_layers": 0, + "n_ctx":4096, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512 + }, + { + "model": "/home/test/llm-models/functionary-small-v2.4.Q4_0.gguf", + "model_alias": "functionary", + "chat_format": "functionary-v2", + "hf_pretrained_model_name_or_path":"meetkai/functionary-small-v2.4", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512, + "n_ctx": 8192, + "use_mmap":true + }, + { + "model": "/home/test/llm-models/bge-large-zh-v1.5-q4_k_m.gguf", + "model_alias": "bge-large-zh-v1.5", + "chat_format": "bert", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512 + }, + { + "model": "/home/test/llm-models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf", + "model_alias": "llama-3-8b", + "chat_format": "llama-3", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512, + "n_ctx": 8192, + "embedding": true + }, + { + "model": "/home/test/llm-models/gemma-7b-it.Q4_K_M.gguf", + "model_alias": "gemma-7b", + "chat_format": "gemma", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_ctx": 8192, + "n_batch": 512 + }, + { + "model": "/home/test/llm-models/ggml-model-q4_k.gguf", + "model_alias": "llava", + "chat_format": "llava-1-5", + "clip_model_path": "/home/test/llm-models/mmproj-model-f16.gguf", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_ctx": 4096, + "n_batch": 512 + }, + { + "model": "/home/test/llm-models/mistral-7b-instruct-v0.2.Q4_K_M.gguf", + "model_alias": "mistral-7b", + "chat_format": "mistral-instruct", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_ctx": 8192, + "n_batch": 512 + }, + { + "model": "/home/test/llm-models/mixtral-8x7b-instruct-v0.1.Q3_K_M.gguf", + "model_alias": "mixtral-8x7b-instruct", + "chat_format": "mistral-instruct", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_ctx": 8192, + "n_batch": 512 + }, + { + "model": "/home/test/llm-models/sqlcoder-7b-2.Q4_K_M.gguf", + "model_alias": "sqlcoder", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_ctx": 16384, + "n_batch": 1024 + }, + { + "model": "/home/test/llm-models/qwen1_5-14b-chat-q4_k_m.gguf", + "model_alias": "qwen", + "chat_format":"qwen", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 1024 + }, + { + "model": "/home/test/llm-models/Baichuan2-13B-Chat-Q4_K_M.gguf", + "model_alias": "baichuan-2", + "chat_format":"baichuan-2", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 1024 + }, + { + "model": "firefunction-v1_q2_k.gguf", + "hf_model_repo_id":"neopolita/firefunction-v1-gguf", + "model_alias": "firefunction", + "chat_format": "firefunction", + "n_gpu_layers": 0, + "n_ctx":4096, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512 + } + + ] +} diff --git a/settings.py b/settings.py new file mode 100644 index 0000000..934aecd --- /dev/null +++ b/settings.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import multiprocessing + +from typing import Optional, List, Literal, Union +from pydantic import Field +from pydantic_settings import BaseSettings + +import llama_cpp + +# Disable warning for model and model_alias settings +BaseSettings.model_config["protected_namespaces"] = () + + +class ModelSettings(BaseSettings): + """Model settings used to load a Llama model.""" + + model: str = Field( + description="The path to the model to use for generating completions." + ) + model_alias: Optional[str] = Field( + default=None, + description="The alias of the model to use for generating completions.", + ) + # Model Params + n_gpu_layers: int = Field( + default=0, + ge=-1, + description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.", + ) + split_mode: int = Field( + default=llama_cpp.LLAMA_SPLIT_MODE_LAYER, + description="The split mode to use.", + ) + main_gpu: int = Field( + default=0, + ge=0, + description="Main GPU to use.", + ) + tensor_split: Optional[List[float]] = Field( + default=None, + description="Split layers across multiple GPUs in proportion.", + ) + vocab_only: bool = Field( + default=False, description="Whether to only return the vocabulary." + ) + use_mmap: bool = Field( + default=llama_cpp.llama_supports_mmap(), + description="Use mmap.", + ) + use_mlock: bool = Field( + default=llama_cpp.llama_supports_mlock(), + description="Use mlock.", + ) + kv_overrides: Optional[List[str]] = Field( + default=None, + description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.", + ) + # Context Params + seed: int = Field( + default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random." + ) + n_ctx: int = Field(default=2048, ge=0, description="The context size.") + n_batch: int = Field( + default=512, ge=1, description="The batch size to use per eval." + ) + n_threads: int = Field( + default=max(multiprocessing.cpu_count() // 2, 1), + ge=1, + description="The number of threads to use.", + ) + n_threads_batch: int = Field( + default=max(multiprocessing.cpu_count(), 1), + ge=0, + description="The number of threads to use when batch processing.", + ) + rope_scaling_type: int = Field( + default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED + ) + rope_freq_base: float = Field(default=0.0, description="RoPE base frequency") + rope_freq_scale: float = Field( + default=0.0, description="RoPE frequency scaling factor" + ) + yarn_ext_factor: float = Field(default=-1.0) + yarn_attn_factor: float = Field(default=1.0) + yarn_beta_fast: float = Field(default=32.0) + yarn_beta_slow: float = Field(default=1.0) + yarn_orig_ctx: int = Field(default=0) + mul_mat_q: bool = Field( + default=True, description="if true, use experimental mul_mat_q kernels" + ) + logits_all: bool = Field(default=True, description="Whether to return logits.") + embedding: bool = Field(default=True, description="Whether to use embeddings.") + offload_kqv: bool = Field( + default=True, description="Whether to offload kqv to the GPU." + ) + # Sampling Params + last_n_tokens_size: int = Field( + default=64, + ge=0, + description="Last n tokens to keep for repeat penalty calculation.", + ) + # LoRA Params + lora_base: Optional[str] = Field( + default=None, + description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.", + ) + lora_path: Optional[str] = Field( + default=None, + description="Path to a LoRA file to apply to the model.", + ) + # Backend Params + numa: Union[bool, int] = Field( + default=False, + description="Enable NUMA support.", + ) + # Chat Format Params + chat_format: Optional[str] = Field( + default=None, + description="Chat format to use.", + ) + clip_model_path: Optional[str] = Field( + default=None, + description="Path to a CLIP model to use for multi-modal chat completion.", + ) + # Cache Params + cache: bool = Field( + default=False, + description="Use a cache to reduce processing times for evaluated prompts.", + ) + cache_type: Literal["ram", "disk"] = Field( + default="ram", + description="The type of cache to use. Only used if cache is True.", + ) + cache_size: int = Field( + default=2 << 30, + description="The size of the cache in bytes. Only used if cache is True.", + ) + # Tokenizer Options + hf_tokenizer_config_path: Optional[str] = Field( + default=None, + description="The path to a HuggingFace tokenizer_config.json file.", + ) + hf_pretrained_model_name_or_path: Optional[str] = Field( + default=None, + description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().", + ) + # Loading from HuggingFace Model Hub + hf_model_repo_id: Optional[str] = Field( + default=None, + description="The model repo id to use for the HuggingFace tokenizer model.", + ) + # Speculative Decoding + draft_model: Optional[str] = Field( + default=None, + description="Method to use for speculative decoding. One of (prompt-lookup-decoding).", + ) + draft_model_num_pred_tokens: int = Field( + default=10, + description="Number of tokens to predict using the draft model.", + ) + # KV Cache Quantization + type_k: Optional[int] = Field( + default=None, + description="Type of the key cache quantization.", + ) + type_v: Optional[int] = Field( + default=None, + description="Type of the value cache quantization.", + ) + # Misc + verbose: bool = Field( + default=True, description="Whether to print debug information." + ) + + +class ServerSettings(BaseSettings): + """Server settings used to configure the FastAPI and Uvicorn server.""" + + # Uvicorn Settings + host: str = Field(default="localhost", description="Listen address") + port: int = Field(default=8000, description="Listen port") + ssl_keyfile: Optional[str] = Field( + default=None, description="SSL key file for HTTPS" + ) + ssl_certfile: Optional[str] = Field( + default=None, description="SSL certificate file for HTTPS" + ) + # FastAPI Settings + api_key: Optional[str] = Field( + default=None, + description="API key for authentication. If set all requests need to be authenticated.", + ) + interrupt_requests: bool = Field( + default=True, + description="Whether to interrupt requests when a new request is received.", + ) + disable_ping_events: bool = Field( + default=False, + description="Disable EventSource pings (may be needed for some clients).", + ) + + +class Settings(ServerSettings, ModelSettings): + pass + + +class ConfigFileSettings(ServerSettings): + """Configuration file format settings.""" + + models: List[ModelSettings] = Field(default=[], description="Model configs")