diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/__main__.py b/__main__.py
new file mode 100644
index 0000000..2fbd575
--- /dev/null
+++ b/__main__.py
@@ -0,0 +1,98 @@
+"""Example FastAPI server for llama.cpp.
+
+To run this example:
+
+```bash
+pip install fastapi uvicorn sse-starlette pydantic-settings
+export MODEL=../models/7B/...
+```
+
+Then run:
+```
+uvicorn llama_cpp.server.app:create_app --reload
+```
+
+or
+
+```
+python3 -m llama_cpp.server
+```
+
+Then visit http://localhost:8000/docs to see the interactive API docs.
+
+"""
+from __future__ import annotations
+
+import os
+import sys
+import argparse
+
+import uvicorn
+
+from app import create_app
+from llama_cpp.server.settings import (
+    Settings,
+    ServerSettings,
+    ModelSettings,
+    ConfigFileSettings,
+)
+from llama_cpp.server.cli import add_args_from_model, parse_model_from_args
+
+
+def main():
+    description = "🦙 Llama.cpp python server. Host your own LLMs!🚀"
+    parser = argparse.ArgumentParser(description=description)
+
+    add_args_from_model(parser, Settings)
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        help="Path to a config file to load.",
+        default="/home/test/api_server.cfg",
+    )
+    server_settings: ServerSettings | None = None
+    model_settings: list[ModelSettings] = []
+    args = parser.parse_args()
+    try:
+        # Load server settings from config_file if provided
+        config_file = os.environ.get("CONFIG_FILE", args.config_file)
+        if config_file:
+            if not os.path.exists(config_file):
+                raise ValueError(f"Config file {config_file} not found!")
+            with open(config_file, "rb") as f:
+                # Check if yaml file
+                if config_file.endswith(".yaml") or config_file.endswith(".yml"):
+                    import yaml
+                    import json
+
+                    config_file_settings = ConfigFileSettings.model_validate_json(
+                        json.dumps(yaml.safe_load(f))
+                    )
+                else:
+                    config_file_settings = ConfigFileSettings.model_validate_json(f.read())
+                server_settings = ServerSettings.model_validate(config_file_settings)
+                model_settings = config_file_settings.models
+        else:
+            server_settings = parse_model_from_args(ServerSettings, args)
+            model_settings = [parse_model_from_args(ModelSettings, args)]
+    except Exception as e:
+        print(e, file=sys.stderr)
+        parser.print_help()
+        sys.exit(1)
+    assert server_settings is not None
+    assert model_settings is not None
+    app = create_app(
+        server_settings=server_settings,
+        model_settings=model_settings,
+    )
+    uvicorn.run(
+        app,
+        host=os.getenv("HOST", server_settings.host),
+        port=int(os.getenv("PORT", server_settings.port)),
+        ssl_keyfile=server_settings.ssl_keyfile,
+        ssl_certfile=server_settings.ssl_certfile,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/app.py b/app.py
new file mode 100644
index 0000000..ad11bd8
--- /dev/null
+++ b/app.py
@@ -0,0 +1,635 @@
+from __future__ import annotations
+
+import os
+import json
+
+from threading import Lock
+from functools import partial
+from typing import Iterator, List, Optional, Union, Dict
+import uuid
+import llama_cpp
+import chatglm
+import extends
+import anyio
+from anyio.streams.memory import MemoryObjectSendStream
+from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
+from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body
+from fastapi.middleware import Middleware
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.security import HTTPBearer
+from sse_starlette.sse import EventSourceResponse
+from starlette_context.plugins import RequestIdPlugin  # type: ignore
+from starlette_context.middleware import RawContextMiddleware
+
+from model import (
+    LlamaProxy,
+)
+from llama_cpp.server.settings import (
+    ConfigFileSettings,
+    Settings,
+    ModelSettings,
+    ServerSettings,
+)
+from llama_cpp.server.types import (
+    CreateCompletionRequest,
+    CreateEmbeddingRequest,
+    CreateChatCompletionRequest,
+    ModelList,
+    TokenizeInputRequest,
+    TokenizeInputResponse,
+    TokenizeInputCountResponse,
+    DetokenizeInputRequest,
+    DetokenizeInputResponse,
+)
+
+from llama_cpp.llama_types import (
+    ChatCompletionStreamResponseChoice,
+    ChatCompletionStreamResponseDelta,
+    ChatCompletionStreamResponseDeltaEmpty,
+)
+
+from llama_cpp.server.errors import RouteErrorHandler
+
+
+router = APIRouter(route_class=RouteErrorHandler)
+
+_server_settings: Optional[ServerSettings] = None
+
+
+def set_server_settings(server_settings: ServerSettings):
+    global _server_settings
+    _server_settings = server_settings
+
+
+def get_server_settings():
+    yield _server_settings
+
+
+_llama_proxy: Optional[LlamaProxy] = None
+
+llama_outer_lock = Lock()
+llama_inner_lock = Lock()
+
+
+def set_llama_proxy(model_settings: List[ModelSettings]):
+    global _llama_proxy
+    _llama_proxy = LlamaProxy(models=model_settings)
+
+
+def get_llama_proxy():
+    # NOTE: This double lock allows the currently streaming llama model to
+    # check if any other requests are pending in the same thread and cancel
+    # the stream if so.
+    llama_outer_lock.acquire()
+    release_outer_lock = True
+    try:
+        llama_inner_lock.acquire()
+        try:
+            llama_outer_lock.release()
+            release_outer_lock = False
+            yield _llama_proxy
+        finally:
+            llama_inner_lock.release()
+    finally:
+        if release_outer_lock:
+            llama_outer_lock.release()
+
+
+_ping_message_factory = None
+
+
+def set_ping_message_factory(factory):
+    global _ping_message_factory
+    _ping_message_factory = factory
+
+
+def create_app(
+    settings: Settings | None = None,
+    server_settings: ServerSettings | None = None,
+    model_settings: List[ModelSettings] | None = None,
+):
+    config_file = os.environ.get("CONFIG_FILE", None)
+    if config_file is not None:
+        if not os.path.exists(config_file):
+            raise ValueError(f"Config file {config_file} not found!")
+        with open(config_file, "rb") as f:
+            # Check if yaml file
+            if config_file.endswith(".yaml") or config_file.endswith(".yml"):
+                import yaml
+
+                config_file_settings = ConfigFileSettings.model_validate_json(
+                    json.dumps(yaml.safe_load(f))
+                )
+            else:
+                config_file_settings = ConfigFileSettings.model_validate_json(f.read())
+            server_settings = ServerSettings.model_validate(config_file_settings)
+            model_settings = config_file_settings.models
+
+    if server_settings is None and model_settings is None:
+        if settings is None:
+            settings = Settings()
+        server_settings = ServerSettings.model_validate(settings)
+        model_settings = [ModelSettings.model_validate(settings)]
+
+    assert (
+        server_settings is not None and model_settings is not None
+    ), "server_settings and model_settings must be provided together"
+
+    set_server_settings(server_settings)
+    middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))]
+    app = FastAPI(
+        middleware=middleware,
+        title="🦙 llama.cpp Python API",
+        version=llama_cpp.__version__,
+    )
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    app.include_router(router)
+
+    assert model_settings is not None
+    set_llama_proxy(model_settings=model_settings)
+
+    if server_settings.disable_ping_events:
+        set_ping_message_factory(lambda: bytes())
+
+    return app
+
+
+async def get_event_publisher(
+    request: Request,
+    inner_send_chan: MemoryObjectSendStream,
+    iterator: Iterator,
+):
+    async with inner_send_chan:
+        try:
+            async for chunk in iterate_in_threadpool(iterator):
+                await inner_send_chan.send(dict(data=json.dumps(chunk)))
+                if await request.is_disconnected():
+                    raise anyio.get_cancelled_exc_class()()
+                if (
+                    next(get_server_settings()).interrupt_requests
+                    and llama_outer_lock.locked()
+                ):
+                    await inner_send_chan.send(dict(data="[DONE]"))
+                    raise anyio.get_cancelled_exc_class()()
+            await inner_send_chan.send(dict(data="[DONE]"))
+        except anyio.get_cancelled_exc_class() as e:
+            print("disconnected")
+            with anyio.move_on_after(1, shield=True):
+                print(f"Disconnected from client (via refresh/close) {request.client}")
+                raise e
+
+
+def _logit_bias_tokens_to_input_ids(
+    llama: llama_cpp.Llama,
+    logit_bias: Dict[str, float],
+) -> Dict[str, float]:
+    to_bias: Dict[str, float] = {}
+    for token, score in logit_bias.items():
+        token = token.encode("utf-8")
+        for input_id in llama.tokenize(token, add_bos=False, special=True):
+            to_bias[str(input_id)] = score
+    return to_bias
+
+
+# Setup Bearer authentication scheme
+bearer_scheme = HTTPBearer(auto_error=False)
+
+
+async def authenticate(
+    settings: Settings = Depends(get_server_settings),
+    authorization: Optional[str] = Depends(bearer_scheme),
+):
+    # Skip API key check if it's not set in settings
+    if settings.api_key is None:
+        return True
+
+    # check bearer credentials against the api_key
+    if authorization and authorization.credentials == settings.api_key:
+        # api key is valid
+        return authorization.credentials
+
+    # raise http error 401
+    raise HTTPException(
+        status_code=status.HTTP_401_UNAUTHORIZED,
+        detail="Invalid API key",
+    )
+
+
+openai_v1_tag = "OpenAI V1"
+
+
+@router.post(
+    "/v1/completions",
+    summary="Completion",
+    dependencies=[Depends(authenticate)],
+    response_model=Union[
+        llama_cpp.CreateCompletionResponse,
+        str,
+    ],
+    responses={
+        "200": {
+            "description": "Successful Response",
+            "content": {
+                "application/json": {
+                    "schema": {
+                        "anyOf": [
+                            {"$ref": "#/components/schemas/CreateCompletionResponse"}
+                        ],
+                        "title": "Completion response, when stream=False",
+                    }
+                },
+                "text/event-stream": {
+                    "schema": {
+                        "type": "string",
+                        "title": "Server Side Streaming response, when stream=True. "
+                        + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",  # noqa: E501
+                        "example": """data: {... see CreateCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
+                    }
+                },
+            },
+        }
+    },
+    tags=[openai_v1_tag],
+)
+@router.post(
+    "/v1/engines/copilot-codex/completions",
+    include_in_schema=False,
+    dependencies=[Depends(authenticate)],
+    tags=[openai_v1_tag],
+)
+async def create_completion(
+    request: Request,
+    body: CreateCompletionRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> llama_cpp.Completion:
+    if isinstance(body.prompt, list):
+        assert len(body.prompt) <= 1
+        body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
+
+    llama = llama_proxy(
+        body.model
+        if request.url.path != "/v1/engines/copilot-codex/completions"
+        else "copilot-codex"
+    )
+
+    exclude = {
+        "n",
+        "best_of",
+        "logit_bias_type",
+        "user",
+    }
+    kwargs = body.model_dump(exclude=exclude)
+
+    if body.logit_bias is not None:
+        kwargs["logit_bias"] = (
+            _logit_bias_tokens_to_input_ids(llama, body.logit_bias)
+            if body.logit_bias_type == "tokens"
+            else body.logit_bias
+        )
+
+    if body.grammar is not None:
+        kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
+
+    iterator_or_completion: Union[
+        llama_cpp.CreateCompletionResponse,
+        Iterator[llama_cpp.CreateCompletionStreamResponse],
+    ] = await run_in_threadpool(llama, **kwargs)
+
+    if isinstance(iterator_or_completion, Iterator):
+        # EAFP: It's easier to ask for forgiveness than permission
+
+        first_response = await run_in_threadpool(next, iterator_or_completion)
+
+        # If no exception was raised from first_response, we can assume that
+        # the iterator is valid and we can use it to stream the response.
+        def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
+            yield first_response
+            yield from iterator_or_completion
+
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        return EventSourceResponse(
+            recv_chan,
+            data_sender_callable=partial(  # type: ignore
+                get_event_publisher,
+                request=request,
+                inner_send_chan=send_chan,
+                iterator=iterator(),
+            ),
+            sep="\n",
+            ping_message_factory=_ping_message_factory,
+        )
+    else:
+        return iterator_or_completion
+
+
+@router.post(
+    "/v1/embeddings",
+    summary="Embedding",
+    dependencies=[Depends(authenticate)],
+    tags=[openai_v1_tag],
+)
+async def create_embedding(
+    request: CreateEmbeddingRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+):
+    return await run_in_threadpool(
+        llama_proxy(request.model).create_embedding,
+        **request.model_dump(exclude={"user"}),
+    )
+
+
+@router.post(
+    "/v1/chat/completions",
+    summary="Chat",
+    dependencies=[Depends(authenticate)],
+    response_model=Union[llama_cpp.ChatCompletion, str],
+    responses={
+        "200": {
+            "description": "Successful Response",
+            "content": {
+                "application/json": {
+                    "schema": {
+                        "anyOf": [
+                            {
+                                "$ref": "#/components/schemas/CreateChatCompletionResponse"
+                            }
+                        ],
+                        "title": "Completion response, when stream=False",
+                    }
+                },
+                "text/event-stream": {
+                    "schema": {
+                        "type": "string",
+                        "title": "Server Side Streaming response, when stream=True"
+                        + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",  # noqa: E501
+                        "example": """data: {... see CreateChatCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
+                    }
+                },
+            },
+        }
+    },
+    tags=[openai_v1_tag],
+)
+async def create_chat_completion(
+    request: Request,
+    body: CreateChatCompletionRequest = Body(
+        openapi_examples={
+            "normal": {
+                "summary": "Chat Completion",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "What is the capital of France?"},
+                    ],
+                },
+            },
+            "json_mode": {
+                "summary": "JSON Mode",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "Who won the world series in 2020"},
+                    ],
+                    "response_format": {"type": "json_object"},
+                },
+            },
+            "tool_calling": {
+                "summary": "Tool Calling",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "Extract Jason is 30 years old."},
+                    ],
+                    "tools": [
+                        {
+                            "type": "function",
+                            "function": {
+                                "name": "User",
+                                "description": "User record",
+                                "parameters": {
+                                    "type": "object",
+                                    "properties": {
+                                        "name": {"type": "string"},
+                                        "age": {"type": "number"},
+                                    },
+                                    "required": ["name", "age"],
+                                },
+                            },
+                        }
+                    ],
+                    "tool_choice": {
+                        "type": "function",
+                        "function": {
+                            "name": "User",
+                        },
+                    },
+                },
+            },
+            "logprobs": {
+                "summary": "Logprobs",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "What is the capital of France?"},
+                    ],
+                    "logprobs": True,
+                    "top_logprobs": 10,
+                },
+            },
+        }
+    ),
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> llama_cpp.ChatCompletion:
+    exclude = {
+        "n",
+        "logit_bias_type",
+        "user",
+    }
+    print(body)
+    kwargs = body.model_dump(exclude=exclude)
+    llama = llama_proxy(body.model)
+    if body.logit_bias is not None:
+        kwargs["logit_bias"] = (
+            _logit_bias_tokens_to_input_ids(llama, body.logit_bias)
+            if body.logit_bias_type == "tokens"
+            else body.logit_bias
+        )
+
+    if body.grammar is not None:
+        kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
+
+    model_settings = llama_proxy._model_settings_dict[body.model]
+    model_chat_format = model_settings.chat_format
+    if model_chat_format == "chatglm3":        
+        max_context_length = model_settings.n_ctx
+        num_threads = model_settings.n_threads
+        chatglm_pipeline = llama
+        if body.stream:
+            iterator = chatglm.stream_chat(
+                chatglm_pipeline, body, max_context_length, num_threads
+            )
+            send_chan, recv_chan = anyio.create_memory_object_stream(10)
+            return EventSourceResponse(
+                recv_chan,
+                data_sender_callable=partial(  # type: ignore
+                    get_event_publisher,
+                    request=request,
+                    inner_send_chan=send_chan,
+                    iterator=iterator,
+                ),
+                sep="\n",
+                ping_message_factory=_ping_message_factory,
+            )
+        else:
+            return chatglm.create_chat_completion(
+                chatglm_pipeline, body, max_context_length, num_threads
+            )
+    if model_chat_format == "functionary-v2" and body.stream:        
+        iterator = extends.functionary_stream_chat( body, llama)
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        return EventSourceResponse(
+            recv_chan,
+            data_sender_callable=partial(  # type: ignore
+                get_event_publisher,
+                request=request,
+                inner_send_chan=send_chan,
+                iterator=iterator,
+            ),
+            sep="\n",
+            ping_message_factory=_ping_message_factory,
+        )        
+
+    elif model_chat_format == "openfunctions":       
+        if body.stream:
+            iterator = extends.openfunction_stream_chat( body, llama)
+            send_chan, recv_chan = anyio.create_memory_object_stream(10)
+            return EventSourceResponse(
+                recv_chan,
+                data_sender_callable=partial(  # type: ignore
+                    get_event_publisher,
+                    request=request,
+                    inner_send_chan=send_chan,
+                    iterator=iterator,
+                ),
+                sep="\n",
+                ping_message_factory=_ping_message_factory,
+            )
+        else:
+            return extends.handle_openfunction(body, llama)
+
+    elif model_chat_format == "firefunction":
+        return extends.handle_firefunction(body, llama)
+    
+    else:
+        iterator_or_completion: Union[
+            llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
+        ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
+
+        if isinstance(iterator_or_completion, Iterator):
+            # EAFP: It's easier to ask for forgiveness than permission
+            first_response = await run_in_threadpool(next, iterator_or_completion)
+
+            print(type(first_response))
+            print(first_response)
+
+            # If no exception was raised from first_response, we can assume that
+            # the iterator is valid and we can use it to stream the response.
+            def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
+                yield first_response
+                yield from iterator_or_completion
+
+            send_chan, recv_chan = anyio.create_memory_object_stream(10)
+            return EventSourceResponse(
+                recv_chan,
+                data_sender_callable=partial(  # type: ignore
+                    get_event_publisher,
+                    request=request,
+                    inner_send_chan=send_chan,
+                    iterator=iterator(),
+                ),
+                sep="\n",
+                ping_message_factory=_ping_message_factory,
+            )
+        else:
+            return iterator_or_completion
+
+
+@router.get(
+    "/v1/models",
+    summary="Models",
+    dependencies=[Depends(authenticate)],
+    tags=[openai_v1_tag],
+)
+async def get_models(
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> ModelList:
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": model_alias,
+                "object": "model",
+                "owned_by": "me",
+                "permissions": [],
+            }
+            for model_alias in llama_proxy
+        ],
+    }
+
+
+extras_tag = "Extras"
+
+
+@router.post(
+    "/extras/tokenize",
+    summary="Tokenize",
+    dependencies=[Depends(authenticate)],
+    tags=[extras_tag],
+)
+async def tokenize(
+    body: TokenizeInputRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> TokenizeInputResponse:
+    tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
+
+    return TokenizeInputResponse(tokens=tokens)
+
+
+@router.post(
+    "/extras/tokenize/count",
+    summary="Tokenize Count",
+    dependencies=[Depends(authenticate)],
+    tags=[extras_tag],
+)
+async def count_query_tokens(
+    body: TokenizeInputRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> TokenizeInputCountResponse:
+    tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
+
+    return TokenizeInputCountResponse(count=len(tokens))
+
+
+@router.post(
+    "/extras/detokenize",
+    summary="Detokenize",
+    dependencies=[Depends(authenticate)],
+    tags=[extras_tag],
+)
+async def detokenize(
+    body: DetokenizeInputRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> DetokenizeInputResponse:
+    text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")
+
+    return DetokenizeInputResponse(text=text)
diff --git a/chatglm.py b/chatglm.py
new file mode 100644
index 0000000..fd0c01e
--- /dev/null
+++ b/chatglm.py
@@ -0,0 +1,165 @@
+import asyncio
+import json
+import logging
+import time
+import os
+import uuid
+import chatglm_cpp
+import llama_cpp
+from sse_starlette.sse import EventSourceResponse
+from pprint import pprint
+from fastapi import HTTPException, status
+from llama_cpp.server.types import ChatCompletionRequestMessage
+from llama_cpp.llama_types import (
+    ChatCompletionResponseChoice,
+    ChatCompletionMessageToolCall,
+    ChatCompletionStreamResponseChoice,
+    CreateChatCompletionStreamResponse,
+    CompletionUsage,
+    ChatCompletionStreamResponseDelta,
+    CreateChatCompletionResponse,
+    ChatCompletionResponseMessage,
+    ChatCompletionRequestAssistantMessage,
+    ChatCompletionMessageToolCallFunction,
+    ChatCompletionStreamResponseDeltaEmpty
+)
+
+
+def _buid_msg(body: ChatCompletionRequestMessage):
+    messages = [
+        chatglm_cpp.ChatMessage(role=msg["role"], content=msg["content"])
+        for msg in body.messages
+    ]
+    if body.tools:
+        system_content = (
+            "Answer the following questions as best as you can. You have access to the following tools:\n"
+            + json.dumps(body.tools, indent=4)
+        )
+        messages.insert(
+            0, chatglm_cpp.ChatMessage(
+                role="system", content=system_content)
+        )
+    return messages
+
+
+def stream_chat(chatglm_pipeline: chatglm_cpp.Pipeline, body: ChatCompletionRequestMessage, max_context_length: int, num_threads: int):
+    max_tokens = 1024
+    if body.max_tokens:
+        max_tokens = body.max_tokens
+
+    for chunk in chatglm_pipeline.chat(
+        messages=_buid_msg(body),
+        max_length=max_tokens,
+        max_context_length=max_context_length,
+        do_sample=body.temperature > 0,
+        top_p=body.top_p,
+        temperature=body.temperature,
+        num_threads=num_threads,
+        stream=True,
+    ):
+        choices = [ChatCompletionStreamResponseChoice(
+            index=1,
+            delta=ChatCompletionStreamResponseDelta(
+                content=chunk.content, role=chunk.role),
+            finish_reason=None,
+            logprobs=None,
+        )]
+        chunk= llama_cpp.ChatCompletionChunk(
+            id="chatcmpl-" + uuid.uuid4().hex,
+            model=body.model,
+            object="chat.completion.chunk",
+            created=int(time.time()),
+            choices=choices,
+        )       
+        yield chunk
+
+
+def create_chat_completion(chatglm_pipeline: chatglm_cpp.Pipeline, body: ChatCompletionRequestMessage, max_context_length: int, num_threads: int) -> CreateChatCompletionResponse:
+    def to_json_arguments(arguments):
+        def tool_call(**kwargs):
+            return kwargs
+
+        try:
+            return json.dumps(eval(arguments, dict(tool_call=tool_call)))
+        except Exception:
+            return arguments
+    if not body.messages:
+        raise HTTPException(status.HTTP_400_BAD_REQUEST, "empty messages")
+
+    max_tokens = 2048
+    if body.max_tokens:
+        max_tokens = body.max_tokens
+
+    messages = _buid_msg(body)
+
+    output = chatglm_pipeline.chat(
+        messages=messages,
+        max_length=max_tokens,
+        max_context_length=max_context_length,
+        do_sample=body.temperature > 0,
+        top_p=body.top_p,
+        temperature=body.temperature,
+        num_threads=num_threads,
+    )
+    logging.info(
+        f'prompt: "{messages[-1].content}", sync response: "{output.content}"')
+    prompt_tokens = len(
+        chatglm_pipeline.tokenizer.encode_messages(
+            messages, max_context_length)
+    )
+    completion_tokens = len(
+        chatglm_pipeline.tokenizer.encode(output.content, max_tokens))
+
+    finish_reason = "stop"
+    tool_calls = None
+    if output.tool_calls:
+        tool_calls = [
+            ChatCompletionMessageToolCall(
+                id="tool_call_" + uuid.uuid4().hex,
+                type=tool_call.type,
+                function=ChatCompletionMessageToolCallFunction(
+                    name=tool_call.function.name,
+                    arguments=to_json_arguments(tool_call.function.arguments),
+                ),
+            )
+            for tool_call in output.tool_calls
+        ]
+        finish_reason = "function_call"
+
+    if tool_calls is None:
+        choices = [
+            ChatCompletionResponseChoice(
+                index=0,
+                message=ChatCompletionResponseMessage(
+                    role="assistant", content=output.content
+                ),
+                finish_reason=finish_reason,
+                logprobs=None,
+            )
+        ]
+    else:
+        choices = [
+            ChatCompletionResponseChoice(
+                index=0,
+                message=ChatCompletionRequestAssistantMessage(
+                    role="assistant", content=output.content, tool_calls=tool_calls
+                ),
+                finish_reason=finish_reason,
+                logprobs=None,
+            )
+        ]
+
+    response = CreateChatCompletionResponse(
+        id="chatcmpl",
+        object="chat.completion",
+        created=int(time.time()),
+        model="chatglm",
+        choices=choices,
+        usage=CompletionUsage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        ),
+    )
+    print(response)
+    return response
diff --git a/cli.py b/cli.py
new file mode 100644
index 0000000..3dd0076
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+import argparse
+
+from typing import List, Literal, Union, Any, Type, TypeVar
+
+from pydantic import BaseModel
+
+
+def _get_base_type(annotation: Type[Any]) -> Type[Any]:
+    if getattr(annotation, "__origin__", None) is Literal:
+        assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
+        return type(annotation.__args__[0])  # type: ignore
+    elif getattr(annotation, "__origin__", None) is Union:
+        assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
+        non_optional_args: List[Type[Any]] = [
+            arg for arg in annotation.__args__ if arg is not type(None)  # type: ignore
+        ]
+        if non_optional_args:
+            return _get_base_type(non_optional_args[0])
+    elif (
+        getattr(annotation, "__origin__", None) is list
+        or getattr(annotation, "__origin__", None) is List
+    ):
+        assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
+        return _get_base_type(annotation.__args__[0])  # type: ignore
+    return annotation
+
+
+def _contains_list_type(annotation: Type[Any] | None) -> bool:
+    origin = getattr(annotation, "__origin__", None)
+
+    if origin is list or origin is List:
+        return True
+    elif origin in (Literal, Union):
+        return any(_contains_list_type(arg) for arg in annotation.__args__)  # type: ignore
+    else:
+        return False
+
+
+def _parse_bool_arg(arg: str | bytes | bool) -> bool:
+    if isinstance(arg, bytes):
+        arg = arg.decode("utf-8")
+
+    true_values = {"1", "on", "t", "true", "y", "yes"}
+    false_values = {"0", "off", "f", "false", "n", "no"}
+
+    arg_str = str(arg).lower().strip()
+
+    if arg_str in true_values:
+        return True
+    elif arg_str in false_values:
+        return False
+    else:
+        raise ValueError(f"Invalid boolean argument: {arg}")
+
+
+def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]):
+    """Add arguments from a pydantic model to an argparse parser."""
+
+    for name, field in model.model_fields.items():
+        description = field.description
+        if field.default and description and not field.is_required():
+            description += f" (default: {field.default})"
+        base_type = (
+            _get_base_type(field.annotation) if field.annotation is not None else str
+        )
+        list_type = _contains_list_type(field.annotation)
+        if base_type is not bool:
+            parser.add_argument(
+                f"--{name}",
+                dest=name,
+                nargs="*" if list_type else None,
+                type=base_type,
+                help=description,
+            )
+        if base_type is bool:
+            parser.add_argument(
+                f"--{name}",
+                dest=name,
+                type=_parse_bool_arg,
+                help=f"{description}",
+            )
+
+
+T = TypeVar("T", bound=Type[BaseModel])
+
+
+def parse_model_from_args(model: T, args: argparse.Namespace) -> T:
+    """Parse a pydantic model from an argparse namespace."""
+    return model(
+        **{
+            k: v
+            for k, v in vars(args).items()
+            if v is not None and k in model.model_fields
+        }
+    )
diff --git a/errors.py b/errors.py
new file mode 100644
index 0000000..fbf9fd8
--- /dev/null
+++ b/errors.py
@@ -0,0 +1,210 @@
+from __future__ import annotations
+
+import sys
+import traceback
+import time
+from re import compile, Match, Pattern
+from typing import Callable, Coroutine, Optional, Tuple, Union, Dict
+from typing_extensions import TypedDict
+
+
+from fastapi import (
+    Request,
+    Response,
+    HTTPException,
+)
+from fastapi.responses import JSONResponse
+from fastapi.routing import APIRoute
+
+from llama_cpp.server.types import (
+    CreateCompletionRequest,
+    CreateEmbeddingRequest,
+    CreateChatCompletionRequest,
+)
+
+
+class ErrorResponse(TypedDict):
+    """OpenAI style error response"""
+
+    message: str
+    type: str
+    param: Optional[str]
+    code: Optional[str]
+
+
+class ErrorResponseFormatters:
+    """Collection of formatters for error responses.
+
+    Args:
+        request (Union[CreateCompletionRequest, CreateChatCompletionRequest]):
+            Request body
+        match (Match[str]): Match object from regex pattern
+
+    Returns:
+        Tuple[int, ErrorResponse]: Status code and error response
+    """
+
+    @staticmethod
+    def context_length_exceeded(
+        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        match,  # type: Match[str] # type: ignore
+    ) -> Tuple[int, ErrorResponse]:
+        """Formatter for context length exceeded error"""
+
+        context_window = int(match.group(2))
+        prompt_tokens = int(match.group(1))
+        completion_tokens = request.max_tokens
+        if hasattr(request, "messages"):
+            # Chat completion
+            message = (
+                "This model's maximum context length is {} tokens. "
+                "However, you requested {} tokens "
+                "({} in the messages, {} in the completion). "
+                "Please reduce the length of the messages or completion."
+            )
+        else:
+            # Text completion
+            message = (
+                "This model's maximum context length is {} tokens, "
+                "however you requested {} tokens "
+                "({} in your prompt; {} for the completion). "
+                "Please reduce your prompt; or completion length."
+            )
+        return 400, ErrorResponse(
+            message=message.format(
+                context_window,
+                (completion_tokens or 0) + prompt_tokens,
+                prompt_tokens,
+                completion_tokens,
+            ),  # type: ignore
+            type="invalid_request_error",
+            param="messages",
+            code="context_length_exceeded",
+        )
+
+    @staticmethod
+    def model_not_found(
+        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        match,  # type: Match[str] # type: ignore
+    ) -> Tuple[int, ErrorResponse]:
+        """Formatter for model_not_found error"""
+
+        model_path = str(match.group(1))
+        message = f"The model `{model_path}` does not exist"
+        return 400, ErrorResponse(
+            message=message,
+            type="invalid_request_error",
+            param=None,
+            code="model_not_found",
+        )
+
+
+class RouteErrorHandler(APIRoute):
+    """Custom APIRoute that handles application errors and exceptions"""
+
+    # key: regex pattern for original error message from llama_cpp
+    # value: formatter function
+    pattern_and_formatters: Dict[
+        "Pattern[str]",
+        Callable[
+            [
+                Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+                "Match[str]",
+            ],
+            Tuple[int, ErrorResponse],
+        ],
+    ] = {
+        compile(
+            r"Requested tokens \((\d+)\) exceed context window of (\d+)"
+        ): ErrorResponseFormatters.context_length_exceeded,
+        compile(
+            r"Model path does not exist: (.+)"
+        ): ErrorResponseFormatters.model_not_found,
+    }
+
+    def error_message_wrapper(
+        self,
+        error: Exception,
+        body: Optional[
+            Union[
+                "CreateChatCompletionRequest",
+                "CreateCompletionRequest",
+                "CreateEmbeddingRequest",
+            ]
+        ] = None,
+    ) -> Tuple[int, ErrorResponse]:
+        """Wraps error message in OpenAI style error response"""
+        print(f"Exception: {str(error)}", file=sys.stderr)
+        traceback.print_exc(file=sys.stderr)
+        if body is not None and isinstance(
+            body,
+            (
+                CreateCompletionRequest,
+                CreateChatCompletionRequest,
+            ),
+        ):
+            # When text completion or chat completion
+            for pattern, callback in self.pattern_and_formatters.items():
+                match = pattern.search(str(error))
+                if match is not None:
+                    return callback(body, match)
+
+        # Wrap other errors as internal server error
+        return 500, ErrorResponse(
+            message=str(error),
+            type="internal_server_error",
+            param=None,
+            code=None,
+        )
+
+    def get_route_handler(
+        self,
+    ) -> Callable[[Request], Coroutine[None, None, Response]]:
+        """Defines custom route handler that catches exceptions and formats
+        in OpenAI style error response"""
+
+        original_route_handler = super().get_route_handler()
+
+        async def custom_route_handler(request: Request) -> Response:
+            try:
+                start_sec = time.perf_counter()
+                response = await original_route_handler(request)
+                elapsed_time_ms = int((time.perf_counter() - start_sec) * 1000)
+                response.headers["openai-processing-ms"] = f"{elapsed_time_ms}"
+                return response
+            except HTTPException as unauthorized:
+                # api key check failed
+                raise unauthorized
+            except Exception as exc:
+                json_body = await request.json()
+                try:
+                    if "messages" in json_body:
+                        # Chat completion
+                        body: Optional[
+                            Union[
+                                CreateChatCompletionRequest,
+                                CreateCompletionRequest,
+                                CreateEmbeddingRequest,
+                            ]
+                        ] = CreateChatCompletionRequest(**json_body)
+                    elif "prompt" in json_body:
+                        # Text completion
+                        body = CreateCompletionRequest(**json_body)
+                    else:
+                        # Embedding
+                        body = CreateEmbeddingRequest(**json_body)
+                except Exception:
+                    # Invalid request body
+                    body = None
+
+                # Get proper error message from the exception
+                (
+                    status_code,
+                    error_message,
+                ) = self.error_message_wrapper(error=exc, body=body)
+                return JSONResponse(
+                    {"error": error_message},
+                    status_code=status_code,
+                )
+
+        return custom_route_handler
diff --git a/extends.py b/extends.py
new file mode 100644
index 0000000..0986e8f
--- /dev/null
+++ b/extends.py
@@ -0,0 +1,400 @@
+from llama_cpp.server.types import (
+    CreateCompletionRequest,
+    CreateEmbeddingRequest,
+    CreateChatCompletionRequest,
+    ChatCompletionRequestMessage,
+)
+from llama_cpp.llama_types import (
+    ChatCompletionResponseChoice,
+    ChatCompletionMessageToolCall,
+    ChatCompletionStreamResponseChoice,
+    CreateChatCompletionStreamResponse,
+    CompletionUsage,
+    ChatCompletionStreamResponseDelta,
+    CreateChatCompletionResponse,
+    ChatCompletionResponseMessage,
+    ChatCompletionRequestAssistantMessage,
+    ChatCompletionMessageToolCallFunction,
+    ChatCompletionStreamResponseDeltaEmpty,
+    ChatCompletionMessageToolCallChunk,
+    ChatCompletionMessageToolCallChunkFunction,
+)
+import json
+import uuid
+import ast
+import llama_cpp
+import time
+
+
+def process_ast_node(node):
+    # Check if the node is a function call
+    if isinstance(node, ast.Call):
+        # Return a string representation of the function call
+        return ast.unparse(node)
+    else:
+        # Convert the node to source code and evaluate to get the value
+        node_str = ast.unparse(node)
+        return eval(node_str)
+
+
+def parse_python_function_call(call_str):
+    tree = ast.parse(call_str)
+    expr = tree.body[0]
+
+    call_node = expr.value
+    function_name = (
+        call_node.func.id
+        if isinstance(call_node.func, ast.Name)
+        else str(call_node.func)
+    )
+
+    parameters = {}
+    noNameParam = []
+
+    # Process positional arguments
+    for arg in call_node.args:
+        noNameParam.append(process_ast_node(arg))
+
+    # Process keyword arguments
+    for kw in call_node.keywords:
+        parameters[kw.arg] = process_ast_node(kw.value)
+
+    if noNameParam:
+        parameters["None"] = noNameParam
+
+    function_dict = {"name": function_name, "arguments": parameters}
+    return function_dict
+
+
+FN_CALL_DELIMITER = "<<function>>"
+
+
+def strip_function_calls(content: str) -> list[str]:
+    """
+    Split the content by the function call delimiter and remove empty strings
+    """
+    return [
+        element.strip()
+        for element in content.split(FN_CALL_DELIMITER)[1:]
+        if element.strip()
+    ]
+
+
+def parse_function_call(call: str) -> dict[str, any]:
+    """
+    This is temporary. The long term solution is to union all the
+    types of the parameters from the user's input function definition,
+    and check which language is a proper super set of the union type.
+    """
+    try:
+        return parse_python_function_call(call)
+    except Exception as e:
+        # If Python parsing fails, try Java parsing
+
+        return None
+
+
+def get_openfunctions_prompt(messages: list = [], functions: list = []) -> str:
+    """
+    Generates a conversation prompt based on the user's query and a list of functions.
+
+    Parameters:
+    - user_query (str): The user's query.
+    - functions (list): A list of functions to include in the prompt.
+
+    Returns:
+    - str: The formatted conversation prompt.
+    """
+    system = "You are an AI programming assistant"  # , utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer."
+    if len(messages) > 0:
+        if messages[0]["role"] == "system":
+            system = messages[0]["content"]
+            messages = messages[1:]
+
+    user_query = ""
+    for message in messages:
+        if message["role"] == "user":
+            user_query += "### Instruction: <<question>> " + message["content"] + "\n"
+        elif message["role"] == "system":
+            user_query += message["content"]
+        else:
+            user_query += "### Response:\n" + message["content"] + "\n<|EOT|>\n"
+
+    if len(functions) == 0:
+        return f"{system}\n### Instruction: <<question>> {user_query}\n### Response: "
+    functions_string = json.dumps(functions)
+    result = f"<｜begin▁of▁sentence｜>{system}\n### Instruction: <<function>>{functions_string}\n{user_query} ### Response: "
+
+    print(result)
+    return result
+
+
+def format_response(response):
+    result = []
+    choices = response["choices"]
+    for choice in choices:
+        text = choice["text"]
+        calls = strip_function_calls(text)
+        tool_calls = []
+        for function_call in calls:
+            fc = parse_function_call(function_call)
+            if fc is not None:
+                tool_calls.append(
+                    {
+                        "id": "call_" + uuid.uuid4().hex,
+                        "function": {
+                            "name": fc["name"],
+                            "arguments": json.dumps(fc["arguments"]),
+                        },
+                        "type": "function",
+                    }
+                )
+        result.append(
+            {
+                "finish_reason": choice["finish_reason"],
+                "index": choice["index"],
+                "logprobs": choice["logprobs"],
+                "message": {
+                    "content": text,
+                    "role": "assistant",
+                    "tool_calls": tool_calls,
+                },
+            }
+        )
+    return result
+
+
+def handle_openfunction(body: CreateCompletionRequest, llama) -> any:
+    tools = body.tools
+    if tools is None:
+        tools = []
+    user_prompt = get_openfunctions_prompt(body.messages, tools)
+    response = llama(
+        user_prompt,
+        stop=["<|EOT|>"],
+        temperature=body.temperature,
+        top_p=body.top_p,
+        max_tokens=body.max_tokens,
+    )
+    tool_calls = format_response(response)
+    return {
+        "id": response["id"],
+        "object": "chat.completion",
+        "created": response["created"],
+        "model": body.model,
+        "choices": tool_calls,
+        "usage": {
+            "prompt_tokens": response["usage"]["prompt_tokens"],
+            "completion_tokens": response["usage"]["completion_tokens"],
+            "total_tokens": response["usage"]["total_tokens"],
+        },
+    }
+
+
+def openfunction_stream_chat(body: CreateCompletionRequest, llama) -> any:
+    tools = body.tools
+    if tools is None:
+        tools = []
+    user_prompt = get_openfunctions_prompt(body.messages, tools)
+    if tools is None:
+        for chunk in llama(
+            user_prompt,
+            stop=["<|EOT|>"],
+            temperature=body.temperature,
+            top_p=body.top_p,
+            max_tokens=body.max_tokens,
+            stream=True,
+        ):
+            choices = [
+                ChatCompletionStreamResponseChoice(
+                    index=1,
+                    delta=ChatCompletionStreamResponseDelta(
+                        content=choice["text"],
+                    ),
+                    finish_reason=choice["finish_reason"],
+                    logprobs=choice["logprobs"],
+                )
+                for choice in chunk["choices"]
+            ]
+            chatCompletionChunk = llama_cpp.ChatCompletionChunk(
+                id="chatcmpl-" + uuid.uuid4().hex,
+                model=body.model,
+                object="chat.completion.chunk",
+                created=int(time.time()),
+                choices=choices,
+            )
+            yield chatCompletionChunk
+    else:
+        response = llama(
+            user_prompt,
+            stop=["<|EOT|>"],
+            temperature=body.temperature,
+            top_p=body.top_p,
+            max_tokens=body.max_tokens,
+            stream=False,
+        )
+        tool_calls = format_response(response)
+        choices = response["choices"]
+        stream_response_choices = []
+        choices_index = 0
+        for choice in choices:
+            text = choice["text"]
+            calls = strip_function_calls(text)
+            tool_calls = []
+            index = 0
+            for function_call in calls:
+                fc = parse_function_call(function_call)
+                if fc is not None:
+                    tool_calls.append(
+                        ChatCompletionMessageToolCallChunk(
+                            index=index,
+                            id="call_" + uuid.uuid4().hex,
+                            type="function",
+                            function=ChatCompletionMessageToolCallChunkFunction(
+                                name=fc["name"],
+                                arguments=json.dumps(fc["arguments"]),
+                            ),
+                        )
+                    )
+                index += 1
+            stream_response_choices.append(
+                ChatCompletionStreamResponseChoice(
+                    index=choices_index,
+                    delta=ChatCompletionStreamResponseDelta(
+                        content=json.dumps(calls),
+                        finish_reason=choice["finish_reason"],
+                        logprobs=choice["logprobs"],
+                        tool_calls=tool_calls,
+                    ),
+                )
+            )
+            choices_index += 1
+
+        chatCompletionChunk = llama_cpp.ChatCompletionChunk(
+            id="chatcmpl-" + uuid.uuid4().hex,
+            model=body.model,
+            object="chat.completion.chunk",
+            created=int(time.time()),
+            choices=stream_response_choices,
+        )
+        yield chatCompletionChunk
+
+
+def functionary_stream_chat(body: CreateCompletionRequest, llama) -> any:
+    response = llama.create_chat_completion(
+        messages=body.messages, tools=body.tools, tool_choice="auto", stream=False
+    ) 
+    stream_response_choices = []
+    choices = response["choices"]
+    choices_index = 0
+    for choice in choices:
+        tool_calls = []
+        calls = choice["message"]["tool_calls"]
+        index = 0
+        for call in calls:
+            tool_calls.append(
+                ChatCompletionMessageToolCallChunk(
+                    index=index,
+                    id=call["id"],
+                    type=call["type"],
+                    function=ChatCompletionMessageToolCallChunkFunction(
+                        name=call["function"]["name"],
+                        arguments=json.dumps(call["function"]["arguments"]),
+                    ),
+                )
+            )
+            index += 1
+        stream_response_choices.append(
+            ChatCompletionStreamResponseChoice(
+                index=choices_index,
+                delta=ChatCompletionStreamResponseDelta(
+                    content=None,
+                    finish_reason=choice["finish_reason"],
+                    logprobs=choice["logprobs"],
+                    tool_calls=tool_calls,
+                ),
+            )
+        )
+        choices_index += 1
+
+    chatCompletionChunk = llama_cpp.ChatCompletionChunk(
+        id=response["id"],
+        model=body.model,
+        object="chat.completion.chunk",
+        created=response["created"],
+        choices=stream_response_choices,
+    )
+    print(chatCompletionChunk)
+    yield chatCompletionChunk
+
+
+def handle_firefunction(body: CreateChatCompletionRequest, llama) -> any:
+    messages = []
+    function_spec = json.dumps(body.tools)
+    messages.append({"role": "functions", "content": function_spec})
+    for message in body.messages:
+        messages.append({"role": message["role"], "content": message["content"]})
+    response = llama.create_chat_completion(
+        messages=messages,
+        tools=function_spec,
+        tool_choice="auto",
+        temperature=body.temperature,
+        top_p=body.top_p,
+        logprobs=body.logprobs,
+        max_tokens=body.max_tokens,
+    )
+    choices = []
+    for choice in response["choices"]:
+        message_content = choice["message"]["content"]
+        if "<functioncall>" in message_content:
+            function_call_json = message_content[len("<functioncall>") :]
+            function_call_data = json.loads(function_call_json)
+            choices.append(
+                {
+                    "index": choice["index"],
+                    "logprobs": choice["logprobs"],
+                    "finish_reason": choice["finish_reason"],
+                    "message": {
+                        "content": message_content,
+                        "role": choice["message"]["role"],
+                        "tool_calls": [
+                            {
+                                "id": "tool_call_" + uuid.uuid4().hex,
+                                "type": "function",
+                                "function": {
+                                    "name": function_call_data["name"],
+                                    "arguments": json.dumps(
+                                        function_call_data["arguments"]
+                                    ),
+                                },
+                            }
+                        ],
+                    },
+                }
+            )
+        else:
+            choices.append(
+                {
+                    "index": choice["index"],
+                    "logprobs": choice["logprobs"],
+                    "finish_reason": choice["finish_reason"],
+                    "message": {
+                        "content": message_content,
+                        "role": choice["message"]["role"],
+                    },
+                }
+            )
+
+    result = {
+        "id": response["id"],
+        "object": response["object"],
+        "created": response["created"],
+        "model": body.model,
+        "choices": choices,
+        "usage": {
+            "prompt_tokens": response["usage"]["prompt_tokens"],
+            "completion_tokens": response["usage"]["completion_tokens"],
+            "total_tokens": response["usage"]["total_tokens"],
+        },
+    }
+    return result
diff --git a/model.py b/model.py
new file mode 100644
index 0000000..c256173
--- /dev/null
+++ b/model.py
@@ -0,0 +1,251 @@
+from __future__ import annotations
+
+import json
+
+from typing import Dict, Optional, Union, List
+
+import llama_cpp
+import llama_cpp.llama_speculative as llama_speculative
+import llama_cpp.llama_tokenizer as llama_tokenizer
+from llama_cpp.server.settings import ModelSettings
+import chatglm_cpp
+import chatglm
+import gc
+from llama_cpp import Llama
+
+
+class LlamaProxy:
+    def __init__(self, models: List[ModelSettings]) -> None:
+        assert len(models) > 0, "No models provided!"
+
+        self._model_settings_dict: dict[str, ModelSettings] = {}
+        for model in models:
+            if not model.model_alias:
+                model.model_alias = model.model
+            self._model_settings_dict[model.model_alias] = model
+
+        self._current_model: Optional[llama_cpp.Llama] = None
+        self._current_model_alias: Optional[str] = None
+
+        self._default_model_settings: ModelSettings = models[0]
+        self._default_model_alias: str = self._default_model_settings.model_alias  # type: ignore
+
+        # Load default model
+        self._current_model = self.load_llama_from_model_settings(
+            self._default_model_settings
+        )
+        self._current_model_alias = self._default_model_alias
+
+    def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
+        if model is None:
+            model = self._default_model_alias
+
+        if model not in self._model_settings_dict:
+            model = self._default_model_alias
+
+        if model == self._current_model_alias:
+            if self._current_model is not None:
+                return self._current_model
+
+        self._current_model = None
+
+        settings = self._model_settings_dict[model]
+        self._current_model = self.load_llama_from_model_settings(settings)
+        self._current_model_alias = model
+        return self._current_model
+
+    def __getitem__(self, model: str):
+        return self._model_settings_dict[model].model_dump()
+
+    def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]):
+        if isinstance(settings, (bytes, str)):
+            settings = ModelSettings.model_validate_json(settings)
+        self._model_settings_dict[model] = settings
+
+    def __iter__(self):
+        for model in self._model_settings_dict:
+            yield model
+
+    def free(self):
+        if self._current_model:
+            del self._current_model
+            gc.collect()
+
+    @staticmethod
+    def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
+        chat_handler = None
+        if settings.chat_format == "llava-1-5":
+            assert settings.clip_model_path is not None, "clip model not found"
+            chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
+                clip_model_path=settings.clip_model_path, verbose=settings.verbose
+            )
+        elif settings.chat_format == "hf-autotokenizer":
+            assert (
+                settings.hf_pretrained_model_name_or_path is not None
+            ), "hf_pretrained_model_name_or_path must be set for hf-autotokenizer"
+            chat_handler = (
+                llama_cpp.llama_chat_format.hf_autotokenizer_to_chat_completion_handler(
+                    settings.hf_pretrained_model_name_or_path
+                )
+            )
+        elif settings.chat_format == "hf-tokenizer-config":
+            assert (
+                settings.hf_tokenizer_config_path is not None
+            ), "hf_tokenizer_config_path must be set for hf-tokenizer-config"
+            chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler(
+                json.load(open(settings.hf_tokenizer_config_path))
+            )
+
+        tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
+        if settings.hf_pretrained_model_name_or_path is not None:
+            tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(
+                settings.hf_pretrained_model_name_or_path
+            )
+
+        draft_model = None
+        if settings.draft_model is not None:
+            draft_model = llama_speculative.LlamaPromptLookupDecoding(
+                num_pred_tokens=settings.draft_model_num_pred_tokens
+            )
+
+        kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
+        if settings.kv_overrides is not None:
+            assert isinstance(settings.kv_overrides, list)
+            kv_overrides = {}
+            for kv in settings.kv_overrides:
+                key, value = kv.split("=")
+                if ":" in value:
+                    value_type, value = value.split(":")
+                    if value_type == "bool":
+                        kv_overrides[key] = value.lower() in ["true", "1"]
+                    elif value_type == "int":
+                        kv_overrides[key] = int(value)
+                    elif value_type == "float":
+                        kv_overrides[key] = float(value)
+                    else:
+                        raise ValueError(f"Unknown value type {value_type}")
+
+        import functools
+
+        kwargs = {}
+
+        if settings.hf_model_repo_id is not None:
+            create_fn = functools.partial(
+                llama_cpp.Llama.from_pretrained,
+                repo_id=settings.hf_model_repo_id,
+                filename=settings.model,
+            )
+        elif settings.chat_format == "chatglm":
+            create_fn = chatglm_cpp.Pipeline
+            kwargs["model_path"] = settings.model
+        else:
+            create_fn = llama_cpp.Llama
+            kwargs["model_path"] = settings.model
+
+        if settings.chat_format == "chatglm3":
+            _model = chatglm_cpp.Pipeline(settings.model)
+            _model.create_chat_completion = chatglm.create_chat_completion
+
+        elif settings.chat_format == "firefunction" :           
+            _model = Llama.from_pretrained(
+                # repo_id="neopolita/firefunction-v1-gguf",
+                # filename="firefunction-v1_q2_k.gguf",
+                repo_id=settings.hf_model_repo_id,
+                filename=settings.model,
+                tokenizer=tokenizer,
+                n_gpu_layers=settings.n_gpu_layers,
+                main_gpu=settings.main_gpu,
+                tensor_split=settings.tensor_split,
+                vocab_only=settings.vocab_only,
+                use_mmap=settings.use_mmap,
+                use_mlock=settings.use_mlock,
+                kv_overrides=kv_overrides,
+                # Context Params
+                seed=settings.seed,
+                n_ctx=settings.n_ctx,
+                n_batch=settings.n_batch,
+                n_threads=settings.n_threads,
+                n_threads_batch=settings.n_threads_batch,
+            )
+        elif settings.chat_format == "openfunctions":          
+            _model = Llama(            
+                model_path=settings.model,
+                tokenizer=tokenizer,
+                n_gpu_layers=settings.n_gpu_layers,
+                main_gpu=settings.main_gpu,
+                tensor_split=settings.tensor_split,
+                vocab_only=settings.vocab_only,
+                use_mmap=settings.use_mmap,
+                use_mlock=settings.use_mlock,
+                kv_overrides=kv_overrides,
+                # Context Params
+                seed=settings.seed,
+                n_ctx=settings.n_ctx,
+                n_batch=settings.n_batch,
+                n_threads=settings.n_threads,
+                n_threads_batch=settings.n_threads_batch,
+            )
+        else:
+            _model = create_fn(
+                **kwargs,
+                # Model Params
+                n_gpu_layers=settings.n_gpu_layers,
+                main_gpu=settings.main_gpu,
+                tensor_split=settings.tensor_split,
+                vocab_only=settings.vocab_only,
+                use_mmap=settings.use_mmap,
+                use_mlock=settings.use_mlock,
+                kv_overrides=kv_overrides,
+                # Context Params
+                seed=settings.seed,
+                n_ctx=settings.n_ctx,
+                n_batch=settings.n_batch,
+                n_threads=settings.n_threads,
+                n_threads_batch=settings.n_threads_batch,
+                rope_scaling_type=settings.rope_scaling_type,
+                rope_freq_base=settings.rope_freq_base,
+                rope_freq_scale=settings.rope_freq_scale,
+                yarn_ext_factor=settings.yarn_ext_factor,
+                yarn_attn_factor=settings.yarn_attn_factor,
+                yarn_beta_fast=settings.yarn_beta_fast,
+                yarn_beta_slow=settings.yarn_beta_slow,
+                yarn_orig_ctx=settings.yarn_orig_ctx,
+                mul_mat_q=settings.mul_mat_q,
+                logits_all=settings.logits_all,
+                embedding=settings.embedding,
+                offload_kqv=settings.offload_kqv,
+                # Sampling Params
+                last_n_tokens_size=settings.last_n_tokens_size,
+                # LoRA Params
+                lora_base=settings.lora_base,
+                lora_path=settings.lora_path,
+                # Backend Params
+                numa=settings.numa,
+                # Chat Format Params
+                chat_format=settings.chat_format,
+                chat_handler=chat_handler,
+                # Speculative Decoding
+                draft_model=draft_model,
+                # KV Cache Quantization
+                type_k=settings.type_k,
+                type_v=settings.type_v,
+                # Tokenizer
+                tokenizer=tokenizer,
+                # Misc
+                verbose=settings.verbose,
+            )
+            if settings.cache:
+                if settings.cache_type == "disk":
+                    if settings.verbose:
+                        print(
+                            f"Using disk cache with size {settings.cache_size}")
+                    cache = llama_cpp.LlamaDiskCache(
+                        capacity_bytes=settings.cache_size)
+                else:
+                    if settings.verbose:
+                        print(
+                            f"Using ram cache with size {settings.cache_size}")
+                    cache = llama_cpp.LlamaRAMCache(
+                        capacity_bytes=settings.cache_size)
+                _model.set_cache(cache)
+        return _model
diff --git a/server.cfg b/server.cfg
new file mode 100644
index 0000000..5cebfa5
--- /dev/null
+++ b/server.cfg
@@ -0,0 +1,138 @@
+{
+    "host": "0.0.0.0",
+    "port": 8000,
+    "models": [
+       {
+            "model": "/home/test/llm-models/chatglm3-ggml-q8.bin",
+            "model_alias": "chatglm3",
+            "chat_format": "chatglm3",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512
+        },
+        {
+            "model": "/home/test/llm-models/gorilla-openfunctions-v2-q4_K_M.gguf",
+            "hf_pretrained_model_name_or_path":"gorilla-llm/gorilla-openfunctions-v2",
+            "model_alias": "openfunctions",
+            "chat_format": "openfunctions",
+            "n_gpu_layers": 0,
+            "n_ctx":4096,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512
+        },
+       {
+            "model": "/home/test/llm-models/functionary-small-v2.4.Q4_0.gguf",
+            "model_alias": "functionary",
+            "chat_format": "functionary-v2",
+            "hf_pretrained_model_name_or_path":"meetkai/functionary-small-v2.4",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 8192,
+            "use_mmap":true
+        },
+        {
+            "model": "/home/test/llm-models/bge-large-zh-v1.5-q4_k_m.gguf",
+            "model_alias": "bge-large-zh-v1.5",
+            "chat_format": "bert",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512
+        },
+        {
+            "model": "/home/test/llm-models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
+            "model_alias": "llama-3-8b",
+            "chat_format": "llama-3",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 8192,
+            "embedding": true
+        },
+        {
+            "model": "/home/test/llm-models/gemma-7b-it.Q4_K_M.gguf",
+            "model_alias": "gemma-7b",
+            "chat_format": "gemma",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_ctx": 8192,
+            "n_batch": 512
+        },
+        {
+            "model": "/home/test/llm-models/ggml-model-q4_k.gguf",
+            "model_alias": "llava",
+            "chat_format": "llava-1-5",
+            "clip_model_path": "/home/test/llm-models/mmproj-model-f16.gguf",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_ctx": 4096,
+            "n_batch": 512
+        },
+        {
+            "model": "/home/test/llm-models/mistral-7b-instruct-v0.2.Q4_K_M.gguf",
+            "model_alias": "mistral-7b",
+            "chat_format": "mistral-instruct",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_ctx": 8192,
+            "n_batch": 512
+        },
+        {
+            "model": "/home/test/llm-models/mixtral-8x7b-instruct-v0.1.Q3_K_M.gguf",
+            "model_alias": "mixtral-8x7b-instruct",
+            "chat_format": "mistral-instruct",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_ctx": 8192,
+            "n_batch": 512
+        },
+        {  
+            "model": "/home/test/llm-models/sqlcoder-7b-2.Q4_K_M.gguf",
+            "model_alias": "sqlcoder",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_ctx": 16384,  
+            "n_batch": 1024
+        },
+        {
+            "model": "/home/test/llm-models/qwen1_5-14b-chat-q4_k_m.gguf",
+            "model_alias": "qwen",
+            "chat_format":"qwen",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 1024
+        },
+        {
+            "model": "/home/test/llm-models/Baichuan2-13B-Chat-Q4_K_M.gguf",
+            "model_alias": "baichuan-2",
+            "chat_format":"baichuan-2",
+            "n_gpu_layers": 0,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 1024
+        },
+        {
+            "model": "firefunction-v1_q2_k.gguf",
+            "hf_model_repo_id":"neopolita/firefunction-v1-gguf",
+            "model_alias": "firefunction",
+            "chat_format": "firefunction",
+            "n_gpu_layers": 0,
+            "n_ctx":4096,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512
+        }
+       
+    ]
+}
diff --git a/settings.py b/settings.py
new file mode 100644
index 0000000..934aecd
--- /dev/null
+++ b/settings.py
@@ -0,0 +1,211 @@
+from __future__ import annotations
+
+import multiprocessing
+
+from typing import Optional, List, Literal, Union
+from pydantic import Field
+from pydantic_settings import BaseSettings
+
+import llama_cpp
+
+# Disable warning for model and model_alias settings
+BaseSettings.model_config["protected_namespaces"] = ()
+
+
+class ModelSettings(BaseSettings):
+    """Model settings used to load a Llama model."""
+
+    model: str = Field(
+        description="The path to the model to use for generating completions."
+    )
+    model_alias: Optional[str] = Field(
+        default=None,
+        description="The alias of the model to use for generating completions.",
+    )
+    # Model Params
+    n_gpu_layers: int = Field(
+        default=0,
+        ge=-1,
+        description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
+    )
+    split_mode: int = Field(
+        default=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
+        description="The split mode to use.",
+    )
+    main_gpu: int = Field(
+        default=0,
+        ge=0,
+        description="Main GPU to use.",
+    )
+    tensor_split: Optional[List[float]] = Field(
+        default=None,
+        description="Split layers across multiple GPUs in proportion.",
+    )
+    vocab_only: bool = Field(
+        default=False, description="Whether to only return the vocabulary."
+    )
+    use_mmap: bool = Field(
+        default=llama_cpp.llama_supports_mmap(),
+        description="Use mmap.",
+    )
+    use_mlock: bool = Field(
+        default=llama_cpp.llama_supports_mlock(),
+        description="Use mlock.",
+    )
+    kv_overrides: Optional[List[str]] = Field(
+        default=None,
+        description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
+    )
+    # Context Params
+    seed: int = Field(
+        default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
+    )
+    n_ctx: int = Field(default=2048, ge=0, description="The context size.")
+    n_batch: int = Field(
+        default=512, ge=1, description="The batch size to use per eval."
+    )
+    n_threads: int = Field(
+        default=max(multiprocessing.cpu_count() // 2, 1),
+        ge=1,
+        description="The number of threads to use.",
+    )
+    n_threads_batch: int = Field(
+        default=max(multiprocessing.cpu_count(), 1),
+        ge=0,
+        description="The number of threads to use when batch processing.",
+    )
+    rope_scaling_type: int = Field(
+        default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
+    )
+    rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
+    rope_freq_scale: float = Field(
+        default=0.0, description="RoPE frequency scaling factor"
+    )
+    yarn_ext_factor: float = Field(default=-1.0)
+    yarn_attn_factor: float = Field(default=1.0)
+    yarn_beta_fast: float = Field(default=32.0)
+    yarn_beta_slow: float = Field(default=1.0)
+    yarn_orig_ctx: int = Field(default=0)
+    mul_mat_q: bool = Field(
+        default=True, description="if true, use experimental mul_mat_q kernels"
+    )
+    logits_all: bool = Field(default=True, description="Whether to return logits.")
+    embedding: bool = Field(default=True, description="Whether to use embeddings.")
+    offload_kqv: bool = Field(
+        default=True, description="Whether to offload kqv to the GPU."
+    )
+    # Sampling Params
+    last_n_tokens_size: int = Field(
+        default=64,
+        ge=0,
+        description="Last n tokens to keep for repeat penalty calculation.",
+    )
+    # LoRA Params
+    lora_base: Optional[str] = Field(
+        default=None,
+        description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.",
+    )
+    lora_path: Optional[str] = Field(
+        default=None,
+        description="Path to a LoRA file to apply to the model.",
+    )
+    # Backend Params
+    numa: Union[bool, int] = Field(
+        default=False,
+        description="Enable NUMA support.",
+    )
+    # Chat Format Params
+    chat_format: Optional[str] = Field(
+        default=None,
+        description="Chat format to use.",
+    )
+    clip_model_path: Optional[str] = Field(
+        default=None,
+        description="Path to a CLIP model to use for multi-modal chat completion.",
+    )
+    # Cache Params
+    cache: bool = Field(
+        default=False,
+        description="Use a cache to reduce processing times for evaluated prompts.",
+    )
+    cache_type: Literal["ram", "disk"] = Field(
+        default="ram",
+        description="The type of cache to use. Only used if cache is True.",
+    )
+    cache_size: int = Field(
+        default=2 << 30,
+        description="The size of the cache in bytes. Only used if cache is True.",
+    )
+    # Tokenizer Options
+    hf_tokenizer_config_path: Optional[str] = Field(
+        default=None,
+        description="The path to a HuggingFace tokenizer_config.json file.",
+    )
+    hf_pretrained_model_name_or_path: Optional[str] = Field(
+        default=None,
+        description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
+    )
+    # Loading from HuggingFace Model Hub
+    hf_model_repo_id: Optional[str] = Field(
+        default=None,
+        description="The model repo id to use for the HuggingFace tokenizer model.",
+    )
+    # Speculative Decoding
+    draft_model: Optional[str] = Field(
+        default=None,
+        description="Method to use for speculative decoding. One of (prompt-lookup-decoding).",
+    )
+    draft_model_num_pred_tokens: int = Field(
+        default=10,
+        description="Number of tokens to predict using the draft model.",
+    )
+    # KV Cache Quantization
+    type_k: Optional[int] = Field(
+        default=None,
+        description="Type of the key cache quantization.",
+    )
+    type_v: Optional[int] = Field(
+        default=None,
+        description="Type of the value cache quantization.",
+    )
+    # Misc
+    verbose: bool = Field(
+        default=True, description="Whether to print debug information."
+    )
+
+
+class ServerSettings(BaseSettings):
+    """Server settings used to configure the FastAPI and Uvicorn server."""
+
+    # Uvicorn Settings
+    host: str = Field(default="localhost", description="Listen address")
+    port: int = Field(default=8000, description="Listen port")
+    ssl_keyfile: Optional[str] = Field(
+        default=None, description="SSL key file for HTTPS"
+    )
+    ssl_certfile: Optional[str] = Field(
+        default=None, description="SSL certificate file for HTTPS"
+    )
+    # FastAPI Settings
+    api_key: Optional[str] = Field(
+        default=None,
+        description="API key for authentication. If set all requests need to be authenticated.",
+    )
+    interrupt_requests: bool = Field(
+        default=True,
+        description="Whether to interrupt requests when a new request is received.",
+    )
+    disable_ping_events: bool = Field(
+        default=False,
+        description="Disable EventSource pings (may be needed for some clients).",
+    )
+
+
+class Settings(ServerSettings, ModelSettings):
+    pass
+
+
+class ConfigFileSettings(ServerSettings):
+    """Configuration file format settings."""
+
+    models: List[ModelSettings] = Field(default=[], description="Model configs")