From 17a8ab926dd88a2ca4373bf7b207476adab88aa5 Mon Sep 17 00:00:00 2001
From: luotingdan <luotingdan@baidu.com>
Date: Wed, 11 Oct 2023 11:21:25 +0800
Subject: [PATCH 1/3] add client

---
 llm/client/Client.py       |  124 +++++
 llm/client/api_client.py   |  301 +++++++++++
 llm/client/conversation.py | 1044 ++++++++++++++++++++++++++++++++++++
 llm/client/test_client.py  |    7 +
 llm/client/test_openai.py  |   58 ++
 5 files changed, 1534 insertions(+)
 create mode 100644 llm/client/Client.py
 create mode 100644 llm/client/api_client.py
 create mode 100644 llm/client/conversation.py
 create mode 100644 llm/client/test_client.py
 create mode 100644 llm/client/test_openai.py

diff --git a/llm/client/Client.py b/llm/client/Client.py
new file mode 100644
index 0000000000..55e9889738
--- /dev/null
+++ b/llm/client/Client.py
@@ -0,0 +1,124 @@
+import queue
+import json
+import sys
+from functools import partial
+
+import numpy as np
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import *
+
+class UserData:
+    def __init__(self):
+        self._completed_requests = queue.Queue()
+
+
+def callback(user_data, result, error):
+    if error:
+        user_data._completed_requests.put(error)
+    else:
+        user_data._completed_requests.put(result)
+
+
+class grpcClient:
+    def __init__(
+        self,
+        base_url: str,
+		model_name: str,
+		model_version: str = "1",
+        timeout: int = 100,
+    ):
+        """
+        Args:
+            base_url (`str`): inference server grpc url
+            model_name (`str`)
+            model_version (`str`): default "1"
+            timeout (`int`): inference timeout in seconds
+        """
+        self._model_name = model_name
+        self._model_version = model_version
+        self.timeout = timeout
+        self._client = grpcclient.InferenceServerClient(
+            base_url, verbose=False)
+        
+        error = self._verify_triton_state(self._client)
+        if error:
+            raise RuntimeError(
+                f"Could not communicate to Triton Server: {error}")
+
+        self.inputs = [grpcclient.InferInput("IN", [1], np_to_triton_dtype(np.object_))]
+        self.outputs = [grpcclient.InferRequestedOutput("OUT")]
+        self.has_init = False
+        self.user_data = UserData()    
+
+    def _verify_triton_state(self, triton_client):
+        if not triton_client.is_server_live():
+            return f"Triton server {self._server_url} is not live"
+        elif not triton_client.is_server_ready():
+            return f"Triton server {self._server_url} is not ready"
+        elif not triton_client.is_model_ready(self._model_name,
+                                              self._model_version):
+            return f"Model {self._model_name}:{self._model_version} is not ready"
+        return None
+    
+    def generate(
+        self,
+        prompt: str,
+        request_id: str = "0",
+        top_p: float = 0.0,
+        temperature: float = 1.0,
+        max_dec_len: int = 1024,
+        min_dec_len: int = 2,
+        penalty_score: float = 1.0,
+        frequency_score: float = 0.99,
+        eos_token_id: int =2,
+        presence_score: float = 0.0
+    ):
+        
+        #text = data_process(prompt)
+        req_dict = {
+            "text": prompt,
+            "topp": top_p,
+            "temperature": temperature,
+            "max_dec_len": max_dec_len,
+            "min_dec_len": min_dec_len,
+            "penalty_score": penalty_score,
+            "frequency_score": frequency_score,
+            "eos_token_id": eos_token_id,
+            "model_test": "test",
+            "presence_score": presence_score
+            }
+
+        try:
+            # Establish stream
+            if not self.has_init:
+                self._client.start_stream(callback=partial(callback, self.user_data))
+                self.has_init = True
+            else:
+                self.user_data.reset()
+                self.inputs = [grpcclient.InferInput("IN", [1], np_to_triton_dtype(np.object_))]
+                self.outputs = [grpcclient.InferRequestedOutput("OUT")]
+
+            in_data = np.array([json.dumps(req_dict)], dtype=np.object_)
+            self.inputs[0].set_data_from_numpy(in_data)
+
+            self._client.async_stream_infer(model_name=self._model_name,
+                                            inputs=self.inputs,
+                                            request_id=request_id,
+                                            outputs=self.outputs)
+            # Retrieve results...
+            completion = ""
+            while True:
+                data_item = self.user_data._completed_requests.get(timeout=self.timeout)
+                if type(data_item) == InferenceServerException:
+                    print('Exception:', 'status', data_item.status(), 'msg', data_item.message())
+                else:
+                    results = data_item.as_numpy("OUT")[0]
+                    data = json.loads(results)
+                    completion += data["result"]
+                    if data.get("is_end", False):
+                        break
+            return completion
+        except Exception as e:
+            print(f"Client infer error: {e}")
+            raise e
+
diff --git a/llm/client/api_client.py b/llm/client/api_client.py
new file mode 100644
index 0000000000..e6500d28c9
--- /dev/null
+++ b/llm/client/api_client.py
@@ -0,0 +1,301 @@
+import json
+import logging
+from logging.handlers import TimedRotatingFileHandler
+import numpy as np
+import argparse
+
+import time
+import random
+from http import HTTPStatus
+import tornado
+from tornado import web
+from tornado.concurrent import run_on_executor
+from concurrent.futures import ThreadPoolExecutor
+
+from conversation import *
+from Client import *
+
+
+
+parse = argparse.ArgumentParser()
+parse.add_argument(
+    '--url', type=str, help='grpc server url')
+parse.add_argument(
+    '--port', type=int, help='openai http port', default=2001)
+parse.add_argument(
+    '--model', type=str, help='model name', default="model")
+
+
+
+
+def parse_parameters(parameters_config, name, default_value):
+        if name not in parameters_config:
+            return default_value
+        return parameters_config[name]
+
+def create_error_response(status_code, msg):
+    output ={
+        "status" : status_code,
+        "errResponse": {
+            "message":msg,
+            "type":"invalid_request_error"
+        }
+    }
+    return output
+ 
+class ChatCompletionApiHandler(web.RequestHandler):
+    """
+    This handler provides OpenAI's ChatCompletion API。
+
+    See  https://platform.openai.com/docs/api-reference/chat/create
+    for the API specification. This API mimics the OpenAI ChatCompletion API.
+
+    NOTE: Currently we do not support the following features:
+        - n （currently only support 1）
+        - logit_bias 
+        - logprobs
+        - stop （currently support token id）
+        - function_call (Users should implement this by themselves)
+        - function (Users should implement this by themselves)
+    """
+    executor = ThreadPoolExecutor(20)
+ 
+    def __init__(self, application, request, **kwargs):
+        web.RequestHandler.__init__(self, application, request, **kwargs)
+
+    def initialize(self, url, model_name):
+        self._client = grpcClient(base_url= url,model_name=model_name)
+
+    @tornado.gen.coroutine
+    def post(self):
+        """
+        POST METHOD
+        """
+        body = self.request.body
+        remote_ip = self.request.remote_ip
+        start_time = time.time() 
+        if not body:
+            out_json = {"errorCode": 4000101}
+            result_str = json.dumps(out_json, ensure_ascii=False)
+            logging.warning(f"request receieved from remote ip:{remote_ip}, body=None,\
+                result={result_str}, time_cost={time.time() - start_time : 0.5f}")
+            self.write(result_str)
+        else:
+            body = json.loads(body)
+            logging.info(f"request receieved from remote ip:{remote_ip}, body={json.dumps(body, ensure_ascii=False)}")
+            err = self.valid_body(body)
+            if err is None:
+                data = yield self.run_req(body)
+                if data is None:
+                    out_json = create_error_response(4000102,"result is empty")
+                else:
+                    out_json = {"outputs" : [data],
+                                "status": 0}
+                result_str = json.dumps(out_json, ensure_ascii=False)
+            else:
+                result_str = json.dumps(err, ensure_ascii=False)
+            
+            logging.info(f"request returned, result={result_str}, time_cost={time.time() - start_time : 0.5f}")
+            self.write(result_str)
+
+    def valid_body(self, request):
+        """
+        Check whether the request body is legal
+        
+        Args:
+            request (dict):
+        
+        Returns:
+            Union[dict, None]: 
+            If the request body is valid, return None; 
+            otherwise, return json with the error message 
+        """
+        if request['model'] != self._client._model_name :
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "current model is not currently supported")
+        if 'n' in request and request['n'] != 1:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "n only support 1")
+        if 'logit_bias' in request and request['logit_bias'] is not None:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "logit_bias is not currently supported")
+        if 'functions' in request and request['functions'] is not None:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "functions is not currently supported")
+        if 'function_call' in request and request['function_call'] is not None:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "function_call is not currently supported")
+        return None
+
+    def gen_prompt(self, request):
+        conv = get_conv_template(request['model'])
+        if isinstance(request['messages'], str):
+            prompt = request['messages']
+        else:
+            for message in request['messages']:
+                msg_role = message["role"]
+                if msg_role == "system":
+                    conv.system_message = message["content"]
+                elif msg_role == "user":
+                    conv.append_message(conv.roles[0], message["content"])
+                elif msg_role == "assistant":
+                    conv.append_message(conv.roles[1], message["content"])
+                else:
+                    raise ValueError(f"Unknown role: {msg_role}")
+
+            # Add a blank message for the assistant.
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+
+        return prompt
+
+    @run_on_executor
+    def run_req(self, body):
+        req_id = random.randint(0, 100000)
+        prompt = self.gen_prompt(body)
+        result = self._client.generate(
+            request_id = str(req_id),
+            prompt = prompt,
+            top_p =  parse_parameters(body, 'top_p', 0.0),
+            temperature = parse_parameters(body, 'temperature', 1.0),
+            max_dec_len = parse_parameters(body, 'max_tokens', 1024),
+            frequency_score= parse_parameters(body, 'frequency_penalty', 0.99),
+            presence_score= parse_parameters(body, 'presence_penalty', 0.0)
+        )
+        return result
+
+
+class CompletionApiHandler(web.RequestHandler):
+    """
+    This handler provides OpenAI's Completion API。
+
+    See https://platform.openai.com/docs/api-reference/completions/create
+    for the API specification. This API mimics the OpenAI Completion API.
+
+    NOTE: Currently we do not support the following features:
+        - best_of （currently only support 1）
+        - n （currently only support 1）
+        - echo (not currently support getting the logprobs of prompt tokens)
+        - suffix (the language models we currently support do not support
+          suffix)
+        - logit_bias 
+        - logprobs
+        - stop （currently support token id）
+    """
+    executor = ThreadPoolExecutor(20)
+ 
+    def __init__(self, application, request, **kwargs):
+        web.RequestHandler.__init__(self, application, request, **kwargs)
+
+    def initialize(self, url, model_name):
+        self._client = grpcClient(base_url= url,model_name=model_name)
+
+    @tornado.gen.coroutine
+    def post(self):
+        """
+        POST METHOD
+        """
+        body = self.request.body
+        remote_ip = self.request.remote_ip
+        start_time = time.time() 
+        if not body:
+            out_json = {"errorCode": 4000101}
+            result_str = json.dumps(out_json, ensure_ascii=False)
+            logging.warning(f"request receieved from remote ip:{remote_ip}, body=None,\
+                result={result_str}, time_cost={time.time() - start_time : 0.5f}")
+
+            self.write(result_str)
+        else:
+            body = json.loads(body)
+            logging.info(f"request receieved from remote ip:{remote_ip}, body={json.dumps(body, ensure_ascii=False)}")
+            err = self.valid_body(body)
+            if err is None:
+                data = yield self.run_req(body)
+                if data is None:
+                    out_json = create_error_response(4000102,"result is empty")
+                else:
+                    out_json = {"outputs" : [data],
+                                "status": 0}
+                result_str = json.dumps(out_json, ensure_ascii=False)
+            else:
+                result_str = json.dumps(err, ensure_ascii=False)
+            
+            logging.info(f"request returned, result={result_str}, time_cost={time.time() - start_time : 0.5f}")
+            self.write(result_str)
+    
+    def valid_body(self, request):
+        """
+        Check whether the request body is legal
+        
+        Args:
+            request (dict):
+        
+        Returns:
+            Union[dict, None]: 
+            If the request body is valid, return None; 
+            otherwise, return json with the error message 
+        """
+        if request['model'] != self._client._model_name :
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "current model is not currently supported")
+        if 'n' in request and request['n'] != 1:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "n only support 1")       
+        if 'best_of' in request and request['best_of'] != 1:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "best_of only support 1")
+        if 'echo' in request and request['echo']:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "not suport echo")
+        if 'suffix' in request and request['suffix'] is not None:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "not suport suffix")
+        if 'logit_bias' in request and request['logit_bias'] is not None:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "logit_bias is not currently supported")
+        if 'logprobs' in request and request['logprobs'] is not None:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                        "logprobs is not currently supported")
+
+        return None
+
+    @run_on_executor
+    def run_req(self, body):
+        req_id = random.randint(0, 100000)
+        result = self._client.generate(
+            request_id = str(req_id),
+            prompt = body['prompt'],
+            top_p =  parse_parameters(body, 'top_p', 0.0),
+            temperature = parse_parameters(body, 'temperature', 1.0),
+            max_dec_len = parse_parameters(body, 'max_tokens', 1024),
+            frequency_score= parse_parameters(body, 'frequency_penalty', 0.99),
+            presence_score= parse_parameters(body, 'presence_penalty', 0.0)
+        )
+        return result
+
+
+
+
+if __name__ == '__main__':
+    args=parse.parse_args()
+    port = args.port
+    app = web.Application([("/v1/completions", CompletionApiHandler, dict(url=args.url,model_name = args.model)),
+                           ("/v1/chat/completions", ChatCompletionApiHandler, dict(url=args.url,model_name = args.model))
+                           ])
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    formatter = tornado.log.LogFormatter(
+        fmt='%(levelname)-8s %(asctime)s %(process)-5s %(filename)s[line:%(lineno)d] %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+    file_handler = TimedRotatingFileHandler(filename='log/server.log', when='D', interval=3, backupCount=90, encoding='utf-8', delay=False)
+    file_handler.setFormatter(formatter)
+    
+    logger.addHandler(file_handler)
+    app.listen(port)
+    print("Server started")
+    logging.info(f"Server started at port:{port}")
+    tornado.ioloop.IOLoop.current().start()
\ No newline at end of file
diff --git a/llm/client/conversation.py b/llm/client/conversation.py
new file mode 100644
index 0000000000..4cf148cc58
--- /dev/null
+++ b/llm/client/conversation.py
@@ -0,0 +1,1044 @@
+"""
+Conversation prompt templates.
+
+We kindly request that you import fastchat instead of copying this file if you want to use it.
+You can contribute back the changes you want to make.
+"""
+
+import dataclasses
+from enum import auto, IntEnum
+from typing import List, Any, Dict, Union
+
+
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    LLAMA2 = auto()
+    CHATGLM = auto()
+    CHATML = auto()
+    CHATINTERN = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ROBIN = auto()
+    FALCON_CHAT = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: List[str] = (("USER", "ASSISTANT"),)
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ": "
+                        + message.replace("\r\n", "\n").replace("\n\n", "\n")
+                    )
+                    ret += "\n\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA2:
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = "[INST] "
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if i == 0:
+                        ret += message + " "
+                    else:
+                        ret += role + " " + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM:
+            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+            round_add_n = 1 if self.name == "chatglm2" else 0
+            if system_prompt:
+                ret = system_prompt + self.sep
+            else:
+                ret = ""
+
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += f"[Round {i//2 + round_add_n}]{self.sep}"
+
+                if message:
+                    ret += f"{role}：{message}{self.sep}"
+                else:
+                    ret += f"{role}："
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATINTERN:
+            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += "<s>"
+                if message:
+                    ret += role + ":" + message + seps[i % 2] + "\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ":\n" + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += "\n\n"
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + "<s>" + message + "</s>"
+                else:
+                    ret += role + ": " + "<s>"
+            return ret
+        elif self.sep_style == SeparatorStyle.ROBIN:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ":\n" + message + self.sep
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+
+    def update_last_message(self, message: str):
+        """Update the last output.
+
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        ret = [{"role": "system", "content": self.system_message}]
+
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+        }
+
+
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+
+
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert (
+            template.name not in conv_templates
+        ), f"{template.name} has been registered."
+
+    conv_templates[template.name] = template
+
+
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+
+
+# An empty template for raw conversation.
+register_conv_template(
+    Conversation(
+        name="raw",
+        system_message="",
+        roles=("", ""),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+    )
+)
+
+# A template with a one-shot conversation example
+register_conv_template(
+    Conversation(
+        name="one_shot",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        messages=(
+            (
+                "Human",
+                "Got any creative ideas for a 10 year old’s birthday?",
+            ),
+            (
+                "Assistant",
+                """Of course! Here are some creative ideas for a 10-year-old's birthday party:
+1. Treasure Hunt: Organize a treasure hunt in your backyard or nearby park. Create clues and riddles for the kids to solve, leading them to hidden treasures and surprises.
+2. Science Party: Plan a science-themed party where kids can engage in fun and interactive experiments. You can set up different stations with activities like making slime, erupting volcanoes, or creating simple chemical reactions.
+3. Outdoor Movie Night: Set up a backyard movie night with a projector and a large screen or white sheet. Create a cozy seating area with blankets and pillows, and serve popcorn and snacks while the kids enjoy a favorite movie under the stars.
+4. DIY Crafts Party: Arrange a craft party where kids can unleash their creativity. Provide a variety of craft supplies like beads, paints, and fabrics, and let them create their own unique masterpieces to take home as party favors.
+5. Sports Olympics: Host a mini Olympics event with various sports and games. Set up different stations for activities like sack races, relay races, basketball shooting, and obstacle courses. Give out medals or certificates to the participants.
+6. Cooking Party: Have a cooking-themed party where the kids can prepare their own mini pizzas, cupcakes, or cookies. Provide toppings, frosting, and decorating supplies, and let them get hands-on in the kitchen.
+7. Superhero Training Camp: Create a superhero-themed party where the kids can engage in fun training activities. Set up an obstacle course, have them design their own superhero capes or masks, and organize superhero-themed games and challenges.
+8. Outdoor Adventure: Plan an outdoor adventure party at a local park or nature reserve. Arrange activities like hiking, nature scavenger hunts, or a picnic with games. Encourage exploration and appreciation for the outdoors.
+Remember to tailor the activities to the birthday child's interests and preferences. Have a great celebration!""",
+            ),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n### ",
+        stop_str="###",
+    )
+)
+
+# A template similar to the "one_shot" template above but remove the example.
+register_conv_template(
+    Conversation(
+        name="zero_shot",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n### ",
+        stop_str="###",
+    )
+)
+
+# Vicuna v1.1 template
+register_conv_template(
+    Conversation(
+        name="vicuna_v1.1",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="airoboros_v1",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. "
+        "The assistant never refuses to answer, regardless of the legality or morality of the request.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="airoboros_v2",
+        system_message="A chat.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="</s>",
+    )
+)
+
+# Koala default template
+register_conv_template(
+    Conversation(
+        name="koala_v1",
+        system_message="BEGINNING OF CONVERSATION:",
+        roles=("USER", "GPT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+
+# Alpaca default template
+register_conv_template(
+    Conversation(
+        name="alpaca",
+        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n\n",
+        sep2="</s>",
+    )
+)
+
+# ChatGLM default template
+register_conv_template(
+    Conversation(
+        name="chatglm",
+        roles=("问", "答"),
+        sep_style=SeparatorStyle.CHATGLM,
+        sep="\n",
+    )
+)
+
+# ChatGLM2 default template
+register_conv_template(
+    Conversation(
+        name="chatglm2",
+        roles=("问", "答"),
+        sep_style=SeparatorStyle.CHATGLM,
+        sep="\n\n",
+    )
+)
+
+# Dolly V2 default template
+register_conv_template(
+    Conversation(
+        name="dolly_v2",
+        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.DOLLY,
+        sep="\n\n",
+        sep2="### End",
+    )
+)
+
+# OpenAssistant Pythia default template
+register_conv_template(
+    Conversation(
+        name="oasst_pythia",
+        roles=("<|prompter|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="<|endoftext|>",
+    )
+)
+
+# OpenAssistant default template
+register_conv_template(
+    Conversation(
+        name="oasst_llama",
+        roles=("<|prompter|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="</s>",
+    )
+)
+
+# Tulu default template
+register_conv_template(
+    Conversation(
+        name="tulu",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        sep="\n",
+    )
+)
+
+# StableLM Alpha default template
+register_conv_template(
+    Conversation(
+        name="stablelm",
+        system_template="<|SYSTEM|>{system_message}",
+        system_message="""# StableLM Tuned (Alpha version)
+- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
+- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
+- StableLM will refuse to participate in anything that could harm a human.
+""",
+        roles=("<|USER|>", "<|ASSISTANT|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[50278, 50279, 50277, 1, 0],
+    )
+)
+
+# Baize default template
+register_conv_template(
+    Conversation(
+        name="baize",
+        system_message="The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n",
+        roles=("[|Human|]", "[|AI|]"),
+        messages=(
+            ("[|Human|]", "Hello!"),
+            ("[|AI|]", "Hi!"),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="\n",
+        stop_str="[|Human|]",
+    )
+)
+
+# RWKV-4-Raven default template
+register_conv_template(
+    Conversation(
+        name="rwkv",
+        roles=("Bob", "Alice"),
+        messages=(
+            ("Bob", "hi"),
+            (
+                "Alice",
+                "Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.",
+            ),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.RWKV,
+        sep="",
+        stop_str="\n\n",
+    )
+)
+
+# Buddy default template
+register_conv_template(
+    Conversation(
+        name="openbuddy",
+        system_message="""Consider a conversation between User (a human) and Assistant (named Buddy).
+Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
+Buddy cannot access the Internet.
+Buddy can fluently speak the user's language (e.g. English, Chinese).
+Buddy can generate poems, stories, code, essays, songs, parodies, and more.
+Buddy possesses vast knowledge about the world, history, and culture.
+Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
+Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
+
+User: Hi.
+Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""",
+        roles=("User", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+    )
+)
+
+# Phoenix default template
+register_conv_template(
+    Conversation(
+        name="phoenix",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.PHOENIX,
+        sep="</s>",
+    )
+)
+
+# ReaLM default template
+register_conv_template(
+    Conversation(
+        name="ReaLM-7b-v1",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.PHOENIX,
+        sep="</s>",
+    )
+)
+
+# ChatGPT default template
+register_conv_template(
+    Conversation(
+        name="chatgpt",
+        system_message="You are a helpful assistant.",
+        roles=("user", "assistant"),
+        sep_style=None,
+        sep=None,
+    )
+)
+
+# Claude default template
+register_conv_template(
+    Conversation(
+        name="claude",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n\n",
+    )
+)
+
+# MPT default template
+register_conv_template(
+    Conversation(
+        name="mpt-7b-chat",
+        system_template="""<|im_start|>system
+{system_message}""",
+        system_message="""- You are a helpful assistant chatbot trained by MosaicML.
+- You answer questions.
+- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[50278, 0],
+    )
+)
+
+# MPT-30b-chat default template
+register_conv_template(
+    Conversation(
+        name="mpt-30b-chat",
+        system_template="""<|im_start|>system
+{system_message}""",
+        system_message="""A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[50278, 0],
+    )
+)
+
+# MPT-30b-instruct default template
+# reference: https://huggingface.co/mosaicml/mpt-30b-instruct#formatting
+register_conv_template(
+    Conversation(
+        name="mpt-30b-instruct",
+        system_template="{system_message}",
+        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        sep="\n\n",
+        stop_token_ids=[50278, 0],
+    )
+)
+
+# Bard default template
+# Reference: https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L150
+#            https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L40
+register_conv_template(
+    Conversation(
+        name="bard",
+        roles=("0", "1"),
+        sep_style=None,
+        sep=None,
+    )
+)
+
+# BiLLa default template
+register_conv_template(
+    Conversation(
+        name="billa",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
+        sep="\n",
+        stop_str="Human:",
+    )
+)
+
+# RedPajama INCITE default template
+register_conv_template(
+    Conversation(
+        name="redpajama-incite",
+        roles=("<human>", "<bot>"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+        stop_str="<human>",
+    )
+)
+
+# h2oGPT default template
+register_conv_template(
+    Conversation(
+        name="h2ogpt",
+        roles=("<|prompt|>", "<|answer|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="</s>",
+    )
+)
+
+# Robin default template
+register_conv_template(
+    Conversation(
+        name="Robin",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("###Human", "###Assistant"),
+        sep_style=SeparatorStyle.ROBIN,
+        sep="\n",
+        stop_token_ids=[2, 396],
+        stop_str="###",
+    )
+)
+
+# Snoozy default template
+# Reference: https://github.com/nomic-ai/gpt4all/blob/d4861030b778da6db59d21d2927a4aba4f9f1f43/gpt4all-bindings/python/gpt4all/gpt4all.py#L232
+register_conv_template(
+    Conversation(
+        name="snoozy",
+        system_template="### Instruction:\n{system_message}",
+        system_message="The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.",
+        roles=("### Prompt", "### Response"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+        stop_str="###",
+    )
+)
+
+# manticore default template
+register_conv_template(
+    Conversation(
+        name="manticore",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="</s>",
+    )
+)
+
+# Falcon default template
+register_conv_template(
+    Conversation(
+        name="falcon",
+        roles=("User", "Assistant"),
+        messages=[],
+        sep_style=SeparatorStyle.RWKV,
+        sep="\n",
+        sep2="<|endoftext|>",
+        stop_str="\nUser",  # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
+        stop_token_ids=[
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+        ],  # it better only put special tokens here, because tokenizer only remove special tokens
+    )
+)
+
+# ChagGPT default template
+register_conv_template(
+    Conversation(
+        name="polyglot_changgpt",
+        roles=("B", "A"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+    )
+)
+
+# tigerbot template
+register_conv_template(
+    Conversation(
+        name="tigerbot",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.ROBIN,
+        sep="\n\n",
+        stop_str="###",
+    )
+)
+
+# ref: https://huggingface.co/Salesforce/xgen-7b-8k-inst
+register_conv_template(
+    Conversation(
+        name="xgen",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("### Human", "### Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+        stop_token_ids=[50256],
+    )
+)
+
+# Internlm-chat template
+register_conv_template(
+    Conversation(
+        name="internlm-chat",
+        system_message="A chat between a curious <|User|> and an <|Bot|>. The <|Bot|> gives helpful, detailed, and polite answers to the <|User|>'s questions.\n\n",
+        roles=("<|User|>", "<|Bot|>"),
+        sep_style=SeparatorStyle.CHATINTERN,
+        sep="<eoh>",
+        sep2="<eoa>",
+        stop_token_ids=[1, 103028],
+        stop_str="<|User|>",
+    )
+)
+
+# StarChat template
+# reference: https://huggingface.co/spaces/HuggingFaceH4/starchat-playground/blob/main/dialogues.py
+register_conv_template(
+    Conversation(
+        name="starchat",
+        system_template="<system>\n{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|end|>",
+        stop_token_ids=[0, 49155],
+        stop_str="<|end|>",
+    )
+)
+
+# Baichuan-13B-Chat template
+register_conv_template(
+    # source: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555
+    # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/main/generation_config.json
+    # https://github.com/baichuan-inc/Baichuan-13B/issues/25
+    Conversation(
+        name="baichuan-chat",
+        roles=("<reserved_102>", "<reserved_103>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[],
+    )
+)
+
+# Baichuan2-13B-Chat template
+register_conv_template(
+    # source: https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py#L773
+    # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_config.json
+    # https://github.com/baichuan-inc/Baichuan2/issues/62
+    Conversation(
+        name="baichuan2-chat",
+        roles=("<reserved_106>", "<reserved_107>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[],
+    )
+)
+
+# Mistral template
+# source: https://docs.mistral.ai/llm/mistral-instruct-v0.1#chat-template
+register_conv_template(
+    Conversation(
+        name="mistral",
+        system_template="",
+        roles=("[INST] ", " [/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep="",
+        sep2=" </s>",
+    )
+)
+
+# llama2 template
+# reference: https://huggingface.co/blog/codellama#conversational-instructions
+# reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
+register_conv_template(
+    Conversation(
+        name="llama-2",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="cutegpt",
+        roles=("问：", "答：\n"),
+        sep_style=SeparatorStyle.NO_COLON_TWO,
+        sep="\n",
+        sep2="\n",
+        stop_str="<end>",
+    )
+)
+
+# OpenOrcaxOpenChat-Preview2-13B template
+register_conv_template(
+    Conversation(
+        name="open-orca",
+        system_template="{system_message}",
+        system_message="You are a helpful assistant. Please answer truthfully and write out your "
+        "thinking step by step to be sure you get the right answer. If you make a mistake or encounter "
+        "an error in your thinking, say so out loud and attempt to correct it. If you don't know or "
+        "aren't sure about something, say so clearly. You will act as a professional logician, mathematician, "
+        "and physicist. You will also act as the most appropriate type of expert to answer any particular "
+        "question or solve the relevant problem; state which expert type your are, if so. Also think of "
+        "any particular named expert that would be ideal to answer the relevant question or solve the "
+        "relevant problem; name and act as them, if appropriate.",
+        roles=("User", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
+        sep="<|end_of_turn|>\n",
+        stop_token_ids=[32000, 32001],  # "<|end_of_turn|>"
+        stop_str="User",
+    )
+)
+
+
+# Qwen-chat default template
+# source: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/qwen_generation_utils.py#L130
+register_conv_template(
+    Conversation(
+        name="qwen-7b-chat",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[
+            151643,
+            151644,
+            151645,
+        ],  # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
+        stop_str="<|endoftext|>",
+    )
+)
+
+
+# AquilaChat default template
+# source: https://github.com/FlagAI-Open/FlagAI/blob/master/examples/Aquila/Aquila-chat/cyg_conversation.py
+register_conv_template(
+    Conversation(
+        name="aquila-chat",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant", "System"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="###",
+        sep2="",
+        stop_str=["###", "</s>", "[UNK]"],
+    )
+)
+
+# Llama2-Chinese default template
+# source: https://huggingface.co/FlagAlpha
+register_conv_template(
+    Conversation(
+        name="llama2-chinese",
+        system_template="<s>{system_message}</s>",
+        roles=("Human", "Assistant", "System"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="\n</s><s>",
+        stop_str="</s>",
+    )
+)
+
+# Vigogne Chat default template
+# source: https://github.com/bofenghuang/vigogne
+register_conv_template(
+    Conversation(
+        name="vigogne-chat",
+        system_template="<|system|>: {system_message}",
+        system_message="Vous êtes l'assistant IA nommé Vigogne, créé par Zaion Lab (https://zaion.ai). "
+        "Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="</s>\n",
+        stop_str="<|user|>",
+    )
+)
+
+# Falcon 180B chat template
+# source: https://huggingface.co/spaces/tiiuae/falcon-180b-demo/blob/d1590ee7fae9b6ce331ba7808e61a29dcce9239f/app.py#L28-L37
+register_conv_template(
+    Conversation(
+        name="falcon-chat",
+        roles=("User", "Falcon"),
+        system_template="System: {system_message}",
+        messages=[],
+        sep_style=SeparatorStyle.FALCON_CHAT,
+        sep="\n",
+        sep2="<|endoftext|>",
+        stop_str="\nUser:",  # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
+    )
+)
+
+# Phind template
+# source: https://huggingface.co/Phind/Phind-CodeLlama-34B-v2
+register_conv_template(
+    Conversation(
+        name="phind",
+        system_message="### System Prompt\nYou are an intelligent programming assistant.",
+        roles=("### User Message", "### Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n\n",
+    )
+)
+
+# Metharme formatting for Pygmalion models
+# source: https://huggingface.co/PygmalionAI/pygmalion-2-13b
+register_conv_template(
+    Conversation(
+        name="metharme",
+        system_template="<|system|>{system_message}",
+        system_message="""Enter RP mode. You shall reply to the user while staying 
+        in character. Your responses must be detailed, creative, immersive, and drive the scenario
+        forward.""",
+        roles=("<|user|>", "<|model|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_str="<|user|>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="llama-ptuning",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+    )
+)
+
+
+
+if __name__ == "__main__":
+    print("Vicuna template:")
+    conv = get_conv_template("vicuna_v1.1")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
+
+    print("\n")
+
+    print("Llama-2 template:")
+    conv = get_conv_template("llama-2")
+    conv.set_system_message("You are a helpful, respectful and honest assistant.")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
\ No newline at end of file
diff --git a/llm/client/test_client.py b/llm/client/test_client.py
new file mode 100644
index 0000000000..c1ca8f9ad3
--- /dev/null
+++ b/llm/client/test_client.py
@@ -0,0 +1,7 @@
+from Client import grpcClient
+
+client  = grpcClient(base_url="0.0.0.0:8812",
+                     model_name="llama-ptuning",
+                     timeout= 100)
+result = client.generate("Hello, how are you")
+print(result)
\ No newline at end of file
diff --git a/llm/client/test_openai.py b/llm/client/test_openai.py
new file mode 100644
index 0000000000..d24b92ce14
--- /dev/null
+++ b/llm/client/test_openai.py
@@ -0,0 +1,58 @@
+import openai
+import subprocess
+import os
+import time
+import signal
+
+model = "llama-ptuning"
+port = 2001
+url = "0.0.0.0:8812"
+
+pd_cmd = "python3 api_client.py --url {0} --port {1} --model {2}".format(url, port, model)
+print("pd_cmd: ", pd_cmd)
+pd_process = subprocess.Popen(pd_cmd, shell=True, stdout=subprocess.PIPE,
+                                           stderr=subprocess.STDOUT, preexec_fn=os.setsid)
+
+time.sleep( 5 )
+
+# Modify OpenAI's API key and API base.
+openai.api_key = "EMPTY"
+openai.api_base = "http://0.0.0.0:"+str(port)+"/v1"
+
+
+# Completion API
+# 
+stream = False
+
+completion = openai.Completion.create(
+    model=model,
+    prompt="A robot may not injure a human being"
+)
+
+print("Completion results:")
+print(completion)
+
+# ChatCompletion API
+# 
+
+chat_completion = openai.ChatCompletion.create(
+    model=model,
+    messages=[{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "Who won the world series in 2020?"
+    }, {
+        "role":
+        "assistant",
+        "content":
+        "The Los Angeles Dodgers won the World Series in 2020."
+    }, {
+        "role": "user",
+        "content": "Where was it played?"
+    }])
+print("Chat completion results:")
+print(chat_completion)
+
+os.killpg(os.getpgid(pd_process.pid), signal.SIGTERM) 
\ No newline at end of file

From 57c3715ed694d27914ffe271881c07057409e7ca Mon Sep 17 00:00:00 2001
From: luotingdan <luotingdan@baidu.com>
Date: Mon, 16 Oct 2023 18:26:29 +0800
Subject: [PATCH 2/3] add client

---
 llm/client/conversation.py                   | 1044 ------------------
 llm/client/test_client.py                    |    7 -
 llm/{client => fastdeploy_llm}/Client.py     |   47 +-
 llm/fastdeploy_llm/__init__.py               |    1 +
 llm/{client => fastdeploy_llm}/api_client.py |   30 +-
 llm/fastdeploy_llm/utils/conversation.py     |  103 ++
 llm/test/README.md                           |    9 +
 llm/test/test_client.py                      |   21 +
 llm/{client => test}/test_openai.py          |   34 +-
 9 files changed, 211 insertions(+), 1085 deletions(-)
 delete mode 100644 llm/client/conversation.py
 delete mode 100644 llm/client/test_client.py
 rename llm/{client => fastdeploy_llm}/Client.py (73%)
 rename llm/{client => fastdeploy_llm}/api_client.py (93%)
 create mode 100644 llm/fastdeploy_llm/utils/conversation.py
 create mode 100644 llm/test/README.md
 create mode 100644 llm/test/test_client.py
 rename llm/{client => test}/test_openai.py (52%)

diff --git a/llm/client/conversation.py b/llm/client/conversation.py
deleted file mode 100644
index 4cf148cc58..0000000000
--- a/llm/client/conversation.py
+++ /dev/null
@@ -1,1044 +0,0 @@
-"""
-Conversation prompt templates.
-
-We kindly request that you import fastchat instead of copying this file if you want to use it.
-You can contribute back the changes you want to make.
-"""
-
-import dataclasses
-from enum import auto, IntEnum
-from typing import List, Any, Dict, Union
-
-
-class SeparatorStyle(IntEnum):
-    """Separator styles."""
-
-    ADD_COLON_SINGLE = auto()
-    ADD_COLON_TWO = auto()
-    ADD_COLON_SPACE_SINGLE = auto()
-    NO_COLON_SINGLE = auto()
-    NO_COLON_TWO = auto()
-    ADD_NEW_LINE_SINGLE = auto()
-    LLAMA2 = auto()
-    CHATGLM = auto()
-    CHATML = auto()
-    CHATINTERN = auto()
-    DOLLY = auto()
-    RWKV = auto()
-    PHOENIX = auto()
-    ROBIN = auto()
-    FALCON_CHAT = auto()
-
-
-@dataclasses.dataclass
-class Conversation:
-    """A class that manages prompt templates and keeps all conversation history."""
-
-    # The name of this template
-    name: str
-    # The template of the system prompt
-    system_template: str = "{system_message}"
-    # The system message
-    system_message: str = ""
-    # The names of two roles
-    roles: List[str] = (("USER", "ASSISTANT"),)
-    # All messages. Each item is (role, message).
-    messages: List[List[str]] = ()
-    # The number of few shot examples
-    offset: int = 0
-    # The separator style and configurations
-    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
-    sep: str = "\n"
-    sep2: str = None
-    # Stop criteria (the default one is EOS token)
-    stop_str: Union[str, List[str]] = None
-    # Stops generation if meeting any token in this list
-    stop_token_ids: List[int] = None
-
-    def get_prompt(self) -> str:
-        """Get the prompt for generation."""
-        system_prompt = self.system_template.format(system_message=self.system_message)
-        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + message + self.sep
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt + seps[0]
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + ": " + message + seps[i % 2]
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + message + self.sep
-                else:
-                    ret += role + ": "  # must be end with a space
-            return ret
-        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
-            ret = "" if system_prompt == "" else system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + "\n" + message + self.sep
-                else:
-                    ret += role + "\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
-            ret = system_prompt
-            for role, message in self.messages:
-                if message:
-                    ret += role + message + self.sep
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + message + seps[i % 2]
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.RWKV:
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += (
-                        role
-                        + ": "
-                        + message.replace("\r\n", "\n").replace("\n\n", "\n")
-                    )
-                    ret += "\n\n"
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.LLAMA2:
-            seps = [self.sep, self.sep2]
-            if self.system_message:
-                ret = system_prompt
-            else:
-                ret = "[INST] "
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    if i == 0:
-                        ret += message + " "
-                    else:
-                        ret += role + " " + message + seps[i % 2]
-                else:
-                    ret += role
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATGLM:
-            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
-            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
-            round_add_n = 1 if self.name == "chatglm2" else 0
-            if system_prompt:
-                ret = system_prompt + self.sep
-            else:
-                ret = ""
-
-            for i, (role, message) in enumerate(self.messages):
-                if i % 2 == 0:
-                    ret += f"[Round {i//2 + round_add_n}]{self.sep}"
-
-                if message:
-                    ret += f"{role}：{message}{self.sep}"
-                else:
-                    ret += f"{role}："
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATML:
-            ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
-            for role, message in self.messages:
-                if message:
-                    ret += role + "\n" + message + self.sep + "\n"
-                else:
-                    ret += role + "\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.CHATINTERN:
-            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if i % 2 == 0:
-                    ret += "<s>"
-                if message:
-                    ret += role + ":" + message + seps[i % 2] + "\n"
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.DOLLY:
-            seps = [self.sep, self.sep2]
-            ret = system_prompt
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + ":\n" + message + seps[i % 2]
-                    if i % 2 == 1:
-                        ret += "\n\n"
-                else:
-                    ret += role + ":\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.PHOENIX:
-            ret = system_prompt
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + "<s>" + message + "</s>"
-                else:
-                    ret += role + ": " + "<s>"
-            return ret
-        elif self.sep_style == SeparatorStyle.ROBIN:
-            ret = system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ":\n" + message + self.sep
-                else:
-                    ret += role + ":\n"
-            return ret
-        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
-            ret = ""
-            if self.system_message:
-                ret += system_prompt + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + message + self.sep
-                else:
-                    ret += role + ":"
-
-            return ret
-        else:
-            raise ValueError(f"Invalid style: {self.sep_style}")
-
-    def set_system_message(self, system_message: str):
-        """Set the system message."""
-        self.system_message = system_message
-
-    def append_message(self, role: str, message: str):
-        """Append a new message."""
-        self.messages.append([role, message])
-
-    def update_last_message(self, message: str):
-        """Update the last output.
-
-        The last message is typically set to be None when constructing the prompt,
-        so we need to update it in-place after getting the response from a model.
-        """
-        self.messages[-1][1] = message
-
-    def to_gradio_chatbot(self):
-        """Convert the conversation to gradio chatbot format."""
-        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                ret.append([msg, None])
-            else:
-                ret[-1][-1] = msg
-        return ret
-
-    def to_openai_api_messages(self):
-        """Convert the conversation to OpenAI chat completion format."""
-        ret = [{"role": "system", "content": self.system_message}]
-
-        for i, (_, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                ret.append({"role": "user", "content": msg})
-            else:
-                if msg is not None:
-                    ret.append({"role": "assistant", "content": msg})
-        return ret
-
-    def copy(self):
-        return Conversation(
-            name=self.name,
-            system_template=self.system_template,
-            system_message=self.system_message,
-            roles=self.roles,
-            messages=[[x, y] for x, y in self.messages],
-            offset=self.offset,
-            sep_style=self.sep_style,
-            sep=self.sep,
-            sep2=self.sep2,
-            stop_str=self.stop_str,
-            stop_token_ids=self.stop_token_ids,
-        )
-
-    def dict(self):
-        return {
-            "template_name": self.name,
-            "system_message": self.system_message,
-            "roles": self.roles,
-            "messages": self.messages,
-            "offset": self.offset,
-        }
-
-
-# A global registry for all conversation templates
-conv_templates: Dict[str, Conversation] = {}
-
-
-def register_conv_template(template: Conversation, override: bool = False):
-    """Register a new conversation template."""
-    if not override:
-        assert (
-            template.name not in conv_templates
-        ), f"{template.name} has been registered."
-
-    conv_templates[template.name] = template
-
-
-def get_conv_template(name: str) -> Conversation:
-    """Get a conversation template."""
-    return conv_templates[name].copy()
-
-
-# An empty template for raw conversation.
-register_conv_template(
-    Conversation(
-        name="raw",
-        system_message="",
-        roles=("", ""),
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
-        sep="",
-    )
-)
-
-# A template with a one-shot conversation example
-register_conv_template(
-    Conversation(
-        name="one_shot",
-        system_message="A chat between a curious human and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
-        roles=("Human", "Assistant"),
-        messages=(
-            (
-                "Human",
-                "Got any creative ideas for a 10 year old’s birthday?",
-            ),
-            (
-                "Assistant",
-                """Of course! Here are some creative ideas for a 10-year-old's birthday party:
-1. Treasure Hunt: Organize a treasure hunt in your backyard or nearby park. Create clues and riddles for the kids to solve, leading them to hidden treasures and surprises.
-2. Science Party: Plan a science-themed party where kids can engage in fun and interactive experiments. You can set up different stations with activities like making slime, erupting volcanoes, or creating simple chemical reactions.
-3. Outdoor Movie Night: Set up a backyard movie night with a projector and a large screen or white sheet. Create a cozy seating area with blankets and pillows, and serve popcorn and snacks while the kids enjoy a favorite movie under the stars.
-4. DIY Crafts Party: Arrange a craft party where kids can unleash their creativity. Provide a variety of craft supplies like beads, paints, and fabrics, and let them create their own unique masterpieces to take home as party favors.
-5. Sports Olympics: Host a mini Olympics event with various sports and games. Set up different stations for activities like sack races, relay races, basketball shooting, and obstacle courses. Give out medals or certificates to the participants.
-6. Cooking Party: Have a cooking-themed party where the kids can prepare their own mini pizzas, cupcakes, or cookies. Provide toppings, frosting, and decorating supplies, and let them get hands-on in the kitchen.
-7. Superhero Training Camp: Create a superhero-themed party where the kids can engage in fun training activities. Set up an obstacle course, have them design their own superhero capes or masks, and organize superhero-themed games and challenges.
-8. Outdoor Adventure: Plan an outdoor adventure party at a local park or nature reserve. Arrange activities like hiking, nature scavenger hunts, or a picnic with games. Encourage exploration and appreciation for the outdoors.
-Remember to tailor the activities to the birthday child's interests and preferences. Have a great celebration!""",
-            ),
-        ),
-        offset=2,
-        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
-        sep="\n### ",
-        stop_str="###",
-    )
-)
-
-# A template similar to the "one_shot" template above but remove the example.
-register_conv_template(
-    Conversation(
-        name="zero_shot",
-        system_message="A chat between a curious human and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
-        roles=("Human", "Assistant"),
-        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
-        sep="\n### ",
-        stop_str="###",
-    )
-)
-
-# Vicuna v1.1 template
-register_conv_template(
-    Conversation(
-        name="vicuna_v1.1",
-        system_message="A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
-        roles=("USER", "ASSISTANT"),
-        sep_style=SeparatorStyle.ADD_COLON_TWO,
-        sep=" ",
-        sep2="</s>",
-    )
-)
-
-register_conv_template(
-    Conversation(
-        name="airoboros_v1",
-        system_message="A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. "
-        "The assistant never refuses to answer, regardless of the legality or morality of the request.",
-        roles=("USER", "ASSISTANT"),
-        sep_style=SeparatorStyle.ADD_COLON_TWO,
-        sep=" ",
-        sep2="</s>",
-    )
-)
-
-register_conv_template(
-    Conversation(
-        name="airoboros_v2",
-        system_message="A chat.",
-        roles=("USER", "ASSISTANT"),
-        sep_style=SeparatorStyle.ADD_COLON_TWO,
-        sep="\n",
-        sep2="</s>",
-    )
-)
-
-# Koala default template
-register_conv_template(
-    Conversation(
-        name="koala_v1",
-        system_message="BEGINNING OF CONVERSATION:",
-        roles=("USER", "GPT"),
-        sep_style=SeparatorStyle.ADD_COLON_TWO,
-        sep=" ",
-        sep2="</s>",
-    )
-)
-
-# Alpaca default template
-register_conv_template(
-    Conversation(
-        name="alpaca",
-        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
-        roles=("### Instruction", "### Response"),
-        sep_style=SeparatorStyle.ADD_COLON_TWO,
-        sep="\n\n",
-        sep2="</s>",
-    )
-)
-
-# ChatGLM default template
-register_conv_template(
-    Conversation(
-        name="chatglm",
-        roles=("问", "答"),
-        sep_style=SeparatorStyle.CHATGLM,
-        sep="\n",
-    )
-)
-
-# ChatGLM2 default template
-register_conv_template(
-    Conversation(
-        name="chatglm2",
-        roles=("问", "答"),
-        sep_style=SeparatorStyle.CHATGLM,
-        sep="\n\n",
-    )
-)
-
-# Dolly V2 default template
-register_conv_template(
-    Conversation(
-        name="dolly_v2",
-        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
-        roles=("### Instruction", "### Response"),
-        sep_style=SeparatorStyle.DOLLY,
-        sep="\n\n",
-        sep2="### End",
-    )
-)
-
-# OpenAssistant Pythia default template
-register_conv_template(
-    Conversation(
-        name="oasst_pythia",
-        roles=("<|prompter|>", "<|assistant|>"),
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
-        sep="<|endoftext|>",
-    )
-)
-
-# OpenAssistant default template
-register_conv_template(
-    Conversation(
-        name="oasst_llama",
-        roles=("<|prompter|>", "<|assistant|>"),
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
-        sep="</s>",
-    )
-)
-
-# Tulu default template
-register_conv_template(
-    Conversation(
-        name="tulu",
-        roles=("<|user|>", "<|assistant|>"),
-        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
-        sep="\n",
-    )
-)
-
-# StableLM Alpha default template
-register_conv_template(
-    Conversation(
-        name="stablelm",
-        system_template="<|SYSTEM|>{system_message}",
-        system_message="""# StableLM Tuned (Alpha version)
-- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
-- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
-- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
-- StableLM will refuse to participate in anything that could harm a human.
-""",
-        roles=("<|USER|>", "<|ASSISTANT|>"),
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
-        sep="",
-        stop_token_ids=[50278, 50279, 50277, 1, 0],
-    )
-)
-
-# Baize default template
-register_conv_template(
-    Conversation(
-        name="baize",
-        system_message="The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n",
-        roles=("[|Human|]", "[|AI|]"),
-        messages=(
-            ("[|Human|]", "Hello!"),
-            ("[|AI|]", "Hi!"),
-        ),
-        offset=2,
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
-        sep="\n",
-        stop_str="[|Human|]",
-    )
-)
-
-# RWKV-4-Raven default template
-register_conv_template(
-    Conversation(
-        name="rwkv",
-        roles=("Bob", "Alice"),
-        messages=(
-            ("Bob", "hi"),
-            (
-                "Alice",
-                "Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.",
-            ),
-        ),
-        offset=2,
-        sep_style=SeparatorStyle.RWKV,
-        sep="",
-        stop_str="\n\n",
-    )
-)
-
-# Buddy default template
-register_conv_template(
-    Conversation(
-        name="openbuddy",
-        system_message="""Consider a conversation between User (a human) and Assistant (named Buddy).
-Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
-Buddy cannot access the Internet.
-Buddy can fluently speak the user's language (e.g. English, Chinese).
-Buddy can generate poems, stories, code, essays, songs, parodies, and more.
-Buddy possesses vast knowledge about the world, history, and culture.
-Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
-Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
-
-User: Hi.
-Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""",
-        roles=("User", "Assistant"),
-        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
-        sep="\n",
-    )
-)
-
-# Phoenix default template
-register_conv_template(
-    Conversation(
-        name="phoenix",
-        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
-        roles=("Human", "Assistant"),
-        sep_style=SeparatorStyle.PHOENIX,
-        sep="</s>",
-    )
-)
-
-# ReaLM default template
-register_conv_template(
-    Conversation(
-        name="ReaLM-7b-v1",
-        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
-        roles=("Human", "Assistant"),
-        sep_style=SeparatorStyle.PHOENIX,
-        sep="</s>",
-    )
-)
-
-# ChatGPT default template
-register_conv_template(
-    Conversation(
-        name="chatgpt",
-        system_message="You are a helpful assistant.",
-        roles=("user", "assistant"),
-        sep_style=None,
-        sep=None,
-    )
-)
-
-# Claude default template
-register_conv_template(
-    Conversation(
-        name="claude",
-        roles=("Human", "Assistant"),
-        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
-        sep="\n\n",
-    )
-)
-
-# MPT default template
-register_conv_template(
-    Conversation(
-        name="mpt-7b-chat",
-        system_template="""<|im_start|>system
-{system_message}""",
-        system_message="""- You are a helpful assistant chatbot trained by MosaicML.
-- You answer questions.
-- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
-- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
-        roles=("<|im_start|>user", "<|im_start|>assistant"),
-        sep_style=SeparatorStyle.CHATML,
-        sep="<|im_end|>",
-        stop_token_ids=[50278, 0],
-    )
-)
-
-# MPT-30b-chat default template
-register_conv_template(
-    Conversation(
-        name="mpt-30b-chat",
-        system_template="""<|im_start|>system
-{system_message}""",
-        system_message="""A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
-        roles=("<|im_start|>user", "<|im_start|>assistant"),
-        sep_style=SeparatorStyle.CHATML,
-        sep="<|im_end|>",
-        stop_token_ids=[50278, 0],
-    )
-)
-
-# MPT-30b-instruct default template
-# reference: https://huggingface.co/mosaicml/mpt-30b-instruct#formatting
-register_conv_template(
-    Conversation(
-        name="mpt-30b-instruct",
-        system_template="{system_message}",
-        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
-        roles=("### Instruction", "### Response"),
-        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
-        sep="\n\n",
-        stop_token_ids=[50278, 0],
-    )
-)
-
-# Bard default template
-# Reference: https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L150
-#            https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L40
-register_conv_template(
-    Conversation(
-        name="bard",
-        roles=("0", "1"),
-        sep_style=None,
-        sep=None,
-    )
-)
-
-# BiLLa default template
-register_conv_template(
-    Conversation(
-        name="billa",
-        roles=("Human", "Assistant"),
-        sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
-        sep="\n",
-        stop_str="Human:",
-    )
-)
-
-# RedPajama INCITE default template
-register_conv_template(
-    Conversation(
-        name="redpajama-incite",
-        roles=("<human>", "<bot>"),
-        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
-        sep="\n",
-        stop_str="<human>",
-    )
-)
-
-# h2oGPT default template
-register_conv_template(
-    Conversation(
-        name="h2ogpt",
-        roles=("<|prompt|>", "<|answer|>"),
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
-        sep="</s>",
-    )
-)
-
-# Robin default template
-register_conv_template(
-    Conversation(
-        name="Robin",
-        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
-        roles=("###Human", "###Assistant"),
-        sep_style=SeparatorStyle.ROBIN,
-        sep="\n",
-        stop_token_ids=[2, 396],
-        stop_str="###",
-    )
-)
-
-# Snoozy default template
-# Reference: https://github.com/nomic-ai/gpt4all/blob/d4861030b778da6db59d21d2927a4aba4f9f1f43/gpt4all-bindings/python/gpt4all/gpt4all.py#L232
-register_conv_template(
-    Conversation(
-        name="snoozy",
-        system_template="### Instruction:\n{system_message}",
-        system_message="The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.",
-        roles=("### Prompt", "### Response"),
-        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
-        sep="\n",
-        stop_str="###",
-    )
-)
-
-# manticore default template
-register_conv_template(
-    Conversation(
-        name="manticore",
-        roles=("USER", "ASSISTANT"),
-        sep_style=SeparatorStyle.ADD_COLON_TWO,
-        sep="\n",
-        sep2="</s>",
-    )
-)
-
-# Falcon default template
-register_conv_template(
-    Conversation(
-        name="falcon",
-        roles=("User", "Assistant"),
-        messages=[],
-        sep_style=SeparatorStyle.RWKV,
-        sep="\n",
-        sep2="<|endoftext|>",
-        stop_str="\nUser",  # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
-        stop_token_ids=[
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-        ],  # it better only put special tokens here, because tokenizer only remove special tokens
-    )
-)
-
-# ChagGPT default template
-register_conv_template(
-    Conversation(
-        name="polyglot_changgpt",
-        roles=("B", "A"),
-        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
-        sep="\n",
-    )
-)
-
-# tigerbot template
-register_conv_template(
-    Conversation(
-        name="tigerbot",
-        system_message="A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
-        roles=("### Instruction", "### Response"),
-        sep_style=SeparatorStyle.ROBIN,
-        sep="\n\n",
-        stop_str="###",
-    )
-)
-
-# ref: https://huggingface.co/Salesforce/xgen-7b-8k-inst
-register_conv_template(
-    Conversation(
-        name="xgen",
-        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
-        roles=("### Human", "### Assistant"),
-        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
-        sep="\n",
-        stop_token_ids=[50256],
-    )
-)
-
-# Internlm-chat template
-register_conv_template(
-    Conversation(
-        name="internlm-chat",
-        system_message="A chat between a curious <|User|> and an <|Bot|>. The <|Bot|> gives helpful, detailed, and polite answers to the <|User|>'s questions.\n\n",
-        roles=("<|User|>", "<|Bot|>"),
-        sep_style=SeparatorStyle.CHATINTERN,
-        sep="<eoh>",
-        sep2="<eoa>",
-        stop_token_ids=[1, 103028],
-        stop_str="<|User|>",
-    )
-)
-
-# StarChat template
-# reference: https://huggingface.co/spaces/HuggingFaceH4/starchat-playground/blob/main/dialogues.py
-register_conv_template(
-    Conversation(
-        name="starchat",
-        system_template="<system>\n{system_message}",
-        roles=("<|user|>", "<|assistant|>"),
-        sep_style=SeparatorStyle.CHATML,
-        sep="<|end|>",
-        stop_token_ids=[0, 49155],
-        stop_str="<|end|>",
-    )
-)
-
-# Baichuan-13B-Chat template
-register_conv_template(
-    # source: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555
-    # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/main/generation_config.json
-    # https://github.com/baichuan-inc/Baichuan-13B/issues/25
-    Conversation(
-        name="baichuan-chat",
-        roles=("<reserved_102>", "<reserved_103>"),
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
-        sep="",
-        stop_token_ids=[],
-    )
-)
-
-# Baichuan2-13B-Chat template
-register_conv_template(
-    # source: https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py#L773
-    # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_config.json
-    # https://github.com/baichuan-inc/Baichuan2/issues/62
-    Conversation(
-        name="baichuan2-chat",
-        roles=("<reserved_106>", "<reserved_107>"),
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
-        sep="",
-        stop_token_ids=[],
-    )
-)
-
-# Mistral template
-# source: https://docs.mistral.ai/llm/mistral-instruct-v0.1#chat-template
-register_conv_template(
-    Conversation(
-        name="mistral",
-        system_template="",
-        roles=("[INST] ", " [/INST]"),
-        sep_style=SeparatorStyle.LLAMA2,
-        sep="",
-        sep2=" </s>",
-    )
-)
-
-# llama2 template
-# reference: https://huggingface.co/blog/codellama#conversational-instructions
-# reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
-register_conv_template(
-    Conversation(
-        name="llama-2",
-        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
-        roles=("[INST]", "[/INST]"),
-        sep_style=SeparatorStyle.LLAMA2,
-        sep=" ",
-        sep2=" </s><s>",
-    )
-)
-
-register_conv_template(
-    Conversation(
-        name="cutegpt",
-        roles=("问：", "答：\n"),
-        sep_style=SeparatorStyle.NO_COLON_TWO,
-        sep="\n",
-        sep2="\n",
-        stop_str="<end>",
-    )
-)
-
-# OpenOrcaxOpenChat-Preview2-13B template
-register_conv_template(
-    Conversation(
-        name="open-orca",
-        system_template="{system_message}",
-        system_message="You are a helpful assistant. Please answer truthfully and write out your "
-        "thinking step by step to be sure you get the right answer. If you make a mistake or encounter "
-        "an error in your thinking, say so out loud and attempt to correct it. If you don't know or "
-        "aren't sure about something, say so clearly. You will act as a professional logician, mathematician, "
-        "and physicist. You will also act as the most appropriate type of expert to answer any particular "
-        "question or solve the relevant problem; state which expert type your are, if so. Also think of "
-        "any particular named expert that would be ideal to answer the relevant question or solve the "
-        "relevant problem; name and act as them, if appropriate.",
-        roles=("User", "Assistant"),
-        sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
-        sep="<|end_of_turn|>\n",
-        stop_token_ids=[32000, 32001],  # "<|end_of_turn|>"
-        stop_str="User",
-    )
-)
-
-
-# Qwen-chat default template
-# source: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/qwen_generation_utils.py#L130
-register_conv_template(
-    Conversation(
-        name="qwen-7b-chat",
-        system_template="<|im_start|>system\n{system_message}",
-        system_message="You are a helpful assistant.",
-        roles=("<|im_start|>user", "<|im_start|>assistant"),
-        sep_style=SeparatorStyle.CHATML,
-        sep="<|im_end|>",
-        stop_token_ids=[
-            151643,
-            151644,
-            151645,
-        ],  # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
-        stop_str="<|endoftext|>",
-    )
-)
-
-
-# AquilaChat default template
-# source: https://github.com/FlagAI-Open/FlagAI/blob/master/examples/Aquila/Aquila-chat/cyg_conversation.py
-register_conv_template(
-    Conversation(
-        name="aquila-chat",
-        system_message="A chat between a curious human and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
-        roles=("Human", "Assistant", "System"),
-        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
-        sep="###",
-        sep2="",
-        stop_str=["###", "</s>", "[UNK]"],
-    )
-)
-
-# Llama2-Chinese default template
-# source: https://huggingface.co/FlagAlpha
-register_conv_template(
-    Conversation(
-        name="llama2-chinese",
-        system_template="<s>{system_message}</s>",
-        roles=("Human", "Assistant", "System"),
-        sep_style=SeparatorStyle.ADD_COLON_TWO,
-        sep="\n",
-        sep2="\n</s><s>",
-        stop_str="</s>",
-    )
-)
-
-# Vigogne Chat default template
-# source: https://github.com/bofenghuang/vigogne
-register_conv_template(
-    Conversation(
-        name="vigogne-chat",
-        system_template="<|system|>: {system_message}",
-        system_message="Vous êtes l'assistant IA nommé Vigogne, créé par Zaion Lab (https://zaion.ai). "
-        "Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.",
-        roles=("<|user|>", "<|assistant|>"),
-        sep_style=SeparatorStyle.ADD_COLON_TWO,
-        sep="\n",
-        sep2="</s>\n",
-        stop_str="<|user|>",
-    )
-)
-
-# Falcon 180B chat template
-# source: https://huggingface.co/spaces/tiiuae/falcon-180b-demo/blob/d1590ee7fae9b6ce331ba7808e61a29dcce9239f/app.py#L28-L37
-register_conv_template(
-    Conversation(
-        name="falcon-chat",
-        roles=("User", "Falcon"),
-        system_template="System: {system_message}",
-        messages=[],
-        sep_style=SeparatorStyle.FALCON_CHAT,
-        sep="\n",
-        sep2="<|endoftext|>",
-        stop_str="\nUser:",  # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
-    )
-)
-
-# Phind template
-# source: https://huggingface.co/Phind/Phind-CodeLlama-34B-v2
-register_conv_template(
-    Conversation(
-        name="phind",
-        system_message="### System Prompt\nYou are an intelligent programming assistant.",
-        roles=("### User Message", "### Assistant"),
-        messages=(),
-        offset=0,
-        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
-        sep="\n\n",
-    )
-)
-
-# Metharme formatting for Pygmalion models
-# source: https://huggingface.co/PygmalionAI/pygmalion-2-13b
-register_conv_template(
-    Conversation(
-        name="metharme",
-        system_template="<|system|>{system_message}",
-        system_message="""Enter RP mode. You shall reply to the user while staying 
-        in character. Your responses must be detailed, creative, immersive, and drive the scenario
-        forward.""",
-        roles=("<|user|>", "<|model|>"),
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
-        sep="",
-        stop_str="<|user|>",
-    )
-)
-
-register_conv_template(
-    Conversation(
-        name="llama-ptuning",
-        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
-        roles=("[INST]", "[/INST]"),
-        sep_style=SeparatorStyle.LLAMA2,
-        sep=" ",
-        sep2=" </s><s>",
-    )
-)
-
-
-
-if __name__ == "__main__":
-    print("Vicuna template:")
-    conv = get_conv_template("vicuna_v1.1")
-    conv.append_message(conv.roles[0], "Hello!")
-    conv.append_message(conv.roles[1], "Hi!")
-    conv.append_message(conv.roles[0], "How are you?")
-    conv.append_message(conv.roles[1], None)
-    print(conv.get_prompt())
-
-    print("\n")
-
-    print("Llama-2 template:")
-    conv = get_conv_template("llama-2")
-    conv.set_system_message("You are a helpful, respectful and honest assistant.")
-    conv.append_message(conv.roles[0], "Hello!")
-    conv.append_message(conv.roles[1], "Hi!")
-    conv.append_message(conv.roles[0], "How are you?")
-    conv.append_message(conv.roles[1], None)
-    print(conv.get_prompt())
\ No newline at end of file
diff --git a/llm/client/test_client.py b/llm/client/test_client.py
deleted file mode 100644
index c1ca8f9ad3..0000000000
--- a/llm/client/test_client.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from Client import grpcClient
-
-client  = grpcClient(base_url="0.0.0.0:8812",
-                     model_name="llama-ptuning",
-                     timeout= 100)
-result = client.generate("Hello, how are you")
-print(result)
\ No newline at end of file
diff --git a/llm/client/Client.py b/llm/fastdeploy_llm/Client.py
similarity index 73%
rename from llm/client/Client.py
rename to llm/fastdeploy_llm/Client.py
index 55e9889738..baece99743 100644
--- a/llm/client/Client.py
+++ b/llm/fastdeploy_llm/Client.py
@@ -1,12 +1,30 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import queue
 import json
 import sys
 from functools import partial
-
+import os
+import time
 import numpy as np
+import subprocess
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import *
 
+import api_client
+
 class UserData:
     def __init__(self):
         self._completed_requests = queue.Queue()
@@ -26,6 +44,7 @@ def __init__(
 		model_name: str,
 		model_version: str = "1",
         timeout: int = 100,
+        openai_port: int = None
     ):
         """
         Args:
@@ -33,6 +52,7 @@ def __init__(
             model_name (`str`)
             model_version (`str`): default "1"
             timeout (`int`): inference timeout in seconds
+            openai_port (`int`) 
         """
         self._model_name = model_name
         self._model_version = model_version
@@ -48,7 +68,13 @@ def __init__(
         self.inputs = [grpcclient.InferInput("IN", [1], np_to_triton_dtype(np.object_))]
         self.outputs = [grpcclient.InferRequestedOutput("OUT")]
         self.has_init = False
-        self.user_data = UserData()    
+        self.user_data = UserData()
+
+        if openai_port is not None:
+            pd_cmd = "python3 api_client.py --url {0} --port {1} --model {2}".format(base_url, openai_port, model_name)
+            subprocess.Popen(pd_cmd, shell=True, stdout=subprocess.PIPE,
+                                               stderr=subprocess.STDOUT, preexec_fn=os.setsid)
+            time.sleep(5)
 
     def _verify_triton_state(self, triton_client):
         if not triton_client.is_server_live():
@@ -71,10 +97,10 @@ def generate(
         penalty_score: float = 1.0,
         frequency_score: float = 0.99,
         eos_token_id: int =2,
-        presence_score: float = 0.0
+        presence_score: float = 0.0,
+        stream: bool=False
     ):
         
-        #text = data_process(prompt)
         req_dict = {
             "text": prompt,
             "topp": top_p,
@@ -89,7 +115,6 @@ def generate(
             }
 
         try:
-            # Establish stream
             if not self.has_init:
                 self._client.start_stream(callback=partial(callback, self.user_data))
                 self.has_init = True
@@ -105,8 +130,10 @@ def generate(
                                             inputs=self.inputs,
                                             request_id=request_id,
                                             outputs=self.outputs)
-            # Retrieve results...
-            completion = ""
+            if stream:
+                completion = []
+            else:
+                completion = ""
             while True:
                 data_item = self.user_data._completed_requests.get(timeout=self.timeout)
                 if type(data_item) == InferenceServerException:
@@ -114,11 +141,13 @@ def generate(
                 else:
                     results = data_item.as_numpy("OUT")[0]
                     data = json.loads(results)
-                    completion += data["result"]
+                    if stream:
+                        completion.append(data["result"])
+                    else:
+                        completion += data["result"]
                     if data.get("is_end", False):
                         break
             return completion
         except Exception as e:
             print(f"Client infer error: {e}")
             raise e
-
diff --git a/llm/fastdeploy_llm/__init__.py b/llm/fastdeploy_llm/__init__.py
index 4fe63367d9..6bcc2b0e43 100644
--- a/llm/fastdeploy_llm/__init__.py
+++ b/llm/fastdeploy_llm/__init__.py
@@ -17,3 +17,4 @@
 from .task import Task, BatchTask
 from .config import Config
 from . import utils
+from . import Client
diff --git a/llm/client/api_client.py b/llm/fastdeploy_llm/api_client.py
similarity index 93%
rename from llm/client/api_client.py
rename to llm/fastdeploy_llm/api_client.py
index e6500d28c9..c0d5aa0b72 100644
--- a/llm/client/api_client.py
+++ b/llm/fastdeploy_llm/api_client.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import logging
 from logging.handlers import TimedRotatingFileHandler
@@ -12,11 +26,10 @@
 from tornado.concurrent import run_on_executor
 from concurrent.futures import ThreadPoolExecutor
 
-from conversation import *
+from utils.conversation import *
 from Client import *
 
 
-
 parse = argparse.ArgumentParser()
 parse.add_argument(
     '--url', type=str, help='grpc server url')
@@ -25,9 +38,6 @@
 parse.add_argument(
     '--model', type=str, help='model name', default="model")
 
-
-
-
 def parse_parameters(parameters_config, name, default_value):
         if name not in parameters_config:
             return default_value
@@ -90,7 +100,7 @@ def post(self):
                     out_json = create_error_response(4000102,"result is empty")
                 else:
                     out_json = {"outputs" : [data],
-                                "status": 0}
+                                    "status": 0}
                 result_str = json.dumps(out_json, ensure_ascii=False)
             else:
                 result_str = json.dumps(err, ensure_ascii=False)
@@ -160,11 +170,14 @@ def run_req(self, body):
             temperature = parse_parameters(body, 'temperature', 1.0),
             max_dec_len = parse_parameters(body, 'max_tokens', 1024),
             frequency_score= parse_parameters(body, 'frequency_penalty', 0.99),
-            presence_score= parse_parameters(body, 'presence_penalty', 0.0)
+            presence_score= parse_parameters(body, 'presence_penalty', 0.0),
+            stream= parse_parameters(body, 'stream', False)
         )
         return result
 
 
+
+
 class CompletionApiHandler(web.RequestHandler):
     """
     This handler provides OpenAI's Completion API。
@@ -269,7 +282,8 @@ def run_req(self, body):
             temperature = parse_parameters(body, 'temperature', 1.0),
             max_dec_len = parse_parameters(body, 'max_tokens', 1024),
             frequency_score= parse_parameters(body, 'frequency_penalty', 0.99),
-            presence_score= parse_parameters(body, 'presence_penalty', 0.0)
+            presence_score= parse_parameters(body, 'presence_penalty', 0.0),
+            stream= parse_parameters(body, 'stream', False)
         )
         return result
 
diff --git a/llm/fastdeploy_llm/utils/conversation.py b/llm/fastdeploy_llm/utils/conversation.py
new file mode 100644
index 0000000000..4403bc70ca
--- /dev/null
+++ b/llm/fastdeploy_llm/utils/conversation.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+
+
+@dataclasses.dataclass
+class Conversation:
+    name: str
+    system_template: str = "{system_message}"
+    system_message: str = ""
+    roles: list[str] = (("USER", "ASSISTANT"),)
+    messages: list[list[str]] = ()
+    sep_style: str = ""
+    sep: str = "\n"
+    sep2: str = None
+    stop_token_ids: list[int] = None
+
+    def get_prompt(self) -> str:
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.name == "llama-ptuning":
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = "[INST] "
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if i == 0:
+                        ret += message + " "
+                    else:
+                        ret += role + " " + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        
+    def set_system_message(self, system_message: str):
+        self.system_message = system_message
+
+    def append_message(self, role: str, message: str):
+        self.messages.append([role, message])
+
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_token_ids=self.stop_token_ids,
+        )
+
+
+conv_templates: dict[str, Conversation] = {}
+
+
+def register_conv_template(template: Conversation, override: bool = False):
+    if not override:
+        assert (
+            template.name not in conv_templates
+        ), f"{template.name} has been registered."
+
+    conv_templates[template.name] = template
+
+
+def get_conv_template(name: str) -> Conversation:
+    return conv_templates[name].copy()
+
+register_conv_template(
+    Conversation(
+        name="llama-ptuning",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        roles=("[INST]", "[/INST]"),
+        sep=" ",
+        sep2=" </s><s>",
+    )
+)
+
+
+
+if __name__ == "__main__":
+    print("llama-ptuning template:")
+    conv = get_conv_template("llama-ptuning")
+    conv.set_system_message("You are a helpful, respectful and honest assistant.")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
\ No newline at end of file
diff --git a/llm/test/README.md b/llm/test/README.md
new file mode 100644
index 0000000000..7f10ac70ba
--- /dev/null
+++ b/llm/test/README.md
@@ -0,0 +1,9 @@
+## 客户端
+ 
+- 支持open ai 两种api ：ChatCompletion 与 Completion
+
+### 使用方式
+- 这里提供了fastdelopy 客户端的示例demo，参考test 目录下 test_client.py
+- openai 两个接口示例代码在test 目录下 test_openai.py
+
+
diff --git a/llm/test/test_client.py b/llm/test/test_client.py
new file mode 100644
index 0000000000..33a94af79c
--- /dev/null
+++ b/llm/test/test_client.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from fastdeploy_llm.Client import grpcClient
+
+client  = grpcClient(base_url="0.0.0.0:8812",
+                     model_name="llama-ptuning",
+                     timeout= 100)
+result = client.generate("Hello, how are you")
+print(result)
\ No newline at end of file
diff --git a/llm/client/test_openai.py b/llm/test/test_openai.py
similarity index 52%
rename from llm/client/test_openai.py
rename to llm/test/test_openai.py
index d24b92ce14..eb5a8563a6 100644
--- a/llm/client/test_openai.py
+++ b/llm/test/test_openai.py
@@ -1,29 +1,32 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import openai
-import subprocess
-import os
-import time
-import signal
+from fastdeploy_llm.Client import grpcClient
 
 model = "llama-ptuning"
 port = 2001
 url = "0.0.0.0:8812"
 
-pd_cmd = "python3 api_client.py --url {0} --port {1} --model {2}".format(url, port, model)
-print("pd_cmd: ", pd_cmd)
-pd_process = subprocess.Popen(pd_cmd, shell=True, stdout=subprocess.PIPE,
-                                           stderr=subprocess.STDOUT, preexec_fn=os.setsid)
-
-time.sleep( 5 )
+client = grpcClient(base_url= url, model_name= model, openai_port= port)
 
 # Modify OpenAI's API key and API base.
 openai.api_key = "EMPTY"
-openai.api_base = "http://0.0.0.0:"+str(port)+"/v1"
+openai.api_base = "http://0.0.0.0:" + str(port) + "/v1"
 
 
 # Completion API
-# 
-stream = False
-
 completion = openai.Completion.create(
     model=model,
     prompt="A robot may not injure a human being"
@@ -33,8 +36,6 @@
 print(completion)
 
 # ChatCompletion API
-# 
-
 chat_completion = openai.ChatCompletion.create(
     model=model,
     messages=[{
@@ -55,4 +56,3 @@
 print("Chat completion results:")
 print(chat_completion)
 
-os.killpg(os.getpgid(pd_process.pid), signal.SIGTERM) 
\ No newline at end of file

From 3f6a547567519c76f87126bcf7561e3abcbf92be Mon Sep 17 00:00:00 2001
From: luotingdan <luotingdan@baidu.com>
Date: Tue, 17 Oct 2023 19:24:20 +0800
Subject: [PATCH 3/3] format code

---
 llm/fastdeploy_llm/Client.py             |  83 +++++-----
 llm/fastdeploy_llm/api_client.py         | 198 ++++++++++++-----------
 llm/fastdeploy_llm/utils/conversation.py | 103 ------------
 llm/test/test_client.py                  |   6 +-
 llm/test/test_openai.py                  |   8 +-
 5 files changed, 159 insertions(+), 239 deletions(-)
 delete mode 100644 llm/fastdeploy_llm/utils/conversation.py

diff --git a/llm/fastdeploy_llm/Client.py b/llm/fastdeploy_llm/Client.py
index baece99743..53965365b3 100644
--- a/llm/fastdeploy_llm/Client.py
+++ b/llm/fastdeploy_llm/Client.py
@@ -25,7 +25,9 @@
 
 import api_client
 
+
 class UserData:
+
     def __init__(self):
         self._completed_requests = queue.Queue()
 
@@ -38,14 +40,13 @@ def callback(user_data, result, error):
 
 
 class grpcClient:
-    def __init__(
-        self,
-        base_url: str,
-		model_name: str,
-		model_version: str = "1",
-        timeout: int = 100,
-        openai_port: int = None
-    ):
+
+    def __init__(self,
+                 base_url: str,
+                 model_name: str,
+                 model_version: str = "1",
+                 timeout: int = 100,
+                 openai_port: int = None):
         """
         Args:
             base_url (`str`): inference server grpc url
@@ -57,23 +58,29 @@ def __init__(
         self._model_name = model_name
         self._model_version = model_version
         self.timeout = timeout
-        self._client = grpcclient.InferenceServerClient(
-            base_url, verbose=False)
-        
+        self._client = grpcclient.InferenceServerClient(base_url,
+                                                        verbose=False)
+
         error = self._verify_triton_state(self._client)
         if error:
             raise RuntimeError(
                 f"Could not communicate to Triton Server: {error}")
 
-        self.inputs = [grpcclient.InferInput("IN", [1], np_to_triton_dtype(np.object_))]
+        self.inputs = [
+            grpcclient.InferInput("IN", [1], np_to_triton_dtype(np.object_))
+        ]
         self.outputs = [grpcclient.InferRequestedOutput("OUT")]
         self.has_init = False
         self.user_data = UserData()
 
         if openai_port is not None:
-            pd_cmd = "python3 api_client.py --url {0} --port {1} --model {2}".format(base_url, openai_port, model_name)
-            subprocess.Popen(pd_cmd, shell=True, stdout=subprocess.PIPE,
-                                               stderr=subprocess.STDOUT, preexec_fn=os.setsid)
+            pd_cmd = "python3 api_client.py --url {0} --port {1} --model {2}".format(
+                base_url, openai_port, model_name)
+            subprocess.Popen(pd_cmd,
+                             shell=True,
+                             stdout=subprocess.PIPE,
+                             stderr=subprocess.STDOUT,
+                             preexec_fn=os.setsid)
             time.sleep(5)
 
     def _verify_triton_state(self, triton_client):
@@ -85,22 +92,20 @@ def _verify_triton_state(self, triton_client):
                                               self._model_version):
             return f"Model {self._model_name}:{self._model_version} is not ready"
         return None
-    
-    def generate(
-        self,
-        prompt: str,
-        request_id: str = "0",
-        top_p: float = 0.0,
-        temperature: float = 1.0,
-        max_dec_len: int = 1024,
-        min_dec_len: int = 2,
-        penalty_score: float = 1.0,
-        frequency_score: float = 0.99,
-        eos_token_id: int =2,
-        presence_score: float = 0.0,
-        stream: bool=False
-    ):
-        
+
+    def generate(self,
+                 prompt: str,
+                 request_id: str = "0",
+                 top_p: float = 0.0,
+                 temperature: float = 1.0,
+                 max_dec_len: int = 1024,
+                 min_dec_len: int = 2,
+                 penalty_score: float = 1.0,
+                 frequency_score: float = 0.99,
+                 eos_token_id: int = 2,
+                 presence_score: float = 0.0,
+                 stream: bool = False):
+
         req_dict = {
             "text": prompt,
             "topp": top_p,
@@ -112,15 +117,19 @@ def generate(
             "eos_token_id": eos_token_id,
             "model_test": "test",
             "presence_score": presence_score
-            }
+        }
 
         try:
             if not self.has_init:
-                self._client.start_stream(callback=partial(callback, self.user_data))
+                self._client.start_stream(
+                    callback=partial(callback, self.user_data))
                 self.has_init = True
             else:
                 self.user_data.reset()
-                self.inputs = [grpcclient.InferInput("IN", [1], np_to_triton_dtype(np.object_))]
+                self.inputs = [
+                    grpcclient.InferInput("IN", [1],
+                                          np_to_triton_dtype(np.object_))
+                ]
                 self.outputs = [grpcclient.InferRequestedOutput("OUT")]
 
             in_data = np.array([json.dumps(req_dict)], dtype=np.object_)
@@ -135,9 +144,11 @@ def generate(
             else:
                 completion = ""
             while True:
-                data_item = self.user_data._completed_requests.get(timeout=self.timeout)
+                data_item = self.user_data._completed_requests.get(
+                    timeout=self.timeout)
                 if type(data_item) == InferenceServerException:
-                    print('Exception:', 'status', data_item.status(), 'msg', data_item.message())
+                    print('Exception:', 'status', data_item.status(), 'msg',
+                          data_item.message())
                 else:
                     results = data_item.as_numpy("OUT")[0]
                     data = json.loads(results)
diff --git a/llm/fastdeploy_llm/api_client.py b/llm/fastdeploy_llm/api_client.py
index c0d5aa0b72..718a06b49a 100644
--- a/llm/fastdeploy_llm/api_client.py
+++ b/llm/fastdeploy_llm/api_client.py
@@ -29,30 +29,29 @@
 from utils.conversation import *
 from Client import *
 
-
 parse = argparse.ArgumentParser()
-parse.add_argument(
-    '--url', type=str, help='grpc server url')
-parse.add_argument(
-    '--port', type=int, help='openai http port', default=2001)
-parse.add_argument(
-    '--model', type=str, help='model name', default="model")
+parse.add_argument('--url', type=str, help='grpc server url')
+parse.add_argument('--port', type=int, help='openai http port', default=2001)
+parse.add_argument('--model', type=str, help='model name', default="model")
+
 
 def parse_parameters(parameters_config, name, default_value):
-        if name not in parameters_config:
-            return default_value
-        return parameters_config[name]
+    if name not in parameters_config:
+        return default_value
+    return parameters_config[name]
+
 
 def create_error_response(status_code, msg):
-    output ={
-        "status" : status_code,
+    output = {
+        "status": status_code,
         "errResponse": {
-            "message":msg,
-            "type":"invalid_request_error"
+            "message": msg,
+            "type": "invalid_request_error"
         }
     }
     return output
- 
+
+
 class ChatCompletionApiHandler(web.RequestHandler):
     """
     This handler provides OpenAI's ChatCompletion API。
@@ -69,12 +68,12 @@ class ChatCompletionApiHandler(web.RequestHandler):
         - function (Users should implement this by themselves)
     """
     executor = ThreadPoolExecutor(20)
- 
+
     def __init__(self, application, request, **kwargs):
         web.RequestHandler.__init__(self, application, request, **kwargs)
 
     def initialize(self, url, model_name):
-        self._client = grpcClient(base_url= url,model_name=model_name)
+        self._client = grpcClient(base_url=url, model_name=model_name)
 
     @tornado.gen.coroutine
     def post(self):
@@ -83,29 +82,35 @@ def post(self):
         """
         body = self.request.body
         remote_ip = self.request.remote_ip
-        start_time = time.time() 
+        start_time = time.time()
         if not body:
             out_json = {"errorCode": 4000101}
             result_str = json.dumps(out_json, ensure_ascii=False)
-            logging.warning(f"request receieved from remote ip:{remote_ip}, body=None,\
-                result={result_str}, time_cost={time.time() - start_time : 0.5f}")
+            logging.warning(
+                f"request receieved from remote ip:{remote_ip}, body=None,\
+                result={result_str}, time_cost={time.time() - start_time : 0.5f}"
+            )
             self.write(result_str)
         else:
             body = json.loads(body)
-            logging.info(f"request receieved from remote ip:{remote_ip}, body={json.dumps(body, ensure_ascii=False)}")
+            logging.info(
+                f"request receieved from remote ip:{remote_ip}, body={json.dumps(body, ensure_ascii=False)}"
+            )
             err = self.valid_body(body)
             if err is None:
                 data = yield self.run_req(body)
                 if data is None:
-                    out_json = create_error_response(4000102,"result is empty")
+                    out_json = create_error_response(4000102,
+                                                     "result is empty")
                 else:
-                    out_json = {"outputs" : [data],
-                                    "status": 0}
+                    out_json = {"outputs": [data], "status": 0}
                 result_str = json.dumps(out_json, ensure_ascii=False)
             else:
                 result_str = json.dumps(err, ensure_ascii=False)
-            
-            logging.info(f"request returned, result={result_str}, time_cost={time.time() - start_time : 0.5f}")
+
+            logging.info(
+                f"request returned, result={result_str}, time_cost={time.time() - start_time : 0.5f}"
+            )
             self.write(result_str)
 
     def valid_body(self, request):
@@ -120,21 +125,24 @@ def valid_body(self, request):
             If the request body is valid, return None; 
             otherwise, return json with the error message 
         """
-        if request['model'] != self._client._model_name :
-            return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "current model is not currently supported")
+        if request['model'] != self._client._model_name:
+            return create_error_response(
+                HTTPStatus.BAD_REQUEST,
+                "current model is not currently supported")
         if 'n' in request and request['n'] != 1:
             return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "n only support 1")
+                                         "n only support 1")
         if 'logit_bias' in request and request['logit_bias'] is not None:
-            return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "logit_bias is not currently supported")
+            return create_error_response(
+                HTTPStatus.BAD_REQUEST,
+                "logit_bias is not currently supported")
         if 'functions' in request and request['functions'] is not None:
-            return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "functions is not currently supported")
+            return create_error_response(
+                HTTPStatus.BAD_REQUEST, "functions is not currently supported")
         if 'function_call' in request and request['function_call'] is not None:
-            return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "function_call is not currently supported")
+            return create_error_response(
+                HTTPStatus.BAD_REQUEST,
+                "function_call is not currently supported")
         return None
 
     def gen_prompt(self, request):
@@ -164,20 +172,17 @@ def run_req(self, body):
         req_id = random.randint(0, 100000)
         prompt = self.gen_prompt(body)
         result = self._client.generate(
-            request_id = str(req_id),
-            prompt = prompt,
-            top_p =  parse_parameters(body, 'top_p', 0.0),
-            temperature = parse_parameters(body, 'temperature', 1.0),
-            max_dec_len = parse_parameters(body, 'max_tokens', 1024),
-            frequency_score= parse_parameters(body, 'frequency_penalty', 0.99),
-            presence_score= parse_parameters(body, 'presence_penalty', 0.0),
-            stream= parse_parameters(body, 'stream', False)
-        )
+            request_id=str(req_id),
+            prompt=prompt,
+            top_p=parse_parameters(body, 'top_p', 0.0),
+            temperature=parse_parameters(body, 'temperature', 1.0),
+            max_dec_len=parse_parameters(body, 'max_tokens', 1024),
+            frequency_score=parse_parameters(body, 'frequency_penalty', 0.99),
+            presence_score=parse_parameters(body, 'presence_penalty', 0.0),
+            stream=parse_parameters(body, 'stream', False))
         return result
 
 
-
-
 class CompletionApiHandler(web.RequestHandler):
     """
     This handler provides OpenAI's Completion API。
@@ -196,12 +201,12 @@ class CompletionApiHandler(web.RequestHandler):
         - stop （currently support token id）
     """
     executor = ThreadPoolExecutor(20)
- 
+
     def __init__(self, application, request, **kwargs):
         web.RequestHandler.__init__(self, application, request, **kwargs)
 
     def initialize(self, url, model_name):
-        self._client = grpcClient(base_url= url,model_name=model_name)
+        self._client = grpcClient(base_url=url, model_name=model_name)
 
     @tornado.gen.coroutine
     def post(self):
@@ -210,32 +215,38 @@ def post(self):
         """
         body = self.request.body
         remote_ip = self.request.remote_ip
-        start_time = time.time() 
+        start_time = time.time()
         if not body:
             out_json = {"errorCode": 4000101}
             result_str = json.dumps(out_json, ensure_ascii=False)
-            logging.warning(f"request receieved from remote ip:{remote_ip}, body=None,\
-                result={result_str}, time_cost={time.time() - start_time : 0.5f}")
+            logging.warning(
+                f"request receieved from remote ip:{remote_ip}, body=None,\
+                result={result_str}, time_cost={time.time() - start_time : 0.5f}"
+            )
 
             self.write(result_str)
         else:
             body = json.loads(body)
-            logging.info(f"request receieved from remote ip:{remote_ip}, body={json.dumps(body, ensure_ascii=False)}")
+            logging.info(
+                f"request receieved from remote ip:{remote_ip}, body={json.dumps(body, ensure_ascii=False)}"
+            )
             err = self.valid_body(body)
             if err is None:
                 data = yield self.run_req(body)
                 if data is None:
-                    out_json = create_error_response(4000102,"result is empty")
+                    out_json = create_error_response(4000102,
+                                                     "result is empty")
                 else:
-                    out_json = {"outputs" : [data],
-                                "status": 0}
+                    out_json = {"outputs": [data], "status": 0}
                 result_str = json.dumps(out_json, ensure_ascii=False)
             else:
                 result_str = json.dumps(err, ensure_ascii=False)
-            
-            logging.info(f"request returned, result={result_str}, time_cost={time.time() - start_time : 0.5f}")
+
+            logging.info(
+                f"request returned, result={result_str}, time_cost={time.time() - start_time : 0.5f}"
+            )
             self.write(result_str)
-    
+
     def valid_body(self, request):
         """
         Check whether the request body is legal
@@ -248,27 +259,29 @@ def valid_body(self, request):
             If the request body is valid, return None; 
             otherwise, return json with the error message 
         """
-        if request['model'] != self._client._model_name :
-            return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "current model is not currently supported")
+        if request['model'] != self._client._model_name:
+            return create_error_response(
+                HTTPStatus.BAD_REQUEST,
+                "current model is not currently supported")
         if 'n' in request and request['n'] != 1:
             return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "n only support 1")       
+                                         "n only support 1")
         if 'best_of' in request and request['best_of'] != 1:
             return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "best_of only support 1")
+                                         "best_of only support 1")
         if 'echo' in request and request['echo']:
             return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "not suport echo")
+                                         "not suport echo")
         if 'suffix' in request and request['suffix'] is not None:
             return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "not suport suffix")
+                                         "not suport suffix")
         if 'logit_bias' in request and request['logit_bias'] is not None:
-            return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "logit_bias is not currently supported")
+            return create_error_response(
+                HTTPStatus.BAD_REQUEST,
+                "logit_bias is not currently supported")
         if 'logprobs' in request and request['logprobs'] is not None:
-            return create_error_response(HTTPStatus.BAD_REQUEST,
-                                        "logprobs is not currently supported")
+            return create_error_response(
+                HTTPStatus.BAD_REQUEST, "logprobs is not currently supported")
 
         return None
 
@@ -276,38 +289,41 @@ def valid_body(self, request):
     def run_req(self, body):
         req_id = random.randint(0, 100000)
         result = self._client.generate(
-            request_id = str(req_id),
-            prompt = body['prompt'],
-            top_p =  parse_parameters(body, 'top_p', 0.0),
-            temperature = parse_parameters(body, 'temperature', 1.0),
-            max_dec_len = parse_parameters(body, 'max_tokens', 1024),
-            frequency_score= parse_parameters(body, 'frequency_penalty', 0.99),
-            presence_score= parse_parameters(body, 'presence_penalty', 0.0),
-            stream= parse_parameters(body, 'stream', False)
-        )
+            request_id=str(req_id),
+            prompt=body['prompt'],
+            top_p=parse_parameters(body, 'top_p', 0.0),
+            temperature=parse_parameters(body, 'temperature', 1.0),
+            max_dec_len=parse_parameters(body, 'max_tokens', 1024),
+            frequency_score=parse_parameters(body, 'frequency_penalty', 0.99),
+            presence_score=parse_parameters(body, 'presence_penalty', 0.0),
+            stream=parse_parameters(body, 'stream', False))
         return result
 
 
-
-
 if __name__ == '__main__':
-    args=parse.parse_args()
+    args = parse.parse_args()
     port = args.port
-    app = web.Application([("/v1/completions", CompletionApiHandler, dict(url=args.url,model_name = args.model)),
-                           ("/v1/chat/completions", ChatCompletionApiHandler, dict(url=args.url,model_name = args.model))
-                           ])
+    app = web.Application([("/v1/completions", CompletionApiHandler,
+                            dict(url=args.url, model_name=args.model)),
+                           ("/v1/chat/completions", ChatCompletionApiHandler,
+                            dict(url=args.url, model_name=args.model))])
 
     logger = logging.getLogger()
     logger.setLevel(logging.INFO)
 
     formatter = tornado.log.LogFormatter(
-        fmt='%(levelname)-8s %(asctime)s %(process)-5s %(filename)s[line:%(lineno)d] %(message)s',
-        datefmt='%Y-%m-%d %H:%M:%S'
-    )
-
-    file_handler = TimedRotatingFileHandler(filename='log/server.log', when='D', interval=3, backupCount=90, encoding='utf-8', delay=False)
+        fmt=
+        '%(levelname)-8s %(asctime)s %(process)-5s %(filename)s[line:%(lineno)d] %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S')
+
+    file_handler = TimedRotatingFileHandler(filename='log/server.log',
+                                            when='D',
+                                            interval=3,
+                                            backupCount=90,
+                                            encoding='utf-8',
+                                            delay=False)
     file_handler.setFormatter(formatter)
-    
+
     logger.addHandler(file_handler)
     app.listen(port)
     print("Server started")
diff --git a/llm/fastdeploy_llm/utils/conversation.py b/llm/fastdeploy_llm/utils/conversation.py
deleted file mode 100644
index 4403bc70ca..0000000000
--- a/llm/fastdeploy_llm/utils/conversation.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dataclasses
-
-
-@dataclasses.dataclass
-class Conversation:
-    name: str
-    system_template: str = "{system_message}"
-    system_message: str = ""
-    roles: list[str] = (("USER", "ASSISTANT"),)
-    messages: list[list[str]] = ()
-    sep_style: str = ""
-    sep: str = "\n"
-    sep2: str = None
-    stop_token_ids: list[int] = None
-
-    def get_prompt(self) -> str:
-        system_prompt = self.system_template.format(system_message=self.system_message)
-        if self.name == "llama-ptuning":
-            seps = [self.sep, self.sep2]
-            if self.system_message:
-                ret = system_prompt
-            else:
-                ret = "[INST] "
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    if i == 0:
-                        ret += message + " "
-                    else:
-                        ret += role + " " + message + seps[i % 2]
-                else:
-                    ret += role
-            return ret
-        
-    def set_system_message(self, system_message: str):
-        self.system_message = system_message
-
-    def append_message(self, role: str, message: str):
-        self.messages.append([role, message])
-
-    def copy(self):
-        return Conversation(
-            name=self.name,
-            system_template=self.system_template,
-            system_message=self.system_message,
-            roles=self.roles,
-            messages=[[x, y] for x, y in self.messages],
-            sep_style=self.sep_style,
-            sep=self.sep,
-            sep2=self.sep2,
-            stop_token_ids=self.stop_token_ids,
-        )
-
-
-conv_templates: dict[str, Conversation] = {}
-
-
-def register_conv_template(template: Conversation, override: bool = False):
-    if not override:
-        assert (
-            template.name not in conv_templates
-        ), f"{template.name} has been registered."
-
-    conv_templates[template.name] = template
-
-
-def get_conv_template(name: str) -> Conversation:
-    return conv_templates[name].copy()
-
-register_conv_template(
-    Conversation(
-        name="llama-ptuning",
-        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
-        roles=("[INST]", "[/INST]"),
-        sep=" ",
-        sep2=" </s><s>",
-    )
-)
-
-
-
-if __name__ == "__main__":
-    print("llama-ptuning template:")
-    conv = get_conv_template("llama-ptuning")
-    conv.set_system_message("You are a helpful, respectful and honest assistant.")
-    conv.append_message(conv.roles[0], "Hello!")
-    conv.append_message(conv.roles[1], "Hi!")
-    conv.append_message(conv.roles[0], "How are you?")
-    conv.append_message(conv.roles[1], None)
-    print(conv.get_prompt())
\ No newline at end of file
diff --git a/llm/test/test_client.py b/llm/test/test_client.py
index 33a94af79c..0e7c22bb19 100644
--- a/llm/test/test_client.py
+++ b/llm/test/test_client.py
@@ -14,8 +14,8 @@
 
 from fastdeploy_llm.Client import grpcClient
 
-client  = grpcClient(base_url="0.0.0.0:8812",
-                     model_name="llama-ptuning",
-                     timeout= 100)
+client = grpcClient(base_url="0.0.0.0:8812",
+                    model_name="llama-ptuning",
+                    timeout=100)
 result = client.generate("Hello, how are you")
 print(result)
\ No newline at end of file
diff --git a/llm/test/test_openai.py b/llm/test/test_openai.py
index eb5a8563a6..b0165ee34a 100644
--- a/llm/test/test_openai.py
+++ b/llm/test/test_openai.py
@@ -19,18 +19,15 @@
 port = 2001
 url = "0.0.0.0:8812"
 
-client = grpcClient(base_url= url, model_name= model, openai_port= port)
+client = grpcClient(base_url=url, model_name=model, openai_port=port)
 
 # Modify OpenAI's API key and API base.
 openai.api_key = "EMPTY"
 openai.api_base = "http://0.0.0.0:" + str(port) + "/v1"
 
-
 # Completion API
 completion = openai.Completion.create(
-    model=model,
-    prompt="A robot may not injure a human being"
-)
+    model=model, prompt="A robot may not injure a human being")
 
 print("Completion results:")
 print(completion)
@@ -55,4 +52,3 @@
     }])
 print("Chat completion results:")
 print(chat_completion)
-