From 39f41565ef3b4140c31aebc72ba05af02340fcd2 Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Sun, 4 Feb 2024 14:49:10 +0800
Subject: [PATCH] update documents, set log level

---
 docs/en/inference/pipeline.md            | 60 +++++++++++++++++++++
 docs/en/serving/restful_api.md           | 62 +++++++++++++++++++++
 docs/zh_cn/inference/pipeline.md         | 68 ++++++++++++++++++++++++
 docs/zh_cn/serving/restful_api.md        | 63 +++++++++++++++++++++-
 lmdeploy/api.py                          | 35 +++++++-----
 lmdeploy/serve/async_engine.py           |  5 ++
 lmdeploy/serve/openai/api_server.py      |  4 +-
 src/turbomind/models/llama/LlamaBatch.cc |  2 +-
 8 files changed, 282 insertions(+), 17 deletions(-)

diff --git a/docs/en/inference/pipeline.md b/docs/en/inference/pipeline.md
index 603d9daaf..74d154461 100644
--- a/docs/en/inference/pipeline.md
+++ b/docs/en/inference/pipeline.md
@@ -233,6 +233,66 @@ This class contains the generation parameters used by inference engines.
 | stop_words         | List\[str\] | Words that stop generating further tokens.                                                                            | None    |
 | bad_words          | List\[str\] | Words that the engine will never generate.                                                                            | None    |
 
+## Customize chat template
+
+LMDeploy supports two methods for adding dialogue templates:
+
+- One method involves customizing a Python dialogue template class based on LMDeploy's existing dialogue templates, which can be used directly after successful registration. The advantages are a high degree of customization and strong controllability. Below is an example of registering an LMDeploy dialogue template:
+  ```python
+  from typing import Dict, Union
+
+  from lmdeploy import ChatTemplateConfig, pipeline
+  from lmdeploy.model import MODELS, BaseModel
+
+
+  @MODELS.register_module(name='customized_model')
+  class CustomizedModel(BaseModel):
+      """A customized chat template."""
+
+      def messages2prompt(self,
+                          messages: Union[str, Dict],
+                          sequence_start: bool = True) -> str:
+          """This func apply chat template for input messages
+              Args:
+                    messages (str | Dict): input messages. Could be a str prompt or
+                    OpenAI format chat history. The former is for interactive chat.
+                    sequence_start (bool): Only for interactive chatting. Begin of the
+                    prompt token will be removed in interactive chatting when
+                    the sequence_start is False.
+              Returns:
+                    string. The return value will be sent to tokenizer.encode directly.
+              """
+          print(f'Any modification can be done for {messages}')
+          return str(messages)  # just a dummpy conversion.
+
+
+  pipe = pipeline('internlm/internlm2-chat-7b',
+                  chat_template_config=ChatTemplateConfig('customized_model'))
+
+  response = pipe('hi')
+  print(response)  # text completion in this case because of customized_model
+  ```
+  In this example, we registered an LMDeploy dialogue template that simply returns the input prompt as is, or converts the dialogue history into a string directly. The user needs to implement the actual dialogue template logic themselves, ideally considering both input scenarios. This allows the pipeline to handle both string inputs and OpenAI format dialogue history inputs after initialization.
+- The other method involves passing in [Huggingface's dialogue templates](https://huggingface.co/docs/transformers/main/en/chat_templating), which are Jinja templates.
+  Starting with a Python script is as follows:
+  ```python
+  from lmdeploy import ChatTemplateConfig, pipeline
+
+  pipe = pipeline('internlm/internlm2-chat-7b',
+                  chat_template_config=ChatTemplateConfig(
+                      jinja_template='jinja_template_str_or_file'))
+
+  response = pipe([[{
+      'role': 'user',
+      'content': 'Hi, pls intro yourself'
+  }], [{
+      'role': 'user',
+      'content': 'Shanghai is'
+  }]])
+  print(response)  # Jinja template can only handle OpenAI format chat history
+  ```
+  It's important to note that after passing in the Jinja template, the pipeline can only process dialogue history inputs in the OpenAI format.
+
 ## FAQs
 
 - *RuntimeError: context has already been set*. If you got this for tp>1 in pytorch backend. Please make sure the python script has following
diff --git a/docs/en/serving/restful_api.md b/docs/en/serving/restful_api.md
index 9087a98fc..c10b32bf5 100644
--- a/docs/en/serving/restful_api.md
+++ b/docs/en/serving/restful_api.md
@@ -161,6 +161,68 @@ openaoe -f /path/to/your/config-template.yaml
 
 Please refer to the [guidance](https://github.com/InternLM/OpenAOE/blob/main/docs/tech-report/model_serving_by_lmdeploy/model_serving_by_lmdeploy.md) for more deploy information.
 
+### Customize the chat template
+
+LMDeploy supports two forms of chat templates:
+
+- The first approach is to customize a Python dialogue template class like the existing LMDeploy dialogue templates. It can be used directly after successful registration. The advantages are a high degree of customization and strong controllability. Below is an example of registering an LMDeploy dialogue template.
+
+  ```python
+  from typing import Dict, Union
+
+  from lmdeploy import ChatTemplateConfig, serve
+  from lmdeploy.model import MODELS, BaseModel
+
+
+  @MODELS.register_module(name='customized_model')
+  class CustomizedModel(BaseModel):
+      """A customized chat template."""
+
+      def messages2prompt(self,
+                          messages: Union[str, Dict],
+                          sequence_start: bool = True) -> str:
+          """This func apply chat template for input messages
+          Args:
+              messages (str | Dict): input messages. Could be a str prompt or
+                  OpenAI format chat history. The former is for interactive chat.
+              sequence_start (bool): Only for interactive chatting. Begin of the
+                  prompt token will be removed in interactive chatting when
+                  the sequence_start is False.
+          Returns:
+              string. The return value will be sent to tokenizer.encode directly.
+          """
+          print(f'Any modification can be done for {messages}')
+          return str(messages)  # just a dummpy conversion.
+
+
+  client = serve('internlm/internlm2-chat-7b',
+                chat_template_config=ChatTemplateConfig('customized_model'))
+  for item in client.chat_completions_v1('customized_model', [{
+          'role': 'user',
+          'content': 'hi'
+  }]):
+      print(item)
+  ```
+
+  In this example, we registered an LMDeploy dialogue template that simply returns the input prompt as is, or converts the dialogue history into a string directly. The user needs to implement the actual dialogue template logic themselves, ideally considering both input scenarios. With such a service started, all interfaces can be used.
+
+- Another approach is using [Huggingface chat template](https://huggingface.co/docs/transformers/main/en/chat_templating).
+  You can start the service by passing parameters directly through the command line, or by passing parameters through an LMDeploy API function to a script.
+
+  ```shell
+  lmdeploy serve api_server internlm/internlm2-chat-7b --jinja-template ${JINJA_STR_OR_FILE}
+  ```
+
+  ```python
+  from lmdeploy import ChatTemplateConfig, serve
+
+  serve('internlm/internlm2-chat-7b',
+        ChatTemplateConfig(jinja_template='jinja_template_str_or_file'),
+        block=True)
+  ```
+
+  It's important to note that after passing in the Jinja template, the service's endpoint should query the model name preferably through the user's `/v1/models` endpoint first. Additionally, Jinja templates can only be used for inputs in the OpenAI format, which means they are only suitable for services that use the OpenAI interface.
+
 ### FAQ
 
 1. When user got `"finish_reason":"length"`, it means the session is too long to be continued. The session length can be
diff --git a/docs/zh_cn/inference/pipeline.md b/docs/zh_cn/inference/pipeline.md
index 26b94381e..c26583a24 100644
--- a/docs/zh_cn/inference/pipeline.md
+++ b/docs/zh_cn/inference/pipeline.md
@@ -233,6 +233,74 @@ print(response)
 | stop_words         | List\[str\] | 停止进一步生成令牌的词。                              | None    |
 | bad_words          | List\[str\] | 引擎永远不会生成的词。                                | None    |
 
+## 自定义对话模板
+
+LMDeploy 支持两种添加对话模板的形式：
+
+- 一种是以 LMDeploy 现有对话模板，自定义一个python对话模板类，注册成功后直接用即可。优点是自定义程度高，可控性强。
+  下面是一个注册 LMDeploy 对话模板的例子：
+
+  ```python
+  from typing import Dict, Union
+
+  from lmdeploy import ChatTemplateConfig, pipeline
+  from lmdeploy.model import MODELS, BaseModel
+
+
+  @MODELS.register_module(name='customized_model')
+  class CustomizedModel(BaseModel):
+      """A customized chat template."""
+
+      def messages2prompt(self,
+                          messages: Union[str, Dict],
+                          sequence_start: bool = True) -> str:
+          """This func apply chat template for input messages
+              Args:
+                    messages (str | Dict): input messages. Could be a str prompt or
+                    OpenAI format chat history. The former is for interactive chat.
+                    sequence_start (bool): Only for interactive chatting. Begin of the
+                    prompt token will be removed in interactive chatting when
+                    the sequence_start is False.
+              Returns:
+                    string. The return value will be sent to tokenizer.encode directly.
+              """
+          print(f'Any modification can be done for {messages}')
+          return str(messages)  # just a dummpy conversion.
+
+
+  pipe = pipeline('internlm/internlm2-chat-7b',
+                  chat_template_config=ChatTemplateConfig('customized_model'))
+
+  response = pipe('hi')
+  print(response)  # text completion in this case because of customized_model
+  ```
+
+  在这个例子中，我们注册了一个 LMDeploy 的对话模板，该模板只是将输入的 prompt 直接返回，或者
+  将对话历史直接转成了一个字符串。用户真正需要的对话模板逻辑，需要用户自己做填充，最好对两种输入情况都考虑到。
+  这样 pipeline 初始化后既能处理 string 输入又能处理 OpenAI 格式的对话历史输入。
+
+- 另一种是传入 [Huggingface 的对话模板](https://huggingface.co/docs/transformers/main/en/chat_templating)，即 Jinja 模板。
+  通过 python 脚本启动为：
+
+  ```python
+  from lmdeploy import ChatTemplateConfig, pipeline
+
+  pipe = pipeline('internlm/internlm2-chat-7b',
+                  chat_template_config=ChatTemplateConfig(
+                      jinja_template='jinja_template_str_or_file'))
+
+  response = pipe([[{
+      'role': 'user',
+      'content': 'Hi, pls intro yourself'
+  }], [{
+      'role': 'user',
+      'content': 'Shanghai is'
+  }]])
+  print(response)  # Jinja template can only handle OpenAI format chat history
+  ```
+
+  需要注意的时，传入的 Jinja 模板后，pipeline 只能处理 OpenAI 格式的对话历史作为输入。
+
 ## FAQs
 
 - *RuntimeError: context has already been set*. 如果你在使用 tp>1 和 pytorch 后端的时候，遇到了这个错误。请确保 python 脚本中有下面内容作为入口
diff --git a/docs/zh_cn/serving/restful_api.md b/docs/zh_cn/serving/restful_api.md
index 931c4a21a..ee1351d8a 100644
--- a/docs/zh_cn/serving/restful_api.md
+++ b/docs/zh_cn/serving/restful_api.md
@@ -161,7 +161,68 @@ openaoe -f /path/to/your/config-template.yaml
 LMDeploy 支持两种添加对话模板的形式：
 
 - 一种是以 LMDeploy 现有对话模板，自定义一个python对话模板类，注册成功后直接用即可。优点是自定义程度高，可控性强。
-- 一种是传入 Huggingface 的对话模板，即 Jinja 模板。
+  下面是一个注册 LMDeploy 对话模板的例子：
+
+  ```python
+  from typing import Dict, Union
+
+  from lmdeploy import ChatTemplateConfig, serve
+  from lmdeploy.model import MODELS, BaseModel
+
+
+  @MODELS.register_module(name='customized_model')
+  class CustomizedModel(BaseModel):
+      """A customized chat template."""
+
+      def messages2prompt(self,
+                          messages: Union[str, Dict],
+                          sequence_start: bool = True) -> str:
+          """This func apply chat template for input messages
+          Args:
+              messages (str | Dict): input messages. Could be a str prompt or
+                  OpenAI format chat history. The former is for interactive chat.
+              sequence_start (bool): Only for interactive chatting. Begin of the
+                  prompt token will be removed in interactive chatting when
+                  the sequence_start is False.
+          Returns:
+              string. The return value will be sent to tokenizer.encode directly.
+          """
+          print(f'Any modification can be done for {messages}')
+          return str(messages)  # just a dummpy conversion.
+
+
+  client = serve('internlm/internlm2-chat-7b',
+                chat_template_config=ChatTemplateConfig('customized_model'))
+  for item in client.chat_completions_v1('customized_model', [{
+          'role': 'user',
+          'content': 'hi'
+  }]):
+      print(item)
+  ```
+
+  在这个例子中，我们注册了一个 LMDeploy 的对话模板，该模板只是将输入的 prompt 直接返回，或者
+  将对话历史直接转成了一个字符串。用户真正需要的对话模板逻辑，需要用户自己做填充，最好对两种输入情况都考虑到。
+  这样启动的服务，各个接口都可以使用。
+
+- 另一种是传入 [Huggingface 的对话模板](https://huggingface.co/docs/transformers/main/en/chat_templating)，即 Jinja 模板。
+  可以通过命令行直接传参启动，也可以通过 LMDeploy 的 API 函数传参脚本启动。
+
+  ```shell
+  lmdeploy serve api_server internlm/internlm2-chat-7b --jinja-template ${JINJA_STR_OR_FILE}
+  ```
+
+  通过 python 脚本启动为：
+
+  ```python
+  from lmdeploy import ChatTemplateConfig, serve
+
+  serve('internlm/internlm2-chat-7b',
+        ChatTemplateConfig(jinja_template='jinja_template_str_or_file'),
+        block=True)
+  ```
+
+  需要注意的时，传入的 Jinja 模板后，服务的 endpoint 需要的模型名字最好用户先通过 `/v1/models`查询。此外，Jinja 模板只能用于 OpenAI
+  格式的输入，这就意味着只能用 OpenAI 接口的服务。
 
 ### FAQ
 
diff --git a/lmdeploy/api.py b/lmdeploy/api.py
index 34f2be9b7..259343628 100644
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
@@ -76,9 +76,10 @@ def serve(model_path: str,
           chat_template_config: Optional[ChatTemplateConfig] = None,
           server_name: str = '0.0.0.0',
           server_port: int = 23333,
-          log_level: str = 'ERROR',
+          log_level: str = 'WARNING',
           api_keys: Optional[Union[List[str], str]] = None,
           ssl: bool = False,
+          block: bool = False,
           **kwargs):
     """This will run the api_server in a subprocess.
 
@@ -111,9 +112,13 @@ def serve(model_path: str,
         api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as
             a single api_key. Default to None, which means no api key applied.
         ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'.
+        block (bool): If the block is True, the python process will be blocked.
+            Otherwise, the function will return a client. The service will be
+            running in a subprocess.
 
     Return:
-        APIClient: A client chatbot for LLaMA series models.
+        APIClient or None: If block is False, return a client chatbot for LLaMA series models.
+            Or else, return None.
 
     Examples:
         >>> import lmdeploy
@@ -131,19 +136,23 @@ def serve(model_path: str,
         kwargs.pop('tp')
     else:
         tp = 1 if backend_config is None else backend_config.tp
+    serve_kwargs = dict(model_name=model_name,
+                        backend=backend,
+                        backend_config=backend_config,
+                        chat_template_config=chat_template_config,
+                        server_name=server_name,
+                        server_port=server_port,
+                        tp=tp,
+                        log_level=log_level,
+                        api_keys=api_keys,
+                        ssl=ssl,
+                        **kwargs)
+    if block:
+        serve(model_path, **serve_kwargs)
+        return
     task = Process(target=serve,
                    args=(model_path, ),
-                   kwargs=dict(model_name=model_name,
-                               backend=backend,
-                               backend_config=backend_config,
-                               chat_template_config=chat_template_config,
-                               server_name=server_name,
-                               server_port=server_port,
-                               tp=tp,
-                               log_level=log_level,
-                               api_keys=api_keys,
-                               ssl=ssl,
-                               **kwargs),
+                   kwargs=serve_kwargs,
                    daemon=True)
     task.start()
     client = APIClient(f'http://{server_name}:{server_port}')
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 2cc8ba363..737ade0c8 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -105,6 +105,7 @@ def _load_chat_template(self, chat_template_config: ChatTemplateConfig):
         # if model_name is given, lmdeploy template will be applied
         # no matter what Jinja template
         if chat_template_config and chat_template_config.model_name:
+            self.model_name = chat_template_config.model_name
             return
         # if no model_name passed in, will choose tokenizer's template
         # it could be a Jinja if it exists in tokenizer_config.json
@@ -488,6 +489,10 @@ async def generate(
         finish_reason = None
         if self.id2step[str(session_id)] + len(
                 input_ids) + gen_config.max_new_tokens >= self.session_len:
+            logger.warning(f'The maximum session len is reached. Step: '
+                           f'{self.id2step[str(session_id)]}, input len: '
+                           f'{len(input_ids)}, request out len: '
+                           f'{gen_config.max_new_tokens}.')
             finish_reason = 'length'
             yield GenOut('', self.id2step[str(session_id)], len(input_ids), 0,
                          finish_reason)
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 8d1bf2092..3e8bb7adc 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -72,7 +72,7 @@ def get_model_list():
 
     Only provided one now.
     """
-    return [VariableInterface.async_engine.engine.model_name]
+    return [VariableInterface.async_engine.model_name]
 
 
 @app.get('/v1/models', dependencies=[Depends(check_api_key)])
@@ -928,7 +928,7 @@ def serve(model_path: str,
           allow_credentials: bool = True,
           allow_methods: List[str] = ['*'],
           allow_headers: List[str] = ['*'],
-          log_level: str = 'ERROR',
+          log_level: str = 'WARNING',
           api_keys: Optional[Union[List[str], str]] = None,
           ssl: bool = False,
           qos_config_path: str = '',
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 3c2a9b6ab..a73b41924 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -235,7 +235,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
         FT_CHECK(!state.requests[idx]);
 
         if (rank_ == 0) {
-            TM_LOG_WARNING("[ProcessInferRequests] Request for %ld received.", (long)r->id);
+            TM_LOG_INFO("[ProcessInferRequests] Request for %ld received.", (long)r->id);
         }
 
         state.requests[idx] = r;