merge main

grimoire · Feb 5, 2024 · f9444e7 · f9444e7
2 parents 80d9dc7 + c332efa
commit f9444e7
Show file tree

Hide file tree

Showing 34 changed files with 526 additions and 218 deletions.
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -210,7 +210,7 @@ The config for a pre-commit hook is stored in [.pre-commit-config](../.pre-commi
 
 #### C++ and CUDA
 
-The clang-format config is stored in [.clang-format](../.clang-format).
+The clang-format config is stored in [.clang-format](../.clang-format). And it's recommended to use clang-format version **11**. Please do not use older or newer versions as they will result in differences after formatting, which can cause the [lint](https://github.com/InternLM/lmdeploy/blob/main/.github/workflows/lint.yml#L25) to fail.
 
 ### PR Specs
 

diff --git a/README.md b/README.md
@@ -126,6 +126,11 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"])
 print(response)
 ```
 
+> \[!NOTE\]
+> By default, LMDeploy downloads model from HuggingFace. If you would like to use models from ModelScope, please install ModelScope by `pip install modelscope` and set the environment variable:
+>
+> `export LMDEPLOY_USE_MODELSCOPE=True`
+
 For more information about inference pipeline, please refer to [here](./docs/en/inference/pipeline.md).
 
 # Tutorials

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -127,6 +127,11 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"])
 print(response)
 ```
 
+> \[!NOTE\]
+> LMDeploy 默认从 HuggingFace 上面下载模型，如果要从 ModelScope 上面下载模型，请通过命令 `pip install modelscope` 安装ModelScope，并设置环境变量：
+>
+> `export LMDEPLOY_USE_MODELSCOPE=True`
+
 关于 pipeline 的更多推理参数说明，请参考[这里](./docs/zh_cn/inference/pipeline.md)
 
 # 用户教程

diff --git a/docs/en/inference/pipeline.md b/docs/en/inference/pipeline.md
diff --git a/docs/zh_cn/inference/pipeline.md b/docs/zh_cn/inference/pipeline.md
@@ -171,6 +171,16 @@ print(response)
 | repetition_penalty | float                      | 1.0    | 重复惩罚的参数。1.0表示没有惩罚。后期会弃用，请改用 gen_config 参数                                                                       |
 | ignore_eos         | bool                       | False  | 是否忽略结束符的指示器。后期会弃用，请改用 gen_config 参数                                                                                |
 
+### Response
+
+| 参数名             | 类型                                    | 描述                                                                                                                                         |
+| ------------------ | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- |
+| text               | str                                     | 服务器响应的文本。如果输出文本为空字符串，且finish_reason为length，则表示已达最大会话长度。                                                  |
+| generate_token_len | int                                     | 响应的 token 数。                                                                                                                            |
+| input_token_len    | int                                     | 输入提示 token 数。注意，这可能包含聊天模板部分。                                                                                            |
+| session_id         | int                                     | 运行会话的ID。基本上，它指的是输入请求批次的索引位置。                                                                                       |
+| finish_reason      | Optional\[Literal\['stop', 'length'\]\] | 模型停止生成 token 的原因。如果模型遇到 stop word，这将设置为'stop'；如果达到了请求中指定的最大 token 数或者 session_len，则设置为'length'。 |
+
 ## TurbomindEngineConfig
 
 ### 描述
@@ -190,6 +200,8 @@ print(response)
 | quant_policy          | int           | 默认为0。当k/v量化为8位时，设置为4。                                   | 0       |
 | rope_scaling_factor   | float         | 用于动态ntk的缩放因子。TurboMind遵循transformer LlamaAttention的实现。 | 0.0     |
 | use_logn_attn         | bool          | 是否使用对数注意力。                                                   | False   |
+| download_dir          | str, optional | 模型缓存路径                                                           | None    |
+| revision              | str, optional | Git revision id, 可以是branch，tag或者commit id                        | None    |
 
 ## PytorchEngineConfig
 
@@ -211,6 +223,8 @@ print(response)
 | num_cpu_blocks   | int  | CPU块的数量。如果值为0，缓存将根据当前环境进行分配。         | 0           |
 | num_gpu_blocks   | int  | GPU块的数量。如果值为0，缓存将根据当前环境进行分配。         | 0           |
 | adapters         | dict | lora adapters的配置路径                                      | None        |
+| download_dir     | str  | 模型缓存路径                                                 | None        |
+| revision         | str  | Git revision id, 可以是branch，tag或者commit id              | None        |
 
 ## GenerationConfig
 
@@ -220,18 +234,20 @@ print(response)
 
 ### 参数
 
-| Parameter          | Type        | Description                                           | Default |
-| ------------------ | ----------- | ----------------------------------------------------- | ------- |
-| n                  | int         | 对每个输入消息生成聊天补全选择的数量。目前仅支持 1    | 1       |
-| max_new_tokens     | int         | 聊天补全中可以生成的最大令牌数。                      | 512     |
-| top_p              | float       | 核心采样，其中模型考虑具有top_p概率质量的令牌。       | 1.0     |
-| top_k              | int         | 模型考虑具有最高概率的前K个令牌。                     | 1       |
-| temperature        | float       | 采样温度。                                            | 0.8     |
-| repetition_penalty | float       | 防止模型生成重复词或短语的惩罚。大于1的值会抑制重复。 | 1.0     |
-| ignore_eos         | bool        | 是否忽略eos_token_id。                                | False   |
-| random_seed        | int         | 采样令牌时使用的种子。                                | None    |
-| stop_words         | List\[str\] | 停止进一步生成令牌的词。                              | None    |
-| bad_words          | List\[str\] | 引擎永远不会生成的词。                                | None    |
+| Parameter           | Type        | Description                                           | Default |
+| ------------------- | ----------- | ----------------------------------------------------- | ------- |
+| n                   | int         | 对每个输入消息生成聊天补全选择的数量。目前仅支持 1    | 1       |
+| max_new_tokens      | int         | 聊天补全中可以生成的最大令牌数。                      | 512     |
+| top_p               | float       | 核心采样，其中模型考虑具有top_p概率质量的令牌。       | 1.0     |
+| top_k               | int         | 模型考虑具有最高概率的前K个令牌。                     | 1       |
+| temperature         | float       | 采样温度。                                            | 0.8     |
+| repetition_penalty  | float       | 防止模型生成重复词或短语的惩罚。大于1的值会抑制重复。 | 1.0     |
+| ignore_eos          | bool        | 是否忽略eos_token_id。                                | False   |
+| random_seed         | int         | 采样令牌时使用的种子。                                | None    |
+| stop_words          | List\[str\] | 停止进一步生成令牌的词。                              | None    |
+| bad_words           | List\[str\] | 引擎永远不会生成的词。                                | None    |
+| min_new_tokens      | int         | 最小令牌生成数。                                      | None    |
+| skip_special_tokens | bool        | 是否跳过 special token。                              | True    |
 
 ## FAQs
 

diff --git a/lmdeploy/api.py b/lmdeploy/api.py
@@ -44,7 +44,8 @@ def pipeline(model_path: str,
         >>> print(response)
     """ # noqa E501
     from lmdeploy.serve.async_engine import AsyncEngine
-    os.environ['TM_LOG_LEVEL'] = log_level
+    if os.getenv('TM_LOG_LEVEL') is None:
+        os.environ['TM_LOG_LEVEL'] = log_level
     from lmdeploy.utils import get_logger
     logger = get_logger('lmdeploy')
     logger.setLevel(log_level)

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -32,6 +32,8 @@ class GenerationConfig:
         bad_words (List[str]): Words that the engine will never generate
         min_new_tokens (int): The minimum numbers of tokens to generate,
             ignoring the number of tokens in the prompt.
+        skip_special_tokens (bool): Whether or not to remove special tokens
+            in the decoding. Default to be True.
     """
 
     n: int = 1
@@ -45,6 +47,7 @@ class GenerationConfig:
     stop_words: List[str] = None
     bad_words: List[str] = None
     min_new_tokens: int = None
+    skip_special_tokens: bool = True
 
 
 @dataclass
@@ -90,6 +93,7 @@ def special_word_token_ids(words):
             repetition_penalty=gen_config.repetition_penalty,
             ignore_eos=gen_config.ignore_eos,
             random_seed=gen_config.random_seed,
+            skip_special_tokens=gen_config.skip_special_tokens,
             stop_words=special_word_token_ids(gen_config.stop_words),
             bad_words=special_word_token_ids(gen_config.bad_words))
 
@@ -110,6 +114,8 @@ class TurbomindEngineConfig:
         quant_policy (int): , default to 0. When k/v is quantized into 8 bit, set it to 4
         rope_scaling_factor (int): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention
         use_logn_attn (bool): whether or not to use log attn: default to False
+        download_dir (str): Directory to download and load the weights, default to the default cache directory of huggingface.
+        revision (str): The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
     """  # noqa: E501
 
     model_name: Optional[str] = None
@@ -121,6 +127,8 @@ class TurbomindEngineConfig:
     quant_policy: int = 0
     rope_scaling_factor: float = 0.0
     use_logn_attn: bool = False
+    download_dir: Optional[str] = None
+    revision: Optional[str] = None
 
 
 @dataclass
@@ -143,6 +151,11 @@ class PytorchEngineConfig:
             would be allocate according to current environment.
         adapters (dict): The path configs to lora adapters.
         max_prefill_token_num (int): tokens per iteration.
+        download_dir (str): Directory to download and load the weights,
+            default to the default cache directory of huggingface.
+        revision (str): The specific model version to use.
+            It can be a branch name, a tag name, or a commit id.
+            If unspecified, will use the default version.
     """
     model_name: str = ''
     tp: int = 1
@@ -155,6 +168,8 @@ class PytorchEngineConfig:
     num_gpu_blocks: int = 0
     adapters: Dict[str, str] = None
     max_prefill_token_num: int = 16384
+    download_dir: str = None
+    revision: str = None
 
 
 class ResponseType(enum.Enum):
@@ -170,8 +185,24 @@ class ResponseType(enum.Enum):
 
 @dataclass
 class Response:
-    """Pack all response information together."""
+    """Pack all response information together.
+
+    Args:
+        text (str): the response text from the server. If the output text is
+            an empty str and the finish_reason is length, it means the session
+            length is reached.
+        generate_token_len (int): the response token length.
+        input_token_len (int): the input prompt token length. Note that it may
+            contains chat template part.
+        session_id (int): the id for running the session. Basically, it refers
+            to the position index of the input request batch.
+        finish_reason ('stop' | 'length' | None): the reason the model stopped
+            generating tokens. This will be 'stop' if the model hit a natural
+            stop point or a provided stop sequence, 'length' if the maximum
+            number of tokens specified in the request was reached.
+    """
     text: str
     generate_token_len: int
+    input_token_len: int
     session_id: int
     finish_reason: Optional[Literal['stop', 'length']] = None