Support torch cache_max_entry_count (#1166)

* max entry only * update cli --------- Co-authored-by: RunningLeon <[email protected]>
InternLM · Feb 20, 2024 · c71db87 · c71db87
1 parent bdaf092
commit c71db87
Show file tree

Hide file tree

Showing 8 changed files with 59 additions and 42 deletions.
diff --git a/docs/en/inference/pipeline.md b/docs/en/inference/pipeline.md
@@ -211,20 +211,21 @@ This class provides the configuration parameters for Pytorch backend.
 
 ### Arguments
 
-| Parameter        | Type | Description                                                                                                                           | Default     |
-| ---------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------- | ----------- |
-| model_name       | str  | The chat template name of the deployed model                                                                                          | ''          |
-| tp               | int  | Tensor Parallelism.                                                                                                                   | 1           |
-| session_len      | int  | Maximum session length.                                                                                                               | None        |
-| max_batch_size   | int  | Maximum batch size.                                                                                                                   | 128         |
-| eviction_type    | str  | Action to perform when kv cache is full. Options are \['recompute', 'copy'\].                                                         | 'recompute' |
-| prefill_interval | int  | Interval to perform prefill.                                                                                                          | 16          |
-| block_size       | int  | Paging cache block size.                                                                                                              | 64          |
-| num_cpu_blocks   | int  | Number of CPU blocks. If the number is 0, cache would be allocated according to the current environment.                              | 0           |
-| num_gpu_blocks   | int  | Number of GPU blocks. If the number is 0, cache would be allocated according to the current environment.                              | 0           |
-| adapters         | dict | The path configs to lora adapters.                                                                                                    | None        |
-| download_dir     | str  | Directory to download and load the weights, default to the default cache directory of huggingface.                                    | None        |
-| revision         | str  | The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. | None        |
+| Parameter             | Type  | Description                                                                                                                           | Default     |
+| --------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------- | ----------- |
+| model_name            | str   | The chat template name of the deployed model                                                                                          | ''          |
+| tp                    | int   | Tensor Parallelism.                                                                                                                   | 1           |
+| session_len           | int   | Maximum session length.                                                                                                               | None        |
+| max_batch_size        | int   | Maximum batch size.                                                                                                                   | 128         |
+| cache_max_entry_count | float | The percentage of free GPU memory occupied by the k/v cache.                                                                          | 0.8         |
+| eviction_type         | str   | Action to perform when kv cache is full. Options are \['recompute', 'copy'\].                                                         | 'recompute' |
+| prefill_interval      | int   | Interval to perform prefill.                                                                                                          | 16          |
+| block_size            | int   | Paging cache block size.                                                                                                              | 64          |
+| num_cpu_blocks        | int   | Number of CPU blocks. If the number is 0, cache would be allocated according to the current environment.                              | 0           |
+| num_gpu_blocks        | int   | Number of GPU blocks. If the number is 0, cache would be allocated according to the current environment.                              | 0           |
+| adapters              | dict  | The path configs to lora adapters.                                                                                                    | None        |
+| download_dir          | str   | Directory to download and load the weights, default to the default cache directory of huggingface.                                    | None        |
+| revision              | str   | The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. | None        |
 
 ## GenerationConfig
 

diff --git a/docs/zh_cn/inference/pipeline.md b/docs/zh_cn/inference/pipeline.md
@@ -211,20 +211,21 @@ print(response)
 
 ### 参数
 
-| Parameter        | Type | Description                                                  | Default     |
-| ---------------- | ---- | ------------------------------------------------------------ | ----------- |
-| model_name       | str  | 已部署模型的对话模板名称。                                   | ''          |
-| tp               | int  | 张量并行度。                                                 | 1           |
-| session_len      | int  | 最大会话长度。                                               | None        |
-| max_batch_size   | int  | 最大批处理大小。                                             | 128         |
-| eviction_type    | str  | 当kv缓存满时需要执行的操作，可选值为\['recompute', 'copy'\]. | 'recompute' |
-| prefill_interval | int  | 执行预填充的间隔。                                           | 16          |
-| block_size       | int  | 分页缓存块大小。                                             | 64          |
-| num_cpu_blocks   | int  | CPU块的数量。如果值为0，缓存将根据当前环境进行分配。         | 0           |
-| num_gpu_blocks   | int  | GPU块的数量。如果值为0，缓存将根据当前环境进行分配。         | 0           |
-| adapters         | dict | lora adapters的配置路径                                      | None        |
-| download_dir     | str  | 模型缓存路径                                                 | None        |
-| revision         | str  | Git revision id, 可以是branch，tag或者commit id              | None        |
+| Parameter             | Type  | Description                                                  | Default     |
+| --------------------- | ----- | ------------------------------------------------------------ | ----------- |
+| model_name            | str   | 已部署模型的对话模板名称。                                   | ''          |
+| tp                    | int   | 张量并行度。                                                 | 1           |
+| session_len           | int   | 最大会话长度。                                               | None        |
+| max_batch_size        | int   | 最大批处理大小。                                             | 128         |
+| cache_max_entry_count | float | 由k/v缓存占用的空闲GPU内存百分比。                           | 0.8         |
+| eviction_type         | str   | 当kv缓存满时需要执行的操作，可选值为\['recompute', 'copy'\]. | 'recompute' |
+| prefill_interval      | int   | 执行预填充的间隔。                                           | 16          |
+| block_size            | int   | 分页缓存块大小。                                             | 64          |
+| num_cpu_blocks        | int   | CPU块的数量。如果值为0，缓存将根据当前环境进行分配。         | 0           |
+| num_gpu_blocks        | int   | GPU块的数量。如果值为0，缓存将根据当前环境进行分配。         | 0           |
+| adapters              | dict  | lora adapters的配置路径                                      | None        |
+| download_dir          | str   | 模型缓存路径                                                 | None        |
+| revision              | str   | Git revision id, 可以是branch，tag或者commit id              | None        |
 
 ## GenerationConfig
 

diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py
@@ -30,6 +30,7 @@ def add_parser_torch():
         ArgumentHelper.tp(engine_group)
         ArgumentHelper.session_len(engine_group)
         ArgumentHelper.adapters(engine_group)
+        ArgumentHelper.cache_max_entry_count(engine_group)
 
         # other args
         parser.add_argument('--trust-remote-code',
@@ -73,10 +74,12 @@ def torch(args):
         from lmdeploy.messages import PytorchEngineConfig
         from lmdeploy.pytorch.chat import run_chat
         adapters = get_lora_adapters(args.adapters)
-        engine_config = PytorchEngineConfig(model_name=args.model_name,
-                                            tp=args.tp,
-                                            session_len=args.session_len,
-                                            adapters=adapters)
+        engine_config = PytorchEngineConfig(
+            model_name=args.model_name,
+            tp=args.tp,
+            session_len=args.session_len,
+            cache_max_entry_count=args.cache_max_entry_count,
+            adapters=adapters)
         run_chat(args.model_path,
                  engine_config,
                  trust_remote_code=args.trust_remote_code)

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -53,6 +53,7 @@ def add_parser_gradio():
         model_name_act = ArgumentHelper.model_name(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
         max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
+        cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
 
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
@@ -61,10 +62,10 @@ def add_parser_gradio():
         tb_group._group_actions.append(model_name_act)
         tb_group._group_actions.append(session_len_act)
         tb_group._group_actions.append(max_batch_size_act)
+        tb_group._group_actions.append(cache_max_entry_act)
         ArgumentHelper.model_format(tb_group)
         ArgumentHelper.quant_policy(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
-        ArgumentHelper.cache_max_entry_count(tb_group)
 
     @staticmethod
     def add_parser_api_server():
@@ -135,6 +136,7 @@ def add_parser_api_server():
         model_name_act = ArgumentHelper.model_name(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
         max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
+        cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
 
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
@@ -143,10 +145,10 @@ def add_parser_api_server():
         tb_group._group_actions.append(model_name_act)
         tb_group._group_actions.append(session_len_act)
         tb_group._group_actions.append(max_batch_size_act)
+        tb_group._group_actions.append(cache_max_entry_act)
         ArgumentHelper.model_format(tb_group)
         ArgumentHelper.quant_policy(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
-        ArgumentHelper.cache_max_entry_count(tb_group)
 
     @staticmethod
     def add_parser_api_client():
@@ -195,6 +197,7 @@ def gradio(args):
                 tp=args.tp,
                 model_name=args.model_name,
                 max_batch_size=args.max_batch_size,
+                cache_max_entry_count=args.cache_max_entry_count,
                 session_len=args.session_len)
         else:
             from lmdeploy.messages import TurbomindEngineConfig
@@ -229,6 +232,7 @@ def api_server(args):
                 tp=args.tp,
                 model_name=args.model_name,
                 max_batch_size=args.max_batch_size,
+                cache_max_entry_count=args.cache_max_entry_count,
                 session_len=args.session_len)
         else:
             from lmdeploy.messages import TurbomindEngineConfig

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -140,6 +140,10 @@ class PytorchEngineConfig:
         tp (int): Tensor Parallelism. default 1.
         session_len (int): Max session length. Default None.
         max_batch_size (int): Max batch size. Default 128.
+        cache_max_entry_count (float): the percentage of gpu memory occupied
+            by the k/v cache. For lmdeploy versions greater than `v0.2.1`,
+            it defaults to 0.8, signifying the percentage of FREE GPU memory
+            to be reserved for the k/v cache
         eviction_type (str): What action to perform when kv cache
             is full, ['recompute', 'copy'], Default 'recompute'.
         prefill_interval (int): Interval to perform prefill,
@@ -161,13 +165,14 @@ class PytorchEngineConfig:
     tp: int = 1
     session_len: int = None
     max_batch_size: int = 128
+    cache_max_entry_count: float = 0.8
     eviction_type: str = 'recompute'
     prefill_interval: int = 16
     block_size: int = 64
     num_cpu_blocks: int = 0
     num_gpu_blocks: int = 0
     adapters: Dict[str, str] = None
-    max_prefill_token_num: int = 16384
+    max_prefill_token_num: int = 8192
     download_dir: str = None
     revision: str = None
 

diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -28,7 +28,7 @@ class SchedulerConfig:
     eviction_type: str = 'recompute'
     prefill_interval: int = 16
     max_active_adapters: int = 64
-    max_prefill_token_num: int = 16384
+    max_prefill_token_num: int = 8192
 
 
 @dataclass
@@ -38,6 +38,7 @@ class CacheConfig:
     block_size: int
     num_cpu_blocks: int
     num_gpu_blocks: int
+    cache_max_entry_count: float = 0.8
 
 
 @dataclass

diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -108,14 +108,17 @@ def __init__(self,
         scheduler_config = SchedulerConfig(
             max_batches=engine_config.max_batch_size,
             max_session_len=engine_config.session_len,
-            eviction_type='recompute',
+            eviction_type=engine_config.eviction_type,
+            prefill_interval=engine_config.prefill_interval,
             max_prefill_token_num=engine_config.max_prefill_token_num)
 
         # block_size = 1 to enable unified paging
         adapters = engine_config.adapters
-        cache_config = CacheConfig(block_size=engine_config.block_size,
-                                   num_cpu_blocks=engine_config.num_cpu_blocks,
-                                   num_gpu_blocks=engine_config.num_gpu_blocks)
+        cache_config = CacheConfig(
+            block_size=engine_config.block_size,
+            num_cpu_blocks=engine_config.num_cpu_blocks,
+            num_gpu_blocks=engine_config.num_gpu_blocks,
+            cache_max_entry_count=engine_config.cache_max_entry_count)
 
         if not os.path.exists(model_path):
             model_path = get_model(model_path, engine_config.download_dir,

diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -44,7 +44,6 @@ def _infer_block_size(model: torch.nn.Module,
 def _update_cache_config(model_config: ModelConfig,
                          cache_config: CacheConfig,
                          gpu_id: int = 0,
-                         gpu_mem_percent: float = 0.7,
                          host_mem_size: int = 4 * (1 << 30),
                          world_size: int = 1):
     """Update the gpu mem and cpu mem according to model info.
@@ -56,7 +55,7 @@ def _update_cache_config(model_config: ModelConfig,
     """
     torch.cuda.empty_cache()
     gpu_mem_physical_free, _ = get_gpu_memory(gpu_id)
-    gpu_mem = gpu_mem_physical_free * gpu_mem_percent
+    gpu_mem = gpu_mem_physical_free * cache_config.cache_max_entry_count
     cpu_mem = host_mem_size
     cache_block_size = CacheEngine.get_cache_block_size(
         cache_config.block_size, model_config) // world_size