merge main

grimoire · Jan 23, 2024 · eea91e5 · eea91e5
2 parents b01f020 + 4db2502
commit eea91e5
Show file tree

Hide file tree

Showing 16 changed files with 260 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -107,6 +107,14 @@ Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md)
 pip install lmdeploy
 ```
 
+The default prebuilt package is compiled on CUDA 11.8. However, if CUDA 12+ is required, you can install lmdeploy by:
+
+```shell
+export LMDEPLOY_VERSION=0.2.0
+export PYTHON_VERSION=38
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
+```
+
 ## Offline Batch Inference
 
 ```python

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -108,6 +108,14 @@ LMDeploy 支持 2 种推理引擎： [TurboMind](./docs/zh_cn/inference/turbomin
 pip install lmdeploy
 ```
 
+LMDeploy的预编译包默认是基于 CUDA 11.8 编译的。如果需要在 CUDA 12+ 下安装 LMDeploy，请执行以下命令：
+
+```shell
+export LMDEPLOY_VERSION=0.2.0
+export PYTHON_VERSION=38
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
+```
+
 ## 离线批处理
 
 ```python

diff --git a/benchmark/profile_torch_generation.py b/benchmark/profile_torch_generation.py
@@ -123,8 +123,11 @@ def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
     from lmdeploy.messages import PytorchEngineConfig
     from lmdeploy.pytorch.engine import Engine
 
-    tm_model = Engine(model_path, PytorchEngineConfig(model_name='llama',
-                                                      tp=tp))
+    tm_model = Engine(
+        model_path,
+        PytorchEngineConfig(model_name='llama',
+                            tp=tp,
+                            max_batch_size=concurrency))
 
     # make up a dummy `input_ids` with the length of `input_seqlen` exactly
     assert input_seqlen > 0, 'input_seqlen should > 0'
@@ -342,6 +345,28 @@ def parse_args():
     return args
 
 
+def _process_map(target, iterable):
+    from multiprocessing import Pipe, Process
+
+    def __proc_cb(*args, ret_pipe: Pipe):
+        try:
+            ret = target(*args)
+            ret_pipe[1].send(ret)
+        except Exception as e:
+            ret_pipe[1].send(e)
+
+    pipe = Pipe(False)
+    proc = Process(target=__proc_cb, args=iterable, kwargs=dict(ret_pipe=pipe))
+    proc.start()
+    proc.join()
+
+    ret = pipe[0].recv()
+    if isinstance(ret, Exception):
+        raise ret
+
+    return ret
+
+
 def main():
     args = parse_args()
     assert len(args.prompt_tokens) == len(args.completion_tokens), \
@@ -355,7 +380,6 @@ def main():
                                                     args.completion_tokens):
             MemoryMonitor.start()
             from functools import partial
-            from multiprocessing import Pool
             profile_target = partial(profile_throughput,
                                      concurrency=batch,
                                      input_seqlen=prompt_tokens,
@@ -366,9 +390,9 @@ def main():
                                      temperature=args.temperature,
                                      test_round=args.test_round,
                                      warmup_round=args.warmup_round)
-            output = Pool(1).map(profile_target, (args.model_path, ))
+            output = _process_map(profile_target, (args.model_path, ))
             model_name, first_token_latency, percentiles, \
-                throughput_per_proc, tp = output[0]
+                throughput_per_proc, tp = output
             time.sleep(5)  # wait a while for releasing GPU mem
             memory = MemoryMonitor.terminate()
             device_count = MemoryMonitor.device_count.value

diff --git a/docs/en/get_started.md b/docs/en/get_started.md
@@ -10,6 +10,14 @@ Install lmdeploy with pip (python 3.8+) or [from source](./build.md)
 pip install lmdeploy
 ```
 
+The default prebuilt package is compiled on CUDA 11.8. However, if CUDA 12+ is required, you can install lmdeploy by:
+
+```shell
+export LMDEPLOY_VERSION=0.2.0
+export PYTHON_VERSION=38
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
+```
+
 ## Offline batch inference
 
 ```python

diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
@@ -10,6 +10,14 @@ LMDeploy提供了快速安装、模型量化、离线批处理、在线推理服
 pip install lmdeploy
 ```
 
+LMDeploy的预编译包默认是基于 CUDA 11.8 编译的。如果需要在 CUDA 12+ 下安装 LMDeploy，请执行以下命令：
+
+```shell
+export LMDEPLOY_VERSION=0.2.0
+export PYTHON_VERSION=38
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
+```
+
 ## 离线批处理
 
 ```python

diff --git a/examples/vl/app.py b/examples/vl/app.py
@@ -1,6 +1,7 @@
 import argparse
 import os
 import random
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from itertools import count
 from pathlib import Path
@@ -89,12 +90,27 @@ def parse_args():
     return args
 
 
+@contextmanager
+def get_stop_words():
+    from lmdeploy.tokenizer import Tokenizer
+    old_func = Tokenizer.indexes_containing_token
+
+    def new_func(self, token):
+        indexes = self.encode(token, add_bos=False)
+        return indexes
+
+    Tokenizer.indexes_containing_token = new_func
+    yield
+    Tokenizer.indexes_containing_token = old_func
+
+
 def load_preprocessor_model(args):
     """Load preprocessor and llm inference engine."""
     assert args.model_name in SUPPORTED_MODELS
     llm_ckpt = args.hf_ckpt if args.llm_ckpt is None else args.llm_ckpt
     preprocessor = SUPPORTED_MODELS[args.model_name](args.hf_ckpt)
-    model = TurboMind.from_pretrained(llm_ckpt, model_name=args.model_name)
+    with get_stop_words():
+        model = TurboMind.from_pretrained(llm_ckpt, model_name=args.model_name)
     return preprocessor, model
 
 

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -265,9 +265,13 @@ def decorate_prompt(self, prompt, sequence_start=True):
         assert self.capability == 'chat', \
             f'{type(self).__name__} has no capability of {self.capability}'
         if sequence_start:
-            return f'{self.system}{self.meta_instruction}{self.eosys}' \
-                   f'{self.user}{prompt}{self.eoh}' \
+            ret = ''
+            if self.meta_instruction:
+                ret += f'{self.system}{self.meta_instruction}{self.eosys}'
+            ret += f'{self.user}{prompt}{self.eoh}' \
                    f'{self.assistant}'
+            return ret
+
         else:
             return f'\n{self.user}{prompt}{self.eoh}' \
                    f'{self.assistant}'
@@ -287,7 +291,7 @@ def messages2prompt(self, messages, sequence_start=True):
         eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
         ret = ''
         if self.meta_instruction:
-            ret += f'{self.system}:{self.meta_instruction}{self.eosys}'
+            ret += f'{self.system}{self.meta_instruction}{self.eosys}'
 
         for message in messages:
             role = message['role']
@@ -334,13 +338,13 @@ class InternLM2Chat7B(InternLMChat7B):
 
     def __init__(self,
                  session_len=32768,
-                 system='[UNUSED_TOKEN_146]system\n',
-                 user='[UNUSED_TOKEN_146]user\n',
-                 assistant='[UNUSED_TOKEN_146]assistant\n',
-                 eosys='[UNUSED_TOKEN_145]\n',
-                 eoh='[UNUSED_TOKEN_145]\n',
-                 eoa='[UNUSED_TOKEN_145]\n',
-                 stop_words=['[UNUSED_TOKEN_145]'],
+                 system='<|im_start|>system\n',
+                 user='<|im_start|>user\n',
+                 assistant='<|im_start|>assistant\n',
+                 eosys='<|im_end|>\n',
+                 eoh='<|im_end|>\n',
+                 eoa='<|im_end|>\n',
+                 stop_words=['<|im_end|>', '<|action_end|>'],
                  **kwargs):
         super(InternLM2Chat7B, self).__init__(session_len=session_len,
                                               system=system,

diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -13,6 +13,8 @@ def _get_torch_dtype(config: Any, default: str = 'float16'):
         default (str): default device type.
     """
     torch_dtype = getattr(config, 'torch_dtype', default)
+    # torch_dtype in config could be none
+    torch_dtype = torch_dtype or default
     return eval(f'torch.{torch_dtype}')
 
 

diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -399,7 +399,7 @@ def create_model_inputs(self, messages: SeqList, adapters: AdapterList):
                            max_rank=max_rank,
                            meta=meta)
 
-    def _stoping_criteria(self, msg: SchedulerSequence, next_token_id: int):
+    def _stopping_criteria(self, msg: SchedulerSequence, next_token_id: int):
         """Check if the message should stop.
 
         Args:
@@ -489,9 +489,24 @@ def update_running(self, running: SeqList, next_token_ids: torch.Tensor,
             msg.meta = meta
             msg.update_token_ids(token)
             msg.remain_output_len -= 1
-            if self._stoping_criteria(msg, token):
+            if msg.remain_output_len < 0:
+                msg.token_ids = torch.empty((0, ), dtype=torch.long)
+            if self._stopping_criteria(msg, token):
                 msg.status = MessageStatus.STOPPED
 
+    def _can_output_token(self, token: torch.Tensor, msg: SchedulerSequence):
+        """check if output is necessary."""
+        if isinstance(token, torch.Tensor):
+            token = token.item()
+        if token == self.model_config.eos_token_id:
+            return False
+
+        stop_words = msg.sampling_param.stop_words
+        if stop_words is not None and token in stop_words:
+            return False
+
+        return True
+
     def _model_forward(self, inputs: ModelInputs, swap_in_map: Dict,
                        swap_out_map: Dict):
         """model forward."""
@@ -638,12 +653,16 @@ def step(self, is_prefill: bool, return_logits: bool = False):
         outputs: Dict[int, InferOutput] = dict()
         for msg, next_id in zip(running, next_token_ids):
             session_id = msg.session_id
+            if self._can_output_token(next_id, msg):
+                out_token_ids = [next_id.item()]
+            else:
+                out_token_ids = []
             out = InferOutput(
                 session_id=session_id,
                 sender_id=msg.sender_id,
                 req_id=msg.req_id,
                 finish=(msg.status == MessageStatus.STOPPED),
-                token_ids=[next_id.item()],
+                token_ids=out_token_ids,
             )
             outputs[session_id] = out
 

diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -585,6 +585,24 @@ def _tp_build_model(
     patched_model = None
     cache_engine = None
 
+    def __load_params_and_buffers(param_mod, mod):
+        """load param and buffer."""
+        for name, param in param_mod.named_parameters(recurse=False):
+            mod.register_parameter(name, param)
+        for name, buffer in param_mod.named_buffers(recurse=False):
+            mod.register_buffer(name, buffer)
+
+    def __load_state_dict_assign(param_model, model):
+        """load state dict assign."""
+        try:
+            model.load_state_dict(param_model.state_dict(), assign=True)
+        except Exception:
+            __load_params_and_buffers(param_model, model)
+            mods = dict(model.named_modules())
+            for mod_name, param_mod in param_model.named_modules():
+                mod = mods[mod_name]
+                __load_params_and_buffers(param_mod, mod)
+
     def _broadcast_config(cache_config):
         """broadcast cache config, use minimum cache."""
         if rank == 0:
@@ -631,7 +649,7 @@ def _broadcast_config(cache_config):
                     device_map=device_map,
                     trust_remote_code=trust_remote_code)
                 _load_adapters(param_model, adapters, device_map=device_map)
-                model.load_state_dict(param_model.state_dict(), assign=True)
+                __load_state_dict_assign(param_model, model)
                 param_model = param_model.to('meta')
                 del param_model
 
@@ -655,6 +673,7 @@ def _broadcast_config(cache_config):
                                    rank=rank,
                                    world_size=world_size)
     except Exception as e:
+        logger.error(f'rank[{rank}] failed with error: {e}')
         error_code = 1
         error_type = e
 
@@ -702,6 +721,7 @@ def _tp_get_input(rank: int, in_que: mp.Queue, world_size: int):
                                                  device_mesh=device_mesh,
                                                  placements=[Replicate()
                                                              ]).to_local()
+    torch.cuda.synchronize()
 
     inputs = updated_inputs
     inputs.update(other_metas)
@@ -895,11 +915,11 @@ def __init__(self,
                  world_size: int,
                  adapters: Dict[str, str] = None,
                  trust_remote_code: bool = True) -> None:
-        mp.set_start_method('spawn')
+        self.mp_ctx = mp.get_context('spawn')
         super().__init__(model_config=model_config, cache_config=cache_config)
         self.world_size = world_size
-        self.tp_model_in_que = mp.Queue(10)
-        self.tp_model_out_que = mp.Queue(10)
+        self.tp_model_in_que = self.mp_ctx.Queue(10)
+        self.tp_model_out_que = self.mp_ctx.Queue(10)
 
         self.patch_model_tp(model_path,
                             model_config=model_config,

diff --git a/lmdeploy/pytorch/engine/request.py b/lmdeploy/pytorch/engine/request.py
@@ -2,7 +2,7 @@
 import enum
 from dataclasses import dataclass, field
 from queue import Empty, Queue
-from threading import Lock, Thread, ThreadError
+from threading import Lock, Thread
 from typing import Any, Callable, ClassVar, Dict, List
 
 from lmdeploy.messages import ResponseType
@@ -75,7 +75,8 @@ def _resp_que_get(self, block: bool = True, timeout: float = None):
             except Empty:
                 timeout_counter -= self.THREAD_ALIVE_INTERVAL
             if self._thread and not self._thread.is_alive():
-                raise ThreadError('Engine main loop stopped.')
+                logger.error('Engine main loop stopped.')
+                exit(1)
 
         return self.resp_que.get(timeout=timeout_counter)
 
@@ -110,7 +111,8 @@ def batched_send_async(self, req_types: List[RequestType],
                            data: List[Any]) -> List[int]:
         """Batched send request asynchronize."""
         if self._thread and not self._thread.is_alive():
-            raise ThreadError('Engine main loop stopped.')
+            logger.error('Engine main loop stopped.')
+            exit(1)
         assert len(req_types) == len(data)
         batch_size = len(req_types)