Skip to content

Commit

Permalink
merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
grimoire committed Jan 23, 2024
2 parents b01f020 + 4db2502 commit eea91e5
Show file tree
Hide file tree
Showing 16 changed files with 260 additions and 45 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,14 @@ Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md)
pip install lmdeploy
```

The default prebuilt package is compiled on CUDA 11.8. However, if CUDA 12+ is required, you can install lmdeploy by:

```shell
export LMDEPLOY_VERSION=0.2.0
export PYTHON_VERSION=38
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
```

## Offline Batch Inference

```python
Expand Down
8 changes: 8 additions & 0 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,14 @@ LMDeploy 支持 2 种推理引擎: [TurboMind](./docs/zh_cn/inference/turbomin
pip install lmdeploy
```

LMDeploy的预编译包默认是基于 CUDA 11.8 编译的。如果需要在 CUDA 12+ 下安装 LMDeploy,请执行以下命令:

```shell
export LMDEPLOY_VERSION=0.2.0
export PYTHON_VERSION=38
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
```

## 离线批处理

```python
Expand Down
34 changes: 29 additions & 5 deletions benchmark/profile_torch_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,11 @@ def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
from lmdeploy.messages import PytorchEngineConfig
from lmdeploy.pytorch.engine import Engine

tm_model = Engine(model_path, PytorchEngineConfig(model_name='llama',
tp=tp))
tm_model = Engine(
model_path,
PytorchEngineConfig(model_name='llama',
tp=tp,
max_batch_size=concurrency))

# make up a dummy `input_ids` with the length of `input_seqlen` exactly
assert input_seqlen > 0, 'input_seqlen should > 0'
Expand Down Expand Up @@ -342,6 +345,28 @@ def parse_args():
return args


def _process_map(target, iterable):
from multiprocessing import Pipe, Process

def __proc_cb(*args, ret_pipe: Pipe):
try:
ret = target(*args)
ret_pipe[1].send(ret)
except Exception as e:
ret_pipe[1].send(e)

pipe = Pipe(False)
proc = Process(target=__proc_cb, args=iterable, kwargs=dict(ret_pipe=pipe))
proc.start()
proc.join()

ret = pipe[0].recv()
if isinstance(ret, Exception):
raise ret

return ret


def main():
args = parse_args()
assert len(args.prompt_tokens) == len(args.completion_tokens), \
Expand All @@ -355,7 +380,6 @@ def main():
args.completion_tokens):
MemoryMonitor.start()
from functools import partial
from multiprocessing import Pool
profile_target = partial(profile_throughput,
concurrency=batch,
input_seqlen=prompt_tokens,
Expand All @@ -366,9 +390,9 @@ def main():
temperature=args.temperature,
test_round=args.test_round,
warmup_round=args.warmup_round)
output = Pool(1).map(profile_target, (args.model_path, ))
output = _process_map(profile_target, (args.model_path, ))
model_name, first_token_latency, percentiles, \
throughput_per_proc, tp = output[0]
throughput_per_proc, tp = output
time.sleep(5) # wait a while for releasing GPU mem
memory = MemoryMonitor.terminate()
device_count = MemoryMonitor.device_count.value
Expand Down
8 changes: 8 additions & 0 deletions docs/en/get_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ Install lmdeploy with pip (python 3.8+) or [from source](./build.md)
pip install lmdeploy
```

The default prebuilt package is compiled on CUDA 11.8. However, if CUDA 12+ is required, you can install lmdeploy by:

```shell
export LMDEPLOY_VERSION=0.2.0
export PYTHON_VERSION=38
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
```

## Offline batch inference

```python
Expand Down
8 changes: 8 additions & 0 deletions docs/zh_cn/get_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ LMDeploy提供了快速安装、模型量化、离线批处理、在线推理服
pip install lmdeploy
```

LMDeploy的预编译包默认是基于 CUDA 11.8 编译的。如果需要在 CUDA 12+ 下安装 LMDeploy,请执行以下命令:

```shell
export LMDEPLOY_VERSION=0.2.0
export PYTHON_VERSION=38
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
```

## 离线批处理

```python
Expand Down
18 changes: 17 additions & 1 deletion examples/vl/app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import os
import random
from contextlib import contextmanager
from dataclasses import dataclass, field
from itertools import count
from pathlib import Path
Expand Down Expand Up @@ -89,12 +90,27 @@ def parse_args():
return args


@contextmanager
def get_stop_words():
from lmdeploy.tokenizer import Tokenizer
old_func = Tokenizer.indexes_containing_token

def new_func(self, token):
indexes = self.encode(token, add_bos=False)
return indexes

Tokenizer.indexes_containing_token = new_func
yield
Tokenizer.indexes_containing_token = old_func


def load_preprocessor_model(args):
"""Load preprocessor and llm inference engine."""
assert args.model_name in SUPPORTED_MODELS
llm_ckpt = args.hf_ckpt if args.llm_ckpt is None else args.llm_ckpt
preprocessor = SUPPORTED_MODELS[args.model_name](args.hf_ckpt)
model = TurboMind.from_pretrained(llm_ckpt, model_name=args.model_name)
with get_stop_words():
model = TurboMind.from_pretrained(llm_ckpt, model_name=args.model_name)
return preprocessor, model


Expand Down
24 changes: 14 additions & 10 deletions lmdeploy/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,9 +265,13 @@ def decorate_prompt(self, prompt, sequence_start=True):
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
return f'{self.system}{self.meta_instruction}{self.eosys}' \
f'{self.user}{prompt}{self.eoh}' \
ret = ''
if self.meta_instruction:
ret += f'{self.system}{self.meta_instruction}{self.eosys}'
ret += f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
return ret

else:
return f'\n{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
Expand All @@ -287,7 +291,7 @@ def messages2prompt(self, messages, sequence_start=True):
eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
ret = ''
if self.meta_instruction:
ret += f'{self.system}:{self.meta_instruction}{self.eosys}'
ret += f'{self.system}{self.meta_instruction}{self.eosys}'

for message in messages:
role = message['role']
Expand Down Expand Up @@ -334,13 +338,13 @@ class InternLM2Chat7B(InternLMChat7B):

def __init__(self,
session_len=32768,
system='[UNUSED_TOKEN_146]system\n',
user='[UNUSED_TOKEN_146]user\n',
assistant='[UNUSED_TOKEN_146]assistant\n',
eosys='[UNUSED_TOKEN_145]\n',
eoh='[UNUSED_TOKEN_145]\n',
eoa='[UNUSED_TOKEN_145]\n',
stop_words=['[UNUSED_TOKEN_145]'],
system='<|im_start|>system\n',
user='<|im_start|>user\n',
assistant='<|im_start|>assistant\n',
eosys='<|im_end|>\n',
eoh='<|im_end|>\n',
eoa='<|im_end|>\n',
stop_words=['<|im_end|>', '<|action_end|>'],
**kwargs):
super(InternLM2Chat7B, self).__init__(session_len=session_len,
system=system,
Expand Down
2 changes: 2 additions & 0 deletions lmdeploy/pytorch/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ def _get_torch_dtype(config: Any, default: str = 'float16'):
default (str): default device type.
"""
torch_dtype = getattr(config, 'torch_dtype', default)
# torch_dtype in config could be none
torch_dtype = torch_dtype or default
return eval(f'torch.{torch_dtype}')


Expand Down
25 changes: 22 additions & 3 deletions lmdeploy/pytorch/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ def create_model_inputs(self, messages: SeqList, adapters: AdapterList):
max_rank=max_rank,
meta=meta)

def _stoping_criteria(self, msg: SchedulerSequence, next_token_id: int):
def _stopping_criteria(self, msg: SchedulerSequence, next_token_id: int):
"""Check if the message should stop.
Args:
Expand Down Expand Up @@ -489,9 +489,24 @@ def update_running(self, running: SeqList, next_token_ids: torch.Tensor,
msg.meta = meta
msg.update_token_ids(token)
msg.remain_output_len -= 1
if self._stoping_criteria(msg, token):
if msg.remain_output_len < 0:
msg.token_ids = torch.empty((0, ), dtype=torch.long)
if self._stopping_criteria(msg, token):
msg.status = MessageStatus.STOPPED

def _can_output_token(self, token: torch.Tensor, msg: SchedulerSequence):
"""check if output is necessary."""
if isinstance(token, torch.Tensor):
token = token.item()
if token == self.model_config.eos_token_id:
return False

stop_words = msg.sampling_param.stop_words
if stop_words is not None and token in stop_words:
return False

return True

def _model_forward(self, inputs: ModelInputs, swap_in_map: Dict,
swap_out_map: Dict):
"""model forward."""
Expand Down Expand Up @@ -638,12 +653,16 @@ def step(self, is_prefill: bool, return_logits: bool = False):
outputs: Dict[int, InferOutput] = dict()
for msg, next_id in zip(running, next_token_ids):
session_id = msg.session_id
if self._can_output_token(next_id, msg):
out_token_ids = [next_id.item()]
else:
out_token_ids = []
out = InferOutput(
session_id=session_id,
sender_id=msg.sender_id,
req_id=msg.req_id,
finish=(msg.status == MessageStatus.STOPPED),
token_ids=[next_id.item()],
token_ids=out_token_ids,
)
outputs[session_id] = out

Expand Down
28 changes: 24 additions & 4 deletions lmdeploy/pytorch/engine/model_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,24 @@ def _tp_build_model(
patched_model = None
cache_engine = None

def __load_params_and_buffers(param_mod, mod):
"""load param and buffer."""
for name, param in param_mod.named_parameters(recurse=False):
mod.register_parameter(name, param)
for name, buffer in param_mod.named_buffers(recurse=False):
mod.register_buffer(name, buffer)

def __load_state_dict_assign(param_model, model):
"""load state dict assign."""
try:
model.load_state_dict(param_model.state_dict(), assign=True)
except Exception:
__load_params_and_buffers(param_model, model)
mods = dict(model.named_modules())
for mod_name, param_mod in param_model.named_modules():
mod = mods[mod_name]
__load_params_and_buffers(param_mod, mod)

def _broadcast_config(cache_config):
"""broadcast cache config, use minimum cache."""
if rank == 0:
Expand Down Expand Up @@ -631,7 +649,7 @@ def _broadcast_config(cache_config):
device_map=device_map,
trust_remote_code=trust_remote_code)
_load_adapters(param_model, adapters, device_map=device_map)
model.load_state_dict(param_model.state_dict(), assign=True)
__load_state_dict_assign(param_model, model)
param_model = param_model.to('meta')
del param_model

Expand All @@ -655,6 +673,7 @@ def _broadcast_config(cache_config):
rank=rank,
world_size=world_size)
except Exception as e:
logger.error(f'rank[{rank}] failed with error: {e}')
error_code = 1
error_type = e

Expand Down Expand Up @@ -702,6 +721,7 @@ def _tp_get_input(rank: int, in_que: mp.Queue, world_size: int):
device_mesh=device_mesh,
placements=[Replicate()
]).to_local()
torch.cuda.synchronize()

inputs = updated_inputs
inputs.update(other_metas)
Expand Down Expand Up @@ -895,11 +915,11 @@ def __init__(self,
world_size: int,
adapters: Dict[str, str] = None,
trust_remote_code: bool = True) -> None:
mp.set_start_method('spawn')
self.mp_ctx = mp.get_context('spawn')
super().__init__(model_config=model_config, cache_config=cache_config)
self.world_size = world_size
self.tp_model_in_que = mp.Queue(10)
self.tp_model_out_que = mp.Queue(10)
self.tp_model_in_que = self.mp_ctx.Queue(10)
self.tp_model_out_que = self.mp_ctx.Queue(10)

self.patch_model_tp(model_path,
model_config=model_config,
Expand Down
8 changes: 5 additions & 3 deletions lmdeploy/pytorch/engine/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import enum
from dataclasses import dataclass, field
from queue import Empty, Queue
from threading import Lock, Thread, ThreadError
from threading import Lock, Thread
from typing import Any, Callable, ClassVar, Dict, List

from lmdeploy.messages import ResponseType
Expand Down Expand Up @@ -75,7 +75,8 @@ def _resp_que_get(self, block: bool = True, timeout: float = None):
except Empty:
timeout_counter -= self.THREAD_ALIVE_INTERVAL
if self._thread and not self._thread.is_alive():
raise ThreadError('Engine main loop stopped.')
logger.error('Engine main loop stopped.')
exit(1)

return self.resp_que.get(timeout=timeout_counter)

Expand Down Expand Up @@ -110,7 +111,8 @@ def batched_send_async(self, req_types: List[RequestType],
data: List[Any]) -> List[int]:
"""Batched send request asynchronize."""
if self._thread and not self._thread.is_alive():
raise ThreadError('Engine main loop stopped.')
logger.error('Engine main loop stopped.')
exit(1)
assert len(req_types) == len(data)
batch_size = len(req_types)

Expand Down
Loading

0 comments on commit eea91e5

Please sign in to comment.