Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support loading hf model directly #685

Merged
merged 33 commits into from
Nov 22, 2023
Merged
Changes from 8 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
0f7a35b
turbomind support export model params
irexyc Nov 10, 2023
542882a
fix overflow
irexyc Nov 10, 2023
9f30d4f
support turbomind.from_pretrained
irexyc Nov 10, 2023
8605521
fix tp
irexyc Nov 10, 2023
a050907
support AutoModel
irexyc Nov 13, 2023
a3f5fc5
support load kv qparams
irexyc Nov 13, 2023
4426c0a
update auto_awq
irexyc Nov 13, 2023
f24c905
udpate docstring
irexyc Nov 14, 2023
371320a
export lmdeploy version
irexyc Nov 14, 2023
fe81ce9
update doc
irexyc Nov 14, 2023
1c4db1e
remove download_hf_repo
irexyc Nov 15, 2023
85f0ed0
LmdeployForCausalLM -> LmdeployForCausalLM
irexyc Nov 15, 2023
5619b44
refactor turbomind.py
irexyc Nov 15, 2023
00b21d4
update comment
irexyc Nov 15, 2023
d868694
Merge remote-tracking branch 'origin/main' into from_pretrained2
irexyc Nov 15, 2023
a827412
add bfloat16 convert back
irexyc Nov 15, 2023
197133d
support gradio run_locl load hf
irexyc Nov 15, 2023
47fd6a8
support resuful api server load hf
irexyc Nov 15, 2023
8dd4876
add docs
irexyc Nov 15, 2023
ae67e87
support loading previous quantized model
irexyc Nov 15, 2023
db51a06
adapt pr 690
irexyc Nov 15, 2023
68962ce
udpate docs
irexyc Nov 16, 2023
c6176f3
resolve conflict in auto_awq.py
irexyc Nov 16, 2023
4c4ae26
not export turbomind config when quantize a model
irexyc Nov 17, 2023
2562724
check model_name when can not get it from config.json
irexyc Nov 17, 2023
f41dce4
update readme
irexyc Nov 17, 2023
7fa302c
remove model_name in auto_awq
irexyc Nov 20, 2023
4db0e25
Merge remote-tracking branch 'origin/main' into from_pretrained
irexyc Nov 20, 2023
4e82cdf
update
irexyc Nov 21, 2023
0f9c6f0
update
irexyc Nov 21, 2023
b470f06
udpate
irexyc Nov 22, 2023
d3c5d01
fix build
irexyc Nov 22, 2023
6ce951f
absolute import
irexyc Nov 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -58,6 +58,7 @@ work_dir*/
*.bin
*config.json
*generate_config.json
!lmdeploy/turbomind/hf_repo/config.json

# Pytorch
*.pth
1 change: 1 addition & 0 deletions docs/en/w4a16.md
Original file line number Diff line number Diff line change
@@ -96,6 +96,7 @@ LMDeploy employs AWQ algorithm for model weight quantization.

```shell
lmdeploy lite auto_awq \
--model_name $MODEL_NAME \ # Model name, csould refer to lmdeploy/model.py
irexyc marked this conversation as resolved.
Show resolved Hide resolved
--model $HF_MODEL \
--w_bits 4 \ # Bit number for weight quantization
--w_group_size 128 \ # Group size for weight quantization statistics
1 change: 1 addition & 0 deletions docs/zh_cn/w4a16.md
Original file line number Diff line number Diff line change
@@ -94,6 +94,7 @@ LMDeploy 使用 AWQ 算法对模型权重进行量化。在执行下面的命令

```shell
lmdeploy lite auto_awq \
--model_name $MODEL_NAME # 模型名字,可参考 lmdeploy/model.py
--model $HF_MODEL \
--w_bits 4 \ # 权重量化的 bit 数
--w_group_size 128 \ # 权重量化分组统计尺寸
5 changes: 4 additions & 1 deletion lmdeploy/cli/lite.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@ class SubCliLite(object):
"""CLI for compressing LLMs."""

def auto_awq(self,
model_name: str,
model: str,
work_dir: str,
w_bits: int = 4,
@@ -14,6 +15,7 @@ def auto_awq(self,
"""Perform weight quantization using AWQ algorithm.

Args:
model_name (str): The name of model.
model (str): The path of model in hf format.
work_dir (str): The working directory to save results.
w_bits (int): Bit number for weight quantization.
@@ -23,7 +25,8 @@ def auto_awq(self,
"""
from lmdeploy.lite.apis.auto_awq import auto_awq

auto_awq(model,
auto_awq(model_name,
model,
work_dir,
w_bits=w_bits,
w_sym=w_sym,
12 changes: 11 additions & 1 deletion lmdeploy/lite/apis/auto_awq.py
Original file line number Diff line number Diff line change
@@ -11,6 +11,7 @@
from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
quant_weights, smooth_layers)
from lmdeploy.lite.utils import collect_target_modules
from lmdeploy.lite.utils.export_turbomind import export_turbomind_hf_model

LAYER_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMDecoderLayer',
@@ -28,13 +29,17 @@
}


def auto_awq(model: str,
def auto_awq(model_name: str,
model: str,
work_dir: str,
w_bits: int = 4,
w_sym: bool = False,
w_group_size: int = 128,
device: str = 'cuda'):

assert model != work_dir, '$WORK_DIR and $HF_MODEL should be different'
model_path = model

# Load tokenizer and configuration
tokenizer = AutoTokenizer.from_pretrained(model,
use_fast=False,
@@ -86,6 +91,11 @@ def auto_awq(model: str,
model.save_pretrained(work_dir, max_shard_size='2GB')
tokenizer.save_pretrained(work_dir)

export_turbomind_hf_model(model_name,
irexyc marked this conversation as resolved.
Show resolved Hide resolved
model_path,
work_dir,
group_size=w_group_size)


if __name__ == '__main__':
import fire
42 changes: 32 additions & 10 deletions lmdeploy/lite/apis/kv_qparams.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,34 @@
# Copyright (c) OpenMMLab. All rights reserved.
import os
from pathlib import Path
from typing import Union

import numpy as np
import torch


def _export_weight(into: str,
kv_qparams: np.array,
out_path: str,
tm_params: dict = None):
"""Save kv_qparams to disk or copy to tm_params."""
if tm_params is None:
print(into)
kv_qparams.tofile(out_path)
else:
name = os.path.basename(out_path)
src = torch.from_numpy(kv_qparams)
for tm_tensor in tm_params[name]:
tm_tensor.copy_from(src)
tm_params.pop(name)


def _export_sym(key_stats: dict,
value_stats: dict,
bits: int,
out_dir: Union[str, Path],
tp: int = 1) -> None:
tp: int = 1,
tm_params: dict = None) -> None:
"""Export symmetric quantization parameters to specified directory."""
keys_absmax = key_stats['absmax']
values_absmax = value_stats['absmax']
@@ -31,15 +49,16 @@ def _export_sym(key_stats: dict,

kv_qparams = np.array([k_s, v_s], dtype=np.float32)
out_path = out_dir / f'layers.{layer_idx}.past_kv_scale.{i}.weight' # noqa: E501
kv_qparams.tofile(out_path)
print(f'Layer {layer_idx} MP {i} qparam: {k_s} \t{v_s}')
info = f'Layer {layer_idx} MP {i} qparam: {k_s} \t{v_s}'
_export_weight(info, kv_qparams, out_path, tm_params)


def _export_asym(key_stats: dict,
value_stats: dict,
bits: int,
out_dir: Union[str, Path],
tp: int = 1) -> None:
tp: int = 1,
tm_params: dict = None) -> None:
"""Export asymmetric quantization parameters to specified directory."""
keys_min = key_stats['min']
values_min = value_stats['min']
@@ -81,16 +100,17 @@ def _export_asym(key_stats: dict,
kv_qparams = np.array([k_scale, k_zp, v_scale, v_zp],
dtype=np.float32)
out_path = out_dir / f'layers.{layer_idx}.past_kv_scale.{i}.weight'
kv_qparams.tofile(out_path)
print(f'Layer {layer_idx} MP {i} qparam: '
f'\t{k_scale} \t{k_zp} \t{v_scale} \t{v_zp}')
info = f'Layer {layer_idx} MP {i} qparam: ' \
f'\t{k_scale} \t{k_zp} \t{v_scale} \t{v_zp}'
_export_weight(info, kv_qparams, out_path, tm_params)


def main(work_dir: str,
turbomind_dir: str,
kv_bits: int = 8,
kv_sym: bool = False,
num_tp: int = 1) -> None:
num_tp: int = 1,
tm_params: dict = None) -> None:
"""Main function to export key and value stats.

Args:
@@ -102,6 +122,7 @@ def main(work_dir: str,
kv_sym (bool, optional): Whether to use symmetric quantizaiton.
Defaults to False.
num_tp (int, optional): Number of tensor parallelism. Defaults to 1.
tm_params (dict): turbomind model weights.
"""

work_dir = Path(work_dir)
@@ -113,9 +134,10 @@ def main(work_dir: str,
value_stats = torch.load(work_dir / 'value_stats.pth')

if kv_sym:
_export_sym(key_stats, value_stats, kv_bits, tm_dir, num_tp)
_export_sym(key_stats, value_stats, kv_bits, tm_dir, num_tp, tm_params)
else:
_export_asym(key_stats, value_stats, kv_bits, tm_dir, num_tp)
_export_asym(key_stats, value_stats, kv_bits, tm_dir, num_tp,
tm_params)


if __name__ == '__main__':
54 changes: 54 additions & 0 deletions lmdeploy/lite/utils/export_turbomind.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os
import shutil


def export_turbomind_hf_model(model_name: str,
model_path: str,
work_dir: str,
model_format: str = 'awq',
group_size: int = 128,
tp: int = 1):
"""Export hf lmdeploy model and config.json."""
import lmdeploy
from lmdeploy.model import MODELS
from lmdeploy.turbomind.deploy.converter import get_model_format
from lmdeploy.turbomind.deploy.source_model.base import INPUT_MODELS
from lmdeploy.turbomind.deploy.target_model.base import (
OUTPUT_MODELS, TurbomindModelConfig)

assert model_name in MODELS.module_dict.keys(), \
f"'{model_name}' is not supported. " \
f'The supported models are: {MODELS.module_dict.keys()}'

lmdeploy_dir = os.path.split(lmdeploy.__file__)[0]
hf_repo = os.path.join(lmdeploy_dir, 'turbomind', 'hf_repo')
files = os.listdir(hf_repo)
for file in files:
src = os.path.join(hf_repo, file)
dst = os.path.join(work_dir, file)
shutil.copy(src, dst)

cfg = TurbomindModelConfig.from_dict({}, allow_none=True)
cfg.model_name = model_name
cfg.tensor_para_size = tp
cfg.rotary_embedding = cfg.size_per_head
cfg.group_size = group_size
cfg.weight_type = 'int4'
output_format = 'w4'

inferred_model_format = get_model_format(model_name, model_format)
input_model = INPUT_MODELS.get(inferred_model_format)(
model_path=model_path, tokenizer_path=work_dir, ckpt_path=work_dir)
output_model = OUTPUT_MODELS.get(output_format)(input_model=input_model,
cfg=cfg,
to_file=False,
out_dir='')
config = output_model.cfg.__dict__
config_file = os.path.join(work_dir, 'config.json')
with open(config_file) as f:
data = json.load(f)
data['turbomind'] = config
with open(config_file, 'w') as f:
f.write(json.dumps(data, indent=2) + '\n')
12 changes: 2 additions & 10 deletions lmdeploy/serve/async_engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
import asyncio
import dataclasses
import os.path as osp
import random
from contextlib import contextmanager
from typing import List, Literal, Optional
@@ -28,15 +27,8 @@ class AsyncEngine:

def __init__(self, model_path, instance_num=32, tp=1, **kwargs) -> None:
from lmdeploy import turbomind as tm
from lmdeploy.tokenizer import Tokenizer
tokenizer_model_path = osp.join(model_path, 'triton_models',
'tokenizer')
tokenizer = Tokenizer(tokenizer_model_path)
self.tm_model = tm.TurboMind(model_path,
eos_id=tokenizer.eos_token_id,
tp=tp,
**kwargs)
self.tokenizer = tokenizer
self.tm_model = tm.TurboMind(model_path, tp=tp, **kwargs)
self.tokenizer = self.tm_model.tokenizer
self.generators = [
self.tm_model.create_instance() for i in range(instance_num)
]
53 changes: 7 additions & 46 deletions lmdeploy/turbomind/chat.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,11 @@
# Copyright (c) OpenMMLab. All rights reserved.
import dataclasses
import os
import os.path as osp
import random

os.environ['TM_LOG_LEVEL'] = 'ERROR'

from .utils import get_gen_param
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we use absolute import instead of relative import? IDEs now like vscode can modify corresponding codes if the file is moved to other places.


@dataclasses.dataclass
class GenParam:
top_p: float
top_k: float
temperature: float
repetition_penalty: float
sequence_start: bool = False
sequence_end: bool = False
step: int = 0
request_output_len: int = 512
os.environ['TM_LOG_LEVEL'] = 'ERROR'


def input_prompt(model_name):
@@ -40,30 +29,6 @@ def valid_str(string, coding='utf-8'):
return ret


def get_gen_param(cap,
sampling_param,
nth_round,
step,
request_output_len=512,
**kwargs):
"""return parameters used by token generation."""
gen_param = GenParam(**dataclasses.asdict(sampling_param),
request_output_len=request_output_len)
# Fix me later. turbomind.py doesn't support None top_k
if gen_param.top_k is None:
gen_param.top_k = 40

if cap == 'chat':
gen_param.sequence_start = (nth_round == 1)
gen_param.sequence_end = False
gen_param.step = step
else:
gen_param.sequence_start = True
gen_param.sequence_end = True
gen_param.step = 0
return gen_param


def main(model_path,
session_id: int = 1,
cap: str = 'chat',
@@ -84,15 +49,11 @@ def main(model_path,
**kwarg (dict): other arguments for initializing model's chat template
"""
from lmdeploy import turbomind as tm
from lmdeploy.tokenizer import Tokenizer

tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
tokenizer = Tokenizer(tokenizer_model_path)
tm_model = tm.TurboMind(model_path,
eos_id=tokenizer.eos_token_id,
tp=tp,
capability=cap,
**kwargs)
tm_model = tm.TurboMind.from_pretrained(model_path,
tp=tp,
capability=cap,
**kwargs)
tokenizer = tm_model.tokenizer
generator = tm_model.create_instance()

nth_round = 1
2 changes: 1 addition & 1 deletion lmdeploy/turbomind/deploy/converter.py
Original file line number Diff line number Diff line change
@@ -203,7 +203,7 @@ def main(model_name: str,
if inferred_model_format.find('awq') != -1:
cfg.weight_type = 'int4'
output_format = 'w4'
assert group_size > 0, 'group_size should > 0'
assert group_size > 0, f'group_size: {group_size} should > 0'

# convert
print('model_name ', model_name)
Loading