InternLM · lvhan028 · Nov 22, 2023 · Nov 10, 2023 · Nov 10, 2023 · Nov 10, 2023
diff --git a/.gitignore b/.gitignore
@@ -58,6 +58,7 @@ work_dir*/
 *.bin
 *config.json
 *generate_config.json
+!lmdeploy/turbomind/hf_repo/config.json
 
 # Pytorch
 *.pth

diff --git a/docs/en/w4a16.md b/docs/en/w4a16.md
@@ -96,6 +96,7 @@ LMDeploy employs AWQ algorithm for model weight quantization.
 
 ```shell
 lmdeploy lite auto_awq \
+  --model_name $MODEL_NAME \         # Model name, csould refer to lmdeploy/model.py
   --model $HF_MODEL \
   --w_bits 4 \                       # Bit number for weight quantization
   --w_group_size 128 \               # Group size for weight quantization statistics

diff --git a/docs/zh_cn/w4a16.md b/docs/zh_cn/w4a16.md
@@ -94,6 +94,7 @@ LMDeploy 使用 AWQ 算法对模型权重进行量化。在执行下面的命令
 
 ```shell
 lmdeploy lite auto_awq \
+  --model_name $MODEL_NAME           # 模型名字，可参考 lmdeploy/model.py
   --model $HF_MODEL \
   --w_bits 4 \                       # 权重量化的 bit 数
   --w_group_size 128 \               # 权重量化分组统计尺寸

diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
@@ -5,6 +5,7 @@ class SubCliLite(object):
     """CLI for compressing LLMs."""
 
     def auto_awq(self,
+                 model_name: str,
                  model: str,
                  work_dir: str,
                  w_bits: int = 4,
@@ -14,6 +15,7 @@ def auto_awq(self,
         """Perform weight quantization using AWQ algorithm.
 
         Args:
+            model_name (str): The name of model.
             model (str): The path of model in hf format.
             work_dir (str): The working directory to save results.
             w_bits (int): Bit number for weight quantization.
@@ -23,7 +25,8 @@ def auto_awq(self,
         """
         from lmdeploy.lite.apis.auto_awq import auto_awq
 
-        auto_awq(model,
+        auto_awq(model_name,
+                 model,
                  work_dir,
                  w_bits=w_bits,
                  w_sym=w_sym,

diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
@@ -11,6 +11,7 @@
 from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
                                             quant_weights, smooth_layers)
 from lmdeploy.lite.utils import collect_target_modules
+from lmdeploy.lite.utils.export_turbomind import export_turbomind_hf_model
 
 LAYER_TYPE_MAP = {
     'InternLMForCausalLM': 'InternLMDecoderLayer',
@@ -28,13 +29,17 @@
 }
 
 
-def auto_awq(model: str,
+def auto_awq(model_name: str,
+             model: str,
              work_dir: str,
              w_bits: int = 4,
              w_sym: bool = False,
              w_group_size: int = 128,
              device: str = 'cuda'):
 
+    assert model != work_dir, '$WORK_DIR and $HF_MODEL should be different'
+    model_path = model
+
     # Load tokenizer and configuration
     tokenizer = AutoTokenizer.from_pretrained(model,
                                               use_fast=False,
@@ -86,6 +91,11 @@ def auto_awq(model: str,
     model.save_pretrained(work_dir, max_shard_size='2GB')
     tokenizer.save_pretrained(work_dir)
 
+    export_turbomind_hf_model(model_name,
+                              model_path,
+                              work_dir,
+                              group_size=w_group_size)
+
 
 if __name__ == '__main__':
     import fire

diff --git a/lmdeploy/lite/apis/kv_qparams.py b/lmdeploy/lite/apis/kv_qparams.py
@@ -1,16 +1,34 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os
 from pathlib import Path
 from typing import Union
 
 import numpy as np
 import torch
 
 
+def _export_weight(into: str,
+                   kv_qparams: np.array,
+                   out_path: str,
+                   tm_params: dict = None):
+    """Save kv_qparams to disk or copy to tm_params."""
+    if tm_params is None:
+        print(into)
+        kv_qparams.tofile(out_path)
+    else:
+        name = os.path.basename(out_path)
+        src = torch.from_numpy(kv_qparams)
+        for tm_tensor in tm_params[name]:
+            tm_tensor.copy_from(src)
+        tm_params.pop(name)
+
+
 def _export_sym(key_stats: dict,
                 value_stats: dict,
                 bits: int,
                 out_dir: Union[str, Path],
-                tp: int = 1) -> None:
+                tp: int = 1,
+                tm_params: dict = None) -> None:
     """Export symmetric quantization parameters to specified directory."""
     keys_absmax = key_stats['absmax']
     values_absmax = value_stats['absmax']
@@ -31,15 +49,16 @@ def _export_sym(key_stats: dict,
 
             kv_qparams = np.array([k_s, v_s], dtype=np.float32)
             out_path = out_dir / f'layers.{layer_idx}.past_kv_scale.{i}.weight'  # noqa: E501
-            kv_qparams.tofile(out_path)
-            print(f'Layer {layer_idx} MP {i} qparam: {k_s} \t{v_s}')
+            info = f'Layer {layer_idx} MP {i} qparam: {k_s} \t{v_s}'
+            _export_weight(info, kv_qparams, out_path, tm_params)
 
 
 def _export_asym(key_stats: dict,
                  value_stats: dict,
                  bits: int,
                  out_dir: Union[str, Path],
-                 tp: int = 1) -> None:
+                 tp: int = 1,
+                 tm_params: dict = None) -> None:
     """Export asymmetric quantization parameters to specified directory."""
     keys_min = key_stats['min']
     values_min = value_stats['min']
@@ -81,16 +100,17 @@ def _export_asym(key_stats: dict,
             kv_qparams = np.array([k_scale, k_zp, v_scale, v_zp],
                                   dtype=np.float32)
             out_path = out_dir / f'layers.{layer_idx}.past_kv_scale.{i}.weight'
-            kv_qparams.tofile(out_path)
-            print(f'Layer {layer_idx} MP {i} qparam: '
-                  f'\t{k_scale} \t{k_zp} \t{v_scale} \t{v_zp}')
+            info = f'Layer {layer_idx} MP {i} qparam: ' \
+                f'\t{k_scale} \t{k_zp} \t{v_scale} \t{v_zp}'
+            _export_weight(info, kv_qparams, out_path, tm_params)
 
 
 def main(work_dir: str,
          turbomind_dir: str,
          kv_bits: int = 8,
          kv_sym: bool = False,
-         num_tp: int = 1) -> None:
+         num_tp: int = 1,
+         tm_params: dict = None) -> None:
     """Main function to export key and value stats.
 
     Args:
@@ -102,6 +122,7 @@ def main(work_dir: str,
         kv_sym (bool, optional): Whether to use symmetric quantizaiton.
             Defaults to False.
         num_tp (int, optional): Number of tensor parallelism. Defaults to 1.
+        tm_params (dict): turbomind model weights.
     """
 
     work_dir = Path(work_dir)
@@ -113,9 +134,10 @@ def main(work_dir: str,
     value_stats = torch.load(work_dir / 'value_stats.pth')
 
     if kv_sym:
-        _export_sym(key_stats, value_stats, kv_bits, tm_dir, num_tp)
+        _export_sym(key_stats, value_stats, kv_bits, tm_dir, num_tp, tm_params)
     else:
-        _export_asym(key_stats, value_stats, kv_bits, tm_dir, num_tp)
+        _export_asym(key_stats, value_stats, kv_bits, tm_dir, num_tp,
+                     tm_params)
 
 
 if __name__ == '__main__':

diff --git a/lmdeploy/lite/utils/export_turbomind.py b/lmdeploy/lite/utils/export_turbomind.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import shutil
+
+
+def export_turbomind_hf_model(model_name: str,
+                              model_path: str,
+                              work_dir: str,
+                              model_format: str = 'awq',
+                              group_size: int = 128,
+                              tp: int = 1):
+    """Export hf lmdeploy model and config.json."""
+    import lmdeploy
+    from lmdeploy.model import MODELS
+    from lmdeploy.turbomind.deploy.converter import get_model_format
+    from lmdeploy.turbomind.deploy.source_model.base import INPUT_MODELS
+    from lmdeploy.turbomind.deploy.target_model.base import (
+        OUTPUT_MODELS, TurbomindModelConfig)
+
+    assert model_name in MODELS.module_dict.keys(), \
+        f"'{model_name}' is not supported. " \
+        f'The supported models are: {MODELS.module_dict.keys()}'
+
+    lmdeploy_dir = os.path.split(lmdeploy.__file__)[0]
+    hf_repo = os.path.join(lmdeploy_dir, 'turbomind', 'hf_repo')
+    files = os.listdir(hf_repo)
+    for file in files:
+        src = os.path.join(hf_repo, file)
+        dst = os.path.join(work_dir, file)
+        shutil.copy(src, dst)
+
+    cfg = TurbomindModelConfig.from_dict({}, allow_none=True)
+    cfg.model_name = model_name
+    cfg.tensor_para_size = tp
+    cfg.rotary_embedding = cfg.size_per_head
+    cfg.group_size = group_size
+    cfg.weight_type = 'int4'
+    output_format = 'w4'
+
+    inferred_model_format = get_model_format(model_name, model_format)
+    input_model = INPUT_MODELS.get(inferred_model_format)(
+        model_path=model_path, tokenizer_path=work_dir, ckpt_path=work_dir)
+    output_model = OUTPUT_MODELS.get(output_format)(input_model=input_model,
+                                                    cfg=cfg,
+                                                    to_file=False,
+                                                    out_dir='')
+    config = output_model.cfg.__dict__
+    config_file = os.path.join(work_dir, 'config.json')
+    with open(config_file) as f:
+        data = json.load(f)
+    data['turbomind'] = config
+    with open(config_file, 'w') as f:
+        f.write(json.dumps(data, indent=2) + '\n')
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
 import dataclasses
-import os.path as osp
 import random
 from contextlib import contextmanager
 from typing import List, Literal, Optional
@@ -28,15 +27,8 @@ class AsyncEngine:
 
     def __init__(self, model_path, instance_num=32, tp=1, **kwargs) -> None:
         from lmdeploy import turbomind as tm
-        from lmdeploy.tokenizer import Tokenizer
-        tokenizer_model_path = osp.join(model_path, 'triton_models',
-                                        'tokenizer')
-        tokenizer = Tokenizer(tokenizer_model_path)
-        self.tm_model = tm.TurboMind(model_path,
-                                     eos_id=tokenizer.eos_token_id,
-                                     tp=tp,
-                                     **kwargs)
-        self.tokenizer = tokenizer
+        self.tm_model = tm.TurboMind(model_path, tp=tp, **kwargs)
+        self.tokenizer = self.tm_model.tokenizer
         self.generators = [
             self.tm_model.create_instance() for i in range(instance_num)
         ]

diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
@@ -1,22 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dataclasses
 import os
-import os.path as osp
 import random
 
-os.environ['TM_LOG_LEVEL'] = 'ERROR'
-
+from .utils import get_gen_param
 
-@dataclasses.dataclass
-class GenParam:
-    top_p: float
-    top_k: float
-    temperature: float
-    repetition_penalty: float
-    sequence_start: bool = False
-    sequence_end: bool = False
-    step: int = 0
-    request_output_len: int = 512
+os.environ['TM_LOG_LEVEL'] = 'ERROR'
 
 
 def input_prompt(model_name):
@@ -40,30 +29,6 @@ def valid_str(string, coding='utf-8'):
     return ret
 
 
-def get_gen_param(cap,
-                  sampling_param,
-                  nth_round,
-                  step,
-                  request_output_len=512,
-                  **kwargs):
-    """return parameters used by token generation."""
-    gen_param = GenParam(**dataclasses.asdict(sampling_param),
-                         request_output_len=request_output_len)
-    # Fix me later. turbomind.py doesn't support None top_k
-    if gen_param.top_k is None:
-        gen_param.top_k = 40
-
-    if cap == 'chat':
-        gen_param.sequence_start = (nth_round == 1)
-        gen_param.sequence_end = False
-        gen_param.step = step
-    else:
-        gen_param.sequence_start = True
-        gen_param.sequence_end = True
-        gen_param.step = 0
-    return gen_param
-
-
 def main(model_path,
          session_id: int = 1,
          cap: str = 'chat',
@@ -84,15 +49,11 @@ def main(model_path,
         **kwarg (dict): other arguments for initializing model's chat template
     """
     from lmdeploy import turbomind as tm
-    from lmdeploy.tokenizer import Tokenizer
-
-    tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
-    tokenizer = Tokenizer(tokenizer_model_path)
-    tm_model = tm.TurboMind(model_path,
-                            eos_id=tokenizer.eos_token_id,
-                            tp=tp,
-                            capability=cap,
-                            **kwargs)
+    tm_model = tm.TurboMind.from_pretrained(model_path,
+                                            tp=tp,
+                                            capability=cap,
+                                            **kwargs)
+    tokenizer = tm_model.tokenizer
     generator = tm_model.create_instance()
 
     nth_round = 1

diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
@@ -203,7 +203,7 @@ def main(model_name: str,
     if inferred_model_format.find('awq') != -1:
         cfg.weight_type = 'int4'
         output_format = 'w4'
-        assert group_size > 0, 'group_size should > 0'
+        assert group_size > 0, f'group_size: {group_size} should > 0'
 
     # convert
     print('model_name            ', model_name)