From a48e2d273f85f5beeeddd042b79c41049fe01afa Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Tue, 29 Aug 2023 13:50:06 +0800 Subject: [PATCH] fix(kvint8): update doc (#315) * fix(kvint8): update doc * style(lmdeploy): format * style(kv_qparams.py): linting * fix lint * Update kv_int8.md * Update kv_int8.md --------- Co-authored-by: AllentDan --- README.md | 30 ------------------------------ README_zh-CN.md | 30 ------------------------------ docs/en/kv_int8.md | 19 ++++++++++++++----- docs/zh_cn/kv_int8.md | 19 ++++++++++++++----- lmdeploy/lite/apis/kv_qparams.py | 11 ++++++----- 5 files changed, 34 insertions(+), 75 deletions(-) diff --git a/README.md b/README.md index 4a4d4dd4a3..1d22256cb3 100644 --- a/README.md +++ b/README.md @@ -214,40 +214,10 @@ pip install deepspeed ## Quantization -### Step 1. Obtain Quantization Parameters - -First, run the quantization script to obtain the quantization parameters. - -> After execution, various parameters needed for quantization will be stored in `$WORK_DIR`; these will be used in the following steps.. - -``` -python3 -m lmdeploy.lite.apis.calibrate \ - --model $HF_MODEL \ - --calib_dataset 'c4' \ # Calibration dataset, supports c4, ptb, wikitext2, pileval - --calib_samples 128 \ # Number of samples in the calibration set, if memory is insufficient, you can appropriately reduce this - --calib_seqlen 2048 \ # Length of a single piece of text, if memory is insufficient, you can appropriately reduce this - --work_dir $WORK_DIR \ # Folder storing Pytorch format quantization statistics parameters and post-quantization weight - -``` - -### Step 2. Actual Model Quantization - -`LMDeploy` supports INT4 quantization of weights and INT8 quantization of KV Cache. Run the corresponding script according to your needs. - #### Weight INT4 Quantization LMDeploy uses [AWQ](https://arxiv.org/abs/2306.00978) algorithm for model weight quantization -> Requires input from the $WORK_DIR of step 1, and the quantized weights will also be stored in this folder. - -``` -python3 -m lmdeploy.lite.apis.auto_awq \ - --model $HF_MODEL \ - --w_bits 4 \ # Bit number for weight quantization - --w_group_size 128 \ # Group size for weight quantization statistics - --work_dir $WORK_DIR \ # Directory saving quantization parameters from Step 1 -``` - [Click here](./docs/zh_cn/w4a16.md) to view the test results for weight int4 usage. #### KV Cache INT8 Quantization diff --git a/README_zh-CN.md b/README_zh-CN.md index 3e649269e1..929665d091 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -212,40 +212,10 @@ deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \ ## 量化部署 -### Step 1. 获取量化参数 - -首先,执行量化脚本,获取量化参数 - -> 执行后,量化需要的各种参数会存放在 $WORK_DIR 中; 接下来的步骤中会用到 - -``` - -python3 -m lmdeploy.lite.apis.calibrate \ - --model $HF_MODEL \ - --calib_dataset 'c4' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval - --calib_samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小 - --calib_seqlen 2048 \ # 单条的文本长度,如果显存不够,可以适当调小 - --work_dir $WORK_DIR \ # 保存 Pytorch 格式量化统计参数和量化后权重的文件夹 -``` - -### Step 2. 实际量化模型 - -目前支持对权重的 INT4 量化和 KV Cache 的 INT8 量化,根据需求执行对应脚本即可 - #### 权重 INT4 量化 LMDeploy 使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法对模型权重进行量化 -> 需要输入第一步的 \`$WORK_DIR\`\` ,量化后的权重也会存在这个文件夹中 - -``` -python3 -m lmdeploy.lite.apis.auto_awq \ - --model $HF_MODEL \ - --w_bits 4 \ # 权重量化的 bit 数 - --w_group_size 128 \ # 权重量化分组统计尺寸 - --work_dir $WORK_DIR \ # Step 1 保存量化参数的目录 -``` - [点击这里](./docs/zh_cn/w4a16.md) 查看 weight int4 用法测试结果。 #### KV Cache INT8 量化 diff --git a/docs/en/kv_int8.md b/docs/en/kv_int8.md index 9e6cefc147..bbda6a239f 100644 --- a/docs/en/kv_int8.md +++ b/docs/en/kv_int8.md @@ -25,23 +25,32 @@ If you already have a workspace directory, skip this step. ### **Step Two** -Get the quantization parameters. +Get the quantization parameters by these two steps: ```bash +# get minmax +python3 -m lmdeploy.lite.apis.calibrate \ + --model $HF_MODEL \ + --calib_dataset 'c4' \ # Support c4, ptb, wikitext2, pileval + --calib_samples 128 \ # Number of samples in the calibration set, if the memory is not enough, it can be adjusted appropriately + --calib_seqlen 2048 \ # Length of a single text, if the memory is not enough, you can adjust it appropriately + --work_dir $WORK_DIR \ # Directory for saving quantized statistical parameters and quantized weights in Pytorch format + +# get quant parameters python3 -m lmdeploy.lite.apis.kv_qparams \ - --work_dir /path/to/internlm-chat-7b \ # Directory of the Hugging Face model - --turbomind_dir workspace/trition_models/weights/ \ # Directory to save the quantization parameters + --work_dir $WORK_DIR \ # Directory of the last output + --turbomind_dir workspace/triton_models/weights/ \ # Directory to save the quantization parameters --kv_sym False \ # Symmetric or asymmetric quantization, default is False --num_tp 1 \ # Number of GPUs used for Tensor parallelization, keep it consistent with deploy.py ``` `kv_qparams` will generate fp32 scaling factors in the `weights` directory. The file format is a binary produced by `numpy.tofile`. -You can also first set `turbomind_dir` to a private directory, then copy the scaling factors into `workspace/trition_models/weights/`. +You can also first set `turbomind_dir` to a private directory, then copy the scaling factors into `workspace/triton_models/weights/`. ### **Step Three** -Modify `workspace/trition_models/weights/config.ini`: +Modify `workspace/triton_models/weights/config.ini`: - Set use_context_fmha to 0, which means turning off flashattention - Set quant_policy to 4. This means enabling kv_cache int8 diff --git a/docs/zh_cn/kv_int8.md b/docs/zh_cn/kv_int8.md index 2f634d46ce..e527b2be4a 100644 --- a/docs/zh_cn/kv_int8.md +++ b/docs/zh_cn/kv_int8.md @@ -25,23 +25,32 @@ python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-ch ### **第二步** -获取量化参数 +通过以下 2 步,获取量化参数 ```bash +# 计算 minmax +python3 -m lmdeploy.lite.apis.calibrate \ + --model $HF_MODEL \ + --calib_dataset 'c4' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval + --calib_samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小 + --calib_seqlen 2048 \ # 单条的文本长度,如果显存不够,可以适当调小 + --work_dir $WORK_DIR \ # 保存 Pytorch 格式量化统计参数和量化后权重的文件夹 + +# 通过 minmax 获取量化参数 python3 -m lmdeploy.lite.apis.kv_qparams \ - --work_dir /path/to/internlm-chat-7b \ # huggingface 模型目录 - --turbomind_dir workspace/trition_models/weights/ \ # 保存量化参数的目录 + --work_dir $WORK_DIR \ # 上一步的结果 + --turbomind_dir workspace/triton_models/weights/ \ # 保存量化参数的目录,推理要用 --kv_sym False \ # 对称量化或非对称量化,默认为 False --num_tp 1 \ # Tensor 并行使用的 GPU 数,和 deploy.py 保持一致 ``` `kv_qparams` 会在 `weights` 目录生成 fp32 缩放系数,文件格式是 `numpy.tofile` 产生的二进制。 -也可以先把 `turbomind_dir` 设成私有目录,再把缩放系数拷贝进 `workspace/trition_models/weights/`。 +也可以先把 `turbomind_dir` 设成私有目录,再把缩放系数拷贝进 `workspace/triton_models/weights/`。 ### **第三步** -修改 `workspace/trition_models/weights/config.ini`: +修改 `workspace/triton_models/weights/config.ini`: - use_context_fmha 改为 0,表示关闭 flashattention - quant_policy 设置为 4。表示打开 kv_cache int8 diff --git a/lmdeploy/lite/apis/kv_qparams.py b/lmdeploy/lite/apis/kv_qparams.py index e7c585da6f..7d43078daf 100644 --- a/lmdeploy/lite/apis/kv_qparams.py +++ b/lmdeploy/lite/apis/kv_qparams.py @@ -33,7 +33,7 @@ def _export_sym(key_stats: dict, kv_qparams = np.array([k_s, v_s], dtype=np.float32) out_path = out_dir / f'layers.{layer_idx}.past_kv_scale.{i}.weight' # noqa: E501 kv_qparams.tofile(out_path) - print(f'Layer {layer_idx} MP {i} KV scales done.') + print(f'Layer {layer_idx} MP {i} qparam: {k_s} \t{v_s}') def _export_asym(key_stats: dict, @@ -81,15 +81,16 @@ def _export_asym(key_stats: dict, kv_qparams = np.array([k_scale, k_zp, v_scale, v_zp], dtype=np.float32) - out_path = out_dir / f'layers.{layer_idx}.past_kv_scale.{i}.weight' # noqa: E501 + out_path = out_dir / f'layers.{layer_idx}.past_kv_scale.{i}.weight' kv_qparams.tofile(out_path) - print(f'Layer {layer_idx} MP {i} KV scales&zeros done.') + print(f'Layer {layer_idx} MP {i} qparam: ' + f'\t{k_scale} \t{k_zp} \t{v_scale} \t{v_zp}') def main(work_dir: str, turbomind_dir: str, kv_bits: int = 8, - kv_sym: bool = True, + kv_sym: bool = False, num_tp: int = 1) -> None: """Main function to export key and value stats. @@ -100,7 +101,7 @@ def main(work_dir: str, kv_bits (int, optional): Number of bits for quantization. Defaults to 8. kv_sym (bool, optional): Whether to use symmetric quantizaiton. - Defaults to True. + Defaults to False. num_tp (int, optional): Number of tensor parallelism. Defaults to 1. """