From a48e2d273f85f5beeeddd042b79c41049fe01afa Mon Sep 17 00:00:00 2001
From: tpoisonooo <khj.application@aliyun.com>
Date: Tue, 29 Aug 2023 13:50:06 +0800
Subject: [PATCH] fix(kvint8): update doc (#315)

* fix(kvint8): update doc

* style(lmdeploy): format

* style(kv_qparams.py): linting

* fix lint

* Update kv_int8.md

* Update kv_int8.md

---------

Co-authored-by: AllentDan <AllentDan@yeah.net>
---
 README.md                        | 30 ------------------------------
 README_zh-CN.md                  | 30 ------------------------------
 docs/en/kv_int8.md               | 19 ++++++++++++++-----
 docs/zh_cn/kv_int8.md            | 19 ++++++++++++++-----
 lmdeploy/lite/apis/kv_qparams.py | 11 ++++++-----
 5 files changed, 34 insertions(+), 75 deletions(-)

diff --git a/README.md b/README.md
index 4a4d4dd4a3..1d22256cb3 100644
--- a/README.md
+++ b/README.md
@@ -214,40 +214,10 @@ pip install deepspeed
 
 ## Quantization
 
-### Step 1. Obtain Quantization Parameters
-
-First, run the quantization script to obtain the quantization parameters.
-
-> After execution, various parameters needed for quantization will be stored in `$WORK_DIR`; these will be used in the following steps..
-
-```
-python3 -m lmdeploy.lite.apis.calibrate \
-  --model $HF_MODEL \
-  --calib_dataset 'c4' \             # Calibration dataset, supports c4, ptb, wikitext2, pileval
-  --calib_samples 128 \              # Number of samples in the calibration set, if memory is insufficient, you can appropriately reduce this
-  --calib_seqlen 2048 \              # Length of a single piece of text, if memory is insufficient, you can appropriately reduce this
-  --work_dir $WORK_DIR \             # Folder storing Pytorch format quantization statistics parameters and post-quantization weight
-
-```
-
-### Step 2. Actual Model Quantization
-
-`LMDeploy` supports INT4 quantization of weights and INT8 quantization of KV Cache. Run the corresponding script according to your needs.
-
 #### Weight INT4 Quantization
 
 LMDeploy uses [AWQ](https://arxiv.org/abs/2306.00978) algorithm for model weight quantization
 
-> Requires input from the $WORK_DIR of step 1, and the quantized weights will also be stored in this folder.
-
-```
-python3 -m lmdeploy.lite.apis.auto_awq \
-  --model $HF_MODEL \
-  --w_bits 4 \                       # Bit number for weight quantization
-  --w_group_size 128 \               # Group size for weight quantization statistics
-  --work_dir $WORK_DIR \             # Directory saving quantization parameters from Step 1
-```
-
 [Click here](./docs/zh_cn/w4a16.md) to view the test results for weight int4 usage.
 
 #### KV Cache INT8 Quantization
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 3e649269e1..929665d091 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -212,40 +212,10 @@ deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
 
 ## 量化部署
 
-### Step 1. 获取量化参数
-
-首先，执行量化脚本，获取量化参数
-
-> 执行后，量化需要的各种参数会存放在 $WORK_DIR 中; 接下来的步骤中会用到
-
-```
-
-python3 -m lmdeploy.lite.apis.calibrate \
-  --model $HF_MODEL \
-  --calib_dataset 'c4' \             # 校准数据集，支持 c4, ptb, wikitext2, pileval
-  --calib_samples 128 \              # 校准集的样本数，如果显存不够，可以适当调小
-  --calib_seqlen 2048 \              # 单条的文本长度，如果显存不够，可以适当调小
-  --work_dir $WORK_DIR \             # 保存 Pytorch 格式量化统计参数和量化后权重的文件夹
-```
-
-### Step 2. 实际量化模型
-
-目前支持对权重的 INT4 量化和 KV Cache 的 INT8 量化，根据需求执行对应脚本即可
-
 #### 权重 INT4 量化
 
 LMDeploy 使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法对模型权重进行量化
 
-> 需要输入第一步的 \`$WORK_DIR\`\` ，量化后的权重也会存在这个文件夹中
-
-```
-python3 -m lmdeploy.lite.apis.auto_awq \
-  --model $HF_MODEL \
-  --w_bits 4 \                       # 权重量化的 bit 数
-  --w_group_size 128 \               # 权重量化分组统计尺寸
-  --work_dir $WORK_DIR \             # Step 1 保存量化参数的目录
-```
-
 [点击这里](./docs/zh_cn/w4a16.md) 查看 weight int4 用法测试结果。
 
 #### KV Cache INT8 量化
diff --git a/docs/en/kv_int8.md b/docs/en/kv_int8.md
index 9e6cefc147..bbda6a239f 100644
--- a/docs/en/kv_int8.md
+++ b/docs/en/kv_int8.md
@@ -25,23 +25,32 @@ If you already have a workspace directory, skip this step.
 
 ### **Step Two**
 
-Get the quantization parameters.
+Get the quantization parameters by these two steps:
 
 ```bash
+# get minmax
+python3 -m lmdeploy.lite.apis.calibrate \
+  --model $HF_MODEL \
+  --calib_dataset 'c4' \             # Support c4, ptb, wikitext2, pileval
+  --calib_samples 128 \              # Number of samples in the calibration set, if the memory is not enough, it can be adjusted appropriately
+  --calib_seqlen 2048 \              # Length of a single text, if the memory is not enough, you can adjust it appropriately
+  --work_dir $WORK_DIR \             # Directory for saving quantized statistical parameters and quantized weights in Pytorch format
+
+# get quant parameters
 python3 -m lmdeploy.lite.apis.kv_qparams \
-  --work_dir /path/to/internlm-chat-7b  \             # Directory of the Hugging Face model
-  --turbomind_dir workspace/trition_models/weights/ \ # Directory to save the quantization parameters
+  --work_dir $WORK_DIR  \                             # Directory of the last output
+  --turbomind_dir workspace/triton_models/weights/ \ # Directory to save the quantization parameters
   --kv_sym False \                                    # Symmetric or asymmetric quantization, default is False
   --num_tp 1  \                                       # Number of GPUs used for Tensor parallelization, keep it consistent with deploy.py
 ```
 
 `kv_qparams` will generate fp32 scaling factors in the `weights` directory. The file format is a binary produced by `numpy.tofile`.
 
-You can also first set `turbomind_dir` to a private directory, then copy the scaling factors into `workspace/trition_models/weights/`.
+You can also first set `turbomind_dir` to a private directory, then copy the scaling factors into `workspace/triton_models/weights/`.
 
 ### **Step Three**
 
-Modify `workspace/trition_models/weights/config.ini`:
+Modify `workspace/triton_models/weights/config.ini`:
 
 - Set use_context_fmha to 0, which means turning off flashattention
 - Set quant_policy to 4. This means enabling kv_cache int8
diff --git a/docs/zh_cn/kv_int8.md b/docs/zh_cn/kv_int8.md
index 2f634d46ce..e527b2be4a 100644
--- a/docs/zh_cn/kv_int8.md
+++ b/docs/zh_cn/kv_int8.md
@@ -25,23 +25,32 @@ python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-ch
 
 ### **第二步**
 
-获取量化参数
+通过以下 2 步，获取量化参数
 
 ```bash
+# 计算 minmax
+python3 -m lmdeploy.lite.apis.calibrate \
+  --model $HF_MODEL \
+  --calib_dataset 'c4' \             # 校准数据集，支持 c4, ptb, wikitext2, pileval
+  --calib_samples 128 \              # 校准集的样本数，如果显存不够，可以适当调小
+  --calib_seqlen 2048 \              # 单条的文本长度，如果显存不够，可以适当调小
+  --work_dir $WORK_DIR \             # 保存 Pytorch 格式量化统计参数和量化后权重的文件夹
+
+# 通过 minmax 获取量化参数
 python3 -m lmdeploy.lite.apis.kv_qparams \
-  --work_dir /path/to/internlm-chat-7b  \             # huggingface 模型目录
-  --turbomind_dir workspace/trition_models/weights/ \ # 保存量化参数的目录
+  --work_dir $WORK_DIR  \                             # 上一步的结果
+  --turbomind_dir workspace/triton_models/weights/ \ # 保存量化参数的目录，推理要用
   --kv_sym False \                                    # 对称量化或非对称量化，默认为 False
   --num_tp 1  \                                       # Tensor 并行使用的 GPU 数，和 deploy.py 保持一致
 ```
 
 `kv_qparams` 会在 `weights` 目录生成 fp32 缩放系数，文件格式是 `numpy.tofile` 产生的二进制。
 
-也可以先把 `turbomind_dir` 设成私有目录，再把缩放系数拷贝进 `workspace/trition_models/weights/`。
+也可以先把 `turbomind_dir` 设成私有目录，再把缩放系数拷贝进 `workspace/triton_models/weights/`。
 
 ### **第三步**
 
-修改 `workspace/trition_models/weights/config.ini`：
+修改 `workspace/triton_models/weights/config.ini`：
 
 - use_context_fmha 改为 0，表示关闭 flashattention
 - quant_policy 设置为 4。表示打开 kv_cache int8
diff --git a/lmdeploy/lite/apis/kv_qparams.py b/lmdeploy/lite/apis/kv_qparams.py
index e7c585da6f..7d43078daf 100644
--- a/lmdeploy/lite/apis/kv_qparams.py
+++ b/lmdeploy/lite/apis/kv_qparams.py
@@ -33,7 +33,7 @@ def _export_sym(key_stats: dict,
             kv_qparams = np.array([k_s, v_s], dtype=np.float32)
             out_path = out_dir / f'layers.{layer_idx}.past_kv_scale.{i}.weight'  # noqa: E501
             kv_qparams.tofile(out_path)
-            print(f'Layer {layer_idx} MP {i} KV scales done.')
+            print(f'Layer {layer_idx} MP {i} qparam: {k_s} \t{v_s}')
 
 
 def _export_asym(key_stats: dict,
@@ -81,15 +81,16 @@ def _export_asym(key_stats: dict,
 
             kv_qparams = np.array([k_scale, k_zp, v_scale, v_zp],
                                   dtype=np.float32)
-            out_path = out_dir / f'layers.{layer_idx}.past_kv_scale.{i}.weight'  # noqa: E501
+            out_path = out_dir / f'layers.{layer_idx}.past_kv_scale.{i}.weight'
             kv_qparams.tofile(out_path)
-            print(f'Layer {layer_idx} MP {i} KV scales&zeros done.')
+            print(f'Layer {layer_idx} MP {i} qparam: '
+                  f'\t{k_scale} \t{k_zp} \t{v_scale} \t{v_zp}')
 
 
 def main(work_dir: str,
          turbomind_dir: str,
          kv_bits: int = 8,
-         kv_sym: bool = True,
+         kv_sym: bool = False,
          num_tp: int = 1) -> None:
     """Main function to export key and value stats.
 
@@ -100,7 +101,7 @@ def main(work_dir: str,
         kv_bits (int, optional): Number of bits for quantization.
             Defaults to 8.
         kv_sym (bool, optional): Whether to use symmetric quantizaiton.
-            Defaults to True.
+            Defaults to False.
         num_tp (int, optional): Number of tensor parallelism. Defaults to 1.
     """