diff --git a/README.md b/README.md
index b4da5eda5..c65cff7e5 100644
--- a/README.md
+++ b/README.md
@@ -119,14 +119,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
GIT_LFS_SKIP_SMUDGE=1
# 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
```
#### Inference by TurboMind
```shell
-python -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
```
> **Note**
@@ -140,7 +140,7 @@ python -m lmdeploy.turbomind.chat ./workspace
#### Serving with gradio
```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
```
![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -150,14 +150,14 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
Launch inference server by:
```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --instance_num 32 --tp 1
```
Then, you can communicate with it by command line,
```shell
# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
```
or webui,
@@ -165,8 +165,8 @@ or webui,
```shell
# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
# server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
```
Refer to [restful_api.md](docs/en/restful_api.md) for more details.
@@ -182,13 +182,13 @@ bash workspace/service_docker_up.sh
Then, you can communicate with the inference server by command line,
```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
```
or webui,
```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
```
For the deployment of other supported models, such as LLaMA, LLaMA-2, vicuna and so on, you can find the guide from [here](docs/en/serving.md)
@@ -200,7 +200,7 @@ For detailed instructions on Inference pytorch models, see [here](docs/en/pytorc
#### Single GPU
```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL \
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \
--max_new_tokens 64 \
--temperture 0.8 \
--top_p 0.95 \
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 10c03bd1a..84f860ef3 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -120,14 +120,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
GIT_LFS_SKIP_SMUDGE=1
# 2. 转换为 trubomind 要求的格式。默认存放路径为 ./workspace
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
```
#### 使用 turbomind 推理
```shell
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
```
> **Note**
@@ -140,7 +140,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
#### 启动 gradio server
```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
```
![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -150,14 +150,14 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
使用下面的命令启动推理服务:
```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
```
你可以通过命令行方式与推理服务进行对话:
```shell
# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
```
也可以通过 WebUI 方式来对话:
@@ -165,8 +165,8 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
```shell
# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
# server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port${server_port} --restful_api True
```
更多详情可以查阅 [restful_api.md](docs/zh_cn/restful_api.md)。
@@ -182,13 +182,13 @@ bash workspace/service_docker_up.sh
你可以通过命令行方式与推理服务进行对话:
```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
```
也可以通过 WebUI 方式来对话:
```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
```
其他模型的部署方式,比如 LLaMA,LLaMA-2,vicuna等等,请参考[这里](docs/zh_cn/serving.md)
@@ -204,7 +204,7 @@ pip install deepspeed
#### 单个 GPU
```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL\
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL\
--max_new_tokens 64 \
--temperture 0.8 \
--top_p 0.95 \
diff --git a/docs/en/kv_int8.md b/docs/en/kv_int8.md
index 1f5f5aa12..5dcf43ba6 100644
--- a/docs/en/kv_int8.md
+++ b/docs/en/kv_int8.md
@@ -18,7 +18,7 @@ dequant: f = q * scale + zp
Convert the Hugging Face model format to the TurboMind inference format to create a workspace directory.
```bash
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
```
If you already have a workspace directory, skip this step.
@@ -29,7 +29,7 @@ Get the quantization parameters by these two steps:
```bash
# get minmax
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
--model $HF_MODEL \
--calib_dataset 'c4' \ # Support c4, ptb, wikitext2, pileval
--calib_samples 128 \ # Number of samples in the calibration set, if the memory is not enough, it can be adjusted appropriately
@@ -37,7 +37,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
--work_dir $WORK_DIR \ # Directory for saving quantized statistical parameters and quantized weights in Pytorch format
# get quant parameters
-python3 -m lmdeploy.lite.apis.kv_qparams \
+lmdeploy lite kv_qparams \
--work_dir $WORK_DIR \ # Directory of the last output
--turbomind_dir workspace/triton_models/weights/ \ # Directory to save the quantization parameters
--kv_sym False \ # Symmetric or asymmetric quantization, default is False
@@ -64,7 +64,7 @@ Considering there are four combinations of kernels needed to be implemented, pre
Test the chat performance.
```bash
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
```
## GPU Memory Test
diff --git a/docs/en/pytorch.md b/docs/en/pytorch.md
index e3662ab37..e4cd5a9cb 100644
--- a/docs/en/pytorch.md
+++ b/docs/en/pytorch.md
@@ -9,13 +9,13 @@ This submodule allow user to chat with language model through command line, and
**Example 1**: Chat with default setting
```shell
-python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL
+lmdeploy chat torch $PATH_TO_HF_MODEL
```
**Example 2**: Disable sampling and chat history
```shell
-python -m lmdeploy.pytorch.chat \
+lmdeploy chat torch \
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
--temperature 0 --max-history 0
```
@@ -23,7 +23,7 @@ python -m lmdeploy.pytorch.chat \
**Example 3**: Accelerate with deepspeed inference
```shell
-python -m lmdeploy.pytorch.chat \
+lmdeploy chat torch \
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
--accel deepspeed
```
diff --git a/docs/en/restful_api.md b/docs/en/restful_api.md
index cb70e2637..a66859c0c 100644
--- a/docs/en/restful_api.md
+++ b/docs/en/restful_api.md
@@ -3,7 +3,7 @@
### Launch Service
```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace 0.0.0.0 server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
```
Then, the user can open the swagger UI: `http://{server_ip}:{server_port}` for the detailed api usage.
@@ -125,7 +125,7 @@ There is a client script for restful api server.
```shell
# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
```
### webui
@@ -135,8 +135,8 @@ You can also test restful-api through webui.
```shell
# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
# server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
```
### FAQ
diff --git a/docs/en/serving.md b/docs/en/serving.md
index 1e6f783d7..6cc18018d 100644
--- a/docs/en/serving.md
+++ b/docs/en/serving.md
@@ -8,7 +8,7 @@ You can download [llama-2 models from huggingface](https://huggingface.co/meta-l
7B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf
+lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf
bash workspace/service_docker_up.sh
```
@@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh
13B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2
+lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2
bash workspace/service_docker_up.sh
```
@@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh
70B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8
+lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8
bash workspace/service_docker_up.sh
```
@@ -42,7 +42,7 @@ Weights for the LLaMA models can be obtained from by filling out [this form](htt
7B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \
+lmdeploy convert llama /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh
```
@@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh
13B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \
+lmdeploy convert llama /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh
```
@@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh
30B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \
+lmdeploy convert llama /path/to/llama-30b llama \
--tokenizer_path /path/to/tokenizer/model --tp 4
bash workspace/service_docker_up.sh
```
@@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh
65B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \
+lmdeploy convert llama /path/to/llama-65b llama \
--tokenizer_path /path/to/tokenizer/model --tp 8
bash workspace/service_docker_up.sh
```
@@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-7b \
--delta-path lmsys/vicuna-7b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b
+lmdeploy convert vicuna /path/to/vicuna-7b
bash workspace/service_docker_up.sh
```
@@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-13b \
--delta-path lmsys/vicuna-13b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b
+lmdeploy convert vicuna /path/to/vicuna-13b
bash workspace/service_docker_up.sh
```
diff --git a/docs/en/supported_models/codellama.md b/docs/en/supported_models/codellama.md
index 1b5140205..886dc5922 100644
--- a/docs/en/supported_models/codellama.md
+++ b/docs/en/supported_models/codellama.md
@@ -29,7 +29,7 @@ Based on the above table, download the model that meets your requirements. Execu
python3 -m pip install lmdeploy
# convert weight layout
-python3 -m lmdeploy.serve.turbomind.deploy codellama /the/path/of/codellama/model
+lmdeploy convert codellama /the/path/of/codellama/model
```
Then, you can communicate with codellama in consolo by following instructions in next sections
@@ -42,13 +42,13 @@ Then, you can communicate with codellama in consolo by following instructions in
### Completion
```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+lmdeploy chat turbomind ./workspace --cap completion
```
### Infilling
```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+lmdeploy chat turbomind ./workspace --cap infilling
```
The input code is supposed to have a special placeholder ``. For example,
@@ -64,7 +64,7 @@ And the generated code piece by `turbomind.chat` is the one to be filled in `7B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf
+lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf
bash workspace/service_docker_up.sh
```
@@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh
13B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2
+lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2
bash workspace/service_docker_up.sh
```
@@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh
70B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8
+lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8
bash workspace/service_docker_up.sh
```
@@ -42,7 +42,7 @@ bash workspace/service_docker_up.sh
7B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \
+lmdeploy convert llama /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh
```
@@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh
13B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \
+lmdeploy convert llama /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh
```
@@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh
30B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \
+lmdeploy convert llama /path/to/llama-30b llama \
--tokenizer_path /path/to/tokenizer/model --tp 4
bash workspace/service_docker_up.sh
```
@@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh
65B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \
+lmdeploy convert llama /path/to/llama-65b llama \
--tokenizer_path /path/to/tokenizer/model --tp 8
bash workspace/service_docker_up.sh
```
@@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-7b \
--delta-path lmsys/vicuna-7b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b
+lmdeploy convert vicuna /path/to/vicuna-7b
bash workspace/service_docker_up.sh
```
@@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-13b \
--delta-path lmsys/vicuna-13b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b
+lmdeploy convert vicuna /path/to/vicuna-13b
bash workspace/service_docker_up.sh
```
diff --git a/docs/zh_cn/supported_models/codellama.md b/docs/zh_cn/supported_models/codellama.md
index ca9029a52..a2abd2f4a 100644
--- a/docs/zh_cn/supported_models/codellama.md
+++ b/docs/zh_cn/supported_models/codellama.md
@@ -29,7 +29,7 @@
python3 -m pip install lmdeploy
# 转模型格式
-python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model
+lmdeploy convert codellama /path/of/codellama/model
```
接下来,可参考如下章节,在控制台与 codellama 进行交互式对话。
@@ -42,13 +42,13 @@ python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model
### 代码续写
```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+lmdeploy chat turbomind ./workspace --cap completion
```
### 代码填空
```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+lmdeploy chat turbomind ./workspace --cap infilling
```
输入的代码块中要包含 ``,比如:
@@ -64,7 +64,7 @@ def remove_non_ascii(s: str) -> str:
### 对话
```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python"
+lmdeploy chat turbomind ./workspace --cap chat --sys-instruct "Provide answers in Python"
```
可以把 `--sys-instruct` 的指令换成 codellama 支持的其他变成语言。
@@ -72,7 +72,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provid
### Python 专项
```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap python
+lmdeploy chat turbomind ./workspace --cap python
```
建议这里部署 Python 微调模型
@@ -90,7 +90,7 @@ TBD
```shell
# --instance_num: turbomind推理实例的个数。可理解为支持的最大并发数
# --tp: 在 tensor parallel时,使用的GPU数量
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
```
打开 `http://{server_ip}:{server_port}`,即可访问 swagger,查阅 RESTful API 的详细信息。
@@ -99,7 +99,7 @@ python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --
```shell
# restful_api_url 就是 api_server 产生的,比如 http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
```
或者,启动 gradio,在 webui 的聊天对话框中,与 codellama 交流:
@@ -107,8 +107,8 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
```shell
# restful_api_url 就是 api_server 产生的,比如 http://localhost:23333
# server_ip 和 server_port 是用来提供 gradio ui 访问服务的
-# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
```
关于 RESTful API的详细介绍,请参考[这份](../restful_api.md)文档。
diff --git a/docs/zh_cn/w4a16.md b/docs/zh_cn/w4a16.md
index d28cb716d..46f5c58a9 100644
--- a/docs/zh_cn/w4a16.md
+++ b/docs/zh_cn/w4a16.md
@@ -24,14 +24,14 @@ git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
```shell
## 转换模型的layout,存放在默认路径 ./workspace 下
-python3 -m lmdeploy.serve.turbomind.deploy \
+lmdeploy convert \
--model-name llama2 \
--model-path ./llama2-chat-7b-w4 \
--model-format awq \
--group-size 128
## 推理
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
```
## 启动 gradio 服务
@@ -39,7 +39,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
如果想通过 webui 与模型对话,请执行以下命令启动 gradio 服务
```shell
-python3 -m lmdeploy.serve.turbomind ./workspace --server_name {ip_addr} ----server_port {port}
+lmdeploy serve gradio ./workspace --server_name {ip_addr} --server_port {port}
```
然后,在浏览器中打开 http://{ip_addr}:{port},即可在线对话
@@ -80,7 +80,7 @@ python benchmark/profile_generation.py \
### 第一步:生成量化参数
```shell
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
--model $HF_MODEL \
--calib_dataset 'c4' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval
--calib_samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小
@@ -93,7 +93,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
LMDeploy 使用 AWQ 算法对模型权重进行量化。在执行下面的命令时,需要把步骤1的`$WORK_DIR`传入。量化结束后,权重文件也会存放在这个目录中。然后就可以根据 ["4bit权重模型推理"](#4bit-权重模型推理)章节的说明,进行模型推理。
```shell
-python3 -m lmdeploy.lite.apis.auto_awq \
+lmdeploy lite auto_awq \
--model $HF_MODEL \
--w_bits 4 \ # 权重量化的 bit 数
--w_group_size 128 \ # 权重量化分组统计尺寸
diff --git a/lmdeploy/cli/__init__.py b/lmdeploy/cli/__init__.py
new file mode 100644
index 000000000..3575bec5b
--- /dev/null
+++ b/lmdeploy/cli/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .cli import run
+
+__all__ = ['run']
diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py
new file mode 100644
index 000000000..735b24c7c
--- /dev/null
+++ b/lmdeploy/cli/chat.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+
+class SubCliChat(object):
+ """Chat through terminal with pytorch or turbomind model."""
+
+ def torch(self,
+ model_path: str,
+ tokenizer_path: Optional[str] = None,
+ accel: Optional[str] = None,
+ max_new_tokens: int = 128,
+ temperature: float = 0.8,
+ top_p: float = 0.95,
+ seed: int = 0,
+ use_fast_tokenizer: bool = True,
+ max_alloc: int = 2048,
+ max_session_len: int = None,
+ log_file: Optional[str] = None,
+ debug: bool = False,
+ adapter: Optional[str] = None):
+ """Chat with pytorch model through terminal.
+
+ Args:
+ model_path (str): Path to pytorch model.
+ tokenizer_path (str): Path to tokenizer.
+ accel (str): Model accelerator.
+ max_new_tokens (int): Maximum number of tokens to generate.
+ temperature (float): Temperature for sampling.
+ top_p (float): Top p for sampling.
+ seed (int): Random seed.
+ use_fast_tokenizer (bool): Whether to use fast tokenizer.
+ This argument is directly pass to transformer's
+ ``AutoTokenizer.from_pretrained``.
+ Generally, user should choose to use fast tokenizers.
+ But if using fast raise some error, try to force using a slow one.
+ max_alloc (int): Maximum memory to allocate (for deepspeed).
+ max_session_len (int): Maximum number of tokens allowed for all chat sessions.
+ This include both history and current session.
+ log_file (str): Path to log file.
+ debug (bool): Whether to enable debug mode.
+ adapter (str): Force to use an adapter.
+ Generally user should not use this argument because adapter is selected based
+ on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
+ based on `LlamaforCausalLM` class, this argument is required.
+ Currently, only "llama1" is acceptable for llama1 models.
+ """ # noqa: E501
+ from lmdeploy.pytorch.chat import main as run_torch_model
+
+ run_torch_model(model_path,
+ tokenizer_path=tokenizer_path,
+ accel=accel,
+ max_new_tokens=max_new_tokens,
+ temperature=temperature,
+ top_p=top_p,
+ seed=seed,
+ use_fast_tokenizer=use_fast_tokenizer,
+ max_alloc=max_alloc,
+ max_session_len=max_session_len,
+ log_file=log_file,
+ debug=debug,
+ adapter=adapter)
+
+ def turbomind(self,
+ model_path,
+ session_id: int = 1,
+ cap: str = 'chat',
+ tp=1,
+ stream_output=True,
+ **kwargs):
+ """Chat with turbomind model through terminal.
+
+ Args:
+ model_path (str): the path of the deployed model
+ session_id (int): the identical id of a session
+ cap (str): the capability of a model. For example, codellama has
+ the ability among ['completion', 'infilling', 'chat', 'python']
+ tp (int): GPU number used in tensor parallelism
+ stream_output (bool): indicator for streaming output or not
+ **kwarg (dict): other arguments for initializing model's chat
+ template
+ """
+ from lmdeploy.turbomind.chat import main as run_turbomind_model
+
+ run_turbomind_model(model_path,
+ session_id=session_id,
+ cap=cap,
+ tp=tp,
+ stream_output=stream_output,
+ **kwargs)
diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
new file mode 100644
index 000000000..7b2634b53
--- /dev/null
+++ b/lmdeploy/cli/cli.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import fire
+
+from .chat import SubCliChat
+from .lite import SubCliLite
+from .serve import SubCliServe
+
+
+class CLI(object):
+ """LMDeploy Command Line Interface.
+
+ The CLI provides a unified API for converting, compressing and deploying
+ large language models.
+ """
+
+ def convert(self,
+ model_name: str,
+ model_path: str,
+ model_format: str = None,
+ tokenizer_path: str = None,
+ dst_path: str = './workspace',
+ tp: int = 1,
+ quant_path: str = None,
+ group_size: int = 0):
+ """Convert LLMs to lmdeploy format.
+
+ Args:
+ model_name (str): The name of the to-be-deployed model, such as
+ llama-7b, llama-13b, vicuna-7b and etc.
+ model_path (str): The directory path of the model
+ model_format (str): The format of the model, fb or hf. 'fb' stands
+ for META's llama format, and 'hf' means huggingface format.
+ tokenizer_path (str): The path of tokenizer model.
+ dst_path (str): The destination path that saves outputs.
+ tp (int): The number of GPUs used for tensor parallelism, which
+ should be 2^n.
+ quant_path (str): Path of the quantized model, which can be None.
+ group_size (int): A parameter used in AWQ to quantize fp16 weights
+ to 4 bits.
+ """
+ from lmdeploy.serve.turbomind.deploy import main as convert
+
+ convert(model_name,
+ model_path,
+ model_format=model_format,
+ tokenizer_path=tokenizer_path,
+ dst_path=dst_path,
+ tp=tp,
+ quant_path=quant_path,
+ group_size=group_size)
+
+
+def run():
+ """The entry point of running LMDeploy CLI."""
+
+ cli = CLI()
+ cli.lite = SubCliLite()
+ cli.chat = SubCliChat()
+ cli.serve = SubCliServe()
+
+ fire.Fire(cli, name='lmdeploy')
diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
new file mode 100644
index 000000000..4302765e2
--- /dev/null
+++ b/lmdeploy/cli/lite.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+
+class SubCliLite(object):
+ """CLI for compressing LLMs."""
+
+ def auto_awq(self,
+ model: str,
+ work_dir: str,
+ w_bits: int = 4,
+ w_sym: bool = False,
+ w_group_size: int = 128,
+ device: str = 'cuda'):
+ """Perform weight quantization using AWQ algorithm.
+
+ Args:
+ model (str): The path of model in hf format.
+ work_dir (str): The working directory to save results.
+ w_bits (int): Bit number for weight quantization.
+ w_sym (bool): Whether to do symmetric quantization.
+ w_group_size (int): Group size for weight quantization statistics.
+ device (str): Device type of running.
+ """
+ from lmdeploy.lite.apis.auto_awq import auto_awq
+
+ auto_awq(model,
+ work_dir,
+ w_bits=w_bits,
+ w_sym=w_sym,
+ w_group_size=w_group_size,
+ device=device)
+
+ def calibrate(self,
+ model: str,
+ calib_dataset: str = 'c4',
+ calib_samples: int = 128,
+ calib_seqlen: int = 2048,
+ work_dir: str = './work_dir',
+ device: str = 'cuda') -> None:
+ """Perform calibration on a given dataset.
+
+ Args:
+ model (str): The model to be loaded.
+ calib_dataset (str, optional): The calibration dataset name.
+ Defaults to 'c4'.
+ calib_samples (int, optional): The number of samples for
+ calibration. Defaults to 128.
+ calib_seqlen (int, optional): The sequence length for calibration.
+ Defaults to 2048.
+ work_dir (str): The working directory for outputs.
+ Defaults to './work_dir'.
+ device (str, optional): The device to be used for calculation.
+ Defaults to 'cuda'.
+ """
+ from lmdeploy.lite.apis.calibrate import calibrate
+
+ calibrate(model,
+ calib_dataset=calib_dataset,
+ calib_samples=calib_samples,
+ calib_seqlen=calib_seqlen,
+ work_dir=work_dir,
+ device=device)
+
+ def kv_qparams(self,
+ work_dir: str,
+ turbomind_dir: str,
+ kv_bits: int = 8,
+ kv_sym: bool = False,
+ num_tp: int = 1) -> None:
+ """Export key and value stats.
+
+ Args:
+ work_dir (str): Directory path where the stats
+ are saved.
+ turbomind_dir (str): Directory path where to
+ save the results.
+ kv_bits (int, optional): Number of bits for quantization.
+ Defaults to 8.
+ kv_sym (bool, optional): Whether to use symmetric quantization.
+ Defaults to False.
+ num_tp (int, optional): Number of tensor parallelism.
+ Defaults to 1.
+ """
+ from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
+
+ run_kv_qparams(work_dir,
+ turbomind_dir,
+ kv_bits=kv_bits,
+ kv_sym=kv_sym,
+ num_tp=num_tp)
+
+ def get_small_sharded_hf(self, src_dir: str, dst_dir: str):
+ """Convert a hugging face model to the smallest sharded one.
+
+ Args:
+ src_dir (str): The directory of the input HF model.
+ dst_dir (str): The directory to save new model.
+ """
+ from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded
+ run_sharded(src_dir, dst_dir)
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
new file mode 100644
index 000000000..0bff69c31
--- /dev/null
+++ b/lmdeploy/cli/serve.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+
+class SubCliServe(object):
+ """Serve LLMs and interact on terminal or web UI."""
+
+ def gradio(self,
+ model_path_or_server: str,
+ server_name: str = 'localhost',
+ server_port: int = 6006,
+ batch_size: int = 32,
+ tp: int = 1,
+ restful_api: bool = False):
+ """Serve LLMs with web ui using gradio.
+
+ Example 1:
+ lmdeploy serve gradio ./workspace
+
+ Example 2:
+ lmdeploy serve gradio http://localhost:23333
+ --server_name localhost
+ --server_port 6006
+ --restful_api True
+
+ Example 3:
+ lmdeploy serve gradio ${triton_server_ip_addresss}:33337
+
+ Args:
+ model_path_or_server (str): the path of the deployed model or the
+ tritonserver URL or restful api URL. The former is for directly
+ running service with gradio. The latter is for running with
+ tritonserver by default. If the input URL is restful api.
+ Please enable another flag `restful_api`.
+ server_name (str): the ip address of gradio server
+ server_port (int): the port of gradio server
+ batch_size (int): batch size for running Turbomind directly
+ tp (int): tensor parallel for Turbomind
+ restful_api (bool): a flag for model_path_or_server
+ """
+ from lmdeploy.serve.gradio.app import run
+ run(model_path_or_server,
+ server_name=server_name,
+ server_port=server_port,
+ batch_size=batch_size,
+ tp=tp,
+ restful_api=restful_api)
+
+ def api_server(self,
+ model_path: str,
+ server_name: str = 'localhost',
+ server_port: int = 23333,
+ instance_num: int = 32,
+ tp: int = 1,
+ allow_origins: List[str] = ['*'],
+ allow_credentials: bool = True,
+ allow_methods: List[str] = ['*'],
+ allow_headers: List[str] = ['*']):
+ """Serve LLMs with restful api using fastapi.
+
+ Args:
+ model_path (str): the path of the deployed model
+ server_name (str): host ip for serving
+ server_port (int): server port
+ instance_num (int): number of instances of turbomind model
+ tp (int): tensor parallel
+ allow_origins (List[str]): a list of allowed origins for CORS
+ allow_credentials (bool): whether to allow credentials for CORS
+ allow_methods (List[str]): a list of allowed HTTP methods for CORS
+ allow_headers (List[str]): a list of allowed HTTP headers for CORS
+ """
+ from lmdeploy.serve.openai.api_server import main as run_api_server
+
+ run_api_server(model_path,
+ server_name=server_name,
+ server_port=server_port,
+ instance_num=instance_num,
+ tp=tp,
+ allow_origins=allow_origins,
+ allow_credentials=allow_credentials,
+ allow_methods=allow_methods,
+ allow_headers=allow_headers)
+
+ def api_client(self, restful_api_url: str, session_id: int = 0):
+ """Interact with restful api server in terminal.
+
+ Args:
+ restful_api_url: The restful api URL.
+ session_id: The identical id of a session.
+ """
+ from lmdeploy.serve.openai.api_client import main as run_api_client
+ run_api_client(restful_api_url, session_id=session_id)
+
+ def triton_client(self,
+ tritonserver_addr: str,
+ session_id: int = 1,
+ cap: str = 'chat',
+ stream_output: bool = True,
+ **kwargs):
+ """Interact with Triton Server using gRPC protocol.
+
+ Args:
+ tritonserver_addr (str): the address in format "ip:port" of
+ triton inference server
+ session_id (int): the identical id of a session
+ cap (str): the capability of a model. For example, codellama
+ has the ability among ['completion', 'infill', 'instruct',
+ 'python']
+ stream_output (bool): indicator for streaming output or not
+ **kwargs (dict): other arguments for initializing model's
+ chat template
+ """
+
+ from lmdeploy.serve.client import main as run_triton_client
+
+ run_triton_client(
+ tritonserver_addr,
+ session_id=session_id,
+ cap=cap,
+ stream_output=stream_output,
+ **kwargs,
+ )
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
index 3517f51b8..250defb59 100644
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -2,7 +2,6 @@
from pathlib import Path
-import fire
import torch
from accelerate import (infer_auto_device_map, init_empty_weights,
load_checkpoint_in_model)
@@ -81,5 +80,6 @@ def auto_awq(model: str,
if __name__ == '__main__':
+ import fire
fire.Fire(auto_awq)
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 38b6429a1..3df252f06 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -2,7 +2,6 @@
from pathlib import Path
-import fire
import torch
from accelerate import (infer_auto_device_map, init_empty_weights,
load_checkpoint_in_model)
@@ -107,4 +106,6 @@ def calibrate(model: str,
if __name__ == '__main__':
+ import fire
+
fire.Fire(calibrate)
diff --git a/lmdeploy/lite/apis/kv_qparams.py b/lmdeploy/lite/apis/kv_qparams.py
index 7d43078da..f31fee029 100644
--- a/lmdeploy/lite/apis/kv_qparams.py
+++ b/lmdeploy/lite/apis/kv_qparams.py
@@ -2,7 +2,6 @@
from pathlib import Path
from typing import Union
-import fire
import numpy as np
import torch
@@ -120,5 +119,6 @@ def main(work_dir: str,
if __name__ == '__main__':
+ import fire
fire.Fire(main)
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 18da39a6e..39451acdc 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -654,4 +654,5 @@ def main(model_name: str = 'test'):
if __name__ == '__main__':
import fire
+
fire.Fire(main)
diff --git a/lmdeploy/pytorch/chat.py b/lmdeploy/pytorch/chat.py
index c30cf6ffe..2690480a8 100644
--- a/lmdeploy/pytorch/chat.py
+++ b/lmdeploy/pytorch/chat.py
@@ -51,7 +51,6 @@
import logging
from typing import Optional
-import fire
import torch
from transformers import GenerationConfig, PreTrainedModel
@@ -205,6 +204,8 @@ def main(
def cli():
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/serve/client.py b/lmdeploy/serve/client.py
index 9c0d3cb5c..424e83143 100644
--- a/lmdeploy/serve/client.py
+++ b/lmdeploy/serve/client.py
@@ -1,8 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
import os
-import fire
-
from lmdeploy.serve.turbomind.chatbot import Chatbot
@@ -66,4 +64,6 @@ def main(tritonserver_addr: str,
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/serve/gradio/app.py b/lmdeploy/serve/gradio/app.py
index 71db7a274..5c200517b 100644
--- a/lmdeploy/serve/gradio/app.py
+++ b/lmdeploy/serve/gradio/app.py
@@ -5,7 +5,6 @@
from functools import partial
from typing import Sequence
-import fire
import gradio as gr
from lmdeploy.serve.async_engine import AsyncEngine
@@ -525,7 +524,7 @@ def run(model_path_or_server: str,
server_port (int): the port of gradio server
batch_size (int): batch size for running Turbomind directly
tp (int): tensor parallel for Turbomind
- restufl_api (bool): a flag for model_path_or_server
+ restful_api (bool): a flag for model_path_or_server
"""
if ':' in model_path_or_server:
if restful_api:
@@ -539,4 +538,6 @@ def run(model_path_or_server: str,
if __name__ == '__main__':
+ import fire
+
fire.Fire(run)
diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py
index a8718331b..26977bc6c 100644
--- a/lmdeploy/serve/openai/api_client.py
+++ b/lmdeploy/serve/openai/api_client.py
@@ -2,7 +2,6 @@
import json
from typing import Iterable, List
-import fire
import requests
@@ -89,4 +88,6 @@ def main(restful_api_url: str, session_id: int = 0):
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 94271c4b9..8324e3497 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -4,7 +4,6 @@
from http import HTTPStatus
from typing import AsyncGenerator, List, Optional
-import fire
import uvicorn
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
@@ -357,4 +356,6 @@ def main(model_path: str,
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py
index 3aca6e1f1..ab8c9ea95 100644
--- a/lmdeploy/serve/turbomind/deploy.py
+++ b/lmdeploy/serve/turbomind/deploy.py
@@ -8,7 +8,6 @@
import sys
from pathlib import Path
-import fire
import safetensors
import torch
from safetensors.torch import load_file
@@ -1043,4 +1042,6 @@ def main(model_name: str,
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index ca9d9c34e..bf0ce7399 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -4,11 +4,7 @@
import os.path as osp
import random
-import fire
-
-from lmdeploy import turbomind as tm
from lmdeploy.model import MODELS
-from lmdeploy.tokenizer import Tokenizer
os.environ['TM_LOG_LEVEL'] = 'ERROR'
@@ -88,6 +84,9 @@ def main(model_path,
stream_output (bool): indicator for streaming output or not
**kwarg (dict): other arguments for initializing model's chat template
"""
+ from lmdeploy import turbomind as tm
+ from lmdeploy.tokenizer import Tokenizer
+
tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
tokenizer = Tokenizer(tokenizer_model_path)
tm_model = tm.TurboMind(model_path, eos_id=tokenizer.eos_token_id, tp=tp)
@@ -157,4 +156,6 @@ def main(model_path,
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/turbomind/decode.py b/lmdeploy/turbomind/decode.py
index daef35298..5ba4675c5 100644
--- a/lmdeploy/turbomind/decode.py
+++ b/lmdeploy/turbomind/decode.py
@@ -2,7 +2,6 @@
import os
import os.path as osp
-import fire
import torch
from lmdeploy import turbomind as tm
@@ -37,4 +36,6 @@ def main(model_path, inputs):
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
index 328f18215..9a4f0e8c4 100644
--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
@@ -2,8 +2,6 @@
import subprocess
-import fire
-
def get_llama_gemm():
import os.path as osp
@@ -30,4 +28,6 @@ def main(head_num: int = 32,
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/setup.py b/setup.py
index 09ae1e31c..df36118c2 100644
--- a/setup.py
+++ b/setup.py
@@ -121,26 +121,29 @@ def gen_packages_items():
if __name__ == '__main__':
lmdeploy_package_data = ['lmdeploy/bin/llama_gemm']
- setup(name='lmdeploy',
- version=get_version(),
- description='A toolset for compressing, deploying and serving LLM',
- long_description=readme(),
- long_description_content_type='text/markdown',
- author='OpenMMLab',
- author_email='openmmlab@gmail.com',
- packages=find_packages(exclude=()),
- package_data={
- 'lmdeploy': lmdeploy_package_data,
- },
- include_package_data=True,
- install_requires=parse_requirements('requirements.txt'),
- has_ext_modules=check_ext_modules,
- classifiers=[
- 'Programming Language :: Python :: 3.8',
- 'Programming Language :: Python :: 3.9',
- 'Programming Language :: Python :: 3.10',
- 'Programming Language :: Python :: 3.11',
- 'Intended Audience :: Developers',
- 'Intended Audience :: Education',
- 'Intended Audience :: Science/Research',
- ])
+ setup(
+ name='lmdeploy',
+ version=get_version(),
+ description='A toolset for compressing, deploying and serving LLM',
+ long_description=readme(),
+ long_description_content_type='text/markdown',
+ author='OpenMMLab',
+ author_email='openmmlab@gmail.com',
+ packages=find_packages(exclude=()),
+ package_data={
+ 'lmdeploy': lmdeploy_package_data,
+ },
+ include_package_data=True,
+ install_requires=parse_requirements('requirements.txt'),
+ has_ext_modules=check_ext_modules,
+ classifiers=[
+ 'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
+ 'Programming Language :: Python :: 3.10',
+ 'Programming Language :: Python :: 3.11',
+ 'Intended Audience :: Developers',
+ 'Intended Audience :: Education',
+ 'Intended Audience :: Science/Research',
+ ],
+ entry_points={'console_scripts': ['lmdeploy = lmdeploy.cli:run']},
+ )
diff --git a/tests/test_lmdeploy/test_cli.py b/tests/test_lmdeploy/test_cli.py
new file mode 100644
index 000000000..a41eab442
--- /dev/null
+++ b/tests/test_lmdeploy/test_cli.py
@@ -0,0 +1,51 @@
+import inspect
+
+
+def compare_func(class_method, function):
+ """Compare if a class method has same arguments as a function."""
+
+ argspec_cls = inspect.getfullargspec(class_method)
+ argspec_func = inspect.getfullargspec(function)
+ assert argspec_cls.args[1:] == argspec_func.args
+ assert argspec_cls.defaults == argspec_func.defaults
+ assert argspec_cls.annotations == argspec_func.annotations
+
+
+def test_cli():
+
+ from lmdeploy.cli.cli import CLI
+ from lmdeploy.serve.turbomind.deploy import main as convert
+ compare_func(CLI.convert, convert)
+
+
+def test_subcli_chat():
+ from lmdeploy.cli.chat import SubCliChat
+ from lmdeploy.pytorch.chat import main as run_torch_model
+ from lmdeploy.turbomind.chat import main as run_turbomind_model
+
+ compare_func(SubCliChat.torch, run_torch_model)
+ compare_func(SubCliChat.turbomind, run_turbomind_model)
+
+
+def test_subcli_lite():
+ from lmdeploy.cli.lite import SubCliLite
+ from lmdeploy.lite.apis.auto_awq import auto_awq
+ from lmdeploy.lite.apis.calibrate import calibrate
+ from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
+
+ compare_func(SubCliLite.auto_awq, auto_awq)
+ compare_func(SubCliLite.calibrate, calibrate)
+ compare_func(SubCliLite.kv_qparams, run_kv_qparams)
+
+
+def test_subcli_serve():
+ from lmdeploy.cli.serve import SubCliServe
+ from lmdeploy.serve.client import main as run_triton_client
+ from lmdeploy.serve.gradio.app import run as run_gradio
+ from lmdeploy.serve.openai.api_client import main as run_api_client
+ from lmdeploy.serve.openai.api_server import main as run_api_server
+
+ compare_func(SubCliServe.gradio, run_gradio)
+ compare_func(SubCliServe.api_server, run_api_server)
+ compare_func(SubCliServe.api_client, run_api_client)
+ compare_func(SubCliServe.triton_client, run_triton_client)