diff --git a/README.md b/README.md
index b4da5eda5..c65cff7e5 100644
--- a/README.md
+++ b/README.md
@@ -119,14 +119,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
 GIT_LFS_SKIP_SMUDGE=1
 
 # 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 
 ```
 
 #### Inference by TurboMind
 
 ```shell
-python -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 > **Note**<br />
@@ -140,7 +140,7 @@ python -m lmdeploy.turbomind.chat ./workspace
 #### Serving with gradio
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
 ```
 
 ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -150,14 +150,14 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
 Launch inference server by:
 
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --instance_num 32 --tp 1
 ```
 
 Then, you can communicate with it by command line,
 
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 
 or webui,
@@ -165,8 +165,8 @@ or webui,
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
 ```
 
 Refer to [restful_api.md](docs/en/restful_api.md) for more details.
@@ -182,13 +182,13 @@ bash workspace/service_docker_up.sh
 Then, you can communicate with the inference server by command line,
 
 ```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
 ```
 
 or webui,
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
 ```
 
 For the deployment of other supported models, such as LLaMA, LLaMA-2, vicuna and so on, you can find the guide from [here](docs/en/serving.md)
@@ -200,7 +200,7 @@ For detailed instructions on Inference pytorch models, see [here](docs/en/pytorc
 #### Single GPU
 
 ```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL \
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \
     --max_new_tokens 64 \
     --temperture 0.8 \
     --top_p 0.95 \
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 10c03bd1a..84f860ef3 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -120,14 +120,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
 GIT_LFS_SKIP_SMUDGE=1
 
 # 2. 转换为 trubomind 要求的格式。默认存放路径为 ./workspace
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 
 ```
 
 #### 使用 turbomind 推理
 
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 > **Note**<br />
@@ -140,7 +140,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 #### 启动 gradio server
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
 ```
 
 ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -150,14 +150,14 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
 使用下面的命令启动推理服务：
 
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 
 你可以通过命令行方式与推理服务进行对话：
 
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 
 也可以通过 WebUI 方式来对话：
@@ -165,8 +165,8 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port${server_port} --restful_api True
 ```
 
 更多详情可以查阅 [restful_api.md](docs/zh_cn/restful_api.md)。
@@ -182,13 +182,13 @@ bash workspace/service_docker_up.sh
 你可以通过命令行方式与推理服务进行对话：
 
 ```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
 ```
 
 也可以通过 WebUI 方式来对话：
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
 ```
 
 其他模型的部署方式，比如 LLaMA，LLaMA-2，vicuna等等，请参考[这里](docs/zh_cn/serving.md)
@@ -204,7 +204,7 @@ pip install deepspeed
 #### 单个 GPU
 
 ```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL\
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL\
     --max_new_tokens 64 \
     --temperture 0.8 \
     --top_p 0.95 \
diff --git a/docs/en/kv_int8.md b/docs/en/kv_int8.md
index 1f5f5aa12..5dcf43ba6 100644
--- a/docs/en/kv_int8.md
+++ b/docs/en/kv_int8.md
@@ -18,7 +18,7 @@ dequant: f = q * scale + zp
 Convert the Hugging Face model format to the TurboMind inference format to create a workspace directory.
 
 ```bash
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 ```
 
 If you already have a workspace directory, skip this step.
@@ -29,7 +29,7 @@ Get the quantization parameters by these two steps:
 
 ```bash
 # get minmax
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
   --model $HF_MODEL \
   --calib_dataset 'c4' \             # Support c4, ptb, wikitext2, pileval
   --calib_samples 128 \              # Number of samples in the calibration set, if the memory is not enough, it can be adjusted appropriately
@@ -37,7 +37,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
   --work_dir $WORK_DIR \             # Directory for saving quantized statistical parameters and quantized weights in Pytorch format
 
 # get quant parameters
-python3 -m lmdeploy.lite.apis.kv_qparams \
+lmdeploy lite kv_qparams \
   --work_dir $WORK_DIR  \                             # Directory of the last output
   --turbomind_dir workspace/triton_models/weights/ \ # Directory to save the quantization parameters
   --kv_sym False \                                    # Symmetric or asymmetric quantization, default is False
@@ -64,7 +64,7 @@ Considering there are four combinations of kernels needed to be implemented, pre
 Test the chat performance.
 
 ```bash
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 ## GPU Memory Test
diff --git a/docs/en/pytorch.md b/docs/en/pytorch.md
index e3662ab37..e4cd5a9cb 100644
--- a/docs/en/pytorch.md
+++ b/docs/en/pytorch.md
@@ -9,13 +9,13 @@ This submodule allow user to chat with language model through command line, and
 **Example 1**: Chat with default setting
 
 ```shell
-python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL
+lmdeploy chat torch $PATH_TO_HF_MODEL
 ```
 
 **Example 2**: Disable sampling and chat history
 
 ```shell
-python -m lmdeploy.pytorch.chat \
+lmdeploy chat torch \
     $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
     --temperature 0 --max-history 0
 ```
@@ -23,7 +23,7 @@ python -m lmdeploy.pytorch.chat \
 **Example 3**: Accelerate with deepspeed inference
 
 ```shell
-python -m lmdeploy.pytorch.chat \
+lmdeploy chat torch \
     $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
     --accel deepspeed
 ```
diff --git a/docs/en/restful_api.md b/docs/en/restful_api.md
index cb70e2637..a66859c0c 100644
--- a/docs/en/restful_api.md
+++ b/docs/en/restful_api.md
@@ -3,7 +3,7 @@
 ### Launch Service
 
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace 0.0.0.0 server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 
 Then, the user can open the swagger UI: `http://{server_ip}:{server_port}` for the detailed api usage.
@@ -125,7 +125,7 @@ There is a client script for restful api server.
 
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 
 ### webui
@@ -135,8 +135,8 @@ You can also test restful-api through webui.
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
 ```
 
 ### FAQ
diff --git a/docs/en/serving.md b/docs/en/serving.md
index 1e6f783d7..6cc18018d 100644
--- a/docs/en/serving.md
+++ b/docs/en/serving.md
@@ -8,7 +8,7 @@ You can download [llama-2 models from huggingface](https://huggingface.co/meta-l
 <summary><b>7B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf
+lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf
 bash workspace/service_docker_up.sh
 ```
 
@@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2
+lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2
 bash workspace/service_docker_up.sh
 ```
 
@@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh
 <summary><b>70B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8
+lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8
 bash workspace/service_docker_up.sh
 ```
 
@@ -42,7 +42,7 @@ Weights for the LLaMA models can be obtained from by filling out [this form](htt
 <summary><b>7B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \
+lmdeploy convert llama /path/to/llama-7b llama \
     --tokenizer_path /path/to/tokenizer/model
 bash workspace/service_docker_up.sh
 ```
@@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \
+lmdeploy convert llama /path/to/llama-13b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 2
 bash workspace/service_docker_up.sh
 ```
@@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh
 <summary><b>30B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \
+lmdeploy convert llama /path/to/llama-30b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 4
 bash workspace/service_docker_up.sh
 ```
@@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh
 <summary><b>65B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \
+lmdeploy convert llama /path/to/llama-65b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 8
 bash workspace/service_docker_up.sh
 ```
@@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-7b \
   --delta-path lmsys/vicuna-7b-delta-v1.1
 
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b
+lmdeploy convert vicuna /path/to/vicuna-7b
 bash workspace/service_docker_up.sh
 ```
 
@@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-13b \
   --delta-path lmsys/vicuna-13b-delta-v1.1
 
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b
+lmdeploy convert vicuna /path/to/vicuna-13b
 bash workspace/service_docker_up.sh
 ```
 
diff --git a/docs/en/supported_models/codellama.md b/docs/en/supported_models/codellama.md
index 1b5140205..886dc5922 100644
--- a/docs/en/supported_models/codellama.md
+++ b/docs/en/supported_models/codellama.md
@@ -29,7 +29,7 @@ Based on the above table, download the model that meets your requirements. Execu
 python3 -m pip install lmdeploy
 
 # convert weight layout
-python3 -m lmdeploy.serve.turbomind.deploy codellama /the/path/of/codellama/model
+lmdeploy convert codellama /the/path/of/codellama/model
 ```
 
 Then, you can communicate with codellama in consolo by following instructions in next sections
@@ -42,13 +42,13 @@ Then, you can communicate with codellama in consolo by following instructions in
 ### Completion
 
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+lmdeploy chat turbomind ./workspace --cap completion
 ```
 
 ### Infilling
 
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+lmdeploy chat turbomind ./workspace --cap infilling
 ```
 
 The input code is supposed to have a special placeholder `<FILL>`. For example,
@@ -64,7 +64,7 @@ And the generated code piece by `turbomind.chat` is the one to be filled in `<FI
 ### Chat
 
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python"
+lmdeploy chat turbomind ./workspace --cap chat --sys-instruct "Provide answers in Python"
 ```
 
 `--sys-instruct` instruction can be changed to other coding languages as long as codellama supports it
@@ -72,7 +72,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provid
 ### Python specialist
 
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap python
+lmdeploy chat turbomind ./workspace --cap python
 ```
 
 Python fine-tuned model is highly recommended when 'python specialist' capability is required.
@@ -90,14 +90,14 @@ Launch inference server by:
 ```shell
 # --instance_num: number of instances to performance inference, which can be viewed as max requests concurrency
 # --tp: the number of GPUs used in tensor parallelism
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name ${server_ip} --server_port ${server_port} --instance_num 32 --tp 1
 ```
 
 Then, you can communicate with it by command line,
 
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 
 or through webui after launching gradio,
@@ -105,8 +105,8 @@ or through webui after launching gradio,
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
 ```
 
 Regarding the detailed information of RESTful API, you can refer to [restful_api.md](../restful_api.md).
diff --git a/docs/en/w4a16.md b/docs/en/w4a16.md
index fedffdc9d..dc70c23c2 100644
--- a/docs/en/w4a16.md
+++ b/docs/en/w4a16.md
@@ -26,14 +26,14 @@ As demonstrated in the command below, first convert the model's layout using `tu
 ```shell
 
 ## Convert the model's layout and store it in the default path, ./workspace.
-python3 -m lmdeploy.serve.turbomind.deploy \
+lmdeploy convert \
     --model-name llama2 \
     --model-path ./llama2-chat-7b-w4 \
     --model-format awq \
     --group-size 128
 
 ## inference
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 ## Serve with gradio
@@ -41,7 +41,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 If you wish to interact with the model via web ui, please initiate the gradio server as indicated below:
 
 ```shell
-python3 -m lmdeploy.serve.turbomind ./workspace --server_name {ip_addr} ----server_port {port}
+lmdeploy serve gradio ./workspace --server_name {ip_addr} --server_port {port}
 ```
 
 Subsequently, you can open the website `http://{ip_addr}:{port}` in your browser and interact with the model
@@ -82,7 +82,7 @@ It includes two steps:
 ### Step 1: Generate Quantization Parameter
 
 ```shell
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
   --model $HF_MODEL \
   --calib_dataset 'c4' \             # Calibration dataset, supports c4, ptb, wikitext2, pileval
   --calib_samples 128 \              # Number of samples in the calibration set, if memory is insufficient, you can appropriately reduce this
@@ -95,7 +95,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
 LMDeploy employs AWQ algorithm for model weight quantization.
 
 ```shell
-python3 -m lmdeploy.lite.apis.auto_awq \
+lmdeploy lite auto_awq \
   --model $HF_MODEL \
   --w_bits 4 \                       # Bit number for weight quantization
   --w_group_size 128 \               # Group size for weight quantization statistics
diff --git a/docs/zh_cn/kv_int8.md b/docs/zh_cn/kv_int8.md
index 3e006c613..75e58e1cb 100644
--- a/docs/zh_cn/kv_int8.md
+++ b/docs/zh_cn/kv_int8.md
@@ -18,7 +18,7 @@ dequant: f = q * scale + zp
 把 huggingface 格式的模型，转成 turbomind 推理格式，得到一个 workspace 目录
 
 ```bash
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 ```
 
 如果已经有 workspace 目录，可以跳过这步。
@@ -29,7 +29,7 @@ python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-ch
 
 ```bash
 # 计算 minmax
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
   --model $HF_MODEL \
   --calib_dataset 'c4' \             # 校准数据集，支持 c4, ptb, wikitext2, pileval
   --calib_samples 128 \              # 校准集的样本数，如果显存不够，可以适当调小
@@ -37,7 +37,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
   --work_dir $WORK_DIR \             # 保存 Pytorch 格式量化统计参数和量化后权重的文件夹
 
 # 通过 minmax 获取量化参数
-python3 -m lmdeploy.lite.apis.kv_qparams \
+lmdeploy lite kv_qparams \
   --work_dir $WORK_DIR  \                             # 上一步的结果
   --turbomind_dir workspace/triton_models/weights/ \ # 保存量化参数的目录，推理要用
   --kv_sym False \                                    # 对称量化或非对称量化，默认为 False
@@ -64,7 +64,7 @@ python3 -m lmdeploy.lite.apis.kv_qparams \
 测试聊天效果
 
 ```bash
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 ## 显存测试
diff --git a/docs/zh_cn/restful_api.md b/docs/zh_cn/restful_api.md
index 2b56fa0f2..484ab5686 100644
--- a/docs/zh_cn/restful_api.md
+++ b/docs/zh_cn/restful_api.md
@@ -5,7 +5,7 @@
 运行脚本
 
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace 0.0.0.0 server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 
 然后用户可以打开 swagger UI: `http://{server_ip}:{server_port}` 详细查看所有的 API 及其使用方法。
@@ -127,7 +127,7 @@ restful api 服务可以通过客户端测试，例如
 
 ```shell
 # restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 
 ### webui
@@ -137,8 +137,8 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
 ```shell
 # restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
 # server_ip 和 server_port 是用来提供 gradio ui 访问服务的
-# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
 ```
 
 ### FAQ
diff --git a/docs/zh_cn/serving.md b/docs/zh_cn/serving.md
index e0a2f5a98..db4ebb8d3 100644
--- a/docs/zh_cn/serving.md
+++ b/docs/zh_cn/serving.md
@@ -8,7 +8,7 @@
 <summary><b>7B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf
+lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf
 bash workspace/service_docker_up.sh
 ```
 
@@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2
+lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2
 bash workspace/service_docker_up.sh
 ```
 
@@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh
 <summary><b>70B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8
+lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8
 bash workspace/service_docker_up.sh
 ```
 
@@ -42,7 +42,7 @@ bash workspace/service_docker_up.sh
 <summary><b>7B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \
+lmdeploy convert llama /path/to/llama-7b llama \
     --tokenizer_path /path/to/tokenizer/model
 bash workspace/service_docker_up.sh
 ```
@@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \
+lmdeploy convert llama /path/to/llama-13b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 2
 bash workspace/service_docker_up.sh
 ```
@@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh
 <summary><b>30B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \
+lmdeploy convert llama /path/to/llama-30b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 4
 bash workspace/service_docker_up.sh
 ```
@@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh
 <summary><b>65B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \
+lmdeploy convert llama /path/to/llama-65b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 8
 bash workspace/service_docker_up.sh
 ```
@@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-7b \
   --delta-path lmsys/vicuna-7b-delta-v1.1
 
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b
+lmdeploy convert vicuna /path/to/vicuna-7b
 bash workspace/service_docker_up.sh
 ```
 
@@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-13b \
   --delta-path lmsys/vicuna-13b-delta-v1.1
 
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b
+lmdeploy convert vicuna /path/to/vicuna-13b
 bash workspace/service_docker_up.sh
 ```
 
diff --git a/docs/zh_cn/supported_models/codellama.md b/docs/zh_cn/supported_models/codellama.md
index ca9029a52..a2abd2f4a 100644
--- a/docs/zh_cn/supported_models/codellama.md
+++ b/docs/zh_cn/supported_models/codellama.md
@@ -29,7 +29,7 @@
 python3 -m pip install lmdeploy
 
 # 转模型格式
-python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model
+lmdeploy convert codellama /path/of/codellama/model
 ```
 
 接下来，可参考如下章节，在控制台与 codellama 进行交互式对话。
@@ -42,13 +42,13 @@ python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model
 ### 代码续写
 
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+lmdeploy chat turbomind ./workspace --cap completion
 ```
 
 ### 代码填空
 
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+lmdeploy chat turbomind ./workspace --cap infilling
 ```
 
 输入的代码块中要包含 `<FILL>`，比如：
@@ -64,7 +64,7 @@ def remove_non_ascii(s: str) -> str:
 ### 对话
 
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python"
+lmdeploy chat turbomind ./workspace --cap chat --sys-instruct "Provide answers in Python"
 ```
 
 可以把 `--sys-instruct` 的指令换成 codellama 支持的其他变成语言。
@@ -72,7 +72,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provid
 ### Python 专项
 
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap python
+lmdeploy chat turbomind ./workspace --cap python
 ```
 
 建议这里部署 Python 微调模型
@@ -90,7 +90,7 @@ TBD
 ```shell
 # --instance_num: turbomind推理实例的个数。可理解为支持的最大并发数
 # --tp: 在 tensor parallel时，使用的GPU数量
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 
 打开 `http://{server_ip}:{server_port}`，即可访问 swagger，查阅 RESTful API 的详细信息。
@@ -99,7 +99,7 @@ python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --
 
 ```shell
 # restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 
 或者，启动 gradio，在 webui 的聊天对话框中，与 codellama 交流：
@@ -107,8 +107,8 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
 ```shell
 # restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
 # server_ip 和 server_port 是用来提供 gradio ui 访问服务的
-# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
 ```
 
 关于 RESTful API的详细介绍，请参考[这份](../restful_api.md)文档。
diff --git a/docs/zh_cn/w4a16.md b/docs/zh_cn/w4a16.md
index d28cb716d..46f5c58a9 100644
--- a/docs/zh_cn/w4a16.md
+++ b/docs/zh_cn/w4a16.md
@@ -24,14 +24,14 @@ git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
 ```shell
 
 ## 转换模型的layout，存放在默认路径 ./workspace 下
-python3 -m lmdeploy.serve.turbomind.deploy \
+lmdeploy convert \
     --model-name llama2 \
     --model-path ./llama2-chat-7b-w4 \
     --model-format awq \
     --group-size 128
 
 ## 推理
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 ## 启动 gradio 服务
@@ -39,7 +39,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 如果想通过 webui 与模型对话，请执行以下命令启动 gradio 服务
 
 ```shell
-python3 -m lmdeploy.serve.turbomind ./workspace --server_name {ip_addr} ----server_port {port}
+lmdeploy serve gradio ./workspace --server_name {ip_addr} --server_port {port}
 ```
 
 然后，在浏览器中打开 http://{ip_addr}:{port}，即可在线对话
@@ -80,7 +80,7 @@ python benchmark/profile_generation.py \
 ### 第一步：生成量化参数
 
 ```shell
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
   --model $HF_MODEL \
   --calib_dataset 'c4' \             # 校准数据集，支持 c4, ptb, wikitext2, pileval
   --calib_samples 128 \              # 校准集的样本数，如果显存不够，可以适当调小
@@ -93,7 +93,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
 LMDeploy 使用 AWQ 算法对模型权重进行量化。在执行下面的命令时，需要把步骤1的`$WORK_DIR`传入。量化结束后，权重文件也会存放在这个目录中。然后就可以根据 ["4bit权重模型推理"](#4bit-权重模型推理)章节的说明，进行模型推理。
 
 ```shell
-python3 -m lmdeploy.lite.apis.auto_awq \
+lmdeploy lite auto_awq \
   --model $HF_MODEL \
   --w_bits 4 \                       # 权重量化的 bit 数
   --w_group_size 128 \               # 权重量化分组统计尺寸
diff --git a/lmdeploy/cli/__init__.py b/lmdeploy/cli/__init__.py
new file mode 100644
index 000000000..3575bec5b
--- /dev/null
+++ b/lmdeploy/cli/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .cli import run
+
+__all__ = ['run']
diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py
new file mode 100644
index 000000000..735b24c7c
--- /dev/null
+++ b/lmdeploy/cli/chat.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+
+class SubCliChat(object):
+    """Chat through terminal with pytorch or turbomind model."""
+
+    def torch(self,
+              model_path: str,
+              tokenizer_path: Optional[str] = None,
+              accel: Optional[str] = None,
+              max_new_tokens: int = 128,
+              temperature: float = 0.8,
+              top_p: float = 0.95,
+              seed: int = 0,
+              use_fast_tokenizer: bool = True,
+              max_alloc: int = 2048,
+              max_session_len: int = None,
+              log_file: Optional[str] = None,
+              debug: bool = False,
+              adapter: Optional[str] = None):
+        """Chat with pytorch model through terminal.
+
+        Args:
+            model_path (str): Path to pytorch model.
+            tokenizer_path (str): Path to tokenizer.
+            accel (str): Model accelerator.
+            max_new_tokens (int): Maximum number of tokens to generate.
+            temperature (float): Temperature for sampling.
+            top_p (float): Top p for sampling.
+            seed (int): Random seed.
+            use_fast_tokenizer (bool): Whether to use fast tokenizer.
+                This argument is directly pass to transformer's
+                ``AutoTokenizer.from_pretrained``.
+                Generally, user should choose to use fast tokenizers.
+                But if using fast raise some error, try to force using a slow one.
+            max_alloc (int): Maximum memory to allocate (for deepspeed).
+            max_session_len (int): Maximum number of tokens allowed for all chat sessions.
+                This include both history and current session.
+            log_file (str): Path to log file.
+            debug (bool): Whether to enable debug mode.
+            adapter (str): Force to use an adapter.
+                Generally user should not use this argument because adapter is selected based
+                on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
+                based on `LlamaforCausalLM` class, this argument is required.
+                Currently, only "llama1" is acceptable for llama1 models.
+        """  # noqa: E501
+        from lmdeploy.pytorch.chat import main as run_torch_model
+
+        run_torch_model(model_path,
+                        tokenizer_path=tokenizer_path,
+                        accel=accel,
+                        max_new_tokens=max_new_tokens,
+                        temperature=temperature,
+                        top_p=top_p,
+                        seed=seed,
+                        use_fast_tokenizer=use_fast_tokenizer,
+                        max_alloc=max_alloc,
+                        max_session_len=max_session_len,
+                        log_file=log_file,
+                        debug=debug,
+                        adapter=adapter)
+
+    def turbomind(self,
+                  model_path,
+                  session_id: int = 1,
+                  cap: str = 'chat',
+                  tp=1,
+                  stream_output=True,
+                  **kwargs):
+        """Chat with turbomind model through terminal.
+
+        Args:
+            model_path (str): the path of the deployed model
+            session_id (int): the identical id of a session
+            cap (str): the capability of a model. For example, codellama has
+                the ability among ['completion', 'infilling', 'chat', 'python']
+            tp (int): GPU number used in tensor parallelism
+            stream_output (bool): indicator for streaming output or not
+            **kwarg (dict): other arguments for initializing model's chat
+                template
+        """
+        from lmdeploy.turbomind.chat import main as run_turbomind_model
+
+        run_turbomind_model(model_path,
+                            session_id=session_id,
+                            cap=cap,
+                            tp=tp,
+                            stream_output=stream_output,
+                            **kwargs)
diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
new file mode 100644
index 000000000..7b2634b53
--- /dev/null
+++ b/lmdeploy/cli/cli.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import fire
+
+from .chat import SubCliChat
+from .lite import SubCliLite
+from .serve import SubCliServe
+
+
+class CLI(object):
+    """LMDeploy Command Line Interface.
+
+    The CLI provides a unified API for converting, compressing and deploying
+    large language models.
+    """
+
+    def convert(self,
+                model_name: str,
+                model_path: str,
+                model_format: str = None,
+                tokenizer_path: str = None,
+                dst_path: str = './workspace',
+                tp: int = 1,
+                quant_path: str = None,
+                group_size: int = 0):
+        """Convert LLMs to lmdeploy format.
+
+        Args:
+            model_name (str): The name of the to-be-deployed model, such as
+                llama-7b, llama-13b, vicuna-7b and etc.
+            model_path (str): The directory path of the model
+            model_format (str): The format of the model, fb or hf. 'fb' stands
+                for META's llama format, and 'hf' means huggingface format.
+            tokenizer_path (str): The path of tokenizer model.
+            dst_path (str): The destination path that saves outputs.
+            tp (int): The number of GPUs used for tensor parallelism, which
+                should be 2^n.
+            quant_path (str): Path of the quantized model, which can be None.
+            group_size (int): A parameter used in AWQ to quantize fp16 weights
+                to 4 bits.
+        """
+        from lmdeploy.serve.turbomind.deploy import main as convert
+
+        convert(model_name,
+                model_path,
+                model_format=model_format,
+                tokenizer_path=tokenizer_path,
+                dst_path=dst_path,
+                tp=tp,
+                quant_path=quant_path,
+                group_size=group_size)
+
+
+def run():
+    """The entry point of running LMDeploy CLI."""
+
+    cli = CLI()
+    cli.lite = SubCliLite()
+    cli.chat = SubCliChat()
+    cli.serve = SubCliServe()
+
+    fire.Fire(cli, name='lmdeploy')
diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
new file mode 100644
index 000000000..4302765e2
--- /dev/null
+++ b/lmdeploy/cli/lite.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+
+class SubCliLite(object):
+    """CLI for compressing LLMs."""
+
+    def auto_awq(self,
+                 model: str,
+                 work_dir: str,
+                 w_bits: int = 4,
+                 w_sym: bool = False,
+                 w_group_size: int = 128,
+                 device: str = 'cuda'):
+        """Perform weight quantization using AWQ algorithm.
+
+        Args:
+            model (str): The path of model in hf format.
+            work_dir (str): The working directory to save results.
+            w_bits (int): Bit number for weight quantization.
+            w_sym (bool): Whether to do symmetric quantization.
+            w_group_size (int): Group size for weight quantization statistics.
+            device (str): Device type of running.
+        """
+        from lmdeploy.lite.apis.auto_awq import auto_awq
+
+        auto_awq(model,
+                 work_dir,
+                 w_bits=w_bits,
+                 w_sym=w_sym,
+                 w_group_size=w_group_size,
+                 device=device)
+
+    def calibrate(self,
+                  model: str,
+                  calib_dataset: str = 'c4',
+                  calib_samples: int = 128,
+                  calib_seqlen: int = 2048,
+                  work_dir: str = './work_dir',
+                  device: str = 'cuda') -> None:
+        """Perform calibration on a given dataset.
+
+        Args:
+            model (str): The model to be loaded.
+            calib_dataset (str, optional): The calibration dataset name.
+                Defaults to 'c4'.
+            calib_samples (int, optional): The number of samples for
+                calibration. Defaults to 128.
+            calib_seqlen (int, optional): The sequence length for calibration.
+                Defaults to 2048.
+            work_dir (str): The working directory for outputs.
+                Defaults to './work_dir'.
+            device (str, optional): The device to be used for calculation.
+                Defaults to 'cuda'.
+        """
+        from lmdeploy.lite.apis.calibrate import calibrate
+
+        calibrate(model,
+                  calib_dataset=calib_dataset,
+                  calib_samples=calib_samples,
+                  calib_seqlen=calib_seqlen,
+                  work_dir=work_dir,
+                  device=device)
+
+    def kv_qparams(self,
+                   work_dir: str,
+                   turbomind_dir: str,
+                   kv_bits: int = 8,
+                   kv_sym: bool = False,
+                   num_tp: int = 1) -> None:
+        """Export key and value stats.
+
+        Args:
+            work_dir (str): Directory path where the stats
+                are saved.
+            turbomind_dir (str): Directory path where to
+                save the results.
+            kv_bits (int, optional): Number of bits for quantization.
+                Defaults to 8.
+            kv_sym (bool, optional): Whether to use symmetric quantization.
+                Defaults to False.
+            num_tp (int, optional): Number of tensor parallelism.
+                Defaults to 1.
+        """
+        from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
+
+        run_kv_qparams(work_dir,
+                       turbomind_dir,
+                       kv_bits=kv_bits,
+                       kv_sym=kv_sym,
+                       num_tp=num_tp)
+
+    def get_small_sharded_hf(self, src_dir: str, dst_dir: str):
+        """Convert a hugging face model to the smallest sharded one.
+
+        Args:
+            src_dir (str): The directory of the input HF model.
+            dst_dir (str): The directory to save new  model.
+        """
+        from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded
+        run_sharded(src_dir, dst_dir)
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
new file mode 100644
index 000000000..0bff69c31
--- /dev/null
+++ b/lmdeploy/cli/serve.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+
+class SubCliServe(object):
+    """Serve LLMs and interact on terminal or web UI."""
+
+    def gradio(self,
+               model_path_or_server: str,
+               server_name: str = 'localhost',
+               server_port: int = 6006,
+               batch_size: int = 32,
+               tp: int = 1,
+               restful_api: bool = False):
+        """Serve LLMs with web ui using gradio.
+
+        Example 1:
+            lmdeploy serve gradio ./workspace
+
+        Example 2:
+            lmdeploy serve gradio http://localhost:23333
+            --server_name localhost
+            --server_port 6006
+            --restful_api True
+
+        Example 3:
+            lmdeploy serve gradio ${triton_server_ip_addresss}:33337
+
+        Args:
+            model_path_or_server (str): the path of the deployed model or the
+                tritonserver URL or restful api URL. The former is for directly
+                running service with gradio. The latter is for running with
+                tritonserver by default. If the input URL is restful api.
+                Please enable another flag `restful_api`.
+            server_name (str): the ip address of gradio server
+            server_port (int): the port of gradio server
+            batch_size (int): batch size for running Turbomind directly
+            tp (int): tensor parallel for Turbomind
+            restful_api (bool): a flag for model_path_or_server
+        """
+        from lmdeploy.serve.gradio.app import run
+        run(model_path_or_server,
+            server_name=server_name,
+            server_port=server_port,
+            batch_size=batch_size,
+            tp=tp,
+            restful_api=restful_api)
+
+    def api_server(self,
+                   model_path: str,
+                   server_name: str = 'localhost',
+                   server_port: int = 23333,
+                   instance_num: int = 32,
+                   tp: int = 1,
+                   allow_origins: List[str] = ['*'],
+                   allow_credentials: bool = True,
+                   allow_methods: List[str] = ['*'],
+                   allow_headers: List[str] = ['*']):
+        """Serve LLMs with restful api using fastapi.
+
+        Args:
+            model_path (str): the path of the deployed model
+            server_name (str): host ip for serving
+            server_port (int): server port
+            instance_num (int): number of instances of turbomind model
+            tp (int): tensor parallel
+            allow_origins (List[str]): a list of allowed origins for CORS
+            allow_credentials (bool): whether to allow credentials for CORS
+            allow_methods (List[str]): a list of allowed HTTP methods for CORS
+            allow_headers (List[str]): a list of allowed HTTP headers for CORS
+        """
+        from lmdeploy.serve.openai.api_server import main as run_api_server
+
+        run_api_server(model_path,
+                       server_name=server_name,
+                       server_port=server_port,
+                       instance_num=instance_num,
+                       tp=tp,
+                       allow_origins=allow_origins,
+                       allow_credentials=allow_credentials,
+                       allow_methods=allow_methods,
+                       allow_headers=allow_headers)
+
+    def api_client(self, restful_api_url: str, session_id: int = 0):
+        """Interact with restful api server in terminal.
+
+        Args:
+            restful_api_url: The restful api URL.
+            session_id: The identical id of a session.
+        """
+        from lmdeploy.serve.openai.api_client import main as run_api_client
+        run_api_client(restful_api_url, session_id=session_id)
+
+    def triton_client(self,
+                      tritonserver_addr: str,
+                      session_id: int = 1,
+                      cap: str = 'chat',
+                      stream_output: bool = True,
+                      **kwargs):
+        """Interact with Triton Server using gRPC protocol.
+
+        Args:
+            tritonserver_addr (str): the address in format "ip:port" of
+              triton inference server
+            session_id (int): the identical id of a session
+            cap (str): the capability of a model. For example, codellama
+                has the ability among ['completion', 'infill', 'instruct',
+                'python']
+            stream_output (bool): indicator for streaming output or not
+            **kwargs (dict): other arguments for initializing model's
+                chat template
+        """
+
+        from lmdeploy.serve.client import main as run_triton_client
+
+        run_triton_client(
+            tritonserver_addr,
+            session_id=session_id,
+            cap=cap,
+            stream_output=stream_output,
+            **kwargs,
+        )
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
index 3517f51b8..250defb59 100644
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -2,7 +2,6 @@
 
 from pathlib import Path
 
-import fire
 import torch
 from accelerate import (infer_auto_device_map, init_empty_weights,
                         load_checkpoint_in_model)
@@ -81,5 +80,6 @@ def auto_awq(model: str,
 
 
 if __name__ == '__main__':
+    import fire
 
     fire.Fire(auto_awq)
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 38b6429a1..3df252f06 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -2,7 +2,6 @@
 
 from pathlib import Path
 
-import fire
 import torch
 from accelerate import (infer_auto_device_map, init_empty_weights,
                         load_checkpoint_in_model)
@@ -107,4 +106,6 @@ def calibrate(model: str,
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(calibrate)
diff --git a/lmdeploy/lite/apis/kv_qparams.py b/lmdeploy/lite/apis/kv_qparams.py
index 7d43078da..f31fee029 100644
--- a/lmdeploy/lite/apis/kv_qparams.py
+++ b/lmdeploy/lite/apis/kv_qparams.py
@@ -2,7 +2,6 @@
 from pathlib import Path
 from typing import Union
 
-import fire
 import numpy as np
 import torch
 
@@ -120,5 +119,6 @@ def main(work_dir: str,
 
 
 if __name__ == '__main__':
+    import fire
 
     fire.Fire(main)
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 18da39a6e..39451acdc 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -654,4 +654,5 @@ def main(model_name: str = 'test'):
 
 if __name__ == '__main__':
     import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/pytorch/chat.py b/lmdeploy/pytorch/chat.py
index c30cf6ffe..2690480a8 100644
--- a/lmdeploy/pytorch/chat.py
+++ b/lmdeploy/pytorch/chat.py
@@ -51,7 +51,6 @@
 import logging
 from typing import Optional
 
-import fire
 import torch
 from transformers import GenerationConfig, PreTrainedModel
 
@@ -205,6 +204,8 @@ def main(
 
 
 def cli():
+    import fire
+
     fire.Fire(main)
 
 
diff --git a/lmdeploy/serve/client.py b/lmdeploy/serve/client.py
index 9c0d3cb5c..424e83143 100644
--- a/lmdeploy/serve/client.py
+++ b/lmdeploy/serve/client.py
@@ -1,8 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 
-import fire
-
 from lmdeploy.serve.turbomind.chatbot import Chatbot
 
 
@@ -66,4 +64,6 @@ def main(tritonserver_addr: str,
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/serve/gradio/app.py b/lmdeploy/serve/gradio/app.py
index 71db7a274..5c200517b 100644
--- a/lmdeploy/serve/gradio/app.py
+++ b/lmdeploy/serve/gradio/app.py
@@ -5,7 +5,6 @@
 from functools import partial
 from typing import Sequence
 
-import fire
 import gradio as gr
 
 from lmdeploy.serve.async_engine import AsyncEngine
@@ -525,7 +524,7 @@ def run(model_path_or_server: str,
         server_port (int): the port of gradio server
         batch_size (int): batch size for running Turbomind directly
         tp (int): tensor parallel for Turbomind
-        restufl_api (bool): a flag for model_path_or_server
+        restful_api (bool): a flag for model_path_or_server
     """
     if ':' in model_path_or_server:
         if restful_api:
@@ -539,4 +538,6 @@ def run(model_path_or_server: str,
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(run)
diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py
index a8718331b..26977bc6c 100644
--- a/lmdeploy/serve/openai/api_client.py
+++ b/lmdeploy/serve/openai/api_client.py
@@ -2,7 +2,6 @@
 import json
 from typing import Iterable, List
 
-import fire
 import requests
 
 
@@ -89,4 +88,6 @@ def main(restful_api_url: str, session_id: int = 0):
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 94271c4b9..8324e3497 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -4,7 +4,6 @@
 from http import HTTPStatus
 from typing import AsyncGenerator, List, Optional
 
-import fire
 import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
@@ -357,4 +356,6 @@ def main(model_path: str,
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py
index 3aca6e1f1..ab8c9ea95 100644
--- a/lmdeploy/serve/turbomind/deploy.py
+++ b/lmdeploy/serve/turbomind/deploy.py
@@ -8,7 +8,6 @@
 import sys
 from pathlib import Path
 
-import fire
 import safetensors
 import torch
 from safetensors.torch import load_file
@@ -1043,4 +1042,6 @@ def main(model_name: str,
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index ca9d9c34e..bf0ce7399 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -4,11 +4,7 @@
 import os.path as osp
 import random
 
-import fire
-
-from lmdeploy import turbomind as tm
 from lmdeploy.model import MODELS
-from lmdeploy.tokenizer import Tokenizer
 
 os.environ['TM_LOG_LEVEL'] = 'ERROR'
 
@@ -88,6 +84,9 @@ def main(model_path,
         stream_output (bool): indicator for streaming output or not
         **kwarg (dict): other arguments for initializing model's chat template
     """
+    from lmdeploy import turbomind as tm
+    from lmdeploy.tokenizer import Tokenizer
+
     tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
     tokenizer = Tokenizer(tokenizer_model_path)
     tm_model = tm.TurboMind(model_path, eos_id=tokenizer.eos_token_id, tp=tp)
@@ -157,4 +156,6 @@ def main(model_path,
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/turbomind/decode.py b/lmdeploy/turbomind/decode.py
index daef35298..5ba4675c5 100644
--- a/lmdeploy/turbomind/decode.py
+++ b/lmdeploy/turbomind/decode.py
@@ -2,7 +2,6 @@
 import os
 import os.path as osp
 
-import fire
 import torch
 
 from lmdeploy import turbomind as tm
@@ -37,4 +36,6 @@ def main(model_path, inputs):
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
index 328f18215..9a4f0e8c4 100644
--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
@@ -2,8 +2,6 @@
 
 import subprocess
 
-import fire
-
 
 def get_llama_gemm():
     import os.path as osp
@@ -30,4 +28,6 @@ def main(head_num: int = 32,
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/setup.py b/setup.py
index 09ae1e31c..df36118c2 100644
--- a/setup.py
+++ b/setup.py
@@ -121,26 +121,29 @@ def gen_packages_items():
 
 if __name__ == '__main__':
     lmdeploy_package_data = ['lmdeploy/bin/llama_gemm']
-    setup(name='lmdeploy',
-          version=get_version(),
-          description='A toolset for compressing, deploying and serving LLM',
-          long_description=readme(),
-          long_description_content_type='text/markdown',
-          author='OpenMMLab',
-          author_email='openmmlab@gmail.com',
-          packages=find_packages(exclude=()),
-          package_data={
-              'lmdeploy': lmdeploy_package_data,
-          },
-          include_package_data=True,
-          install_requires=parse_requirements('requirements.txt'),
-          has_ext_modules=check_ext_modules,
-          classifiers=[
-              'Programming Language :: Python :: 3.8',
-              'Programming Language :: Python :: 3.9',
-              'Programming Language :: Python :: 3.10',
-              'Programming Language :: Python :: 3.11',
-              'Intended Audience :: Developers',
-              'Intended Audience :: Education',
-              'Intended Audience :: Science/Research',
-          ])
+    setup(
+        name='lmdeploy',
+        version=get_version(),
+        description='A toolset for compressing, deploying and serving LLM',
+        long_description=readme(),
+        long_description_content_type='text/markdown',
+        author='OpenMMLab',
+        author_email='openmmlab@gmail.com',
+        packages=find_packages(exclude=()),
+        package_data={
+            'lmdeploy': lmdeploy_package_data,
+        },
+        include_package_data=True,
+        install_requires=parse_requirements('requirements.txt'),
+        has_ext_modules=check_ext_modules,
+        classifiers=[
+            'Programming Language :: Python :: 3.8',
+            'Programming Language :: Python :: 3.9',
+            'Programming Language :: Python :: 3.10',
+            'Programming Language :: Python :: 3.11',
+            'Intended Audience :: Developers',
+            'Intended Audience :: Education',
+            'Intended Audience :: Science/Research',
+        ],
+        entry_points={'console_scripts': ['lmdeploy = lmdeploy.cli:run']},
+    )
diff --git a/tests/test_lmdeploy/test_cli.py b/tests/test_lmdeploy/test_cli.py
new file mode 100644
index 000000000..a41eab442
--- /dev/null
+++ b/tests/test_lmdeploy/test_cli.py
@@ -0,0 +1,51 @@
+import inspect
+
+
+def compare_func(class_method, function):
+    """Compare if a class method has same arguments as a function."""
+
+    argspec_cls = inspect.getfullargspec(class_method)
+    argspec_func = inspect.getfullargspec(function)
+    assert argspec_cls.args[1:] == argspec_func.args
+    assert argspec_cls.defaults == argspec_func.defaults
+    assert argspec_cls.annotations == argspec_func.annotations
+
+
+def test_cli():
+
+    from lmdeploy.cli.cli import CLI
+    from lmdeploy.serve.turbomind.deploy import main as convert
+    compare_func(CLI.convert, convert)
+
+
+def test_subcli_chat():
+    from lmdeploy.cli.chat import SubCliChat
+    from lmdeploy.pytorch.chat import main as run_torch_model
+    from lmdeploy.turbomind.chat import main as run_turbomind_model
+
+    compare_func(SubCliChat.torch, run_torch_model)
+    compare_func(SubCliChat.turbomind, run_turbomind_model)
+
+
+def test_subcli_lite():
+    from lmdeploy.cli.lite import SubCliLite
+    from lmdeploy.lite.apis.auto_awq import auto_awq
+    from lmdeploy.lite.apis.calibrate import calibrate
+    from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
+
+    compare_func(SubCliLite.auto_awq, auto_awq)
+    compare_func(SubCliLite.calibrate, calibrate)
+    compare_func(SubCliLite.kv_qparams, run_kv_qparams)
+
+
+def test_subcli_serve():
+    from lmdeploy.cli.serve import SubCliServe
+    from lmdeploy.serve.client import main as run_triton_client
+    from lmdeploy.serve.gradio.app import run as run_gradio
+    from lmdeploy.serve.openai.api_client import main as run_api_client
+    from lmdeploy.serve.openai.api_server import main as run_api_server
+
+    compare_func(SubCliServe.gradio, run_gradio)
+    compare_func(SubCliServe.api_server, run_api_server)
+    compare_func(SubCliServe.api_client, run_api_client)
+    compare_func(SubCliServe.triton_client, run_triton_client)