diff --git a/README.md b/README.md index b4da5eda5..c65cff7e5 100644 --- a/README.md +++ b/README.md @@ -119,14 +119,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl GIT_LFS_SKIP_SMUDGE=1 # 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default -python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b +lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b ``` #### Inference by TurboMind ```shell -python -m lmdeploy.turbomind.chat ./workspace +lmdeploy chat turbomind ./workspace ``` > **Note**
@@ -140,7 +140,7 @@ python -m lmdeploy.turbomind.chat ./workspace #### Serving with gradio ```shell -python3 -m lmdeploy.serve.gradio.app ./workspace +lmdeploy serve gradio ./workspace ``` ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab) @@ -150,14 +150,14 @@ python3 -m lmdeploy.serve.gradio.app ./workspace Launch inference server by: ```shell -python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1 +lmdeploy serve api_server ./workspace --instance_num 32 --tp 1 ``` Then, you can communicate with it by command line, ```shell # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 -python -m lmdeploy.serve.openai.api_client restful_api_url +lmdeploy serve api_client restful_api_url ``` or webui, @@ -165,8 +165,8 @@ or webui, ```shell # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 # server_ip and server_port here are for gradio ui -# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True -python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True +# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True +lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True ``` Refer to [restful_api.md](docs/en/restful_api.md) for more details. @@ -182,13 +182,13 @@ bash workspace/service_docker_up.sh Then, you can communicate with the inference server by command line, ```shell -python3 -m lmdeploy.serve.client {server_ip_addresss}:33337 +lmdeploy serve triton_client {server_ip_addresss}:33337 ``` or webui, ```shell -python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337 +lmdeploy serve gradio {server_ip_addresss}:33337 ``` For the deployment of other supported models, such as LLaMA, LLaMA-2, vicuna and so on, you can find the guide from [here](docs/en/serving.md) @@ -200,7 +200,7 @@ For detailed instructions on Inference pytorch models, see [here](docs/en/pytorc #### Single GPU ```shell -python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL \ +lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \ --max_new_tokens 64 \ --temperture 0.8 \ --top_p 0.95 \ diff --git a/README_zh-CN.md b/README_zh-CN.md index 10c03bd1a..84f860ef3 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -120,14 +120,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl GIT_LFS_SKIP_SMUDGE=1 # 2. 转换为 trubomind 要求的格式。默认存放路径为 ./workspace -python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b +lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b ``` #### 使用 turbomind 推理 ```shell -python3 -m lmdeploy.turbomind.chat ./workspace +lmdeploy chat turbomind ./workspace ``` > **Note**
@@ -140,7 +140,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace #### 启动 gradio server ```shell -python3 -m lmdeploy.serve.gradio.app ./workspace +lmdeploy serve gradio ./workspace ``` ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab) @@ -150,14 +150,14 @@ python3 -m lmdeploy.serve.gradio.app ./workspace 使用下面的命令启动推理服务: ```shell -python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1 +lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1 ``` 你可以通过命令行方式与推理服务进行对话: ```shell # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 -python -m lmdeploy.serve.openai.api_client restful_api_url +lmdeploy serve api_client restful_api_url ``` 也可以通过 WebUI 方式来对话: @@ -165,8 +165,8 @@ python -m lmdeploy.serve.openai.api_client restful_api_url ```shell # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 # server_ip and server_port here are for gradio ui -# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True -python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True +# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True +lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port${server_port} --restful_api True ``` 更多详情可以查阅 [restful_api.md](docs/zh_cn/restful_api.md)。 @@ -182,13 +182,13 @@ bash workspace/service_docker_up.sh 你可以通过命令行方式与推理服务进行对话: ```shell -python3 -m lmdeploy.serve.client {server_ip_addresss}:33337 +lmdeploy serve triton_client {server_ip_addresss}:33337 ``` 也可以通过 WebUI 方式来对话: ```shell -python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337 +lmdeploy serve gradio {server_ip_addresss}:33337 ``` 其他模型的部署方式,比如 LLaMA,LLaMA-2,vicuna等等,请参考[这里](docs/zh_cn/serving.md) @@ -204,7 +204,7 @@ pip install deepspeed #### 单个 GPU ```shell -python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL\ +lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL\ --max_new_tokens 64 \ --temperture 0.8 \ --top_p 0.95 \ diff --git a/docs/en/kv_int8.md b/docs/en/kv_int8.md index 1f5f5aa12..5dcf43ba6 100644 --- a/docs/en/kv_int8.md +++ b/docs/en/kv_int8.md @@ -18,7 +18,7 @@ dequant: f = q * scale + zp Convert the Hugging Face model format to the TurboMind inference format to create a workspace directory. ```bash -python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b +lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b ``` If you already have a workspace directory, skip this step. @@ -29,7 +29,7 @@ Get the quantization parameters by these two steps: ```bash # get minmax -python3 -m lmdeploy.lite.apis.calibrate \ +lmdeploy lite calibrate \ --model $HF_MODEL \ --calib_dataset 'c4' \ # Support c4, ptb, wikitext2, pileval --calib_samples 128 \ # Number of samples in the calibration set, if the memory is not enough, it can be adjusted appropriately @@ -37,7 +37,7 @@ python3 -m lmdeploy.lite.apis.calibrate \ --work_dir $WORK_DIR \ # Directory for saving quantized statistical parameters and quantized weights in Pytorch format # get quant parameters -python3 -m lmdeploy.lite.apis.kv_qparams \ +lmdeploy lite kv_qparams \ --work_dir $WORK_DIR \ # Directory of the last output --turbomind_dir workspace/triton_models/weights/ \ # Directory to save the quantization parameters --kv_sym False \ # Symmetric or asymmetric quantization, default is False @@ -64,7 +64,7 @@ Considering there are four combinations of kernels needed to be implemented, pre Test the chat performance. ```bash -python3 -m lmdeploy.turbomind.chat ./workspace +lmdeploy chat turbomind ./workspace ``` ## GPU Memory Test diff --git a/docs/en/pytorch.md b/docs/en/pytorch.md index e3662ab37..e4cd5a9cb 100644 --- a/docs/en/pytorch.md +++ b/docs/en/pytorch.md @@ -9,13 +9,13 @@ This submodule allow user to chat with language model through command line, and **Example 1**: Chat with default setting ```shell -python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL +lmdeploy chat torch $PATH_TO_HF_MODEL ``` **Example 2**: Disable sampling and chat history ```shell -python -m lmdeploy.pytorch.chat \ +lmdeploy chat torch \ $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \ --temperature 0 --max-history 0 ``` @@ -23,7 +23,7 @@ python -m lmdeploy.pytorch.chat \ **Example 3**: Accelerate with deepspeed inference ```shell -python -m lmdeploy.pytorch.chat \ +lmdeploy chat torch \ $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \ --accel deepspeed ``` diff --git a/docs/en/restful_api.md b/docs/en/restful_api.md index cb70e2637..a66859c0c 100644 --- a/docs/en/restful_api.md +++ b/docs/en/restful_api.md @@ -3,7 +3,7 @@ ### Launch Service ```shell -python3 -m lmdeploy.serve.openai.api_server ./workspace 0.0.0.0 server_port --instance_num 32 --tp 1 +lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1 ``` Then, the user can open the swagger UI: `http://{server_ip}:{server_port}` for the detailed api usage. @@ -125,7 +125,7 @@ There is a client script for restful api server. ```shell # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 -python -m lmdeploy.serve.openai.api_client restful_api_url +lmdeploy serve api_client restful_api_url ``` ### webui @@ -135,8 +135,8 @@ You can also test restful-api through webui. ```shell # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 # server_ip and server_port here are for gradio ui -# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True -python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True +# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True +lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True ``` ### FAQ diff --git a/docs/en/serving.md b/docs/en/serving.md index 1e6f783d7..6cc18018d 100644 --- a/docs/en/serving.md +++ b/docs/en/serving.md @@ -8,7 +8,7 @@ You can download [llama-2 models from huggingface](https://huggingface.co/meta-l 7B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf +lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf bash workspace/service_docker_up.sh ``` @@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh 13B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2 +lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2 bash workspace/service_docker_up.sh ``` @@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh 70B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8 +lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8 bash workspace/service_docker_up.sh ``` @@ -42,7 +42,7 @@ Weights for the LLaMA models can be obtained from by filling out [this form](htt 7B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \ +lmdeploy convert llama /path/to/llama-7b llama \ --tokenizer_path /path/to/tokenizer/model bash workspace/service_docker_up.sh ``` @@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh 13B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \ +lmdeploy convert llama /path/to/llama-13b llama \ --tokenizer_path /path/to/tokenizer/model --tp 2 bash workspace/service_docker_up.sh ``` @@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh 30B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \ +lmdeploy convert llama /path/to/llama-30b llama \ --tokenizer_path /path/to/tokenizer/model --tp 4 bash workspace/service_docker_up.sh ``` @@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh 65B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \ +lmdeploy convert llama /path/to/llama-65b llama \ --tokenizer_path /path/to/tokenizer/model --tp 8 bash workspace/service_docker_up.sh ``` @@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \ --target-model-path /path/to/vicuna-7b \ --delta-path lmsys/vicuna-7b-delta-v1.1 -python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b +lmdeploy convert vicuna /path/to/vicuna-7b bash workspace/service_docker_up.sh ``` @@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \ --target-model-path /path/to/vicuna-13b \ --delta-path lmsys/vicuna-13b-delta-v1.1 -python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b +lmdeploy convert vicuna /path/to/vicuna-13b bash workspace/service_docker_up.sh ``` diff --git a/docs/en/supported_models/codellama.md b/docs/en/supported_models/codellama.md index 1b5140205..886dc5922 100644 --- a/docs/en/supported_models/codellama.md +++ b/docs/en/supported_models/codellama.md @@ -29,7 +29,7 @@ Based on the above table, download the model that meets your requirements. Execu python3 -m pip install lmdeploy # convert weight layout -python3 -m lmdeploy.serve.turbomind.deploy codellama /the/path/of/codellama/model +lmdeploy convert codellama /the/path/of/codellama/model ``` Then, you can communicate with codellama in consolo by following instructions in next sections @@ -42,13 +42,13 @@ Then, you can communicate with codellama in consolo by following instructions in ### Completion ```shell -python3 -m lmdeploy.turbomind.chat ./workspace --cap completion +lmdeploy chat turbomind ./workspace --cap completion ``` ### Infilling ```shell -python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling +lmdeploy chat turbomind ./workspace --cap infilling ``` The input code is supposed to have a special placeholder ``. For example, @@ -64,7 +64,7 @@ And the generated code piece by `turbomind.chat` is the one to be filled in `7B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf +lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf bash workspace/service_docker_up.sh ``` @@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh 13B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2 +lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2 bash workspace/service_docker_up.sh ``` @@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh 70B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8 +lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8 bash workspace/service_docker_up.sh ``` @@ -42,7 +42,7 @@ bash workspace/service_docker_up.sh 7B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \ +lmdeploy convert llama /path/to/llama-7b llama \ --tokenizer_path /path/to/tokenizer/model bash workspace/service_docker_up.sh ``` @@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh 13B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \ +lmdeploy convert llama /path/to/llama-13b llama \ --tokenizer_path /path/to/tokenizer/model --tp 2 bash workspace/service_docker_up.sh ``` @@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh 30B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \ +lmdeploy convert llama /path/to/llama-30b llama \ --tokenizer_path /path/to/tokenizer/model --tp 4 bash workspace/service_docker_up.sh ``` @@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh 65B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \ +lmdeploy convert llama /path/to/llama-65b llama \ --tokenizer_path /path/to/tokenizer/model --tp 8 bash workspace/service_docker_up.sh ``` @@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \ --target-model-path /path/to/vicuna-7b \ --delta-path lmsys/vicuna-7b-delta-v1.1 -python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b +lmdeploy convert vicuna /path/to/vicuna-7b bash workspace/service_docker_up.sh ``` @@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \ --target-model-path /path/to/vicuna-13b \ --delta-path lmsys/vicuna-13b-delta-v1.1 -python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b +lmdeploy convert vicuna /path/to/vicuna-13b bash workspace/service_docker_up.sh ``` diff --git a/docs/zh_cn/supported_models/codellama.md b/docs/zh_cn/supported_models/codellama.md index ca9029a52..a2abd2f4a 100644 --- a/docs/zh_cn/supported_models/codellama.md +++ b/docs/zh_cn/supported_models/codellama.md @@ -29,7 +29,7 @@ python3 -m pip install lmdeploy # 转模型格式 -python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model +lmdeploy convert codellama /path/of/codellama/model ``` 接下来,可参考如下章节,在控制台与 codellama 进行交互式对话。 @@ -42,13 +42,13 @@ python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model ### 代码续写 ```shell -python3 -m lmdeploy.turbomind.chat ./workspace --cap completion +lmdeploy chat turbomind ./workspace --cap completion ``` ### 代码填空 ```shell -python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling +lmdeploy chat turbomind ./workspace --cap infilling ``` 输入的代码块中要包含 ``,比如: @@ -64,7 +64,7 @@ def remove_non_ascii(s: str) -> str: ### 对话 ``` -python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python" +lmdeploy chat turbomind ./workspace --cap chat --sys-instruct "Provide answers in Python" ``` 可以把 `--sys-instruct` 的指令换成 codellama 支持的其他变成语言。 @@ -72,7 +72,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provid ### Python 专项 ``` -python3 -m lmdeploy.turbomind.chat ./workspace --cap python +lmdeploy chat turbomind ./workspace --cap python ``` 建议这里部署 Python 微调模型 @@ -90,7 +90,7 @@ TBD ```shell # --instance_num: turbomind推理实例的个数。可理解为支持的最大并发数 # --tp: 在 tensor parallel时,使用的GPU数量 -python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1 +lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1 ``` 打开 `http://{server_ip}:{server_port}`,即可访问 swagger,查阅 RESTful API 的详细信息。 @@ -99,7 +99,7 @@ python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port -- ```shell # restful_api_url 就是 api_server 产生的,比如 http://localhost:23333 -python -m lmdeploy.serve.openai.api_client restful_api_url +lmdeploy serve api_client restful_api_url ``` 或者,启动 gradio,在 webui 的聊天对话框中,与 codellama 交流: @@ -107,8 +107,8 @@ python -m lmdeploy.serve.openai.api_client restful_api_url ```shell # restful_api_url 就是 api_server 产生的,比如 http://localhost:23333 # server_ip 和 server_port 是用来提供 gradio ui 访问服务的 -# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True -python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True +# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True +lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True ``` 关于 RESTful API的详细介绍,请参考[这份](../restful_api.md)文档。 diff --git a/docs/zh_cn/w4a16.md b/docs/zh_cn/w4a16.md index d28cb716d..46f5c58a9 100644 --- a/docs/zh_cn/w4a16.md +++ b/docs/zh_cn/w4a16.md @@ -24,14 +24,14 @@ git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4 ```shell ## 转换模型的layout,存放在默认路径 ./workspace 下 -python3 -m lmdeploy.serve.turbomind.deploy \ +lmdeploy convert \ --model-name llama2 \ --model-path ./llama2-chat-7b-w4 \ --model-format awq \ --group-size 128 ## 推理 -python3 -m lmdeploy.turbomind.chat ./workspace +lmdeploy chat turbomind ./workspace ``` ## 启动 gradio 服务 @@ -39,7 +39,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace 如果想通过 webui 与模型对话,请执行以下命令启动 gradio 服务 ```shell -python3 -m lmdeploy.serve.turbomind ./workspace --server_name {ip_addr} ----server_port {port} +lmdeploy serve gradio ./workspace --server_name {ip_addr} --server_port {port} ``` 然后,在浏览器中打开 http://{ip_addr}:{port},即可在线对话 @@ -80,7 +80,7 @@ python benchmark/profile_generation.py \ ### 第一步:生成量化参数 ```shell -python3 -m lmdeploy.lite.apis.calibrate \ +lmdeploy lite calibrate \ --model $HF_MODEL \ --calib_dataset 'c4' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval --calib_samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小 @@ -93,7 +93,7 @@ python3 -m lmdeploy.lite.apis.calibrate \ LMDeploy 使用 AWQ 算法对模型权重进行量化。在执行下面的命令时,需要把步骤1的`$WORK_DIR`传入。量化结束后,权重文件也会存放在这个目录中。然后就可以根据 ["4bit权重模型推理"](#4bit-权重模型推理)章节的说明,进行模型推理。 ```shell -python3 -m lmdeploy.lite.apis.auto_awq \ +lmdeploy lite auto_awq \ --model $HF_MODEL \ --w_bits 4 \ # 权重量化的 bit 数 --w_group_size 128 \ # 权重量化分组统计尺寸 diff --git a/lmdeploy/cli/__init__.py b/lmdeploy/cli/__init__.py new file mode 100644 index 000000000..3575bec5b --- /dev/null +++ b/lmdeploy/cli/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .cli import run + +__all__ = ['run'] diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py new file mode 100644 index 000000000..735b24c7c --- /dev/null +++ b/lmdeploy/cli/chat.py @@ -0,0 +1,90 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + + +class SubCliChat(object): + """Chat through terminal with pytorch or turbomind model.""" + + def torch(self, + model_path: str, + tokenizer_path: Optional[str] = None, + accel: Optional[str] = None, + max_new_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + seed: int = 0, + use_fast_tokenizer: bool = True, + max_alloc: int = 2048, + max_session_len: int = None, + log_file: Optional[str] = None, + debug: bool = False, + adapter: Optional[str] = None): + """Chat with pytorch model through terminal. + + Args: + model_path (str): Path to pytorch model. + tokenizer_path (str): Path to tokenizer. + accel (str): Model accelerator. + max_new_tokens (int): Maximum number of tokens to generate. + temperature (float): Temperature for sampling. + top_p (float): Top p for sampling. + seed (int): Random seed. + use_fast_tokenizer (bool): Whether to use fast tokenizer. + This argument is directly pass to transformer's + ``AutoTokenizer.from_pretrained``. + Generally, user should choose to use fast tokenizers. + But if using fast raise some error, try to force using a slow one. + max_alloc (int): Maximum memory to allocate (for deepspeed). + max_session_len (int): Maximum number of tokens allowed for all chat sessions. + This include both history and current session. + log_file (str): Path to log file. + debug (bool): Whether to enable debug mode. + adapter (str): Force to use an adapter. + Generally user should not use this argument because adapter is selected based + on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2 + based on `LlamaforCausalLM` class, this argument is required. + Currently, only "llama1" is acceptable for llama1 models. + """ # noqa: E501 + from lmdeploy.pytorch.chat import main as run_torch_model + + run_torch_model(model_path, + tokenizer_path=tokenizer_path, + accel=accel, + max_new_tokens=max_new_tokens, + temperature=temperature, + top_p=top_p, + seed=seed, + use_fast_tokenizer=use_fast_tokenizer, + max_alloc=max_alloc, + max_session_len=max_session_len, + log_file=log_file, + debug=debug, + adapter=adapter) + + def turbomind(self, + model_path, + session_id: int = 1, + cap: str = 'chat', + tp=1, + stream_output=True, + **kwargs): + """Chat with turbomind model through terminal. + + Args: + model_path (str): the path of the deployed model + session_id (int): the identical id of a session + cap (str): the capability of a model. For example, codellama has + the ability among ['completion', 'infilling', 'chat', 'python'] + tp (int): GPU number used in tensor parallelism + stream_output (bool): indicator for streaming output or not + **kwarg (dict): other arguments for initializing model's chat + template + """ + from lmdeploy.turbomind.chat import main as run_turbomind_model + + run_turbomind_model(model_path, + session_id=session_id, + cap=cap, + tp=tp, + stream_output=stream_output, + **kwargs) diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py new file mode 100644 index 000000000..7b2634b53 --- /dev/null +++ b/lmdeploy/cli/cli.py @@ -0,0 +1,61 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import fire + +from .chat import SubCliChat +from .lite import SubCliLite +from .serve import SubCliServe + + +class CLI(object): + """LMDeploy Command Line Interface. + + The CLI provides a unified API for converting, compressing and deploying + large language models. + """ + + def convert(self, + model_name: str, + model_path: str, + model_format: str = None, + tokenizer_path: str = None, + dst_path: str = './workspace', + tp: int = 1, + quant_path: str = None, + group_size: int = 0): + """Convert LLMs to lmdeploy format. + + Args: + model_name (str): The name of the to-be-deployed model, such as + llama-7b, llama-13b, vicuna-7b and etc. + model_path (str): The directory path of the model + model_format (str): The format of the model, fb or hf. 'fb' stands + for META's llama format, and 'hf' means huggingface format. + tokenizer_path (str): The path of tokenizer model. + dst_path (str): The destination path that saves outputs. + tp (int): The number of GPUs used for tensor parallelism, which + should be 2^n. + quant_path (str): Path of the quantized model, which can be None. + group_size (int): A parameter used in AWQ to quantize fp16 weights + to 4 bits. + """ + from lmdeploy.serve.turbomind.deploy import main as convert + + convert(model_name, + model_path, + model_format=model_format, + tokenizer_path=tokenizer_path, + dst_path=dst_path, + tp=tp, + quant_path=quant_path, + group_size=group_size) + + +def run(): + """The entry point of running LMDeploy CLI.""" + + cli = CLI() + cli.lite = SubCliLite() + cli.chat = SubCliChat() + cli.serve = SubCliServe() + + fire.Fire(cli, name='lmdeploy') diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py new file mode 100644 index 000000000..4302765e2 --- /dev/null +++ b/lmdeploy/cli/lite.py @@ -0,0 +1,100 @@ +# Copyright (c) OpenMMLab. All rights reserved. + + +class SubCliLite(object): + """CLI for compressing LLMs.""" + + def auto_awq(self, + model: str, + work_dir: str, + w_bits: int = 4, + w_sym: bool = False, + w_group_size: int = 128, + device: str = 'cuda'): + """Perform weight quantization using AWQ algorithm. + + Args: + model (str): The path of model in hf format. + work_dir (str): The working directory to save results. + w_bits (int): Bit number for weight quantization. + w_sym (bool): Whether to do symmetric quantization. + w_group_size (int): Group size for weight quantization statistics. + device (str): Device type of running. + """ + from lmdeploy.lite.apis.auto_awq import auto_awq + + auto_awq(model, + work_dir, + w_bits=w_bits, + w_sym=w_sym, + w_group_size=w_group_size, + device=device) + + def calibrate(self, + model: str, + calib_dataset: str = 'c4', + calib_samples: int = 128, + calib_seqlen: int = 2048, + work_dir: str = './work_dir', + device: str = 'cuda') -> None: + """Perform calibration on a given dataset. + + Args: + model (str): The model to be loaded. + calib_dataset (str, optional): The calibration dataset name. + Defaults to 'c4'. + calib_samples (int, optional): The number of samples for + calibration. Defaults to 128. + calib_seqlen (int, optional): The sequence length for calibration. + Defaults to 2048. + work_dir (str): The working directory for outputs. + Defaults to './work_dir'. + device (str, optional): The device to be used for calculation. + Defaults to 'cuda'. + """ + from lmdeploy.lite.apis.calibrate import calibrate + + calibrate(model, + calib_dataset=calib_dataset, + calib_samples=calib_samples, + calib_seqlen=calib_seqlen, + work_dir=work_dir, + device=device) + + def kv_qparams(self, + work_dir: str, + turbomind_dir: str, + kv_bits: int = 8, + kv_sym: bool = False, + num_tp: int = 1) -> None: + """Export key and value stats. + + Args: + work_dir (str): Directory path where the stats + are saved. + turbomind_dir (str): Directory path where to + save the results. + kv_bits (int, optional): Number of bits for quantization. + Defaults to 8. + kv_sym (bool, optional): Whether to use symmetric quantization. + Defaults to False. + num_tp (int, optional): Number of tensor parallelism. + Defaults to 1. + """ + from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams + + run_kv_qparams(work_dir, + turbomind_dir, + kv_bits=kv_bits, + kv_sym=kv_sym, + num_tp=num_tp) + + def get_small_sharded_hf(self, src_dir: str, dst_dir: str): + """Convert a hugging face model to the smallest sharded one. + + Args: + src_dir (str): The directory of the input HF model. + dst_dir (str): The directory to save new model. + """ + from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded + run_sharded(src_dir, dst_dir) diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py new file mode 100644 index 000000000..0bff69c31 --- /dev/null +++ b/lmdeploy/cli/serve.py @@ -0,0 +1,122 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + + +class SubCliServe(object): + """Serve LLMs and interact on terminal or web UI.""" + + def gradio(self, + model_path_or_server: str, + server_name: str = 'localhost', + server_port: int = 6006, + batch_size: int = 32, + tp: int = 1, + restful_api: bool = False): + """Serve LLMs with web ui using gradio. + + Example 1: + lmdeploy serve gradio ./workspace + + Example 2: + lmdeploy serve gradio http://localhost:23333 + --server_name localhost + --server_port 6006 + --restful_api True + + Example 3: + lmdeploy serve gradio ${triton_server_ip_addresss}:33337 + + Args: + model_path_or_server (str): the path of the deployed model or the + tritonserver URL or restful api URL. The former is for directly + running service with gradio. The latter is for running with + tritonserver by default. If the input URL is restful api. + Please enable another flag `restful_api`. + server_name (str): the ip address of gradio server + server_port (int): the port of gradio server + batch_size (int): batch size for running Turbomind directly + tp (int): tensor parallel for Turbomind + restful_api (bool): a flag for model_path_or_server + """ + from lmdeploy.serve.gradio.app import run + run(model_path_or_server, + server_name=server_name, + server_port=server_port, + batch_size=batch_size, + tp=tp, + restful_api=restful_api) + + def api_server(self, + model_path: str, + server_name: str = 'localhost', + server_port: int = 23333, + instance_num: int = 32, + tp: int = 1, + allow_origins: List[str] = ['*'], + allow_credentials: bool = True, + allow_methods: List[str] = ['*'], + allow_headers: List[str] = ['*']): + """Serve LLMs with restful api using fastapi. + + Args: + model_path (str): the path of the deployed model + server_name (str): host ip for serving + server_port (int): server port + instance_num (int): number of instances of turbomind model + tp (int): tensor parallel + allow_origins (List[str]): a list of allowed origins for CORS + allow_credentials (bool): whether to allow credentials for CORS + allow_methods (List[str]): a list of allowed HTTP methods for CORS + allow_headers (List[str]): a list of allowed HTTP headers for CORS + """ + from lmdeploy.serve.openai.api_server import main as run_api_server + + run_api_server(model_path, + server_name=server_name, + server_port=server_port, + instance_num=instance_num, + tp=tp, + allow_origins=allow_origins, + allow_credentials=allow_credentials, + allow_methods=allow_methods, + allow_headers=allow_headers) + + def api_client(self, restful_api_url: str, session_id: int = 0): + """Interact with restful api server in terminal. + + Args: + restful_api_url: The restful api URL. + session_id: The identical id of a session. + """ + from lmdeploy.serve.openai.api_client import main as run_api_client + run_api_client(restful_api_url, session_id=session_id) + + def triton_client(self, + tritonserver_addr: str, + session_id: int = 1, + cap: str = 'chat', + stream_output: bool = True, + **kwargs): + """Interact with Triton Server using gRPC protocol. + + Args: + tritonserver_addr (str): the address in format "ip:port" of + triton inference server + session_id (int): the identical id of a session + cap (str): the capability of a model. For example, codellama + has the ability among ['completion', 'infill', 'instruct', + 'python'] + stream_output (bool): indicator for streaming output or not + **kwargs (dict): other arguments for initializing model's + chat template + """ + + from lmdeploy.serve.client import main as run_triton_client + + run_triton_client( + tritonserver_addr, + session_id=session_id, + cap=cap, + stream_output=stream_output, + **kwargs, + ) diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py index 3517f51b8..250defb59 100644 --- a/lmdeploy/lite/apis/auto_awq.py +++ b/lmdeploy/lite/apis/auto_awq.py @@ -2,7 +2,6 @@ from pathlib import Path -import fire import torch from accelerate import (infer_auto_device_map, init_empty_weights, load_checkpoint_in_model) @@ -81,5 +80,6 @@ def auto_awq(model: str, if __name__ == '__main__': + import fire fire.Fire(auto_awq) diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py index 38b6429a1..3df252f06 100644 --- a/lmdeploy/lite/apis/calibrate.py +++ b/lmdeploy/lite/apis/calibrate.py @@ -2,7 +2,6 @@ from pathlib import Path -import fire import torch from accelerate import (infer_auto_device_map, init_empty_weights, load_checkpoint_in_model) @@ -107,4 +106,6 @@ def calibrate(model: str, if __name__ == '__main__': + import fire + fire.Fire(calibrate) diff --git a/lmdeploy/lite/apis/kv_qparams.py b/lmdeploy/lite/apis/kv_qparams.py index 7d43078da..f31fee029 100644 --- a/lmdeploy/lite/apis/kv_qparams.py +++ b/lmdeploy/lite/apis/kv_qparams.py @@ -2,7 +2,6 @@ from pathlib import Path from typing import Union -import fire import numpy as np import torch @@ -120,5 +119,6 @@ def main(work_dir: str, if __name__ == '__main__': + import fire fire.Fire(main) diff --git a/lmdeploy/model.py b/lmdeploy/model.py index 18da39a6e..39451acdc 100644 --- a/lmdeploy/model.py +++ b/lmdeploy/model.py @@ -654,4 +654,5 @@ def main(model_name: str = 'test'): if __name__ == '__main__': import fire + fire.Fire(main) diff --git a/lmdeploy/pytorch/chat.py b/lmdeploy/pytorch/chat.py index c30cf6ffe..2690480a8 100644 --- a/lmdeploy/pytorch/chat.py +++ b/lmdeploy/pytorch/chat.py @@ -51,7 +51,6 @@ import logging from typing import Optional -import fire import torch from transformers import GenerationConfig, PreTrainedModel @@ -205,6 +204,8 @@ def main( def cli(): + import fire + fire.Fire(main) diff --git a/lmdeploy/serve/client.py b/lmdeploy/serve/client.py index 9c0d3cb5c..424e83143 100644 --- a/lmdeploy/serve/client.py +++ b/lmdeploy/serve/client.py @@ -1,8 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import os -import fire - from lmdeploy.serve.turbomind.chatbot import Chatbot @@ -66,4 +64,6 @@ def main(tritonserver_addr: str, if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/serve/gradio/app.py b/lmdeploy/serve/gradio/app.py index 71db7a274..5c200517b 100644 --- a/lmdeploy/serve/gradio/app.py +++ b/lmdeploy/serve/gradio/app.py @@ -5,7 +5,6 @@ from functools import partial from typing import Sequence -import fire import gradio as gr from lmdeploy.serve.async_engine import AsyncEngine @@ -525,7 +524,7 @@ def run(model_path_or_server: str, server_port (int): the port of gradio server batch_size (int): batch size for running Turbomind directly tp (int): tensor parallel for Turbomind - restufl_api (bool): a flag for model_path_or_server + restful_api (bool): a flag for model_path_or_server """ if ':' in model_path_or_server: if restful_api: @@ -539,4 +538,6 @@ def run(model_path_or_server: str, if __name__ == '__main__': + import fire + fire.Fire(run) diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py index a8718331b..26977bc6c 100644 --- a/lmdeploy/serve/openai/api_client.py +++ b/lmdeploy/serve/openai/api_client.py @@ -2,7 +2,6 @@ import json from typing import Iterable, List -import fire import requests @@ -89,4 +88,6 @@ def main(restful_api_url: str, session_id: int = 0): if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 94271c4b9..8324e3497 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -4,7 +4,6 @@ from http import HTTPStatus from typing import AsyncGenerator, List, Optional -import fire import uvicorn from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware @@ -357,4 +356,6 @@ def main(model_path: str, if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py index 3aca6e1f1..ab8c9ea95 100644 --- a/lmdeploy/serve/turbomind/deploy.py +++ b/lmdeploy/serve/turbomind/deploy.py @@ -8,7 +8,6 @@ import sys from pathlib import Path -import fire import safetensors import torch from safetensors.torch import load_file @@ -1043,4 +1042,6 @@ def main(model_name: str, if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index ca9d9c34e..bf0ce7399 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -4,11 +4,7 @@ import os.path as osp import random -import fire - -from lmdeploy import turbomind as tm from lmdeploy.model import MODELS -from lmdeploy.tokenizer import Tokenizer os.environ['TM_LOG_LEVEL'] = 'ERROR' @@ -88,6 +84,9 @@ def main(model_path, stream_output (bool): indicator for streaming output or not **kwarg (dict): other arguments for initializing model's chat template """ + from lmdeploy import turbomind as tm + from lmdeploy.tokenizer import Tokenizer + tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer') tokenizer = Tokenizer(tokenizer_model_path) tm_model = tm.TurboMind(model_path, eos_id=tokenizer.eos_token_id, tp=tp) @@ -157,4 +156,6 @@ def main(model_path, if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/turbomind/decode.py b/lmdeploy/turbomind/decode.py index daef35298..5ba4675c5 100644 --- a/lmdeploy/turbomind/decode.py +++ b/lmdeploy/turbomind/decode.py @@ -2,7 +2,6 @@ import os import os.path as osp -import fire import torch from lmdeploy import turbomind as tm @@ -37,4 +36,6 @@ def main(model_path, inputs): if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py index 328f18215..9a4f0e8c4 100644 --- a/lmdeploy/turbomind/generate_gemm_config.py +++ b/lmdeploy/turbomind/generate_gemm_config.py @@ -2,8 +2,6 @@ import subprocess -import fire - def get_llama_gemm(): import os.path as osp @@ -30,4 +28,6 @@ def main(head_num: int = 32, if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/setup.py b/setup.py index 09ae1e31c..df36118c2 100644 --- a/setup.py +++ b/setup.py @@ -121,26 +121,29 @@ def gen_packages_items(): if __name__ == '__main__': lmdeploy_package_data = ['lmdeploy/bin/llama_gemm'] - setup(name='lmdeploy', - version=get_version(), - description='A toolset for compressing, deploying and serving LLM', - long_description=readme(), - long_description_content_type='text/markdown', - author='OpenMMLab', - author_email='openmmlab@gmail.com', - packages=find_packages(exclude=()), - package_data={ - 'lmdeploy': lmdeploy_package_data, - }, - include_package_data=True, - install_requires=parse_requirements('requirements.txt'), - has_ext_modules=check_ext_modules, - classifiers=[ - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Intended Audience :: Developers', - 'Intended Audience :: Education', - 'Intended Audience :: Science/Research', - ]) + setup( + name='lmdeploy', + version=get_version(), + description='A toolset for compressing, deploying and serving LLM', + long_description=readme(), + long_description_content_type='text/markdown', + author='OpenMMLab', + author_email='openmmlab@gmail.com', + packages=find_packages(exclude=()), + package_data={ + 'lmdeploy': lmdeploy_package_data, + }, + include_package_data=True, + install_requires=parse_requirements('requirements.txt'), + has_ext_modules=check_ext_modules, + classifiers=[ + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + ], + entry_points={'console_scripts': ['lmdeploy = lmdeploy.cli:run']}, + ) diff --git a/tests/test_lmdeploy/test_cli.py b/tests/test_lmdeploy/test_cli.py new file mode 100644 index 000000000..a41eab442 --- /dev/null +++ b/tests/test_lmdeploy/test_cli.py @@ -0,0 +1,51 @@ +import inspect + + +def compare_func(class_method, function): + """Compare if a class method has same arguments as a function.""" + + argspec_cls = inspect.getfullargspec(class_method) + argspec_func = inspect.getfullargspec(function) + assert argspec_cls.args[1:] == argspec_func.args + assert argspec_cls.defaults == argspec_func.defaults + assert argspec_cls.annotations == argspec_func.annotations + + +def test_cli(): + + from lmdeploy.cli.cli import CLI + from lmdeploy.serve.turbomind.deploy import main as convert + compare_func(CLI.convert, convert) + + +def test_subcli_chat(): + from lmdeploy.cli.chat import SubCliChat + from lmdeploy.pytorch.chat import main as run_torch_model + from lmdeploy.turbomind.chat import main as run_turbomind_model + + compare_func(SubCliChat.torch, run_torch_model) + compare_func(SubCliChat.turbomind, run_turbomind_model) + + +def test_subcli_lite(): + from lmdeploy.cli.lite import SubCliLite + from lmdeploy.lite.apis.auto_awq import auto_awq + from lmdeploy.lite.apis.calibrate import calibrate + from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams + + compare_func(SubCliLite.auto_awq, auto_awq) + compare_func(SubCliLite.calibrate, calibrate) + compare_func(SubCliLite.kv_qparams, run_kv_qparams) + + +def test_subcli_serve(): + from lmdeploy.cli.serve import SubCliServe + from lmdeploy.serve.client import main as run_triton_client + from lmdeploy.serve.gradio.app import run as run_gradio + from lmdeploy.serve.openai.api_client import main as run_api_client + from lmdeploy.serve.openai.api_server import main as run_api_server + + compare_func(SubCliServe.gradio, run_gradio) + compare_func(SubCliServe.api_server, run_api_server) + compare_func(SubCliServe.api_client, run_api_client) + compare_func(SubCliServe.triton_client, run_triton_client)