Merge branch 'main' into gradio-session-issue

InternLM · Nov 3, 2023 · 619375f · 619375f
2 parents af8c449 + c15fbf4
commit 619375f
Show file tree

Hide file tree

Showing 81 changed files with 4,275 additions and 2,268 deletions.
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -24,19 +24,18 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
-      - name: Check disk space
-        run: |
-          df -h
-          ls /opt/hostedtoolcache
-          rm -rf ${GITHUB_WORKSPACE}/.git
-          rm -rf  /opt/hostedtoolcache/go
-          rm -rf  /opt/hostedtoolcache/node
-          rm -rf  /opt/hostedtoolcache/Ruby
-          rm -rf  /opt/hostedtoolcache/CodeQL
-          cat /proc/cpuinfo  | grep -ic proc
-          free
-          df -h
-          df . -h
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
       - name: Get docker info
         run: |
           docker info

diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml
@@ -27,14 +27,30 @@ permissions:
 jobs:
   cuda-118:
     runs-on: ubuntu-latest
-    container: openmmlab/lmdeploy-builder:cuda11.8
     steps:
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
       - name: Checkout repository
         uses: actions/checkout@v3
       - name: Build
-        run: |
-          source /opt/conda/bin/activate
-          conda activate py38
-          mkdir build && cd build
-          bash ../generate.sh
-          make -j$(nproc) && make install
+        uses: addnab/docker-run-action@v3
+        with:
+          image: openmmlab/lmdeploy-builder:cuda11.8
+          options: -v ${{ github.workspace }}:/work --cpus=1.8
+          run: |
+            cd /work
+            source /opt/conda/bin/activate
+            conda activate py38
+            mkdir build && cd build
+            bash ../generate.sh
+            make -j$(nproc) && make install
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -21,6 +21,18 @@ jobs:
       DOCKER_TAG: cuda11.8
       OUTPUT_FOLDER: cuda11.8_dist
     steps:
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
       - name: Checkout repository
         uses: actions/checkout@v3
       - name: Build

diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## News 🎉
 
+- \[2023/09\] TurboMind supports Qwen-14B
 - \[2023/09\] TurboMind supports InternLM-20B
 - \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
 - \[2023/09\] TurboMind supports Baichuan2-7B
@@ -51,7 +52,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 
 ## Supported Models
 
-`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`.
+`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`. You can run `lmdeploy list` to check the supported model names.
 
 ### TurboMind
 
@@ -62,9 +63,11 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 | :----------: | :-------------: | :--: | :-----: | :---: | :--: |
 |    Llama     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |    Llama2    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|    SOLAR     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-20B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |       Yes       | Yes  |   Yes   |  No   |  No  |
+|   QWen-14B   |       Yes       | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |       Yes       | Yes  |   No    |  No   |  No  |
 |  Code Llama  |       Yes       | Yes  |   No    |  No   |  No  |
@@ -116,14 +119,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
 GIT_LFS_SKIP_SMUDGE=1
 
 # 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 
 ```
 
 #### Inference by TurboMind
 
 ```shell
-python -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 > **Note**<br />
@@ -137,7 +140,7 @@ python -m lmdeploy.turbomind.chat ./workspace
 #### Serving with gradio
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
 ```
 
 ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -147,23 +150,23 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
 Launch inference server by:
 
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --instance_num 32 --tp 1
 ```
 
 Then, you can communicate with it by command line,
 
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client api_server_url
 ```
 
 or webui,
 
 ```shell
-# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
 ```
 
 Refer to [restful_api.md](docs/en/restful_api.md) for more details.
@@ -179,13 +182,13 @@ bash workspace/service_docker_up.sh
 Then, you can communicate with the inference server by command line,
 
 ```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
 ```
 
 or webui,
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
 ```
 
 For the deployment of other supported models, such as LLaMA, LLaMA-2, vicuna and so on, you can find the guide from [here](docs/en/serving.md)
@@ -197,7 +200,7 @@ For detailed instructions on Inference pytorch models, see [here](docs/en/pytorc
 #### Single GPU
 
 ```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL \
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \
     --max_new_tokens 64 \
     --temperture 0.8 \
     --top_p 0.95 \

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## 更新 🎉
 
+- \[2023/09\] TurboMind 支持 Qwen-14B
 - \[2023/09\] TurboMind 支持 InternLM-20B 模型
 - \[2023/09\] TurboMind 支持 Code Llama 所有功能：代码续写、填空、对话、Python专项。点击[这里](./docs/zh_cn/supported_models/codellama.md)阅读部署方法
 - \[2023/09\] TurboMind 支持 Baichuan2-7B
@@ -52,7 +53,7 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 
 ## 支持的模型
 
-`LMDeploy` 支持 `TurboMind` 和 `Pytorch` 两种推理后端
+`LMDeploy` 支持 `TurboMind` 和 `Pytorch` 两种推理后端。运行`lmdeploy list`可查看支持模型列表
 
 ### TurboMind
 
@@ -63,9 +64,11 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 | :----------: | :------: | :--: | :-----: | :---: | :--: |
 |    Llama     |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |    Llama2    |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+|    SOLAR     |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-20B |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |   Yes    | Yes  |   Yes   |  No   |  No  |
+|   QWen-14B   |   Yes    | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |   Yes    | Yes  |   No    |  No   |  No  |
 |  Code Llama  |   Yes    | Yes  |   No    |  No   |  No  |
@@ -117,14 +120,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
 GIT_LFS_SKIP_SMUDGE=1
 
 # 2. 转换为 trubomind 要求的格式。默认存放路径为 ./workspace
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 
 ```
 
 #### 使用 turbomind 推理
 
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 > **Note**<br />
@@ -137,7 +140,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 #### 启动 gradio server
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
 ```
 
 ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -147,23 +150,23 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
 使用下面的命令启动推理服务：
 
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 
 你可以通过命令行方式与推理服务进行对话：
 
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client api_server_url
 ```
 
 也可以通过 WebUI 方式来对话：
 
 ```shell
-# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
 ```
 
 更多详情可以查阅 [restful_api.md](docs/zh_cn/restful_api.md)。
@@ -179,13 +182,13 @@ bash workspace/service_docker_up.sh
 你可以通过命令行方式与推理服务进行对话：
 
 ```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
 ```
 
 也可以通过 WebUI 方式来对话：
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
 ```
 
 其他模型的部署方式，比如 LLaMA，LLaMA-2，vicuna等等，请参考[这里](docs/zh_cn/serving.md)
@@ -201,7 +204,7 @@ pip install deepspeed
 #### 单个 GPU
 
 ```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL\
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL\
     --max_new_tokens 64 \
     --temperture 0.8 \
     --top_p 0.95 \

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -30,7 +30,7 @@ pip install nvidia-ml-py
 ```bash
 python profile_generation.py \
  --model-path /path/to/your/model \
- --concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
+ --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
 ```
 
 ## profile serving

diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
@@ -18,7 +18,8 @@
                     nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion)
 from tqdm import tqdm
 
-from lmdeploy.turbomind import Tokenizer, TurboMind
+from lmdeploy.tokenizer import Tokenizer
+from lmdeploy.turbomind import TurboMind
 
 
 def infer(model, session_id: int, input_ids: str, output_seqlen: int,
@@ -89,7 +90,7 @@ def _infer(model, session_id):
 
 def profile_throughput(model_path: str,
                        concurrency: int = 1,
-                       input_seqlen: int = 0,
+                       input_seqlen: int = 1,
                        output_seqlen: int = 512,
                        test_round: int = 10,
                        tp: int = 1):
@@ -98,8 +99,10 @@ def profile_throughput(model_path: str,
     tm_model = TurboMind(model_path=model_path, tp=tp)
 
     # make up a prompt that can be tokenized into {input_seqlen} tokens
-    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
+    assert input_seqlen > 0, 'input_seqlen should > 0'
+    prompt = 'hi'
     input_ids = tokenizer.encode(prompt)
+    input_ids = input_ids * input_seqlen
 
     warmup(tm_model, concurrency, input_ids, output_seqlen)