diff --git a/.github/md-link-config.json b/.github/md-link-config.json index 469ac707a6..3b9bca0dcc 100644 --- a/.github/md-link-config.json +++ b/.github/md-link-config.json @@ -17,6 +17,15 @@ }, { "pattern": "^http://localhost" + }, + { + "pattern": "^https://twitter.com" + }, + { + "pattern": "^https://platform.openai.com" + }, + { + "pattern": "^http://0.0.0.0" } ], "httpHeaders": [ diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index afc94417f8..f2279536ad 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -2,8 +2,29 @@ name: daily_ete_test on: workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch']" + model: + required: true + description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models' + type: string + default: "['quantization','convert','pipeline','restful','chat','interface-pipeline']" schedule: - - cron: '00 18 * * *' + - cron: '00 21 * * *' env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache @@ -13,7 +34,7 @@ env: jobs: test_functions: runs-on: [self-hosted, linux-a100] - timeout-minutes: 420 + timeout-minutes: 240 env: REPORT_DIR: /nvme/qa_test_models/test-reports container: @@ -23,6 +44,7 @@ jobs: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/bigdisk/qa_test_models:/mnt/bigdisk/qa_test_models - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Setup systems @@ -33,7 +55,10 @@ jobs: dpkg -i /root/packages/allure_2.24.1-1_all.deb rm -rf /var/lib/apt/lists/* - name: Clone repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} - name: Install pytorch run: | python3 -m pip cache dir @@ -68,64 +93,89 @@ jobs: run: | python3 -m pip list lmdeploy check_env + rm -rf allure-results - name: Test lmdeploy - quantization w4a16 continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | pytest autotest/tools/quantization/test_quantization_w4a16.py -m 'not pr_test' -n 8 --alluredir=allure-results --clean-alluredir - name: Test lmdeploy - quantization kv int8 continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | pytest autotest/tools/quantization/test_quantization_kvint8.py -n 8 --alluredir=allure-results - name: Test lmdeploy - quantization w8a8 continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=allure-results - name: Test lmdeploy - quantization kv int8 and w4a16 continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | pytest autotest/tools/quantization/test_quantization_kvint8_w4a16.py -n 8 --alluredir=allure-results - name: Test lmdeploy - convert continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'convert')) run: | - pytest autotest/tools/convert -m 'not pr_test' -n 6 --alluredir=allure-results --dist loadgroup - - name: Test lmdeploy - interface turbomind case + pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=allure-results + - name: Test lmdeploy - chat workspace continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat')) timeout-minutes: 20 run: | - pytest autotest/interface/pipeline/test_pipeline_turbomind_func.py -m 'not pr_test' --alluredir=allure-results - - name: Test lmdeploy - pipeline turbomind - continue-on-error: true - timeout-minutes: 45 - run: pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'not pr_test' --alluredir=allure-results - - name: Test lmdeploy - pipeline torch + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - chat hf turbomind continue-on-error: true - timeout-minutes: 75 - run: pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'not pr_test' --alluredir=allure-results - - name: Test lmdeploy - restful turbomind + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat')) + timeout-minutes: 20 + run: | + pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - chat hf torch continue-on-error: true - timeout-minutes: 60 - run: pytest autotest/tools/restful/test_restful_chat_turbomind.py -m 'not pr_test' --alluredir=allure-results - - name: Test lmdeploy - restful torch + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'chat')) + timeout-minutes: 20 + run: | + pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - pipeline turbomind continue-on-error: true - timeout-minutes: 80 - run: pytest autotest/tools/restful/test_restful_chat_pytorch.py -m 'not pr_test' --alluredir=allure-results - - name: Test lmdeploy - chat workspace + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'pipeline')) + timeout-minutes: 25 + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - restful turbomind continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful')) timeout-minutes: 30 run: | - pytest autotest/tools/chat/test_command_chat_workspace.py -m 'not pr_test' -n 4 --alluredir=allure-results - - name: Test lmdeploy - chat hf turbomind + pytest autotest/tools/restful/test_restful_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/restful/test_restful_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - interface pipeline turbomind case continue-on-error: true - timeout-minutes: 45 + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'interface-pipeline')) + timeout-minutes: 20 run: | - pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'not pr_test' -n 4 --alluredir=allure-results - - name: Test lmdeploy - chat hf torch + pytest autotest/interface/pipeline/test_pipeline_turbomind_func.py -m 'not pr_test' --alluredir=allure-results + - name: Test lmdeploy - pipeline torch + continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'pipeline')) + timeout-minutes: 25 + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - restful torch continue-on-error: true - timeout-minutes: 60 + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'restful')) + timeout-minutes: 40 run: | - pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'not pr_test' -n 4 --alluredir=allure-results + pytest autotest/tools/restful/test_restful_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/restful/test_restful_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results - name: Test lmdeploy - rerun all fail cases - timeout-minutes: 60 + timeout-minutes: 30 run: | pytest autotest --lf --alluredir=allure-results - name: Generate reports diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml index a41e639f30..08bf24b4b7 100644 --- a/.github/workflows/pr_ete_test.yml +++ b/.github/workflows/pr_ete_test.yml @@ -34,6 +34,7 @@ jobs: - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip - /nvme/share_data/github-actions/packages:/root/packages - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/bigdisk/qa_test_models:/mnt/bigdisk/qa_test_models - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Setup systems @@ -81,7 +82,7 @@ jobs: lmdeploy check_env - name: Test lmdeploy timeout-minutes: 120 - run: CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m pr_test --alluredir=allure-results --clean-alluredir + run: CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m pr_test -v -s --alluredir=allure-results --clean-alluredir - name: Generate reports if: always() run: | diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index c4915e886d..590a019689 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -74,7 +74,7 @@ jobs: make -j$(nproc) && make install - name: Install lmdeploy run: | - python3 -m pip install pynvml packaging protobuf transformers_stream_generator transformers==4.33.0 + python3 -m pip install pynvml packaging protobuf transformers_stream_generator # manually install flash attn python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl python3 -m pip install -r requirements.txt -r requirements/test.txt diff --git a/README.md b/README.md index 9f4f0d66b5..5cf1699902 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,23 @@
-[![docs](https://img.shields.io/badge/docs-latest-blue)](https://lmdeploy.readthedocs.io/en/latest/) -[![badge](https://github.com/InternLM/lmdeploy/workflows/lint/badge.svg)](https://github.com/InternLM/lmdeploy/actions) [![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy) +![PyPI - Downloads](https://img.shields.io/pypi/dm/lmdeploy) [![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE) [![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) +[📘Documentation](https://lmdeploy.readthedocs.io/en/latest/) | +[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started.html) | +[🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose) + English | [简体中文](README_zh-CN.md) -
+👋 join us on [![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=wechat&label=WeChat)](https://r.vansin.top/?r=internwx) +[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=twitter&label=Twitter)](https://twitter.com/intern_lm) +[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=discord&label=Discord)](https://discord.gg/xa29JuW87d) -

- 👋 join us on Twitter, Discord and WeChat -

+ ______________________________________________________________________ @@ -23,6 +26,7 @@ ______________________________________________________________________
2024 +- \[2024/02\] Support Qwen 1.5, Gemma, Mistral, Mixtral, Deepseek-MOE and so on. - \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](./docs/en/serving/restful_api.md). - \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](./docs/en/serving/proxy_server.md) - \[2024/01\] Support [PyTorch inference engine](./docs/en/inference/pytorch.md), developed entirely in Python, helping to lower the barriers for developers and enable rapid experimentation with new features and technologies. diff --git a/README_zh-CN.md b/README_zh-CN.md index 51155b819a..b7a0e61a69 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -1,20 +1,23 @@
-[![docs](https://img.shields.io/badge/docs-latest-blue)](https://lmdeploy.readthedocs.io/zh-cn/latest/) -[![badge](https://github.com/InternLM/lmdeploy/workflows/lint/badge.svg)](https://github.com/InternLM/lmdeploy/actions) [![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy) +![PyPI - Downloads](https://img.shields.io/pypi/dm/lmdeploy) [![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE) [![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) +[📘Documentation](https://lmdeploy.readthedocs.io/zh-cn/latest/) | +[🛠️Quick Start](https://lmdeploy.readthedocs.io/zh-cn/latest/get_started.html) | +[🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose) + [English](README.md) | 简体中文 -
+👋 join us on [![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=wechat&label=WeChat)](https://r.vansin.top/?r=internwx) +[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=twitter&label=Twitter)](https://twitter.com/intern_lm) +[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=discord&label=Discord)](https://discord.gg/xa29JuW87d) -

- 👋 join us on Twitter, Discord and WeChat -

+ ______________________________________________________________________ @@ -23,6 +26,7 @@ ______________________________________________________________________
2024 +- \[2024/02\] 支持 Qwen 1.5、Gemma、Mistral、Mixtral、Deepseek-MOE 等模型 - \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) 发布,支持无缝接入[LMDeploy Serving Service](./docs/zh_cn/serving/restful_api.md) - \[2024/01\] 支持多模型、多机、多卡推理服务。使用方法请参考[此处](./docs/zh_cn/serving/proxy_server.md) - \[2024/01\] 增加 [PyTorch 推理引擎](./docs/zh_cn/inference/pytorch.md),作为 TurboMind 引擎的补充。帮助降低开发门槛,和快速实验新特性、新技术 diff --git a/autotest/config.yaml b/autotest/config.yaml index 60100c6fa8..75988f2891 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -1,4 +1,4 @@ -model_path: /nvme/qa_test_models +model_path: /mnt/bigdisk/qa_test_models dst_path: /nvme/qa_test_models/autotest_model log_path: /nvme/qa_test_models/autotest_model/log dataset_path: /nvme/qa_test_models/...dataset @@ -13,67 +13,69 @@ tp_config: turbomind_model: - - llama-2-7b-chat - - internlm2-chat-1_8b - - internlm-chat-7b - - internlm-chat-20b - - internlm2-chat-7b - - internlm2-chat-20b - - Qwen-7B-Chat - - Qwen-14B-Chat - - llama2-chat-7b-w4 - - Baichuan2-7B-Chat - - Yi-6B-Chat - - internlm2-1_8b - - internlm2-20b - - CodeLlama-7b-Instruct-hf + - meta-llama/Llama-2-7b-chat + - internlm/internlm2-chat-1_8b + - internlm/internlm-chat-7b + - internlm/internlm-chat-20b + - internlm/internlm2-chat-7b + - internlm/internlm2-chat-20b + - internlm/internlm2-chat-7b-4bits + - internlm/internlm2-chat-20b-4bits + - Qwen/Qwen-7B-Chat + - Qwen/Qwen-14B-Chat + - lmdeploy/llama2-chat-7b-w4 + - baichuan-inc/Baichuan2-7B-Chat + - 01-ai/Yi-6B-Chat + - internlm/internlm2-1_8b + - internlm/internlm2-20b + - codellama/CodeLlama-7b-Instruct-hf pytorch_model: - - llama-2-7b-chat - - internlm-chat-7b - - internlm-chat-20b - - internlm2-chat-7b - - internlm2-chat-20b - - Baichuan2-7B-Chat - - Baichuan2-13B-Chat - - chatglm2-6b - - falcon-7b - - Yi-6B-Chat - - internlm2-1_8b - - internlm2-20b - - Qwen1.5-7B-Chat - - Mistral-7B-Instruct-v0.1 - - Mixtral-8x7B-Instruct-v0.1 - - gemma-7b-it - - deepseek-moe-16b-chat + - meta-llama/Llama-2-7b-chat + - internlm/internlm-chat-7b + - internlm/internlm-chat-20b + - internlm/internlm2-chat-7b + - internlm/internlm2-chat-20b + - baichuan-inc/Baichuan2-7B-Chat + - baichuan-inc/Baichuan2-13B-Chat + - THUDM/chatglm2-6b + - tiiuae/falcon-7b + - 01-ai/Yi-6B-Chat + - internlm/internlm2-1_8b + - internlm/internlm2-20b + - Qwen/Qwen1.5-7B-Chat + - mistralai/Mistral-7B-Instruct-v0.1 + - mistralai/Mixtral-8x7B-Instruct-v0.1 + - google/gemma-7b-it + - deepseek-ai/deepseek-moe-16b-chat quatization_case_config: w4a16: - - llama-2-7b-chat - - internlm-chat-20b - - Qwen-7B-Chat - - Qwen-14B-Chat - - internlm2-chat-20b - - Baichuan2-7B-Chat - - internlm2-20b + - meta-llama/Llama-2-7b-chat + - internlm/internlm-chat-20b + - Qwen/Qwen-7B-Chat + - Qwen/Qwen-14B-Chat + - internlm/internlm2-chat-20b + - baichuan-inc/Baichuan2-7B-Chat + - internlm/internlm2-20b kvint8: # more models are supported kvint8 quantization, but the chat response are not good, already removed - - llama-2-7b-chat - - internlm-chat-20b - - internlm2-chat-20b + - meta-llama/Llama-2-7b-chat + - internlm/internlm-chat-20b + - internlm/internlm2-chat-20b kvint8_w4a16: - - llama-2-7b-chat - - internlm-chat-20b - - internlm2-chat-20b - - internlm2-20b - - Qwen-7B-Chat - - Qwen-14B-Chat - - Baichuan2-7B-Chat + - meta-llama/Llama-2-7b-chat + - internlm/internlm-chat-20b + - internlm/internlm2-chat-20b + - internlm/internlm2-20b + - Qwen/Qwen-7B-Chat + - Qwen/Qwen-14B-Chat + - baichuan-inc/Baichuan2-7B-Chat w8a8: - - llama-2-7b-chat - - internlm-chat-20b - - internlm2-chat-20b - - internlm2-chat-7b - - Yi-6B-Chat - - internlm2-20b + - meta-llama/Llama-2-7b-chat + - internlm/internlm-chat-20b + - internlm/internlm2-chat-20b + - internlm/internlm2-chat-7b + - 01-ai/Yi-6B-Chat + - internlm/internlm2-20b diff --git a/autotest/interface/pipeline/test_pipeline_turbomind_func.py b/autotest/interface/pipeline/test_pipeline_turbomind_func.py index 8251fae347..64a07b3ddb 100644 --- a/autotest/interface/pipeline/test_pipeline_turbomind_func.py +++ b/autotest/interface/pipeline/test_pipeline_turbomind_func.py @@ -10,7 +10,7 @@ @pytest.mark.flaky(reruns=0) class TestPipelineTurbomindFuncRegression: - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_backend_config_tp(self, config, model): with pytest.raises(AssertionError, match='tp should be 2\\^n'): model_path = '/'.join([config.get('model_path'), model]) @@ -18,7 +18,7 @@ def test_backend_config_tp(self, config, model): pipe = pipeline(model_path, backend_config=backend_config) del pipe - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_backend_config_session_len(self, config, model): model_path = '/'.join([config.get('model_path'), model]) backend_config = TurbomindEngineConfig(session_len=10) @@ -29,7 +29,7 @@ def test_backend_config_session_len(self, config, model): assert response[i].finish_reason == 'length', str(response[i]) assert response[i].generate_token_len == 0, str(response[i]) - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_gen_config_test(self, config, model): model_path = '/'.join([config.get('model_path'), model]) pipe = pipeline(model_path) @@ -111,7 +111,7 @@ def test_gen_config_test(self, config, model): del pipe - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def future_test_backend_config_cache_max_entry_count(self, config, model): model_path = '/'.join([config.get('model_path'), model]) backend_config = TurbomindEngineConfig(cache_max_entry_count=-1) @@ -122,7 +122,7 @@ def future_test_backend_config_cache_max_entry_count(self, config, model): with assume: assert response[i].finish_reason == 'length', str(response[i]) - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_backend_config_max_batch_size2(self, config, model): model_path = '/'.join([config.get('model_path'), model]) backend_config = TurbomindEngineConfig(max_batch_size=-1) @@ -140,7 +140,7 @@ def test_backend_config_max_batch_size2(self, config, model): with assume: assert response[i].text == '', str(response[i]) - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_pipeline_batch_infer(self, config, model): model_path = '/'.join([config.get('model_path'), model]) pipe = pipeline(model_path) @@ -160,7 +160,7 @@ def test_pipeline_batch_infer(self, config, model): with assume: assert response[i].session_id == i - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_pipeline_stream_infer(self, config, model): model_path = '/'.join([config.get('model_path'), model]) pipe = pipeline(model_path) @@ -207,7 +207,7 @@ def test_pipeline_stream_infer(self, config, model): with assume: assert outputs_list[-1].finish_reason is not None, str(output) - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_pipeline_stream_infer2(self, config, model): model_path = '/'.join([config.get('model_path'), model]) pipe = pipeline(model_path) diff --git a/autotest/interface/pipeline/test_pipeline_turbomind_longtext_func.py b/autotest/interface/pipeline/test_pipeline_turbomind_longtext_func.py new file mode 100644 index 0000000000..13bfd8aff3 --- /dev/null +++ b/autotest/interface/pipeline/test_pipeline_turbomind_longtext_func.py @@ -0,0 +1,88 @@ +import pytest +from utils.get_run_config import get_tp_num + +from lmdeploy import TurbomindEngineConfig, pipeline + + +@pytest.mark.order(8) +@pytest.mark.pipeline_func +@pytest.mark.timeout(600) +class TestPipelineLongtextFunc: + + def test_long_test_chat_7b(self, config): + model = 'internlm/internlm2-chat-7b' + tp_config = get_tp_num(config, model) + model_path = '/'.join([config.get('model_path'), model]) + + backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, + session_len=210000, + tp=tp_config) + pipe = pipeline(model_path, backend_config=backend_config) + prompt = '今 天 心 ' * int(200000 / 6) + + # batch infer + pipe(prompt) + + # stream infer + for outputs in pipe.stream_infer(prompt): + continue + + prompts = ['今 天 心 ' * int(200000 / 6)] * 2 + # batch infer + pipe(prompts) + + # stream infer + for outputs in pipe.stream_infer(prompts): + continue + + def test_long_test_chat_20b(self, config): + model = 'internlm/internlm2-chat-20b' + tp_config = get_tp_num(config, model) + model_path = '/'.join([config.get('model_path'), model]) + + backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, + session_len=210000, + tp=tp_config) + pipe = pipeline(model_path, backend_config=backend_config) + prompt = '今 天 心 ' * int(200000 / 6) + + # batch infer + pipe(prompt) + + # stream infer + for outputs in pipe.stream_infer(prompt): + continue + + prompts = ['今 天 心 ' * int(200000 / 6)] * 2 + # batch infer + pipe(prompts) + + # stream infer + for outputs in pipe.stream_infer(prompts): + continue + + def test_long_test_20b(self, config): + model = 'internlm/internlm2-20b' + tp_config = get_tp_num(config, model) + model_path = '/'.join([config.get('model_path'), model]) + + backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, + session_len=210000, + tp=tp_config) + pipe = pipeline(model_path, backend_config=backend_config) + prompt = '今 天 心 ' * int(200000 / 6) + + # batch infer + pipe(prompt) + + # stream infer + for outputs in pipe.stream_infer(prompt): + continue + + prompts = ['今 天 心 ' * int(200000 / 6)] * 2 + # batch infer + pipe(prompts) + + # stream infer + for outputs in pipe.stream_infer(prompts): + continue diff --git a/autotest/prompt_case.yaml b/autotest/prompt_case.yaml index e1839ce3f2..ce5d174518 100644 --- a/autotest/prompt_case.yaml +++ b/autotest/prompt_case.yaml @@ -77,6 +77,9 @@ chinese_poem_case: - internlm2-20b: - len_g: 5 + - falcon: + - len_g: + 5 english_poem_case: - write a romantic English poem: - contain: @@ -110,6 +113,7 @@ emoji_case: - \u2714 - 赞 - emoji + - '!' traditional_chinese_case: - 使用繁體介紹香港維多利亞港: - contain: diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py index 5854584122..f0f8e1c8b3 100644 --- a/autotest/tools/chat/test_command_chat_hf_pytorch.py +++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py @@ -1,7 +1,8 @@ import allure import conftest import pytest -from utils.config_utils import get_torch_model_list +from utils.config_utils import (get_cuda_prefix_by_workerid, + get_torch_model_list) from utils.run_client_chat import hf_command_line_test conftest._init_cli_case_list() @@ -15,12 +16,40 @@ def getCaseList(): @pytest.mark.order(10) @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat +@pytest.mark.gpu_num_1 @pytest.mark.parametrize('usercase', getCaseList()) -@pytest.mark.parametrize('model', get_torch_model_list()) -def test_hf_pytorch_chat(config, model, cli_case_config, usercase): - result, chat_log, msg = hf_command_line_test(config, usercase, - cli_case_config.get(usercase), - model, 'torch') +@pytest.mark.parametrize('model', get_torch_model_list(tp_num=1)) +def test_hf_pytorch_chat_tp1(config, model, cli_case_config, usercase, + worker_id): + result, chat_log, msg = hf_command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'torch', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) + if chat_log is not None: + allure.attach.file(chat_log, + attachment_type=allure.attachment_type.TEXT) + + assert result, msg + + +@pytest.mark.order(10) +@pytest.mark.usefixtures('cli_case_config') +@pytest.mark.hf_pytorch_chat +@pytest.mark.gpu_num_2 +@pytest.mark.parametrize('usercase', getCaseList()) +@pytest.mark.parametrize('model', get_torch_model_list(tp_num=2)) +def test_hf_pytorch_chat_tp2(config, model, cli_case_config, usercase, + worker_id): + result, chat_log, msg = hf_command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'torch', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2)) if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -34,7 +63,7 @@ def test_hf_pytorch_chat(config, model, cli_case_config, usercase): @pytest.mark.pr_test @pytest.mark.xdist_group(name='pr_test') @pytest.mark.parametrize('usercase', getCaseList()) -@pytest.mark.parametrize('model', ['internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_hf_pytorch_chat_pr(config, model, cli_case_config, usercase): result, chat_log, msg = hf_command_line_test( config, diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py index 3c889fd26d..3e763c0ef2 100644 --- a/autotest/tools/chat/test_command_chat_hf_turbomind.py +++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py @@ -1,7 +1,8 @@ import allure import conftest import pytest -from utils.config_utils import get_turbomind_model_list +from utils.config_utils import (get_cuda_prefix_by_workerid, + get_turbomind_model_list) from utils.run_client_chat import hf_command_line_test conftest._init_cli_case_list() @@ -15,12 +16,41 @@ def getCaseList(): @pytest.mark.order(10) @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat +@pytest.mark.gpu_num_1 @pytest.mark.parametrize('usercase', getCaseList()) -@pytest.mark.parametrize('model', get_turbomind_model_list()) -def test_hf_turbomind_chat(config, model, cli_case_config, usercase): - result, chat_log, msg = hf_command_line_test(config, usercase, - cli_case_config.get(usercase), - model, 'turbomind') +@pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1)) +def test_hf_turbomind_chat_tp1(config, model, cli_case_config, usercase, + worker_id): + result, chat_log, msg = hf_command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'turbomind', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) + + if chat_log is not None: + allure.attach.file(chat_log, + attachment_type=allure.attachment_type.TEXT) + + assert result, msg + + +@pytest.mark.order(10) +@pytest.mark.usefixtures('cli_case_config') +@pytest.mark.hf_turbomind_chat +@pytest.mark.gpu_num_2 +@pytest.mark.parametrize('usercase', getCaseList()) +@pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=2)) +def test_hf_turbomind_chat_tp2(config, model, cli_case_config, usercase, + worker_id): + result, chat_log, msg = hf_command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'turbomind', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2)) if chat_log is not None: allure.attach.file(chat_log, @@ -36,7 +66,8 @@ def test_hf_turbomind_chat(config, model, cli_case_config, usercase): @pytest.mark.xdist_group(name='pr_test') @pytest.mark.parametrize('usercase', getCaseList()) @pytest.mark.parametrize( - 'model', ['internlm2-chat-20b', 'internlm2-chat-20b-inner-w4a16']) + 'model', + ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-w4a16']) def test_hf_turbomind_chat_pr(config, model, cli_case_config, usercase): result, chat_log, msg = hf_command_line_test( config, diff --git a/autotest/tools/chat/test_command_chat_workspace.py b/autotest/tools/chat/test_command_chat_workspace.py index 34f0608783..26afeaf998 100644 --- a/autotest/tools/chat/test_command_chat_workspace.py +++ b/autotest/tools/chat/test_command_chat_workspace.py @@ -1,7 +1,8 @@ import allure import conftest import pytest -from utils.config_utils import get_turbomind_model_list +from utils.config_utils import (get_cuda_prefix_by_workerid, + get_turbomind_model_list) from utils.run_client_chat import command_line_test conftest._init_cli_case_list() @@ -12,9 +13,9 @@ def getPromptCaseList(): return prompt_list -def getModelList(): +def getModelList(tp_num): return [ - item for item in get_turbomind_model_list() + item for item in get_turbomind_model_list(tp_num) if 'kvint8' not in item.lower() ] @@ -22,12 +23,39 @@ def getModelList(): @pytest.mark.order(10) @pytest.mark.usefixtures('cli_case_config') @pytest.mark.command_chat +@pytest.mark.gpu_num_1 @pytest.mark.parametrize('usercase', getPromptCaseList()) -@pytest.mark.parametrize('model', getModelList()) -def test_workspace_chat(config, cli_case_config, usercase, model): - result, chat_log, msg = command_line_test(config, usercase, - cli_case_config.get(usercase), - model, 'turbomind', None) +@pytest.mark.parametrize('model', getModelList(tp_num=1)) +def test_workspace_chat_tp1(config, cli_case_config, usercase, model, + worker_id): + result, chat_log, msg = command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'turbomind', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) + if chat_log is not None: + allure.attach.file(chat_log, + attachment_type=allure.attachment_type.TEXT) + assert result, msg + + +@pytest.mark.order(10) +@pytest.mark.usefixtures('cli_case_config') +@pytest.mark.command_chat +@pytest.mark.gpu_num_2 +@pytest.mark.parametrize('usercase', getPromptCaseList()) +@pytest.mark.parametrize('model', getModelList(tp_num=2)) +def test_workspace_chat_tp2(config, cli_case_config, usercase, model, + worker_id): + result, chat_log, msg = command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'turbomind', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2)) if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -38,10 +66,10 @@ def test_workspace_chat(config, cli_case_config, usercase, model): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.command_chat @pytest.mark.pr_test -@pytest.mark.xdist_group(name='pr_test') @pytest.mark.parametrize('usercase', getPromptCaseList()) @pytest.mark.parametrize( - 'model', ['internlm2-chat-20b', 'internlm2-chat-20b-inner-w4a16']) + 'model', + ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-w4a16']) def test_workspace_chat_pr(config, cli_case_config, usercase, model): result, chat_log, msg = command_line_test( config, diff --git a/autotest/tools/convert/test_convert.py b/autotest/tools/convert/test_convert.py index 074ed8f93d..8fe8d30949 100644 --- a/autotest/tools/convert/test_convert.py +++ b/autotest/tools/convert/test_convert.py @@ -4,15 +4,16 @@ import allure import pytest -from utils.config_utils import get_turbomind_model_list +from utils.config_utils import (get_cuda_prefix_by_workerid, + get_turbomind_model_list) from utils.get_run_config import get_command_with_extra, get_model_name @pytest.mark.order(5) @pytest.mark.convert @pytest.mark.parametrize('model', get_turbomind_model_list()) -def test_convert(config, model): - convert(config, model) +def test_convert(config, model, worker_id): + convert(config, model, get_cuda_prefix_by_workerid(worker_id)) @pytest.mark.order(5) @@ -20,32 +21,40 @@ def test_convert(config, model): @pytest.mark.pr_test @pytest.mark.xdist_group(name='pr_test') @pytest.mark.parametrize( - 'model', ['internlm2-chat-20b', 'internlm2-chat-20b-inner-w4a16']) + 'model', + ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-w4a16']) def test_convert_pr(config, model): - convert(config, model) + convert(config, model, 'CUDA_VISIBLE_DEVICES=5') -def convert(config, model_case): +def convert(config, model_case, cuda_prefix): origin_model_path = config.get('model_path') + '/' + model_case dst_path = config.get('dst_path') + '/workspace_' + model_case log_path = config.get('log_path') model_name = get_model_name(model_case) - if 'w4' in model_case: - cmd = get_command_with_extra( - ' '.join([ - 'lmdeploy convert', model_name, origin_model_path, - '--dst-path', dst_path, '--model-format awq --group-size 128' - ]), config, model_name, True) + if 'w4' in model_case or '4bits' in model_case: + cmd = get_command_with_extra(' '.join([ + 'lmdeploy convert', model_name, origin_model_path, '--dst-path', + dst_path, '--model-format awq --group-size 128' + ]), + config, + model_name, + True, + cuda_prefix=cuda_prefix) else: - cmd = get_command_with_extra( - ' '.join([ - 'lmdeploy convert', model_name, origin_model_path, - '--dst-path', dst_path - ]), config, model_name, True) + cmd = get_command_with_extra(' '.join([ + 'lmdeploy convert', model_name, origin_model_path, '--dst-path', + dst_path + ]), + config, + model_name, + True, + cuda_prefix=cuda_prefix) - convert_log = os.path.join(log_path, 'convert_' + model_case + '.log') + convert_log = os.path.join(log_path, + 'convert_' + model_case.split('/')[1] + '.log') print('reproduce command convert: ' + cmd + '\n') with open(convert_log, 'w') as f: # remove existing workspace diff --git a/autotest/tools/pipeline/pipeline_chat_script.py b/autotest/tools/pipeline/pipeline_chat_script.py index 70d4abcb36..f8a92d9b8f 100644 --- a/autotest/tools/pipeline/pipeline_chat_script.py +++ b/autotest/tools/pipeline/pipeline_chat_script.py @@ -30,7 +30,8 @@ def run_pipeline_chat_test(config, cases_info, model_case, tp, type): if 'pytorch' == type: backend_config = PytorchEngineConfig(tp=tp) else: - if 'kvint8' in model_case and 'w4' in model_case: + if 'kvint8' in model_case and ('w4' in model_case + or '4bits' in model_case): backend_config = TurbomindEngineConfig(tp=tp, model_format='awq', quant_policy=4) @@ -38,7 +39,7 @@ def run_pipeline_chat_test(config, cases_info, model_case, tp, type): backend_config = TurbomindEngineConfig(tp=tp, model_format='hf', quant_policy=4) - elif 'w4' in model_case: + elif 'w4' in model_case or '4bits' in model_case: backend_config = TurbomindEngineConfig(tp=tp, model_format='awq') else: backend_config = TurbomindEngineConfig(tp=tp) diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py index 5014f3a163..7e0318eebd 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py @@ -1,14 +1,15 @@ +import os from multiprocessing import Process import pytest -from utils.config_utils import get_torch_model_list +from utils.config_utils import get_cuda_id_by_workerid, get_torch_model_list from utils.pipeline_chat import (assert_pipeline_chat_log, run_pipeline_chat_test) -def getModelList(): +def getModelList(tp_num): return [ - item for item in get_torch_model_list() + item for item in get_torch_model_list(tp_num) if 'falcon' not in item.lower() and 'chatglm2' not in item.lower() ] @@ -16,9 +17,32 @@ def getModelList(): @pytest.mark.order(6) @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch +@pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('model', getModelList()) -def test_pipeline_chat_pytorch(config, common_case_config, model): +@pytest.mark.parametrize('model', getModelList(tp_num=1)) +def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, + worker_id): + os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + p = Process(target=run_pipeline_chat_test, + args=(config, common_case_config, model, 'pytorch')) + p.start() + p.join() + + # assert script + assert_pipeline_chat_log(config, common_case_config, model) + + +@pytest.mark.order(6) +@pytest.mark.usefixtures('common_case_config') +@pytest.mark.pipeline_chat_pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('model', getModelList(tp_num=2)) +def test_pipeline_chat_pytorch_tp2(config, common_case_config, model, + worker_id): + if 'gw' in worker_id: + os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, + tp_num=2) p = Process(target=run_pipeline_chat_test, args=(config, common_case_config, model, 'pytorch')) p.start() @@ -33,7 +57,7 @@ def test_pipeline_chat_pytorch(config, common_case_config, model): @pytest.mark.pipeline_chat_pytorch @pytest.mark.flaky(reruns=0) @pytest.mark.pr_test -@pytest.mark.parametrize('model', ['internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_pipeline_chat_pytorch_pr(config, common_case_config, model): p = Process(target=run_pipeline_chat_test, args=(config, common_case_config, model, 'pytorch')) diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py index 90773d39bc..e12db44c1e 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py @@ -1,7 +1,8 @@ +import os from multiprocessing import Process import pytest -from utils.config_utils import get_turbomind_model_list +from utils.config_utils import get_all_model_list, get_cuda_id_by_workerid from utils.pipeline_chat import (assert_pipeline_chat_log, run_pipeline_chat_test) @@ -9,9 +10,29 @@ @pytest.mark.order(6) @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat +@pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('model', get_turbomind_model_list()) -def test_pipeline_chat(config, common_case_config, model): +@pytest.mark.parametrize('model', get_all_model_list(tp_num=1)) +def test_pipeline_chat_tp1(config, common_case_config, model, worker_id): + if 'gw' in worker_id: + os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + p = Process(target=run_pipeline_chat_test, + args=(config, common_case_config, model, 'turbomind')) + p.start() + p.join() + assert_pipeline_chat_log(config, common_case_config, model) + + +@pytest.mark.order(6) +@pytest.mark.usefixtures('common_case_config') +@pytest.mark.pipeline_chat +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('model', get_all_model_list(tp_num=2)) +def test_pipeline_chat_tp2(config, common_case_config, model, worker_id): + if 'gw' in worker_id: + os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, + tp_num=2) p = Process(target=run_pipeline_chat_test, args=(config, common_case_config, model, 'turbomind')) p.start() @@ -25,7 +46,8 @@ def test_pipeline_chat(config, common_case_config, model): @pytest.mark.flaky(reruns=0) @pytest.mark.pr_test @pytest.mark.parametrize( - 'model', ['internlm2-chat-20b', 'internlm2-chat-20b-inner-w4a16']) + 'model', + ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-w4a16']) def test_pipeline_chat_pr(config, common_case_config, model): p = Process(target=run_pipeline_chat_test, args=(config, common_case_config, model, 'turbomind')) diff --git a/autotest/tools/quantization/test_quantization_kvint8.py b/autotest/tools/quantization/test_quantization_kvint8.py index 77957a676a..7c57d766e0 100644 --- a/autotest/tools/quantization/test_quantization_kvint8.py +++ b/autotest/tools/quantization/test_quantization_kvint8.py @@ -2,23 +2,23 @@ import allure import pytest +from utils.config_utils import get_cuda_prefix_by_workerid from utils.quantization_utils import quantization -model_list = [('llama-2-7b-chat', 'CUDA_VISIBLE_DEVICES=1'), - ('internlm-chat-20b', 'CUDA_VISIBLE_DEVICES=2'), - ('internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=3'), - ('Qwen-7B-Chat', 'CUDA_VISIBLE_DEVICES=4'), - ('Qwen-14B-Chat', 'CUDA_VISIBLE_DEVICES=5'), - ('internlm2-20b', 'CUDA_VISIBLE_DEVICES=6'), - ('Baichuan2-7B-Chat', 'CUDA_VISIBLE_DEVICES=7')] +model_list = [ + 'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b', + 'internlm/internlm2-chat-20b', 'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat', + 'internlm/internlm2-20b', 'baichuan-inc/Baichuan2-7B-Chat' +] @pytest.mark.order(1) @pytest.mark.quantization_kvint8 @pytest.mark.timeout(900) -@pytest.mark.parametrize('model, prefix', model_list) -def test_quantization_kvint8(config, model, prefix): - quantization_kvint8(config, model + '-inner-kvint8', model, prefix) +@pytest.mark.parametrize('model', model_list) +def test_quantization_kvint8(config, model, worker_id): + quantization_kvint8(config, model + '-inner-kvint8', model, + get_cuda_prefix_by_workerid(worker_id)) def quantization_kvint8(config, quantization_model_name, origin_model_name, @@ -29,9 +29,10 @@ def quantization_kvint8(config, quantization_model_name, origin_model_name, cuda_prefix) log_path = config.get('log_path') quantization_log = os.path.join( - log_path, - '_'.join(['quantization', quantization_type, quantization_model_name - ]) + '.log') + log_path, '_'.join([ + 'quantization', quantization_type, + quantization_model_name.split('/')[1] + ]) + '.log') allure.attach.file(quantization_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/quantization/test_quantization_kvint8_w4a16.py b/autotest/tools/quantization/test_quantization_kvint8_w4a16.py index 9a1c5b6555..44dc1751fb 100644 --- a/autotest/tools/quantization/test_quantization_kvint8_w4a16.py +++ b/autotest/tools/quantization/test_quantization_kvint8_w4a16.py @@ -2,23 +2,26 @@ import allure import pytest +from utils.config_utils import get_cuda_prefix_by_workerid from utils.quantization_utils import quantization -model_list = [('llama-2-7b-chat-inner-kvint8', 'CUDA_VISIBLE_DEVICES=1'), - ('internlm-chat-20b-inner-kvint8', 'CUDA_VISIBLE_DEVICES=2'), - ('internlm2-chat-20b-inner-kvint8', 'CUDA_VISIBLE_DEVICES=3'), - ('Qwen-7B-Chat-inner-kvint8', 'CUDA_VISIBLE_DEVICES=4'), - ('Qwen-14B-Chat-inner-kvint8', 'CUDA_VISIBLE_DEVICES=5'), - ('internlm2-20b-inner-kvint8', 'CUDA_VISIBLE_DEVICES=6'), - ('Baichuan2-7B-Chat-inner-kvint8', 'CUDA_VISIBLE_DEVICES=7')] +model_list = [ + 'meta-llama/Llama-2-7b-chat-inner-kvint8', + 'internlm/internlm-chat-20b-inner-kvint8', + 'internlm/internlm2-chat-20b-inner-kvint8', + 'Qwen/Qwen-7B-Chat-inner-kvint8', 'Qwen/Qwen-14B-Chat-inner-kvint8', + 'internlm/internlm2-20b-inner-kvint8', + 'baichuan-inc/Baichuan2-7B-Chat-inner-kvint8' +] @pytest.mark.order(4) @pytest.mark.quantization_kvint8_w4a16 @pytest.mark.timeout(900) -@pytest.mark.parametrize('model, prefix', model_list) -def test_quantization_kvint8_w4a16(config, model, prefix): - quantization_kvint8(config, model + '-w4a16', model, prefix) +@pytest.mark.parametrize('model', model_list) +def test_quantization_kvint8_w4a16(config, model, worker_id): + quantization_kvint8(config, model + '-w4a16', model, + get_cuda_prefix_by_workerid(worker_id)) def quantization_kvint8(config, quantization_model_name, origin_model_name, @@ -29,9 +32,10 @@ def quantization_kvint8(config, quantization_model_name, origin_model_name, cuda_prefix) log_path = config.get('log_path') quantization_log = os.path.join( - log_path, - '_'.join(['quantization', quantization_type, quantization_model_name - ]) + '.log') + log_path, '_'.join([ + 'quantization', quantization_type, + quantization_model_name.split('/')[1] + ]) + '.log') allure.attach.file(quantization_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/quantization/test_quantization_w4a16.py b/autotest/tools/quantization/test_quantization_w4a16.py index 15749ba70b..3bafadd494 100644 --- a/autotest/tools/quantization/test_quantization_w4a16.py +++ b/autotest/tools/quantization/test_quantization_w4a16.py @@ -2,32 +2,34 @@ import allure import pytest +from utils.config_utils import get_cuda_prefix_by_workerid from utils.quantization_utils import quantization -model_list = [('llama-2-7b-chat', 'CUDA_VISIBLE_DEVICES=0'), - ('internlm-chat-20b', 'CUDA_VISIBLE_DEVICES=1'), - ('Qwen-7B-Chat', 'CUDA_VISIBLE_DEVICES=2'), - ('Qwen-14B-Chat', 'CUDA_VISIBLE_DEVICES=3'), - ('Qwen-VL', 'CUDA_VISIBLE_DEVICES=4'), - ('internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=5'), - ('internlm2-20b', 'CUDA_VISIBLE_DEVICES=6'), - ('Baichuan2-7B-Chat', 'CUDA_VISIBLE_DEVICES=7')] +model_list = [ + 'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b', + 'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat', 'Qwen/Qwen-VL', + 'internlm/internlm2-chat-20b', 'internlm/internlm2-20b', + 'baichuan-inc/Baichuan2-7B-Chat' +] @pytest.mark.order(3) @pytest.mark.quantization_w4a16 @pytest.mark.timeout(900) -@pytest.mark.parametrize('model, prefix', model_list) -def test_quantization_w4a16(config, model, prefix): - quantization_w4a16(config, model + '-inner-w4a16', model, prefix) +@pytest.mark.parametrize('model', model_list) +def test_quantization_w4a16(config, model, worker_id): + quantization_w4a16(config, model + '-inner-w4a16', model, + get_cuda_prefix_by_workerid(worker_id)) @pytest.mark.order(3) @pytest.mark.quantization_w4a16 @pytest.mark.pr_test +@pytest.mark.flaky(reruns=0) @pytest.mark.timeout(900) -@pytest.mark.parametrize('model, prefix', - [('internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=5')]) +@pytest.mark.parametrize( + 'model, prefix', + [('internlm/internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=5')]) def test_quantization_w4a16_pr(config, model, prefix): quantization_w4a16(config, model + '-inner-w4a16', model, prefix) @@ -40,9 +42,10 @@ def quantization_w4a16(config, quantization_model_name, origin_model_name, cuda_prefix) log_path = config.get('log_path') quantization_log = os.path.join( - log_path, - '_'.join(['quantization', quantization_type, quantization_model_name - ]) + '.log') + log_path, '_'.join([ + 'quantization', quantization_type, + quantization_model_name.split('/')[1] + ]) + '.log') allure.attach.file(quantization_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/quantization/test_quantization_w8a8.py b/autotest/tools/quantization/test_quantization_w8a8.py index 7e6690d423..37a198c6d5 100644 --- a/autotest/tools/quantization/test_quantization_w8a8.py +++ b/autotest/tools/quantization/test_quantization_w8a8.py @@ -2,25 +2,23 @@ import allure import pytest +from utils.config_utils import get_cuda_prefix_by_workerid from utils.quantization_utils import quantization -model_list = [('llama-2-7b-chat', 'CUDA_VISIBLE_DEVICES=0'), - ('internlm-chat-20b', 'CUDA_VISIBLE_DEVICES=1'), - ('internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=2'), - ('internlm2-chat-7b', 'CUDA_VISIBLE_DEVICES=3'), - ('Yi-6B-Chat', 'CUDA_VISIBLE_DEVICES=4'), - ('internlm2-20b', 'CUDA_VISIBLE_DEVICES=5')] - -# ('Baichuan2-7B-Chat', 'CUDA_VISIBLE_DEVICES=6') -# ('Baichuan2-13B-Chat', 'CUDA_VISIBLE_DEVICES=7') +model_list = [ + 'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b', + 'internlm/internlm2-chat-20b', 'internlm/internlm2-chat-7b', + '01-ai/Yi-6B-Chat', 'internlm/internlm2-20b' +] @pytest.mark.order(2) @pytest.mark.quantization_w8a8 @pytest.mark.timeout(900) -@pytest.mark.parametrize('model, prefix', model_list) -def test_quantization_w8a8(config, model, prefix): - quantization_w8a8(config, model + '-inner-w8a8', model, prefix) +@pytest.mark.parametrize('model', model_list) +def test_quantization_w8a8(config, model, worker_id): + quantization_w8a8(config, model + '-inner-w8a8', model, + get_cuda_prefix_by_workerid(worker_id)) def quantization_w8a8(config, quantization_model_name, origin_model_name, @@ -31,9 +29,10 @@ def quantization_w8a8(config, quantization_model_name, origin_model_name, cuda_prefix) log_path = config.get('log_path') quantization_log = os.path.join( - log_path, - '_'.join(['quantization', quantization_type, quantization_model_name - ]) + '.log') + log_path, '_'.join([ + 'quantization', quantization_type, + quantization_model_name.split('/')[1] + ]) + '.log') allure.attach.file(quantization_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/restful/test_restful_chat_pytorch.py b/autotest/tools/restful/test_restful_chat_pytorch.py index 1b6fe5607a..6c5b33aa3f 100644 --- a/autotest/tools/restful/test_restful_chat_pytorch.py +++ b/autotest/tools/restful/test_restful_chat_pytorch.py @@ -5,35 +5,50 @@ import allure import pytest from pytest import assume -from utils.config_utils import get_torch_model_list +from utils.config_utils import (get_cuda_prefix_by_workerid, + get_torch_model_list, get_workerid) from utils.get_run_config import get_command_with_extra from utils.run_client_chat import command_line_test from utils.run_restful_chat import (get_model, health_check, interactive_test, open_chat_test) -HTTP_URL = 'http://localhost:23333' +BASE_HTTP_URL = 'http://localhost' +DEFAULT_PORT = 23333 @pytest.fixture(scope='function', autouse=True) -def prepare_environment(request, config): +def prepare_environment(request, config, worker_id): model_path = config.get('model_path') log_path = config.get('log_path') - model = request.param + param = request.param + model = param['model'] + cuda_prefix = param['cuda_prefix'] + tp_num = param['tp_num'] - cmd = ['lmdeploy serve api_server ' + model_path + '/' + model] + if cuda_prefix is None: + cuda_prefix = get_cuda_prefix_by_workerid(worker_id, tp_num=tp_num) + + worker_num = get_workerid(worker_id) + if worker_num is None: + port = DEFAULT_PORT + else: + port = DEFAULT_PORT + worker_num cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path + - '/' + model + ' --backend pytorch', + '/' + model + ' --backend pytorch' + + ' --server-port ' + str(port), config, model, need_tp=True) - start_log = os.path.join(log_path, 'start_restful_' + model + '.log') + print('reproduce command restful: ' + cmd) + + start_log = os.path.join(log_path, + 'start_restful_' + model.split('/')[1] + '.log') with open(start_log, 'w') as f: f.writelines('reproduce command restful: ' + cmd + '\n') - print('reproduce command restful: ' + cmd) # convert convertRes = subprocess.Popen([cmd], @@ -45,7 +60,7 @@ def prepare_environment(request, config): pid = convertRes.pid allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT) - http_url = HTTP_URL + http_url = BASE_HTTP_URL + ':' + str(port) start_time = int(time()) sleep(5) for i in range(120): @@ -58,42 +73,69 @@ def prepare_environment(request, config): yield if pid > 0: - kill_log = os.path.join(log_path, 'kill_' + model + '.log') + kill_log = os.path.join(log_path, + 'kill_' + model.split('/')[1] + '.log') - subprocess.Popen([ - "ps -ef | grep multiprocessing | grep -v grep | awk '{print $2}' " - + '| xargs kill -9' - ], - shell=True, - text=True, - encoding='utf-8') with open(kill_log, 'w') as f: convertRes.kill() allure.attach.file(kill_log, attachment_type=allure.attachment_type.TEXT) -def getModelList(): - return [ - item for item in get_torch_model_list() if 'chat' in item.lower() - and 'falcon' not in item.lower() and 'chatglm2' not in item.lower() - ] +def getModelList(tp_num): + return [{ + 'model': item, + 'cuda_prefix': None, + 'tp_num': tp_num + } for item in get_torch_model_list(tp_num) if 'chat' in item.lower()] @pytest.mark.order(7) @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api_pytorch +@pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(), indirect=True) -def test_restful_chat(config, common_case_config): - run_all_step(config, common_case_config) +@pytest.mark.parametrize('prepare_environment', + getModelList(tp_num=1), + indirect=True) +def test_restful_chat_tp1(config, common_case_config, worker_id): + if get_workerid(worker_id) is None: + run_all_step(config, common_case_config) + else: + run_all_step(config, + common_case_config, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) -def run_all_step(config, cases_info): - http_url = HTTP_URL +@pytest.mark.order(7) +@pytest.mark.usefixtures('common_case_config') +@pytest.mark.restful_api_pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', + getModelList(tp_num=2), + indirect=True) +def test_restful_chat_tp2(config, common_case_config, worker_id): + if get_workerid(worker_id) is None: + run_all_step(config, common_case_config) + else: + run_all_step(config, + common_case_config, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + +def run_all_step(config, + cases_info, + worker_id: str = 'default', + port: int = DEFAULT_PORT): + http_url = BASE_HTTP_URL + ':' + str(port) model = get_model(http_url) - print(model) + if model is None: + assert False, 'server not start correctly' + for case in cases_info.keys(): if (case == 'memory_test' or case == 'emoji_case') and 'chat' not in model.lower(): @@ -103,15 +145,17 @@ def run_all_step(config, cases_info): with allure.step(case + ' step1 - command chat regression'): chat_result, chat_log, msg = command_line_test( - config, case, case_info, model, 'api_client', http_url) - allure.attach.file(chat_log, - attachment_type=allure.attachment_type.TEXT) - with assume: - assert chat_result, msg + config, case, case_info, model + worker_id, 'api_client', + http_url) + if chat_log is not None: + allure.attach.file(chat_log, + attachment_type=allure.attachment_type.TEXT) + with assume: + assert chat_result, msg with allure.step(case + ' step2 - restful_test - openai chat'): restful_result, restful_log, msg = open_chat_test( - config, case_info, model, http_url) + config, case_info, model, http_url, worker_id) allure.attach.file(restful_log, attachment_type=allure.attachment_type.TEXT) with assume: @@ -119,7 +163,7 @@ def run_all_step(config, cases_info): with allure.step(case + ' step3 - restful_test - interactive chat'): active_result, interactive_log, msg = interactive_test( - config, case_info, model, http_url) + config, case_info, model, http_url, worker_id) allure.attach.file(interactive_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/restful/test_restful_chat_turbomind.py b/autotest/tools/restful/test_restful_chat_turbomind.py index cad858333a..f442aec10a 100644 --- a/autotest/tools/restful/test_restful_chat_turbomind.py +++ b/autotest/tools/restful/test_restful_chat_turbomind.py @@ -5,41 +5,56 @@ import allure import pytest from pytest import assume -from utils.config_utils import get_turbomind_model_list +from utils.config_utils import (get_all_model_list, + get_cuda_prefix_by_workerid, get_workerid) from utils.get_run_config import get_command_with_extra from utils.run_client_chat import command_line_test from utils.run_restful_chat import (get_model, health_check, interactive_test, open_chat_test) -HTTP_URL = 'http://localhost:23333' +BASE_HTTP_URL = 'http://localhost' +DEFAULT_PORT = 23333 @pytest.fixture(scope='function', autouse=True) -def prepare_environment(request, config): +def prepare_environment(request, config, worker_id): model_path = config.get('model_path') log_path = config.get('log_path') param = request.param model = param['model'] cuda_prefix = param['cuda_prefix'] + tp_num = param['tp_num'] + + if cuda_prefix is None: + cuda_prefix = get_cuda_prefix_by_workerid(worker_id, tp_num=tp_num) + + worker_num = get_workerid(worker_id) + if worker_num is None: + port = DEFAULT_PORT + else: + port = DEFAULT_PORT + worker_num cmd = ['lmdeploy serve api_server ' + model_path + '/' + model] cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path + - '/' + model, + '/' + model + ' --server-port ' + str(port), config, model, need_tp=True, cuda_prefix=cuda_prefix) - if 'kvint8' in model and 'w4' not in model: - cmd += ' --model-format hf --quant-policy 4' - if 'kvint8' in model and 'w4' in model: + if 'kvint8' in model: cmd += ' --quant-policy 4' - if 'w4' in model: + if 'w4' in model or '4bits' in model: + cmd += ' --model-format awq' + else: + cmd += ' --model-format hf' + if 'w4' in model or '4bits' in model: cmd += ' --model-format awq' - start_log = os.path.join(log_path, 'start_restful_' + model + '.log') + start_log = os.path.join(log_path, + 'start_restful_' + model.split('/')[1] + '.log') print('reproduce command restful: ' + cmd) @@ -56,7 +71,7 @@ def prepare_environment(request, config): pid = convertRes.pid allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT) - http_url = HTTP_URL + http_url = BASE_HTTP_URL + ':' + str(port) start_time = int(time()) sleep(5) for i in range(120): @@ -68,7 +83,8 @@ def prepare_environment(request, config): break yield if pid > 0: - kill_log = os.path.join(log_path, 'kill_' + model + '.log') + kill_log = os.path.join(log_path, + 'kill_' + model.split('/')[1] + '.log') with open(kill_log, 'w') as f: convertRes.kill() @@ -76,45 +92,79 @@ def prepare_environment(request, config): allure.attach.file(kill_log, attachment_type=allure.attachment_type.TEXT) -def getModelList(): +def getModelList(tp_num): return [{ 'model': item, - 'cuda_prefix': None - } for item in get_turbomind_model_list() if 'chat' in item.lower()] + 'cuda_prefix': None, + 'tp_num': tp_num + } for item in get_all_model_list(tp_num) if 'chat' in item.lower()] @pytest.mark.order(7) @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api +@pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(), indirect=True) -def test_restful_chat(config, common_case_config): - run_all_step(config, common_case_config) +@pytest.mark.parametrize('prepare_environment', + getModelList(tp_num=1), + indirect=True) +def test_restful_chat_tp1(request, config, common_case_config, worker_id): + if get_workerid(worker_id) is None: + run_all_step(config, common_case_config) + else: + run_all_step(config, + common_case_config, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) @pytest.mark.order(7) @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api +@pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) -@pytest.mark.pr_test @pytest.mark.parametrize('prepare_environment', - [{ - 'model': 'internlm2-chat-20b', - 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6' - }, { - 'model': 'internlm2-chat-20b-inner-w4a16', - 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6' - }], + getModelList(tp_num=2), + indirect=True) +def test_restful_chat_tp2(config, common_case_config, worker_id): + if get_workerid(worker_id) is None: + run_all_step(config, common_case_config) + else: + run_all_step(config, + common_case_config, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + +@pytest.mark.order(7) +@pytest.mark.usefixtures('common_case_config') +@pytest.mark.restful_api +@pytest.mark.flaky(reruns=0) +@pytest.mark.pr_test +@pytest.mark.parametrize('prepare_environment', [{ + 'model': 'internlm/internlm2-chat-20b', + 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6', + 'tp_num': 2 +}, { + 'model': 'internlm/internlm2-chat-20b-inner-w4a16', + 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6', + 'tp_num': 2 +}], indirect=True) def test_restful_chat_pr(config, common_case_config): run_all_step(config, common_case_config) -def run_all_step(config, cases_info): - http_url = HTTP_URL +def run_all_step(config, + cases_info, + worker_id: str = 'default', + port: int = DEFAULT_PORT): + http_url = BASE_HTTP_URL + ':' + str(port) model = get_model(http_url) - print(model) + + if model is None: + assert False, 'server not start correctly' for case in cases_info.keys(): if (case == 'memory_test' or case == 'emoji_case') and 'chat' not in model.lower(): diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index f811e94f22..48905a1063 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -1,11 +1,11 @@ import os import yaml +from utils.get_run_config import get_tp_num -def get_turbomind_model_list(): +def get_turbomind_model_list(tp_num: int = None): config_path = os.path.join('autotest/config.yaml') - print(config_path) with open(config_path) as f: config = yaml.load(f.read(), Loader=yaml.SafeLoader) @@ -18,12 +18,16 @@ def get_turbomind_model_list(): for key in quatization_case_config.get('kvint8_w4a16'): case_list.append(key + '-inner-kvint8-w4a16') + if tp_num is not None: + return [ + item for item in case_list if get_tp_num(config, item) == tp_num + ] + return case_list -def get_torch_model_list(): +def get_torch_model_list(tp_num: int = None): config_path = os.path.join('autotest/config.yaml') - print(config_path) with open(config_path) as f: config = yaml.load(f.read(), Loader=yaml.SafeLoader) @@ -32,4 +36,64 @@ def get_torch_model_list(): for key in quatization_case_config.get('w8a8'): case_list.append(key + '-inner-w8a8') + if tp_num is not None: + return [ + item for item in case_list if get_tp_num(config, item) == tp_num + ] + return case_list + + +def get_all_model_list(tp_num: int = None): + config_path = os.path.join('autotest/config.yaml') + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + + case_list = config.get('turbomind_model') + for key in config.get('pytorch_model'): + if key not in case_list: + case_list.append(key) + quatization_case_config = config.get('quatization_case_config') + for key in quatization_case_config.get('w4a16'): + case_list.append(key + '-inner-w4a16') + for key in quatization_case_config.get('kvint8'): + case_list.append(key + '-inner-kvint8') + for key in quatization_case_config.get('kvint8_w4a16'): + case_list.append(key + '-inner-kvint8-w4a16') + + if tp_num is not None: + return [ + item for item in case_list if get_tp_num(config, item) == tp_num + ] + + return case_list + + +def get_cuda_prefix_by_workerid(worker_id, tp_num: int = 1): + if worker_id is None or 'gw' not in worker_id: + return None + else: + if tp_num == 1: + return 'CUDA_VISIBLE_DEVICES=' + worker_id.replace('gw', '') + elif tp_num == 2: + cuda_num = int(worker_id.replace('gw', '')) * 2 + return 'CUDA_VISIBLE_DEVICES=' + ','.join( + [str(cuda_num), str(cuda_num + 1)]) + + +def get_cuda_id_by_workerid(worker_id, tp_num: int = 1): + if worker_id is None or 'gw' not in worker_id: + return None + else: + if tp_num == 1: + return worker_id.replace('gw', '') + elif tp_num == 2: + cuda_num = int(worker_id.replace('gw', '')) * 2 + return ','.join([str(cuda_num), str(cuda_num + 1)]) + + +def get_workerid(worker_id): + if worker_id is None or 'gw' not in worker_id: + return None + else: + return int(worker_id.replace('gw', '')) diff --git a/autotest/utils/get_run_config.py b/autotest/utils/get_run_config.py index 03c53cf5d5..120446f47d 100644 --- a/autotest/utils/get_run_config.py +++ b/autotest/utils/get_run_config.py @@ -102,12 +102,21 @@ def _get_available_cude(): def _simple_model_name(model): - model_name = model.replace('-inner-w4a16', '') + if '/' in model: + model_name = model.split('/')[1] + else: + model_name = model + model_name = model_name.replace('-inner-w4a16', '') model_name = model_name.replace('-inner-w8a8', '') model_name = model_name.replace('-inner-kvint8', '') model_name = model_name.replace('-w4a16', '') return model_name +def _split_model_name(model): + model_name = model.split('/')[1] + return model_name + + if __name__ == '__main__': - print(_simple_model_name('Baichuan2-7B-Chat-inner-w4a16')) + print(_simple_model_name('baichuan-inc/Baichuan2-7B-Chat-inner-w4a16')) diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 274730d4a1..0d65ad6ae0 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -27,7 +27,8 @@ def run_pipeline_chat_test(config, cases_info, model_case, type): if 'pytorch' == type: backend_config = PytorchEngineConfig(tp=tp) else: - if 'kvint8' in model_case and 'w4' in model_case: + if 'kvint8' in model_case and ('w4' in model_case + or '4bits' in model_case): backend_config = TurbomindEngineConfig(tp=tp, model_format='awq', quant_policy=4) @@ -35,7 +36,7 @@ def run_pipeline_chat_test(config, cases_info, model_case, type): backend_config = TurbomindEngineConfig(tp=tp, model_format='hf', quant_policy=4) - elif 'w4' in model_case: + elif 'w4' in model_case or '4bits' in model_case: backend_config = TurbomindEngineConfig(tp=tp, model_format='awq') else: backend_config = TurbomindEngineConfig(tp=tp) @@ -43,6 +44,7 @@ def run_pipeline_chat_test(config, cases_info, model_case, type): # run testcases gen_config = GenerationConfig(temperature=0.01) + gen_config = GenerationConfig() for case in cases_info.keys(): if (case == 'memory_test' or case == 'emoji_case') and 'chat' not in model_case.lower(): @@ -50,7 +52,8 @@ def run_pipeline_chat_test(config, cases_info, model_case, type): case_info = cases_info.get(case) pipeline_chat_log = os.path.join( - log_path, 'pipeline_chat_' + model_case + '_' + case + '.log') + log_path, + 'pipeline_chat_' + model_case.split('/')[1] + '_' + case + '.log') file = open(pipeline_chat_log, 'w') @@ -94,7 +97,8 @@ def assert_pipeline_chat_log(config, cases_info, model_case): result = False with allure.step('case - ' + case): pipeline_chat_log = os.path.join( - log_path, 'pipeline_chat_' + model_case + '_' + case + '.log') + log_path, 'pipeline_chat_' + model_case.split('/')[1] + '_' + + case + '.log') with open(pipeline_chat_log, 'r') as f: lines = f.readlines() diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py index 759f6a5169..d4b4272713 100644 --- a/autotest/utils/quantization_utils.py +++ b/autotest/utils/quantization_utils.py @@ -13,9 +13,10 @@ def quantization(config, origin_model_path = config.get('model_path') + '/' + origin_model_name quantization_model_path = model_path + '/' + quantization_model_name quantization_log = os.path.join( - log_path, - '_'.join(['quantization', quantization_type, quantization_model_name - ]) + '.log') + log_path, '_'.join([ + 'quantization', quantization_type, + quantization_model_name.split('/')[1] + ]) + '.log') if quantization_type == 'w4a16': quantization_cmd = ' '.join([ diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py index cfdd2bfa54..334154a95a 100644 --- a/autotest/utils/run_client_chat.py +++ b/autotest/utils/run_client_chat.py @@ -10,7 +10,7 @@ def command_line_test(config, case_info, model_case, type, - extra, + extra: str = None, cuda_prefix: str = None): dst_path = config.get('dst_path') @@ -24,12 +24,16 @@ def command_line_test(config, config, model_case, cuda_prefix=cuda_prefix) - if 'kvint8' in model_case and 'w4' not in model_case: - cmd += ' --model-format hf --quant-policy 4' - if 'kvint8' in model_case and 'w4' in model_case: + if 'kvint8' in model_case: cmd += ' --quant-policy 4' - if 'w4' in model_case: + if 'w4' in model_case or '4bits' in model_case: + cmd += ' --model-format awq' + else: + cmd += ' --model-format hf' + elif 'w4' in model_case or '4bits' in model_case: cmd += ' --model-format awq' + if 'chat' not in model_case.lower(): + cmd += ' --cap completion' return command_test(config, [cmd], model_case, case, case_info, type == 'turbomind') @@ -48,11 +52,13 @@ def hf_command_line_test(config, need_tp=True, cuda_prefix=cuda_prefix) - if 'kvint8' in model_case and 'w4' not in model_case: - cmd += ' --model-format hf --quant-policy 4' - if 'kvint8' in model_case and 'w4' in model_case: + if 'kvint8' in model_case: cmd += ' --quant-policy 4' - if 'w4' in model_case: + if 'w4' in model_case or '4bits' in model_case: + cmd += ' --model-format awq' + else: + cmd += ' --model-format hf' + elif 'w4' in model_case or '4bits' in model_case: cmd += ' --model-format awq' return command_test(config, [cmd], model_case, '_'.join(['hf', type, case]), case_info, True) @@ -66,8 +72,12 @@ def command_test(config, cmd, model, case, case_info, need_extract_output): log_path = config.get('log_path') model_name = get_model_name(model) - chat_log = os.path.join(log_path, - 'chat_' + model + '_' + case + '.log') + if '/' in model: + chat_log = os.path.join( + log_path, 'chat_' + model.split('/')[1] + '_' + case + '.log') + else: + chat_log = os.path.join(log_path, + 'chat_' + model + '_' + case + '.log') file = open(chat_log, 'w') @@ -78,7 +88,7 @@ def command_test(config, cmd, model, case, case_info, need_extract_output): file.writelines('reproduce command chat: ' + ' '.join(cmd) + '\n') spliter = '\n\n' - if model == 'CodeLlama-7b-Instruct-hf': + if 'CodeLlama-7b-Instruct-hf' in model: spliter = '\n!!\n' # join prompt together prompt = '' @@ -136,15 +146,13 @@ def command_test(config, cmd, model, case, case_info, need_extract_output): # 从输出中解析模型输出的对话内容 def parse_dialogue(inputs: str, model: str): dialogues = inputs.strip() - if model == 'CodeLlama-7b-Instruct-hf': + if 'CodeLlama-7b-Instruct-hf' in model: sep = 'enter !! to end the input >>>' else: sep = 'double enter to end input >>>' dialogues = dialogues.strip() dialogues = dialogues.split(sep) dialogues = [d.strip() for d in dialogues] - if 'Llama' in model: - return dialogues return dialogues[1:-1] # 去除首尾无用字符 diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 6236ebe50c..8d82a4c9c4 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -7,10 +7,11 @@ from lmdeploy.serve.openai.api_client import APIClient -def open_chat_test(config, case_info, model, url): +def open_chat_test(config, case_info, model, url, worker_id: str = 'default'): log_path = config.get('log_path') - restful_log = os.path.join(log_path, 'restful_' + model + '.log') + restful_log = os.path.join(log_path, + 'restful_' + model + '_' + worker_id + '.log') file = open(restful_log, 'w') @@ -49,10 +50,15 @@ def open_chat_test(config, case_info, model, url): return result, restful_log, msg -def interactive_test(config, case_info, model, url): +def interactive_test(config, + case_info, + model, + url, + worker_id: str = 'default'): log_path = config.get('log_path') - interactive_log = os.path.join(log_path, 'interactive_' + model + '.log') + interactive_log = os.path.join( + log_path, 'interactive_' + model + '_' + worker_id + '.log') file = open(interactive_log, 'w') diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py index 31a9f0364d..788d9ba9fc 100644 --- a/benchmark/profile_generation.py +++ b/benchmark/profile_generation.py @@ -397,7 +397,8 @@ def main(): engine_config = PytorchEngineConfig( cache_max_entry_count=args.cache_max_entry_count, session_len=session_len, - tp=args.tp) + tp=args.tp, + thread_safe=True) gen_config = EngineGenerationConfig( top_k=args.top_k, top_p=args.top_p, diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index d96614929c..06c3706239 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -307,7 +307,8 @@ def main(): session_len=args.session_len, cache_max_entry_count=args.cache_max_entry_count, max_batch_size=args.concurrency, - tp=args.tp) + tp=args.tp, + thread_safe=True) engine = Engine(args.model_path, engine_config, csv=args.csv) diff --git a/.readthedocs.yaml b/docs/en/.readthedocs.yaml similarity index 81% rename from .readthedocs.yaml rename to docs/en/.readthedocs.yaml index 05ec15cca3..525ef5f7a3 100644 --- a/.readthedocs.yaml +++ b/docs/en/.readthedocs.yaml @@ -7,6 +7,11 @@ build: tools: python: "3.8" + +sphinx: + configuration: docs/en/conf.py + + python: install: - requirements: requirements/docs.txt diff --git a/docs/en/get_started.md b/docs/en/get_started.md index 2b09aa3b74..084947095c 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started.md @@ -31,19 +31,11 @@ For more information on inference pipeline parameters, please refer to [here](./ ## Serving -LMDeploy's `api_server` enables models to be easily packed into services with a single command. The provided RESTful APIs are compatible with OpenAI's interfaces. Below are an example of service startup: +LMDeploy offers various serving methods, choosing one that best meet your requirements. -```shell -lmdeploy serve api_server internlm/internlm-chat-7b -``` - -The default port of `api_server` is `23333`. After the server is launched, you can communicate with server on terminal through `api_client`: - -```shell -lmdeploy serve api_client http://0.0.0.0:23333 -``` - -You can overview and try out `api_server` APIs online by swagger UI at `http://0.0.0.0:23333`, or you can read the API specification from [here](serving/restful_api.md). +- [Serving with openai compatible server](https://lmdeploy.readthedocs.io/en/latest/serving/restful_api.html) +- [Serving with docker](https://lmdeploy.readthedocs.io/en/latest/serving/restful_api.html#option-2-deploying-with-docker) +- [Serving with gradio](https://lmdeploy.readthedocs.io/en/latest/serving/gradio.html) ## Quantization diff --git a/docs/en/index.rst b/docs/en/index.rst index 0ba7b86346..66e9c059b1 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -49,8 +49,8 @@ Welcome to LMDeploy's tutorials! :caption: serving serving/restful_api.md - serving/proxy_server.md serving/gradio.md + serving/proxy_server.md .. _quantization: .. toctree:: @@ -67,6 +67,7 @@ Welcome to LMDeploy's tutorials! advance/pytorch_new_model.md advance/long_context.md + advance/debug_turbomind.md serving/qos.md .. toctree:: diff --git a/docs/en/serving/gradio.md b/docs/en/serving/gradio.md index 803dff50f5..7b223565ff 100644 --- a/docs/en/serving/gradio.md +++ b/docs/en/serving/gradio.md @@ -1,10 +1,25 @@ -# Steps to create a huggingface online demo +# Serving with Gradio -## create space +Starting an LLM model's gradio service with LMDeploy and interacting with the model on the WebUI is incredibly simple. + +```shell +pip install lmdeploy[serve] +lmdeploy serve gradio {model_path} +``` + +All it takes is one-line command, with the `{model_path}` replaced by the model ID from huggingface hub, such as `internlm/internlm2-chat-7b`, or the local path to the model. + +For detailed parameters of the command, please turn to `lmdeploy serve gradio -h` for help. + +## Create a huggingface demo + +If you want to create an online demo project for your model on huggingface, please follow the steps below. + +### Step 1: Create space First, register for a Hugging Face account. After successful registration, click on your profile picture in the upper right corner and select “New Space” to create one. Follow the Hugging Face guide to choose the necessary configurations, and you will have a blank demo space ready. -## A demo for LMDeploy +### Step 2: Develop demo's entrypoint `app.py` Replace the content of `app.py` in your space with the following code: @@ -12,7 +27,7 @@ Replace the content of `app.py` in your space with the following code: from lmdeploy.serve.gradio.turbomind_coupled import run_local from lmdeploy.messages import TurbomindEngineConfig -backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05) +backend_config = TurbomindEngineConfig(max_batch_size=8) model_path = 'internlm/internlm2-chat-7b' run_local(model_path, backend_config=backend_config, server_name="huggingface-space") ``` @@ -25,7 +40,7 @@ lmdeploy ## FAQs -- ZeroGPU compatibility issue. ZeroGPU is more suitable for inference methods similar to PyTorch, rather than Turbomind. You can switch to the PyTorch backend or enable standard GPUs. +- ZeroGPU compatibility issue. ZeroGPU is not suitable for LMDeploy turbomind engine. Please use the standard GPUs. Or, you can change the backend config in the above code to `PyTorchEngineConfig` to use the ZeroGPU. - Gradio version issue, versions above 4.0.0 are currently not supported. You can modify this in `app.py`, for example: ```python import os diff --git a/docs/en/serving/restful_api.md b/docs/en/serving/restful_api.md index de1ea9fa44..d092d9b288 100644 --- a/docs/en/serving/restful_api.md +++ b/docs/en/serving/restful_api.md @@ -1,34 +1,29 @@ -# Restful API +# Serving with OpenAI Compatible Server -## Launch Service +This article primarily discusses the deployment of a single LLM model across multiple GPUs on a single node, providing a service that is compatible with the OpenAI interface, as well as the usage of the service API. +For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](./proxy_server.md). -The user can open the http url print by the following command in a browser. +In the following sections, we will first introduce two methods for starting the service, choosing the appropriate one based on your application scenario. -- **Please check the http url for the detailed api usage!!!** -- **Please check the http url for the detailed api usage!!!** -- **Please check the http url for the detailed api usage!!!** +Next, we focus on the definition of the service's RESTful API, explore the various ways to interact with the interface, and demonstrate how to try the service through the Swagger UI or LMDeploy CLI tools. -```shell -lmdeploy serve api_server ./workspace --server-name 0.0.0.0 --server-port ${server_port} --tp 1 -``` +Finally, we showcase how to integrate the service into a WebUI, providing you with a reference to easily set up a demonstration demo. -The parameters supported by api_server can be viewed through the command line `lmdeploy serve api_server -h`. +## Launch Service -We provide some RESTful APIs. Three of them are in OpenAI format. +Take the [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) model hosted on huggingface hub as an example, you can choose one the following methods to start the service. -- /v1/chat/completions -- /v1/models -- /v1/completions +### Option 1: Launching with lmdeploy CLI -However, we recommend users try -our own api `/v1/chat/interactive` which provides more arguments for users to modify. The performance is comparatively better. +```shell +lmdeploy serve api_server internlm/internlm2-chat-7b --server-port 23333 +``` -**Note** please, if you want to launch multiple requests, you'd better set different `session_id` for both -`/v1/chat/completions` and `/v1/chat/interactive` apis. Or, we will set them random values. +The arguments of `api_server` can be viewed through the command `lmdeploy serve api_server -h`, for instance, `--tp` to set tensor parallelism, `--session-len` to specify the max length of the context window, `--cache-max-entry-count` to adjust the GPU mem ratio for k/v cache etc. -## Deploy http service with docker +### Option 2: Deploying with docker -LMDeploy offers [official docker image](https://hub.docker.com/r/openmmlab/lmdeploy/tags) for deployment. The image can be used to run OpenAI compatible server. +With LMDeploy [official docker image](https://hub.docker.com/r/openmmlab/lmdeploy/tags), you can run OpenAI compatible server as follows: ```shell docker run --runtime nvidia --gpus all \ @@ -40,11 +35,60 @@ docker run --runtime nvidia --gpus all \ lmdeploy serve api_server internlm/internlm2-chat-7b ``` -Just like the previous section, user can try the Swagger UI with a web browser. +The parameters of `api_server` are the same with that mentioned in "[option 1](#option-1-launching-with-lmdeploy-cli)" section + +## RESTful API + +LMDeploy's RESTful API is compatible with the following three OpenAI interfaces: + +- /v1/chat/completions +- /v1/models +- /v1/completions + +Additionally, LMDeploy also defines `/v1/chat/interactive` to support interactive inference. The feature of interactive inference is that there's no need to pass the user conversation history as required by `v1/chat/completions`, since the conversation history will be cached on the server side. This method boasts excellent performance during multi-turn long context inference. + +You can overview and try out the offered RESTful APIs by the website `http://0.0.0.0:23333` as shown in the below image after launching the service successfully. + +![swagger_ui](https://github.com/InternLM/lmdeploy/assets/4560679/b891dd90-3ffa-4333-92b2-fb29dffa1459) + +Or, you can use the LMDeploy's built-in CLI tool to verify the service correctness right from the console. + +```shell +# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 +lmdeploy serve api_client ${api_server_url} +``` + +If you need to integrate the service into your own projects or products, we recommend the following approach: + +### Integrate with `OpenAI` + +Here is an example of interaction with the endpoint `v1/chat/completions` service via the openai package. +Before running it, please install the openai package by `pip install openai` + +```python +from openai import OpenAI +client = OpenAI( + api_key='YOUR_API_KEY', + base_url="http://0.0.0.0:23333/v1" +) + +response = client.chat.completions.create( + model="internlm2-chat-7b", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": " provide three suggestions about time management"}, + ], + temperature=0.8, + top_p=0.8 +) +print(response) +``` + +You can invoke other OpenAI interfaces using similar methods. For more detailed information, please refer to the [OpenAI API guide](https://platform.openai.com/docs/guides/text-generation) -## python +### Integrate with lmdeploy `APIClient` -We have integrated the client-side functionalities of these services into the `APIClient` class. Below are some examples demonstrating how to invoke the `api_server` service on the client side. +Below are some examples demonstrating how to visit the service through `APIClient` If you want to use the `/v1/chat/completions` endpoint, you can try the following code: @@ -57,7 +101,7 @@ for item in api_client.chat_completions_v1(model=model_name, messages=messages): print(item) ``` -For the `/v1/completions` endpoint. If you want to use the `/v1/completions` endpoint, you can try: +For the `/v1/completions` endpoint, you can try: ```python from lmdeploy.serve.openai.api_client import APIClient @@ -67,23 +111,29 @@ for item in api_client.completions_v1(model=model_name, prompt='hi'): print(item) ``` -Lmdeploy supports maintaining session histories on the server for `/v1/chat/interactive` api. We disable the -feature by default. +As for `/v1/chat/interactive`,we disable the feature by default. Please open it by setting `interactive_mode = True`. If you don't, it falls back to openai compatible interfaces. -- On interactive mode, the chat history is kept on the server. In a multiple rounds of conversation, you should set - `interactive_mode = True` and the same `session_id` (can't be -1, it's the default number) to `/v1/chat/interactive` for requests. -- On normal mode, no chat history is kept on the server. - -The interactive mode can be controlled by the `interactive_mode` boolean parameter. The following is an example of normal mode. If you want to experience the interactive mode, simply pass in `interactive_mode=True`. +Keep in mind that `session_id` indicates an identical sequence and all requests belonging to the same sequence must share the same `session_id`. +For instance, in a sequence with 10 rounds of chatting requests, the `session_id` in each request should be the same. ```python from lmdeploy.serve.openai.api_client import APIClient -api_client = APIClient('http://{server_ip}:{server_port}') -for item in api_client.chat_interactive_v1(prompt='hi'): - print(item) +api_client = APIClient(f'http://{server_ip}:{server_port}') +messages = [ + "hi, what's your name?", + "who developed you?", + "Tell me more about your developers", + "Summarize the information we've talked so far" +] +for message in messages: + for item in api_client.chat_interactive_v1(prompt=message, + session_id=1, + interactive_mode=True, + stream=False): + print(item) ``` -## Java/Golang/Rust +### Integrate with Java/Golang/Rust May use [openapi-generator-cli](https://github.com/OpenAPITools/openapi-generator-cli) to convert `http://{server_ip}:{server_port}/openapi.json` to java/rust/golang client. Here is an example: @@ -102,29 +152,17 @@ rust/src: apis lib.rs models ``` -## cURL +### Integrate with cURL -cURL is a tool for observing the output of the api. +cURL is a tool for observing the output of the RESTful APIs. -List Models: +- list served models `v1/models` ```bash curl http://{server_ip}:{server_port}/v1/models ``` -Interactive Chat: - -```bash -curl http://{server_ip}:{server_port}/v1/chat/interactive \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "Hello! How are you?", - "session_id": 1, - "interactive_mode": true - }' -``` - -Chat Completions: +- chat `v1/chat/completions` ```bash curl http://{server_ip}:{server_port}/v1/chat/completions \ @@ -135,7 +173,7 @@ curl http://{server_ip}:{server_port}/v1/chat/completions \ }' ``` -Text Completions: +- text completions `v1/completions` ```shell curl http://{server_ip}:{server_port}/v1/completions \ @@ -146,18 +184,23 @@ curl http://{server_ip}:{server_port}/v1/completions \ }' ``` -## CLI client - -There is a client script for restful api server. +- interactive chat `v1/chat/interactive` -```shell -# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 -lmdeploy serve api_client api_server_url +```bash +curl http://{server_ip}:{server_port}/v1/chat/interactive \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "Hello! How are you?", + "session_id": 1, + "interactive_mode": true + }' ``` -## webui through gradio +## Integrate with WebUI + +LMDeploy utilizes `gradio` or [OpenAOE](https://github.com/InternLM/OpenAOE) to integrate a web ui for `api_server` -You can also test restful-api through webui. +### Option 1: gradio ```shell # api_server_url is what printed in api_server.py, e.g. http://localhost:23333 @@ -166,9 +209,7 @@ You can also test restful-api through webui. lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port} ``` -## webui through OpenAOE - -You can use [OpenAOE](https://github.com/InternLM/OpenAOE) for seamless integration with LMDeploy. +### Option 2: OpenAOE ```shell pip install -U openaoe @@ -191,7 +232,3 @@ Please refer to the [guidance](https://github.com/InternLM/OpenAOE/blob/main/doc 5. If you need to adjust other default parameters of the session, such as the content of fields like system. You can directly pass in the initialization parameters of the [dialogue template](https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/model.py). For example, for the internlm-chat-7b model, you can set the `--meta-instruction` parameter when starting the `api_server`. 6. Regarding the stop words, we only support characters that encode into a single index. Furthermore, there may be multiple indexes that decode into results containing the stop word. In such cases, if the number of these indexes is too large, we will only use the index encoded by the tokenizer. If you want use a stop symbol that encodes into multiple indexes, you may consider performing string matching on the streaming client side. Once a successful match is found, you can then break out of the streaming loop. - -## request distribution service - -Please refer to our [request distributor server](./proxy_server.md) diff --git a/docs/zh_cn/.readthedocs.yaml b/docs/zh_cn/.readthedocs.yaml new file mode 100644 index 0000000000..9f94662947 --- /dev/null +++ b/docs/zh_cn/.readthedocs.yaml @@ -0,0 +1,18 @@ +version: 2 + +formats: all + +build: + os: "ubuntu-22.04" + tools: + python: "3.8" + + +sphinx: + configuration: docs/zh_cn/conf.py + + +python: + install: + - requirements: requirements/docs.txt + - requirements: requirements/readthedocs.txt diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md index f5d6b1c13f..26fd61ab71 100644 --- a/docs/zh_cn/get_started.md +++ b/docs/zh_cn/get_started.md @@ -31,19 +31,11 @@ print(response) ## 推理服务 -LMDeploy `api_server` 支持把模型一键封装为服务,对外提供的 RESTful API 兼容 openai 的接口。以下为服务启动的示例: +LMDeploy 提供了多种部署模型推理服务的方式,总有一款适合你。 -```shell -lmdeploy serve api_server internlm/internlm-chat-7b -``` - -服务默认端口是23333。在 server 启动后,你可以在终端通过`api_client`与server进行对话: - -```shell -lmdeploy serve api_client http://0.0.0.0:23333 -``` - -除了`api_client`,你还可以通过 Swagger UI `http://0.0.0.0:23333` 在线阅读和试用 `api_server` 的各接口,也可直接查阅[文档](serving/restful_api.md),了解各接口的定义和使用方法。 +- [部署类 openai 的服务](https://lmdeploy.readthedocs.io/zh-cn/latest//serving/restful_api.html) +- [通过 docker 部署服务](https://lmdeploy.readthedocs.io/zh-cn/latest/serving/restful_api.html#docker) +- [部署 gradio 服务](https://lmdeploy.readthedocs.io/zh-cn/latest/serving/gradio.html) ## 模型量化 diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index 3f8272f01b..265fb716d2 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -50,8 +50,8 @@ :caption: 服务 serving/restful_api.md - serving/proxy_server.md serving/gradio.md + serving/proxy_server.md .. _量化: @@ -69,6 +69,7 @@ advance/pytorch_new_model.md advance/long_context.md + advance/debug_turbomind.md serving/qos.md .. toctree:: diff --git a/docs/zh_cn/serving/gradio.md b/docs/zh_cn/serving/gradio.md index fe1e01af3f..3e70f68856 100644 --- a/docs/zh_cn/serving/gradio.md +++ b/docs/zh_cn/serving/gradio.md @@ -1,11 +1,26 @@ -# 从 LMDeploy 创建一个 huggingface 的在线 demo +# 部署 gradio 服务 -## 创建 space +通过 LMDeploy 启动 LLM 模型的 gradio 服务,并在 WebUI 上和模型对话特别简单,一条命令即可。 + +```shell +pip install lmdeploy[serve] +lmdeploy serve gradio {model_path} +``` + +把上面命令中的 `{model_path}` 换成 huggingface hub 上的模型 id,比如 internlm/internlm2-chat-7b,或者换成模型的本地路径就可以了。 + +关于命令的详细参数,请使用 `lmdeploy serve gradio --help` 查阅。 + +## 创建 huggingface demo + +如果想要在 huggingface 上创建模型的在线演示项目,请按以下步骤进行。 + +### 第一步:创建 space 首先,注册一个 huggingface 的账号,注册成功后,可以点击右上角头像,选择 New Space 创建。 根据 huggingface 的引导选择需要的配置,完成后即可得到一个空白的 demo。 -## 使用 LMDeploy 的 demo +### 第二步:编写 demo 入口代码 app.py 以 `internlm/internlm2-chat-7b` 模型为例,将 space 空间中的`app.py`内容填写为: @@ -13,7 +28,7 @@ from lmdeploy.serve.gradio.turbomind_coupled import run_local from lmdeploy.messages import TurbomindEngineConfig -backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05) +backend_config = TurbomindEngineConfig(max_batch_size=8) model_path = 'internlm/internlm2-chat-7b' run_local(model_path, backend_config=backend_config, server_name="huggingface-space") ``` @@ -26,7 +41,7 @@ lmdeploy ## FAQs -- ZeroGPU 适配问题。ZeroGPU 更适合类似 PyTorch 这样的推理方式,而非 Turbomind。可以改用 pytorch 后端,或者启用普通 GPU。 +- ZeroGPU 适配问题。ZeroGPU不适用 LMDeploy Turbomind 引擎,请选择普通 GPU,或者把上述代码中的 backend_config 改成 PyTorchEngineConfig,就可以用 ZeroGPU 了。 - gradio 版本问题,目前不支持 4.0.0 以上版本,可以在 `app.py` 中修改,类似: ```python import os diff --git a/docs/zh_cn/serving/restful_api.md b/docs/zh_cn/serving/restful_api.md index bb0e4da12d..76b76a0a63 100644 --- a/docs/zh_cn/serving/restful_api.md +++ b/docs/zh_cn/serving/restful_api.md @@ -1,31 +1,29 @@ -# Restful API +# 部署类 openai 服务 -## 启动服务 +本文主要介绍单个模型在单机多卡环境下,部署兼容 openai 接口服务的方式,以及服务接口的用法。为行文方便,我们把该服务名称为 `api_server`。对于多模型的并行服务,请阅读[请求分发服务器](./proxy_server.md)一文。 -用户将下面命令输出的 http url 复制到浏览器打开,详细查看所有的 API 及其使用方法。 -请一定查看`http://{server_ip}:{server_port}`!!! -请一定查看`http://{server_ip}:{server_port}`!!! -请一定查看`http://{server_ip}:{server_port}`!!! -重要的事情说三遍。 +在这篇文章中, 我们首先介绍服务启动的两种方法,你可以根据应用场景,选择合适的。 -```shell -lmdeploy serve api_server ./workspace --server-name 0.0.0.0 --server-port ${server_port} --tp 1 -``` +其次,我们重点介绍服务的 RESTful API 定义,以及接口使用的方式,并展示如何通过 Swagger UI、LMDeploy CLI 工具体验服务功能 -api_server 启动时支持的参数可以通过命令行`lmdeploy serve api_server -h`查看。 +最后,向大家演示把服务接入到 WebUI 的方式,你可以参考它简单搭建一个演示 demo。 -我们提供的 restful api,其中三个仿照 OpenAI 的形式。 +## 启动服务 -- /v1/chat/completions -- /v1/models -- /v1/completions +以 huggingface hub 上的 [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) 模型为例,你可以任选以下方式之一,启动推理服务。 -不过,我们建议用户用我们提供的另一个 API: `/v1/chat/interactive`。 -它有更好的性能,提供更多的参数让用户自定义修改。 +### 方式一:使用 lmdeploy cli 工具 -## 用 docker 部署 http 服务 +```shell +lmdeploy serve api_server internlm/internlm2-chat-7b --server-port 23333 +``` + +api_server 启动时的参数可以通过命令行`lmdeploy serve api_server -h`查看。 +比如,`--tp` 设置张量并行,`--session-len` 设置推理的最大上下文窗口长度,`--cache-max-entry-count` 调整 k/v cache 的内存使用比例等等。 -LMDeploy 提供了官方[镜像](https://hub.docker.com/r/openmmlab/lmdeploy/tags)。使用这个镜像,可以运行兼容 OpenAI 的服务。下面是使用示例: +### 方式二:使用 docker + +使用 LMDeploy 官方[镜像](https://hub.docker.com/r/openmmlab/lmdeploy/tags),可以运行兼容 OpenAI 的服务。下面是使用示例: ```shell docker run --runtime nvidia --gpus all \ @@ -37,16 +35,64 @@ docker run --runtime nvidia --gpus all \ lmdeploy serve api_server internlm/internlm2-chat-7b ``` -然后像上面一样使用浏览器试用 Swagger UI 即可。 +在这个例子中,`lmdeploy server api_server` 的命令参数与方式一一致。 + +## RESTful API + +LMDeploy 的 RESTful API 兼容了 OpenAI 以下 3 个接口: + +- /v1/chat/completions +- /v1/models +- /v1/completions + +此外,LMDeploy 还定义了 `/v1/chat/interactive`,用来支持交互式推理。交互式推理的特点是不用像`v1/chat/completions`传入用户对话历史,因为对话历史会被缓存在服务端。 +这种方式在多轮次的长序列推理时,拥有很好的性能。 + +服务启动后,你可以在浏览器中打开网页 http://0.0.0.0:23333,通过 Swagger UI 查看接口的详细说明,并且也可以直接在网页上操作,体验每个接口的用法,如下图所示。 + +![swagger_ui](https://github.com/InternLM/lmdeploy/assets/4560679/b891dd90-3ffa-4333-92b2-fb29dffa1459) + +也可以使用 LMDeploy 自带的 CLI 工具,在控制台验证服务的正确性。 + +```shell +# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 +lmdeploy serve api_client ${api_server_url} +``` + +若需要把服务集成到自己的项目或者产品中,我们推荐以下用法: + +### 使用 openai 接口 + +以下代码是通过 openai 包使用 `v1/chat/completions` 服务的例子。运行之前,请先安装 openai 包: `pip install openai`。 + +```python +from openai import OpenAI +client = OpenAI( + api_key='YOUR_API_KEY', + base_url="http://0.0.0.0:23333/v1" +) + +response = client.chat.completions.create( + model="internlm2-chat-7b", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": " provide three suggestions about time management"}, + ], + temperature=0.8, + top_p=0.8 +) +print(response) +``` + +关于其他 openai 接口的调用,也可以如法炮制。详情请参考 openai 官方[文档](https://platform.openai.com/docs/guides/text-generation) -## python +### 使用 lmdeploy `APIClient` 接口 -我们将这些服务的客户端功能集成在 `APIClient` 类中。下面是一些例子,展示如何在客户端调用 `api_server` 服务。 如果你想用 `/v1/chat/completions` 接口,你可以尝试下面代码: ```python from lmdeploy.serve.openai.api_client import APIClient -api_client = APIClient('http://{server_ip}:{server_port}') +api_client = APIClient(f'http://{server_ip}:{server_port}') model_name = api_client.available_models[0] messages = [{"role": "user", "content": "Say this is a test!"}] for item in api_client.chat_completions_v1(model=model_name, messages=messages): @@ -57,28 +103,35 @@ for item in api_client.chat_completions_v1(model=model_name, messages=messages): ```python from lmdeploy.serve.openai.api_client import APIClient -api_client = APIClient('http://{server_ip}:{server_port}') +api_client = APIClient(f'http://{server_ip}:{server_port}') model_name = api_client.available_models[0] for item in api_client.completions_v1(model=model_name, prompt='hi'): print(item) ``` -LMDeploy 的 `/v1/chat/interactive` api 支持将对话内容管理在服务端,但是我们默认关闭。如果想尝试,请阅读以下介绍: +关于 `/v1/chat/interactive` 接口,我们默认是关闭的。在使用时,请设置`interactive_mode = True`打开它。否则,它会退化为 openai 接口。 -- 交互模式下,对话历史保存在 server。在一次完整的多轮对话中,所有请求设置`interactive_mode = True`, `session_id`保持相同 (不为 -1,这是缺省值)。 -- 非交互模式下,server 不保存历史记录。 - -交互模式可以通过 `interactive_mode` 布尔量参数控制。下面是一个普通模式的例子, -如果要体验交互模式,将 `interactive_mode=True` 传入即可。 +在交互式推理中,每个对话序列的 id 必须唯一,所有属于该独立的对话请求,必须使用相同的 id。这里的 id 对应与接口中的 `session_id`。 +比如,一个对话序列中,有 10 轮对话请求,那么每轮对话请求中的 `session_id` 都要相同。 ```python from lmdeploy.serve.openai.api_client import APIClient -api_client = APIClient('http://{server_ip}:{server_port}') -for item in api_client.chat_interactive_v1(prompt='hi'): - print(item) +api_client = APIClient(f'http://{server_ip}:{server_port}') +messages = [ + "hi, what's your name?", + "who developed you?", + "Tell me more about your developers", + "Summarize the information we've talked so far" +] +for message in messages: + for item in api_client.chat_interactive_v1(prompt=message, + session_id=1, + interactive_mode=True, + stream=False): + print(item) ``` -## Java/Golang/Rust +### 使用 Java/Golang/Rust 可以使用代码生成工具 [openapi-generator-cli](https://github.com/OpenAPITools/openapi-generator-cli) 将 `http://{server_ip}:{server_port}/openapi.json` 转成 java/rust/golang 客户端。 下面是一个使用示例: @@ -97,29 +150,17 @@ rust/src: apis lib.rs models ``` -## cURL +### 使用 cURL cURL 也可以用于查看 API 的输出结果 -查看模型列表: +- 查看模型列表 `v1/models` ```bash curl http://{server_ip}:{server_port}/v1/models ``` -Interactive Chat: - -```bash -curl http://{server_ip}:{server_port}/v1/chat/interactive \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "Hello! How are you?", - "session_id": 1, - "interactive_mode": true - }' -``` - -Chat Completions: +- 对话 `v1/chat/completions` ```bash curl http://{server_ip}:{server_port}/v1/chat/completions \ @@ -130,7 +171,7 @@ curl http://{server_ip}:{server_port}/v1/chat/completions \ }' ``` -Text Completions: +- 文本补全 `v1/completions` ```shell curl http://{server_ip}:{server_port}/v1/completions \ @@ -141,18 +182,23 @@ curl http://{server_ip}:{server_port}/v1/completions \ }' ``` -## CLI client +- 交互式对话 `v1/chat/interactive` -restful api 服务可以通过客户端测试,例如 - -```shell -# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 -lmdeploy serve api_client api_server_url +```bash +curl http://{server_ip}:{server_port}/v1/chat/interactive \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "Hello! How are you?", + "session_id": 1, + "interactive_mode": true + }' ``` -## webui through gradio +## 接入 WebUI + +LMDeploy 提供 gradio 和 [OpenAOE](https://github.com/InternLM/OpenAOE) 两种方式,为 api_server 接入 WebUI。 -也可以直接用 webui 测试使用 restful-api。 +### 方式一:通过 gradio 接入 ```shell # api_server_url 就是 api_server 产生的,比如 http://localhost:23333 @@ -161,9 +207,7 @@ lmdeploy serve api_client api_server_url lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port} ``` -## webui through OpenAOE - -可以使用 [OpenAOE](https://github.com/InternLM/OpenAOE) 无缝接入restful api服务. +### 方式二:通过 OpenAOE 接入 ```shell pip install -U openaoe @@ -185,7 +229,3 @@ openaoe -f /path/to/your/config-template.yaml 5. 如需调整会话默认的其他参数,比如 system 等字段的内容,可以直接将[对话模板](https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/model.py)初始化参数传入。比如 internlm-chat-7b 模型,可以通过启动`api_server`时,设置`--meta-instruction`参数。 6. 关于停止符,我们只支持编码后为单个 index 的字符。此外,可能存在多种 index 都会解码出带有停止符的结果。对于这种情况,如果这些 index 数量太多,我们只会采用 tokenizer 编码出的 index。而如果你想要编码后为多个 index 的停止符,可以考虑在流式客户端做字符串匹配,匹配成功后跳出流式循环即可。 - -## 多机并行服务 - -请参考我们的 [请求分发服务器](./proxy_server.md) diff --git a/lmdeploy/api.py b/lmdeploy/api.py index 34f2be9b72..78607bfd73 100644 --- a/lmdeploy/api.py +++ b/lmdeploy/api.py @@ -2,6 +2,7 @@ import os from typing import List, Literal, Optional, Union +from .archs import autoget_backend_config from .messages import PytorchEngineConfig, TurbomindEngineConfig from .model import ChatTemplateConfig @@ -31,7 +32,7 @@ def pipeline(model_path: str, model_name (str): needed when model_path is a pytorch model on huggingface.co, such as "internlm/internlm-chat-7b", "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on. - backend_config (TurbomindEngineConfig | PytorchEngineConfig): beckend + backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend config instance. Default to None. chat_template_config (ChatTemplateConfig): chat template configuration. Default to None. @@ -49,8 +50,13 @@ def pipeline(model_path: str, from lmdeploy.utils import get_logger logger = get_logger('lmdeploy') logger.setLevel(log_level) + + if type(backend_config) is not PytorchEngineConfig: + # set auto backend mode + backend_config = autoget_backend_config(model_path, backend_config) backend = 'pytorch' if type( backend_config) is PytorchEngineConfig else 'turbomind' + logger.info(f'Using {backend} engine') if 'tp' in kwargs: logger.warning( 'The argument "tp" is deprecated and will be removed soon. ' @@ -101,7 +107,7 @@ def serve(model_path: str, "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on. backend (str): either `turbomind` or `pytorch` backend. Default to `turbomind` backend. - backend_config (TurbomindEngineConfig | PytorchEngineConfig): beckend + backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend config instance. Default to none. chat_template_config (ChatTemplateConfig): chat template configuration. Default to None. @@ -126,6 +132,12 @@ def serve(model_path: str, from lmdeploy.serve.openai.api_client import APIClient from lmdeploy.serve.openai.api_server import serve + + if type(backend_config) is not PytorchEngineConfig: + # set auto backend mode + backend_config = autoget_backend_config(model_path, backend_config) + backend = 'pytorch' if type( + backend_config) is PytorchEngineConfig else 'turbomind' if 'tp' in kwargs: tp = kwargs['tp'] kwargs.pop('tp') diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py new file mode 100644 index 0000000000..7a8b7f817e --- /dev/null +++ b/lmdeploy/archs.py @@ -0,0 +1,82 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Literal, Optional, Union + +from .messages import PytorchEngineConfig, TurbomindEngineConfig +from .utils import get_logger + +logger = get_logger('lmdeploy') + + +def autoget_backend(model_path: str) -> Union[Literal['turbomind', 'pytorch']]: + """Get backend type in auto backend mode. + + Args: + model_path (str): the path of a model. + It could be one of the following options: + - i) A local directory path of a turbomind model which is + converted by `lmdeploy convert` command or download from + ii) and iii). + - ii) The model_id of a lmdeploy-quantized model hosted + inside a model repo on huggingface.co, such as + "InternLM/internlm-chat-20b-4bit", + "lmdeploy/llama2-chat-70b-4bit", etc. + - iii) The model_id of a model hosted inside a model repo + on huggingface.co, such as "internlm/internlm-chat-7b", + "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" + and so on. + + Returns: + str: the backend type. + """ + from lmdeploy.pytorch.supported_models import \ + is_supported as is_supported_pytorch + + pytorch_has, turbomind_has = False, False + try: + from lmdeploy.turbomind.supported_models import \ + is_supported as is_supported_turbomind + turbomind_has = is_supported_turbomind(model_path) + except ImportError: + logger.warning( + 'Lmdeploy with turbomind engine is not installed correctly. ' + 'You may need to install lmdeploy from pypi or build from source ' + 'for turbomind engine.') + + pytorch_has = is_supported_pytorch(model_path) + + if not (pytorch_has or turbomind_has): + logger.warning(f'{model_path} is not explicitly supported by lmdeploy.' + f' Try to run with lmdeploy pytorch engine.') + backend = 'turbomind' if turbomind_has else 'pytorch' + return backend + + +def autoget_backend_config( + model_path: str, + backend_config: Optional[Union[PytorchEngineConfig, + TurbomindEngineConfig]] = None +) -> Union[PytorchEngineConfig, TurbomindEngineConfig]: + """Get backend config automatically. + + Args: + model_path (str): The input model path. + backend_config (TurbomindEngineConfig | PytorchEngineConfig): The + input backend config. Default to None. + + Returns: + (PytorchEngineConfig | TurbomindEngineConfig): The auto-determined + backend engine config. + """ + from dataclasses import asdict + + backend = autoget_backend(model_path) + if backend == 'pytorch': + config = PytorchEngineConfig() + else: + config = TurbomindEngineConfig() + if backend_config is not None: + data = asdict(backend_config) + for k, v in data.items(): + if v and hasattr(config, k): + setattr(config, k, v) + return config diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py index 4a4a73046c..e4aa405653 100644 --- a/lmdeploy/cli/cli.py +++ b/lmdeploy/cli/cli.py @@ -107,7 +107,8 @@ def list(args): if engine == 'pytorch': model_names = [ 'llama', 'llama2', 'internlm', 'internlm2', 'baichuan2', - 'chatglm2', 'falcon', 'yi', 'mistral', 'qwen1.5', 'gemma' + 'chatglm2', 'falcon', 'yi', 'mistral', 'mixtral', 'qwen1.5', + 'gemma', 'deepseek' ] elif engine == 'turbomind': from lmdeploy.model import MODELS diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py index 858584ae7d..7b01afcb3e 100644 --- a/lmdeploy/cli/serve.py +++ b/lmdeploy/cli/serve.py @@ -189,10 +189,17 @@ def add_parser_triton_client(): @staticmethod def gradio(args): """Serve LLMs with web UI using gradio.""" + from lmdeploy.archs import autoget_backend + from lmdeploy.messages import (PytorchEngineConfig, + TurbomindEngineConfig) from lmdeploy.model import ChatTemplateConfig from lmdeploy.serve.gradio.app import run - if args.backend == 'pytorch': - from lmdeploy.messages import PytorchEngineConfig + backend = args.backend + + if backend != 'pytorch' and ':' not in args.model_path_or_server: + # set auto backend mode + backend = autoget_backend(args.model_path_or_server) + if backend == 'pytorch': backend_config = PytorchEngineConfig( tp=args.tp, model_name=args.model_name, @@ -200,7 +207,6 @@ def gradio(args): cache_max_entry_count=args.cache_max_entry_count, session_len=args.session_len) else: - from lmdeploy.messages import TurbomindEngineConfig backend_config = TurbomindEngineConfig( model_name=args.model_name, tp=args.tp, @@ -217,16 +223,22 @@ def gradio(args): run(args.model_path_or_server, server_name=args.server_name, server_port=args.server_port, - backend=args.backend, + backend=backend, backend_config=backend_config, chat_template_config=chat_template_config) @staticmethod def api_server(args): """Serve LLMs with restful api using fastapi.""" + from lmdeploy.archs import autoget_backend from lmdeploy.model import ChatTemplateConfig from lmdeploy.serve.openai.api_server import serve as run_api_server - if args.backend == 'pytorch': + backend = args.backend + if backend != 'pytorch': + # set auto backend mode + backend = autoget_backend(args.model_path) + + if backend == 'pytorch': from lmdeploy.messages import PytorchEngineConfig backend_config = PytorchEngineConfig( tp=args.tp, @@ -250,7 +262,7 @@ def api_server(args): meta_instruction=args.meta_instruction, capability=args.cap) run_api_server(args.model_path, - backend=args.backend, + backend=backend, backend_config=backend_config, chat_template_config=chat_template_config, server_name=args.server_name, diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index 7792e37aa3..bbf1a9c395 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -158,6 +158,7 @@ class PytorchEngineConfig: would be allocate according to current environment. adapters (dict): The path configs to lora adapters. max_prefill_token_num (int): tokens per iteration. + thread_safe (bool): thread safe engine instance. download_dir (str): Directory to download and load the weights, default to the default cache directory of huggingface. revision (str): The specific model version to use. @@ -176,6 +177,7 @@ class PytorchEngineConfig: num_gpu_blocks: int = 0 adapters: Dict[str, str] = None max_prefill_token_num: int = 8192 + thread_safe: bool = False download_dir: str = None revision: str = None diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py index 5d12bacb0a..62ba7f11f7 100644 --- a/lmdeploy/pytorch/check_env/__init__.py +++ b/lmdeploy/pytorch/check_env/__init__.py @@ -61,7 +61,8 @@ def check_env(): check_env_triton() -def check_transformers_version(model_path: str): +def check_transformers_version(model_path: str, + trust_remote_code: bool = True): """check transformers version.""" from packaging import version logger = get_logger('lmdeploy') @@ -77,7 +78,8 @@ def check_transformers_version(model_path: str): model_trans_version = None try: from transformers import AutoConfig - config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + config = AutoConfig.from_pretrained( + model_path, trust_remote_code=trust_remote_code) model_trans_version = getattr(config, 'transformers_version') except Exception as e: message = ( @@ -97,7 +99,7 @@ def check_transformers_version(model_path: str): _handle_exception(e, 'transformers', logger, message=message) -def check_model(model_path: str): +def check_model(model_path: str, trust_remote_code: bool = True): """check model requirements.""" logger = get_logger('lmdeploy') logger.info('Checking model.') diff --git a/lmdeploy/pytorch/engine/cache_engine.py b/lmdeploy/pytorch/engine/cache_engine.py index 8df2cbdc27..20f516b2e0 100644 --- a/lmdeploy/pytorch/engine/cache_engine.py +++ b/lmdeploy/pytorch/engine/cache_engine.py @@ -201,7 +201,8 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None: @staticmethod def get_cache_block_size(block_size: int, - model_config: ModelConfig) -> int: + model_config: ModelConfig, + world_size: int = 1) -> int: """Get the required cache size of the model. Args: @@ -214,6 +215,8 @@ def get_cache_block_size(block_size: int, head_size = model_config.get_head_size() num_layers = model_config.num_layers num_heads = model_config.num_key_value_heads + if not model_config.multi_query_attention: + num_heads = num_heads // world_size key_cache_block = block_size * num_heads * head_size value_cache_block = key_cache_block diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py index c2956b5804..e4b242fc7f 100644 --- a/lmdeploy/pytorch/engine/engine.py +++ b/lmdeploy/pytorch/engine/engine.py @@ -1,9 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +import asyncio import os -import time from dataclasses import dataclass -from queue import Queue -from threading import Thread from typing import Any, Dict, List import torch @@ -16,12 +14,12 @@ from ..adapter.adapter import ADAPTER_MANAGER, SchedulerAdapter from ..check_env import check_env, check_model from ..config import CacheConfig, SchedulerConfig -from ..messages import (MessageStatus, SamplingParam, SchedulerSequence, - SchedulerSession) +from ..messages import MessageStatus, SamplingParam, SchedulerSequence from ..paging import Scheduler from .logits_process import FusedLogitsProcessor, SamplingInputs from .model_agent import AutoModelAgent, ModelInputs -from .request import Request, RequestManager, RequestType, Response +from .request import (Request, RequestManager, RequestSender, RequestType, + Response) logger = get_logger('lmdeploy') @@ -47,21 +45,6 @@ class InferOutput: logits: torch.Tensor = None -def _check_resp(resp: Response, state: ResponseType, warning_msg: str = None): - """check if response has state.""" - if isinstance(state, ResponseType): - state = [state] - ret = resp.type in state - if not ret and warning_msg is not None: - logger.warning(warning_msg) - return ret - - -def _check_resp_success(resp: Response, warning_msg: str = None): - """check if response success.""" - return _check_resp(resp, ResponseType.SUCCESS, warning_msg) - - def _paging_adapters(adapters: dict, model_agent: AutoModelAgent, scheduler: Scheduler): adapters = adapters or dict() @@ -89,6 +72,79 @@ def _get_adapter_ids(seqs: SeqList, adapters: AdapterList): return adapter_ids +def _check_resp(resp: Response, state: ResponseType, warning_msg: str = None): + """check if response has state.""" + if isinstance(state, ResponseType): + state = [state] + ret = resp.type in state + if not ret and warning_msg is not None: + logger.warning(warning_msg) + return ret + + +def _check_resp_success(resp: Response, warning_msg: str = None): + """check if response success.""" + return _check_resp(resp, ResponseType.SUCCESS, warning_msg) + + +async def async_try_add_session(req_sender: RequestSender, session_id: int): + """Add new session. + + Args: + session_id (int): The session id to add. + """ + resp = await req_sender.async_send(RequestType.ADD_SESSION, + dict(session_id=session_id)) + _check_resp(resp, [ResponseType.SUCCESS, ResponseType.SESSION_REPEAT], + (f'Can not add session {session_id} ' + f'with error: {resp.type}')) + + +async def async_end(req_sender: RequestSender, session_id: int): + """End the given session.""" + resp = await req_sender.async_send(RequestType.END_SESSION, + dict(session_id=session_id)) + _check_resp_success(resp, (f'Failed to end session: {session_id}. ' + f'Error: {resp.type}.')) + + +async def async_cancel(req_sender: RequestSender, session_id: int): + """Stop current streaming inference.""" + resp = await req_sender.async_send(RequestType.STOP_SESSION, + dict(session_id=session_id)) + _check_resp_success(resp, (f'Failed to cancel session: {session_id}. ' + f'Error: {resp.type}.')) + + +def try_add_session(req_sender: RequestSender, session_id: int): + """Add new session. + + Args: + session_id (int): The session id to add. + """ + resp = req_sender.send(RequestType.ADD_SESSION, + dict(session_id=session_id)) + _check_resp(resp, [ResponseType.SUCCESS, ResponseType.SESSION_REPEAT], + (f'Can not add session {session_id} ' + f'with error: {resp.type}')) + + +def end(req_sender: RequestSender, session_id: int): + """End the given session.""" + resp = req_sender.send(RequestType.END_SESSION, + dict(session_id=session_id)) + _check_resp_success(resp, (f'Failed to end session: {session_id}. ' + f'Error: {resp.type}.')) + + +def cancel(req_sender: RequestSender, session_id: int): + """Stop current streaming inference.""" + resp = req_sender.send(RequestType.STOP_SESSION, + dict(session_id=session_id)) + _check_resp_success(resp, (f'Failed to cancel session: {session_id}. ' + f'Error: {resp.type}.')) + + class Engine: """The inference engine of lmdeploy pytorch. @@ -100,10 +156,13 @@ class Engine: def __init__(self, model_path: str, - engine_config: PytorchEngineConfig, + engine_config: PytorchEngineConfig = None, trust_remote_code: bool = True) -> None: check_env() - check_model(model_path) + check_model(model_path, trust_remote_code) + + if engine_config is None: + engine_config = PytorchEngineConfig() self.engine_config = engine_config model_name = engine_config.model_name @@ -153,8 +212,8 @@ def __init__(self, self.req_manager = self._bind_request_manager() # create main thread - self.loop_threads = self._start_loop() - self.req_sender = self.req_manager.build_sender(self.loop_threads) + self._start_loop() + self.req_sender = self.req_manager.build_sender() self._create_buffers() self.tokenizer = Tokenizer(model_path) @@ -162,7 +221,7 @@ def __init__(self, @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, - engine_config: PytorchEngineConfig, + engine_config: PytorchEngineConfig = None, trust_remote_code: bool = True, **kwargs): """lmdeploy python inference engine. @@ -199,7 +258,7 @@ def _create_buffers(self): def _bind_request_manager(self): """bind request manager.""" - req_manager = RequestManager() + req_manager = RequestManager(self.engine_config.thread_safe) req_manager.bind_func(RequestType.ADD_SESSION, self._on_add_session) req_manager.bind_func(RequestType.STOP_SESSION, self._on_stop_session) req_manager.bind_func(RequestType.END_SESSION, self._on_end_session) @@ -208,9 +267,7 @@ def _bind_request_manager(self): def _start_loop(self): """start loop.""" - loop_threads = Thread(target=self.loop, daemon=True) - loop_threads.start() - return loop_threads + return self.req_manager.start_loop(self.async_loop) def _on_add_session(self, reqs: Request, **kwargs): """on add session callback.""" @@ -281,7 +338,9 @@ def __update_bad_words(msg): req.data['token_ids']) > 0, ('Empty input is not allowed.') sess.add_sequence(req.data['token_ids'], sampling_param=req.data['sampling_param'], - adapter_name=req.data['adapter_name']) + adapter_name=req.data['adapter_name'], + return_logits=req.data.get( + 'return_logits', False)) msg = next(iter(sess.sequences.values())) __update_bad_words(msg) self.scheduler.add_sequence(msg) @@ -290,6 +349,7 @@ def __update_bad_words(msg): msg.update_token_ids(req.data['token_ids']) msg.num_new_tokens = 0 msg.sampling_param = req.data['sampling_param'] + msg.return_logits = req.data.get('return_logits', False) msg.status = MessageStatus.WAITING __update_bad_words(msg) @@ -320,27 +380,29 @@ def create_instance(self, cuda_stream_id=0): """ return EngineInstance(self) + async def async_add_session(self, session_id: int): + """Add new session.""" + return await async_try_add_session(self.req_sender, session_id) + def add_session(self, session_id: int): """Add new session.""" - resp = self.req_sender.send(RequestType.ADD_SESSION, - dict(session_id=session_id)) - _check_resp(resp, [ResponseType.SUCCESS, ResponseType.SESSION_REPEAT], - (f'Can not add session {session_id} ' - f'with error: {resp.type}')) + return try_add_session(self.req_sender, session_id) - def stop_session(self, session_id: int): + async def async_stop_session(self, session_id: int): """Stop the given session.""" - resp = self.req_sender.send(RequestType.STOP_SESSION, - dict(session_id=session_id)) - _check_resp_success(resp, (f'Failed to cancel session: {session_id}. ' - f'Error: {resp.type}.')) + return await async_cancel(self.req_sender, session_id) - def end_session(self, session_id: int): + def stop_session(self, session_id: int): + """Add new session.""" + return cancel(self.req_sender, session_id) + + async def async_end_session(self, session_id: int): """End the given session.""" - resp = self.req_sender.send(RequestType.END_SESSION, - dict(session_id=session_id)) - _check_resp_success(resp, (f'Failed to end session: {session_id}. ' - f'Error: {resp.type}.')) + return await async_end(self.req_sender, session_id) + + def end_session(self, session_id: int): + """Add new session.""" + return end(self.req_sender, session_id) @torch.inference_mode() def create_model_inputs(self, messages: SeqList, adapters: AdapterList): @@ -465,8 +527,8 @@ def _check_session_len(msg, max_session_len): return True return False - def sampling_logits(self, logits: torch.Tensor, running: SeqList, - inputs: ModelInputs): + async def async_sampling_logits(self, logits: torch.Tensor, + running: SeqList, inputs: ModelInputs): """sampling logits.""" def _gather_history(seqs: SeqList, device: torch.device): @@ -499,8 +561,12 @@ def _gather_history(seqs: SeqList, device: torch.device): if sampling_inputs.repetition_penalty is not None: input_ids = _gather_history(running, split_logits.device) logits_processor = FusedLogitsProcessor(sampling_inputs) - logits = logits_processor(input_ids, split_logits) - next_token_ids = logits_processor.sampling(logits).cpu() + + with torch.inference_mode(), torch.cuda.stream(self.stream): + logits = logits_processor(input_ids, split_logits) + next_token_ids = logits_processor.sampling(logits) + self.stream.synchronize() + next_token_ids = next_token_ids.cpu() return next_token_ids, split_logits @@ -527,8 +593,8 @@ def _can_output_token(self, token: torch.Tensor, msg: SchedulerSequence): return True - def _model_forward(self, inputs: ModelInputs, swap_in_map: Dict, - swap_out_map: Dict): + async def _async_model_forward(self, inputs: ModelInputs, + swap_in_map: Dict, swap_out_map: Dict): """model forward.""" max_prefill_token_num = self.scheduler_config.max_prefill_token_num swap_done = False @@ -562,20 +628,18 @@ def get_logits(self): torch.cuda.synchronize() return self._out_logits - def __forward(inputs): + async def __forward(inputs): """forward.""" nonlocal swap_done, swap_in_map, swap_out_map if swap_done: - return self.model_agent.forward(inputs, - swap_in_map=dict(), - swap_out_map=dict()) + return await self.model_agent.async_forward( + inputs, swap_in_map=dict(), swap_out_map=dict()) else: swap_done = True - return self.model_agent.forward(inputs, - swap_in_map=swap_in_map, - swap_out_map=swap_out_map) + return await self.model_agent.async_forward( + inputs, swap_in_map=swap_in_map, swap_out_map=swap_out_map) - def __long_context_single_forward(inputs, index): + async def __long_context_single_forward(inputs, index): """one large sequence.""" new_input = inputs.slice(index, index + 1) max_seq_len = new_input.seq_length[0] @@ -584,17 +648,17 @@ def __long_context_single_forward(inputs, index): logits_gather = _LogitsGather(max_seq_len) for inp in new_inputs: - tmp_out = __forward(inp) + tmp_out = await __forward(inp) logits_gather.gather(tmp_out) tmp_out['logits'] = logits_gather.get_logits() return tmp_out - def __long_context_batched_forward(inputs, start, end): + async def __long_context_batched_forward(inputs, start, end): """batched.""" new_inputs = inputs.slice(start, end) - return __forward(new_inputs) + return await __forward(new_inputs) - def __long_context_forward(inputs): + async def __long_context_forward(inputs): """forward for long context.""" seq_len = inputs.seq_length max_seq_len = inputs.input_ids.size(1) @@ -607,13 +671,15 @@ def __long_context_forward(inputs): while idx < batch_size: slen = seq_len[idx] if token_count == 0 and slen > max_prefill_token_num: - tmp_out = __long_context_single_forward(inputs, idx) + tmp_out = await __long_context_single_forward(inputs, idx) logits_gather.gather(tmp_out) + tmp_out.pop('logits', None) idx += 1 elif token_count + slen > max_prefill_token_num: - tmp_out = __long_context_batched_forward( + tmp_out = await __long_context_batched_forward( inputs, indices[0], idx) logits_gather.gather(tmp_out) + tmp_out.pop('logits', None) indices = [] token_count = 0 else: @@ -622,27 +688,23 @@ def __long_context_forward(inputs): idx += 1 if token_count > 0: - tmp_out = __long_context_batched_forward( + tmp_out = await __long_context_batched_forward( inputs, indices[0], idx) logits_gather.gather(tmp_out) tmp_out['logits'] = logits_gather.get_logits() return tmp_out if inputs.input_ids.numel() < max_prefill_token_num: - return __forward(inputs) + return await __forward(inputs) else: - return __long_context_forward(inputs) + return await __long_context_forward(inputs) - def step(self, is_prefill: bool, return_logits: bool = False): + async def async_step(self, is_prefill: bool, return_logits: bool = False): """one step inference. Used to perform streaming chat. - Args: - return_logits (bool): Whether to return the output logits. - Returns: Dict[int, InferOutput]: The output of each session. """ - # schedule schedule_output = self.scheduler.schedule(is_prefill=is_prefill) @@ -656,14 +718,14 @@ def step(self, is_prefill: bool, return_logits: bool = False): inputs = self.create_model_inputs(running, adapters) # inference - output = self._model_forward(inputs, - swap_in_map=swap_in_map, - swap_out_map=swap_out_map) + output = await self._async_model_forward(inputs, + swap_in_map=swap_in_map, + swap_out_map=swap_out_map) custom_outputs = output['custom_outputs'] logits = output['logits'] logits = logits[0] # [bs, seq, prob] -> [seq, prob] - next_token_ids, split_logits = self.sampling_logits( + next_token_ids, _ = await self.async_sampling_logits( logits, running, inputs) self.update_running(running, next_token_ids, custom_outputs) @@ -671,7 +733,8 @@ def step(self, is_prefill: bool, return_logits: bool = False): # generate output outputs: Dict[int, InferOutput] = dict() - for msg, next_id in zip(running, next_token_ids): + for idx, msg in enumerate(running): + next_id = next_token_ids[idx] session_id = msg.session_id if self._can_output_token(next_id, msg): out_token_ids = [next_id.item()] @@ -686,17 +749,18 @@ def step(self, is_prefill: bool, return_logits: bool = False): ) outputs[session_id] = out - if return_logits: - for msg, msg_logit in zip(running, split_logits): - outputs[msg.session_id].logits = msg_logit + if msg.return_logits: + start = inputs.q_start_loc[idx] + seqlen = inputs.seq_length[idx] + outputs[msg.session_id].logits = logits[start:start + seqlen] return outputs - def batched_infer(self, - session_ids: List[int], - token_ids: List[List[int]] = None, - gen_config: EngineGenerationConfig = None, - adapter_names: List[str] = None, - keep_cache: bool = False): + async def async_batched_infer(self, + session_ids: List[int], + token_ids: List[List[int]] = None, + gen_config: EngineGenerationConfig = None, + adapter_names: List[str] = None, + keep_cache: bool = False): """Send inference request. Args: @@ -718,11 +782,11 @@ def batched_infer(self, else: adapter_names = [None for _ in range(batch_size)] - def _add_sessions(session_ids): + async def _add_sessions(session_ids): for session_id in session_ids: - self.add_session(session_id) + await self.async_add_session(session_id) - def _add_messages(session_ids, token_ids): + async def _add_messages(session_ids, token_ids): add_msgs = [] sampling_param = SamplingParam.from_gen_config(gen_config) for session_id, token_id, adapter_name in zip( @@ -733,12 +797,12 @@ def _add_messages(session_ids, token_ids): adapter_name=adapter_name) add_msgs.append(msg) req_types = [RequestType.ADD_MESSAGE] * batch_size - req_ids = self.req_sender.batched_send_async(req_types, - data=add_msgs) + req_ids = await self.req_sender.async_batched_send_async( + req_types, data=add_msgs) return req_ids - _add_sessions(session_ids) - req_ids = _add_messages(session_ids, token_ids) + await _add_sessions(session_ids) + req_ids = await _add_messages(session_ids, token_ids) # receive messages req_idx_map = dict(zip(req_ids, range(len(req_ids)))) @@ -746,12 +810,12 @@ def _add_messages(session_ids, token_ids): status = 0 finish_count = batch_size while finish_count: - if not self.loop_threads.is_alive(): + if not self.req_manager.is_loop_alive(): logger.error('Engine loop is not alive.') status = 1 break - resp = self.req_sender.recv_any() + resp = await self.req_sender.async_recv_any() if resp.req_id not in req_ids: continue idx = req_idx_map[resp.req_id] @@ -762,7 +826,7 @@ def _add_messages(session_ids, token_ids): token_ids += resp.data['token_ids'] if not keep_cache: session_id = session_ids[idx] - self.end_session(session_id=session_id) + await self.async_end_session(session_id=session_id) finish_count -= 1 else: logger.error(f'Unexpected response: {resp.type}') @@ -772,85 +836,123 @@ def _add_messages(session_ids, token_ids): output_token_len = [len(token_ids) for token_ids in output_token_ids] return (status, output_token_ids, output_token_len) - def decode(self, prompt_token_ids: List[List[int]]): - """Perform one step inference and get logits. + def batched_infer(self, + session_ids: List[int], + token_ids: List[List[int]] = None, + gen_config: EngineGenerationConfig = None, + adapter_names: List[str] = None, + keep_cache: bool = False): + """batched infer.""" + coro = self.async_batched_infer(session_ids, + token_ids, + gen_config=gen_config, + adapter_names=adapter_names, + keep_cache=keep_cache) + return self.req_sender.run_until_complete(coro) + + def decode(self, + input_ids, + steps: List[int] = None, + sequence_start: bool = True, + sequence_end: bool = True, + adapter_names: List[str] = None): + """Perform context decode on input tokens. Args: - prompt_token_ids (List[List[int]]): Input prompts. - - Returns: - List[Tensor]: The logits. + input_ids (numpy.ndarray): the batch of input token ids + steps (List[int]): the offset of the k/v cache + sequence_start (bool): indicator for starting a sequence + sequence_end (bool): indicator for ending a sequence + adapter_names (List[str]): The name of the adapters. """ - assert not self.scheduler.has_unfinished() + from torch.nn.utils.rnn import pad_sequence + logger.debug('Decoding logits.') + batch_size = len(input_ids) + + def __add_messages(session_ids, input_ids, adapter_names): + add_msgs = [] + sampling_param = SamplingParam(max_new_tokens=0) + for session_id, token_id, adapter_name in zip( + session_ids, input_ids, adapter_names): + msg = dict(token_ids=token_id, + session_id=session_id, + sampling_param=sampling_param, + adapter_name=adapter_name, + return_logits=True) + add_msgs.append(msg) + req_types = [RequestType.ADD_MESSAGE] * batch_size + req_ids = self.req_sender.batched_send_async(req_types, + data=add_msgs) + return req_ids - if len(self.scheduler.sessions) > 0: - logger.warning( - 'Unreleased session might leads to low performance.') + if steps is not None: + assert batch_size == len(steps) - session_id = 1 - sessions: List[SchedulerSession] = [] - while len(sessions) < len(prompt_token_ids): - while session_id in self.scheduler.sessions: - session_id += 1 - sess = SchedulerSession(session_id) - sessions.append(sess) - self.add_session(sess) + if adapter_names is None: + adapter_names = [None] * batch_size + assert batch_size == len(adapter_names) - msgs: SeqList = [] - for token_ids, sess in zip(prompt_token_ids, sessions): - msg = sess.add_sequence(token_ids=token_ids) - msgs.append(msg) - self.scheduler.add_sequence(msg) + session_ids = tuple(range(batch_size)) + if sequence_start: + for sid in session_ids: + self.req_sender.send(RequestType.END_SESSION, + dict(session_id=sid)) + self.add_session(sid) - outputs = self.step(return_logits=True) + req_ids = __add_messages(session_ids, input_ids, adapter_names) + req_idx_map = dict(zip(req_ids, range(len(req_ids)))) + + finish_count = batch_size + ret = [None] * batch_size + while finish_count > 0: + resp = self.req_sender.recv_any() + if resp.req_id not in req_ids: + continue - logits = dict((k, out.logits) for k, out in outputs.items()) + assert resp.type == ResponseType.FINISH + idx = req_idx_map[resp.req_id] + ret[idx] = resp.data['logits'] + finish_count -= 1 - for sess in sessions: - self.end_session(sess.session_id) + ret = pad_sequence(ret, True) - split_logits = [logits[sess.session_id] for sess in sessions] - pad_sequence = torch.nn.utils.rnn.pad_sequence - logits = pad_sequence(split_logits, True) + if sequence_end: + for sid in session_ids: + self.end_session(sid) - return logits + return ret - def loop(self): + async def async_loop(self): """Main loop of the engine. Each engine instance would communicate with the engine by queue. """ - send_resp_que = Queue() - def _send_resp(): + def _send_resp(step_tokens): """send response callback.""" - while True: - step_tokens = send_resp_que.get() - time.sleep(0.02) - for _, out in step_tokens.items(): - if out.finish: - resp_type = ResponseType.FINISH - else: - resp_type = ResponseType.SUCCESS - self.req_manager.response( - Response( - type=resp_type, - sender_id=out.sender_id, - req_id=out.req_id, - data=dict(token_ids=out.token_ids), - )) - - send_thread = Thread(target=_send_resp, daemon=True) - send_thread.start() + for _, out in step_tokens.items(): + if out.finish: + resp_type = ResponseType.FINISH + else: + resp_type = ResponseType.SUCCESS + self.req_manager.response( + Response( + type=resp_type, + sender_id=out.sender_id, + req_id=out.req_id, + data=dict(token_ids=out.token_ids, logits=out.logits), + )) + prefill_interval = self.scheduler_config.prefill_interval prefill_counter = prefill_interval while True: if not self.req_manager.has_requests( ) and not self.scheduler.has_unfinished(): - time.sleep(0.01) + await asyncio.sleep(0.01) continue + logger.debug('async_loop: RequestManager Step.') self.req_manager.step() # forward @@ -859,13 +961,17 @@ def _send_resp(): is_prefill = not prefill_counter or not has_running if is_prefill: prefill_counter = prefill_interval + logger.debug('async_loop: Engine Step - ' + f'prefilling: {is_prefill}') with torch.inference_mode(): - step_tokens: Dict[int, InferOutput] = self.step( - is_prefill=is_prefill) + step_tokens: Dict[int, + InferOutput] = await self.async_step( + is_prefill=is_prefill) prefill_counter -= 1 # send response - send_resp_que.put(step_tokens) + logger.debug('async_loop: Response.') + _send_resp(step_tokens) class EngineInstance: @@ -877,23 +983,27 @@ class EngineInstance: def __init__(self, engine: Engine): self.engine = engine - self.req_sender = engine.req_manager.build_sender(engine.loop_threads) + self.req_sender = engine.req_manager.build_sender() def __del__(self): """Destructor.""" self.engine.req_manager.senders.pop(self.req_sender.sender_id) + async def _async_try_add_session(self, session_id: int): + """Add new session. + + Args: + session_id (int): The session id to add. + """ + return await async_try_add_session(self.req_sender, session_id) + def _try_add_session(self, session_id: int): """Add new session. Args: session_id (int): The session id to add. """ - resp = self.req_sender.send(RequestType.ADD_SESSION, - dict(session_id=session_id)) - _check_resp(resp, [ResponseType.SUCCESS, ResponseType.SESSION_REPEAT], - (f'Can not add session {session_id} ' - f'with error: {resp.type}')) + return try_add_session(self.req_sender, session_id) async def async_stream_infer(self, session_id: int, @@ -914,47 +1024,68 @@ async def async_stream_infer(self, List[int]: The streaming output tokens. int: The number of the output tokens. """ - import asyncio gen_config = gen_config or EngineGenerationConfig() - request_output_len = gen_config.max_new_tokens sampling_param = SamplingParam.from_gen_config(gen_config=gen_config) - self._try_add_session(session_id) + await async_try_add_session(self.req_sender, session_id) msg = dict( token_ids=input_ids, session_id=session_id, - max_request_output_len=request_output_len, sampling_param=sampling_param, adapter_name=adapter_name, ) - - req_id = self.req_sender.send_async(RequestType.ADD_MESSAGE, msg) + req_id = await self.req_sender.async_send_async( + RequestType.ADD_MESSAGE, msg) token_ids = [] while True: - if not self.engine.loop_threads.is_alive(): + if not self.req_sender.is_loop_alive(): yield (ResponseType.ENGINE_STOP_ERROR, [], 0) break - resps = self.req_sender.recv_all(req_id) - if len(resps) == 0: - await asyncio.sleep(0.1) - continue + resp = await self.req_sender.async_recv(req_id) - resp_type = ResponseType.SUCCESS - for resp in resps: - resp_type = resp.type - if resp.type == ResponseType.SUCCESS: - token_ids += resp.data['token_ids'] - elif resp.type == ResponseType.FINISH: - token_ids += resp.data['token_ids'] - break - else: - token_ids = [] - break - yield (resp_type, token_ids, len(token_ids)) - if resp_type != ResponseType.SUCCESS: + if resp.req_id != req_id: + continue + if resp.type == ResponseType.SUCCESS: + token_ids += resp.data['token_ids'] + yield (resp.type, token_ids, len(token_ids)) + elif resp.type == ResponseType.FINISH: + token_ids += resp.data['token_ids'] + yield (resp.type, token_ids, len(token_ids)) + break + else: + yield (resp.type, [], 0) break + async def async_infer(self, + session_id: int, + input_ids: List[int] = None, + gen_config: EngineGenerationConfig = None, + **kwargs): + """Send inference request. + + Args: + session_id (int): The session id. + input_ids (List[int]): The input token ids. + gen_config (EngineGenerationConfig): The sampling parameters. + + Returns: + int: Error flags. 0 if success. + List[int]: The streaming output tokens. + int: The number of the output tokens. + """ + token_ids = [] + async for outputs in self.async_stream_infer(session_id, + input_ids, + gen_config=gen_config, + **kwargs): + status, tmp_ids, _ = outputs + if status not in [ResponseType.SUCCESS, ResponseType.FINISH]: + return (status, token_ids, len(token_ids)) + token_ids = tmp_ids + + return (0, token_ids, len(token_ids)) + def stream_infer(self, session_id: int, input_ids: List[int], @@ -975,10 +1106,25 @@ def stream_infer(self, int: The number of the output tokens. """ - # TODO: support input embedding, step + def __call_async(): + """call async.""" + coro_gen = self.async_stream_infer(session_id, input_ids, + gen_config, adapter_name, + **kwargs) + while True: + try: + yield self.req_sender.run_until_complete( + coro_gen.__anext__()) + except StopAsyncIteration: + break + + if not self.req_sender.is_thread_safe(): + yield from __call_async() + return + gen_config = gen_config or EngineGenerationConfig() sampling_param = SamplingParam.from_gen_config(gen_config=gen_config) - self._try_add_session(session_id) + try_add_session(self.req_sender, session_id) msg = dict( token_ids=input_ids, session_id=session_id, @@ -989,11 +1135,12 @@ def stream_infer(self, token_ids = [] while True: - if not self.engine.loop_threads.is_alive(): + if not self.req_sender.is_loop_alive(): yield (ResponseType.ENGINE_STOP_ERROR, [], 0) break + resp = self.req_sender.recv(req_id) - # avoid token decoding and scheduling simultaneously + if resp.req_id != req_id: continue if resp.type == ResponseType.SUCCESS: @@ -1036,28 +1183,39 @@ def infer(self, return (0, token_ids, len(token_ids)) + async def async_end(self, session_id: int): + """End the given session.""" + return await async_end(self.req_sender, session_id) + def end(self, session_id: int): """End the given session.""" - resp = self.req_sender.send(RequestType.END_SESSION, - dict(session_id=session_id)) - _check_resp_success(resp, (f'Failed to end session: {session_id}. ' - f'Error: {resp.type}.')) + return end(self.req_sender, session_id) + + async def async_cancel(self, session_id: int): + """Stop current streaming inference.""" + return await async_cancel(self.req_sender, session_id) def cancel(self, session_id: int): """Stop current streaming inference.""" - resp = self.req_sender.send(RequestType.STOP_SESSION, - dict(session_id=session_id)) - _check_resp_success(resp, (f'Failed to cancel session: {session_id}. ' - f'Error: {resp.type}.')) + return cancel(self.req_sender, session_id) - def decode(self, prompt_token_ids: List[List[int]]): - """Return logits of context decoding. + def decode(self, + input_ids, + steps: List[int] = None, + sequence_start: bool = True, + sequence_end: bool = True, + adapter_names: List[str] = None): + """Perform context decode on input tokens. Args: - prompt_token_ids: token ids of a batch prompts. - - Returns: - logits (numpy.ndarray) with shape - [batch, n_max_token_of_the_batch, vocab_size] + input_ids (numpy.ndarray): the batch of input token ids + steps (List[int]): the offset of the k/v cache + sequence_start (bool): indicator for starting a sequence + sequence_end (bool): indicator for ending a sequence + adapter_names (List[str]): The name of the adapters. """ - return self.engine.decode(prompt_token_ids) + return self.engine.decode(input_ids, + steps=steps, + sequence_start=sequence_start, + sequence_end=sequence_end, + adapter_names=adapter_names) diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py index 4493dc432f..bb2895bb5e 100644 --- a/lmdeploy/pytorch/engine/logits_process.py +++ b/lmdeploy/pytorch/engine/logits_process.py @@ -12,6 +12,7 @@ def _process_temperature(scores: torch.Tensor, temperature: torch.Tensor, inplace: bool = True): """process temperature.""" + temperature = temperature.to(scores.dtype) if not inplace: scores = scores / temperature[:, None] else: @@ -42,6 +43,7 @@ def _process_repetition_penalty(scores: torch.Tensor, inplace: bool = True): """process repetition penalty.""" score = torch.gather(scores, 1, input_ids) + penalty = penalty.to(score.dtype) score = torch.where(score < 0, score * penalty[:, None], score / penalty[:, None]) if not inplace: diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py index 008b1da2b1..537d17c072 100644 --- a/lmdeploy/pytorch/engine/model_agent.py +++ b/lmdeploy/pytorch/engine/model_agent.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import asyncio import os from dataclasses import asdict, dataclass, field from typing import Any, Callable, Dict, List, Union @@ -58,7 +59,7 @@ def _update_cache_config(model_config: ModelConfig, gpu_mem = gpu_mem_physical_free * cache_config.cache_max_entry_count cpu_mem = host_mem_size cache_block_size = CacheEngine.get_cache_block_size( - cache_config.block_size, model_config) // world_size + cache_config.block_size, model_config, world_size) if cache_config.num_cpu_blocks == 0: cache_config.num_cpu_blocks = int(cpu_mem / cache_block_size) if cache_config.num_gpu_blocks == 0: @@ -343,7 +344,6 @@ def model_forward( use_origin=False, context=context, ) - stream.synchronize() return dict(logits=output['logits'], custom_outputs=context._outputs) @@ -405,6 +405,17 @@ def paging_adapters(self, weight_maps: List[AdapterWeightMap]): """paging adapter.""" raise NotImplementedError('Not implemented.') + async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, + swap_out_map: SwapMap): + """model forward. + + Args: + inputs (Dict): The input data comes from _make_inputs. + swap_in_map (SwapMap): Cache maps to swap in. + swap_out_map (SwapMap): Cache maps to swap out. + """ + raise NotImplementedError('Not implemented.') + def forward(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap): """model forward. @@ -507,16 +518,8 @@ def paging_adapters(self, weight_maps: List[AdapterWeightMap]): weight_map.cache_adapter(lora_linears, cpu_caches) update_lora_linears(lora_linears, weight_maps, device='cuda') - def forward(self, inputs: ModelInputs, swap_in_map: SwapMap, - swap_out_map: SwapMap): - """model forward. - - Args: - inputs (Dict): The input data comes from _make_inputs. - swap_in_map (SwapMap): Cache maps to swap in. - swap_out_map (SwapMap): Cache maps to swap out. - """ - + def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap, + swap_out_map: SwapMap): cache_swapping(self.cache_engine, swap_in_map=swap_in_map, swap_out_map=swap_out_map) @@ -530,6 +533,37 @@ def forward(self, inputs: ModelInputs, swap_in_map: SwapMap, ) return output + def forward(self, inputs: ModelInputs, swap_in_map: SwapMap, + swap_out_map: SwapMap): + """model forward. + + Args: + inputs (Dict): The input data comes from _make_inputs. + swap_in_map (SwapMap): Cache maps to swap in. + swap_out_map (SwapMap): Cache maps to swap out. + """ + output = self._forward_impl(inputs, + swap_in_map=swap_in_map, + swap_out_map=swap_out_map) + self.stream.synchronize() + return output + + async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, + swap_out_map: SwapMap): + """model forward. + + Args: + inputs (Dict): The input data comes from _make_inputs. + swap_in_map (SwapMap): Cache maps to swap in. + swap_out_map (SwapMap): Cache maps to swap out. + """ + output = self._forward_impl(inputs, + swap_in_map=swap_in_map, + swap_out_map=swap_out_map) + await asyncio.get_event_loop().run_in_executor(None, + self.stream.synchronize) + return output + @dataclass class TPResponse: @@ -886,6 +920,7 @@ def _tp_model_loop( world_size=world_size, stream=stream, ) + stream.synchronize() if rank == 0: resp_output = output out_que.put(TPResponse(0, None, resp_output)) @@ -922,25 +957,45 @@ def _start_tp_process(rank: int, raise e +def _check_context_alive(mp_context: mp.ProcessContext): + """check context alive.""" + procs = mp_context.processes + for idx, p in enumerate(procs): + if not p.is_alive(): + raise RuntimeError(f'Rank[{idx}] failed.') + + def _queue_get_response(que: mp.Queue, mp_context: mp.ProcessContext, interval: float = 1.0): """get response.""" from multiprocessing.queues import Empty + while True: + try: + return que.get(timeout=interval) + except Empty: + _check_context_alive(mp_context) - def __check_context_alive(): - """check context alive.""" - procs = mp_context.processes - for idx, p in enumerate(procs): - if not p.is_alive(): - raise RuntimeError(f'Rank[{idx}] failed.') - while True: +async def _async_queue_get_response(que: mp.Queue, + mp_context: mp.ProcessContext, + interval: float = 1.0): + """get response.""" + from multiprocessing.queues import Empty + + def __try_que_get(): + """try que get.""" try: return que.get(timeout=interval) except Empty: - pass - __check_context_alive() + return None + + while True: + ret = await asyncio.get_event_loop().run_in_executor( + None, __try_que_get) + if ret is not None: + return ret + _check_context_alive(mp_context) class TPModelAgent(AutoModelAgent): @@ -1059,6 +1114,24 @@ def forward(self, inputs: ModelInputs, swap_in_map: SwapMap, return resp.data + async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, + swap_out_map: SwapMap): + """model forward. + + Args: + inputs (Dict): The input data comes from _make_inputs. + swap_in_map (Dict[int, int]): Cache maps to swap in. + swap_out_map (Dict[int, int]): Cache maps to swap out. + """ + with torch.no_grad(): + self.tp_model_in_que.put((inputs, swap_in_map, swap_out_map)) + + resp: TPResponse = await _async_queue_get_response( + self.tp_model_out_que, self.mp_context) + if resp.ret_code != 0: + raise RuntimeError('tp forward failed.') + return resp.data + def build_model_agent(model_path: str, cache_config: CacheConfig, diff --git a/lmdeploy/pytorch/engine/request.py b/lmdeploy/pytorch/engine/request.py index 047ced1043..f71c8a8028 100644 --- a/lmdeploy/pytorch/engine/request.py +++ b/lmdeploy/pytorch/engine/request.py @@ -4,7 +4,7 @@ from dataclasses import dataclass, field from queue import Empty, Queue from threading import Lock, Thread -from typing import Any, Callable, ClassVar, Dict, List +from typing import Any, Awaitable, Callable, Dict, List from lmdeploy.messages import ResponseType from lmdeploy.utils import get_logger @@ -12,6 +12,26 @@ logger = get_logger('lmdeploy') +def _raise_exception_on_finish(task: asyncio.Task) -> None: + msg = ('Engine loop failed!') + try: + task.result() + except asyncio.CancelledError: + return + except Exception as exc: + raise RuntimeError(msg) from exc + + +def _ignore_exception_on_finish(task: asyncio.Task) -> None: + try: + task.result() + except asyncio.CancelledError: + return + except Exception as exc: + logger.info(f'task: {task.get_name()} ended.') + logger.debug(f'task: {task.get_name()} exception: {exc}') + + class RequestType(enum.Enum): """Request type.""" @@ -44,6 +64,21 @@ class Response: err_msg: str = '' +ReqList = List[Request] + + +def _run_until_complete(future: Awaitable): + """run untile complete.""" + try: + event_loop = asyncio.get_event_loop() + except Exception: + logger.warning('Can not found event loop in current thread.' + ' Create a new event loop.') + event_loop = asyncio.new_event_loop() + asyncio.set_event_loop(event_loop) + return event_loop.run_until_complete(future) + + @dataclass class RequestSender: """Request sender. @@ -53,53 +88,133 @@ class RequestSender: """ sender_id: int - req_que: Queue - resp_que: Queue = field(default_factory=Queue) + manager: 'RequestManager' resp_dict: Dict[int, List[Response]] = field(default_factory=dict) - THREAD_ALIVE_INTERVAL: ClassVar[float] = 1.0 _next_req_id: int = 0 - _thread: Thread = None + _resp_que: asyncio.Queue = None + _resp_thread_que: Queue = None @classmethod - def new(cls, sender_id: int, req_que: Queue, thread: Thread): - """new sender.""" - return cls(sender_id=sender_id, req_que=req_que, _thread=thread) - - def _resp_que_get(self, block: bool = True, timeout: float = None): - """warp of resp_que.get.""" - if not block: - return self.resp_que.get_nowait() - timeout_counter = timeout or float(1 << 30) - while timeout_counter > self.THREAD_ALIVE_INTERVAL: + def new(cls, sender_id: int, manager: 'RequestManager'): + """new.""" + return cls(sender_id=sender_id, manager=manager) + + @property + def resp_que(self): + """response queue.""" + if self.is_thread_safe(): + return self.manager.responses + if self.manager._loop_task is None and not self.is_thread_safe(): + self.manager.create_loop_task() + if self._resp_que is None: + self._resp_que = asyncio.Queue() + return self._resp_que + + @property + def req_que(self): + """request queue.""" + return self.manager.requests + + @property + def resp_thread_que(self): + """response threadsafe queue.""" + if self._resp_thread_que is None: + self._resp_thread_que = Queue() + return self._resp_thread_que + + @property + def req_thread_que(self): + """request threadsafe queue.""" + return self.manager.thread_requests + + @property + def event_loop(self): + """get event loop.""" + return self.manager.event_loop + + def is_thread_safe(self): + """is thread safe.""" + return self.manager.is_thread_safe() + + def is_loop_alive(self): + """is loop alive.""" + return self.manager.is_loop_alive() + + def run_until_complete(self, future: Awaitable): + """run untile complete.""" + return self.manager.run_until_complete(future) + + def _resp_get(self): + """resp_que.get.""" + timeout = 1 + + while True: + if not self.manager.is_loop_alive(): + logger.debug('Engine loop is not alive.') + exit(1) try: - return self.resp_que.get(timeout=self.THREAD_ALIVE_INTERVAL) + ret = self.resp_thread_que.get(timeout=timeout) + return ret except Empty: - timeout_counter -= self.THREAD_ALIVE_INTERVAL - if self._thread and not self._thread.is_alive(): - logger.error('Engine main loop stopped.') - exit(1) + continue + except Exception as e: + raise e + + async def _async_resp_get(self): + """get resp. + + Different behavior in threadsafe mode. + """ + timeout = 1 + + async def __no_threadsafe_get(): + while True: + if not self.manager.is_loop_alive(): + logger.debug('Engine loop is not alive.') + exit(1) + try: + return await asyncio.wait_for(self.resp_que.get(), timeout) + except asyncio.TimeoutError: + continue + except Exception as e: + raise e + + if self.is_thread_safe(): + ret = self._resp_get() + await asyncio.sleep(0) + return ret + else: + return await __no_threadsafe_get() - return self.resp_que.get(timeout=timeout_counter) - - async def _async_resp_que_get(self, - block: bool = True, - timeout: float = None): - """warp of resp_que.get.""" - if not block: - return self.resp_que.get_nowait() - timeout_counter = timeout or float(1 << 30) - while timeout_counter > self.THREAD_ALIVE_INTERVAL: - if self.resp_que.qsize() == 0: - await asyncio.sleep(self.THREAD_ALIVE_INTERVAL) - timeout_counter -= self.THREAD_ALIVE_INTERVAL - else: - return self.resp_que.get(block=False) - if self._thread and not self._thread.is_alive(): - logger.error('Engine main loop stopped.') - exit(1) + def _req_put(self, reqs: Any): + """req put.""" + self.req_thread_que.put(reqs) + + async def _async_req_put(self, reqs: Any): + """async rq_que put. - await asyncio.sleep(self.THREAD_ALIVE_INTERVAL) - return self.resp_que.get(block=False) + Different behavior in threadsafe mode. + """ + if self.is_thread_safe(): + self._req_put(reqs) + await asyncio.sleep(0) + else: + await self.req_que.put(reqs) + + def _prefetch_resps(self): + """prefetch from resp que. + + Different behavior in threadsafe mode. + """ + if self.is_thread_safe(): + resp_que = self.resp_thread_que + else: + resp_que = self.resp_que + num_resps = resp_que.qsize() + for _ in range(num_resps): + resp: Response = resp_que.get_nowait() + req_id = resp.req_id + self._push_resp(req_id, resp) def _push_resp(self, req_id: int, resp: Response): """push response.""" @@ -116,22 +231,11 @@ def _pop_resp(self, req_id: int, default: Any = None): self.resp_dict.pop(req_id) return ret - def _prefetch_resps(self): - """prefetch from resp que.""" - num_resps = self.resp_que.qsize() - for _ in range(num_resps): - resp: Response = self._resp_que_get(block=False) - req_id = resp.req_id - self._push_resp(req_id, resp) - - def is_thread_alive(self): - """is thread alive.""" - return self._thread and self._thread.is_alive() - - def batched_send_async(self, req_types: List[RequestType], - data: List[Any]) -> List[int]: - """Batched send request asynchronize.""" - if self._thread and not self._thread.is_alive(): + def _gather_request(self, req_types: List[RequestType], data: List[Any]): + """gather requests.""" + if self.manager._loop_task is None and not self.is_thread_safe(): + self.manager.create_loop_task() + if not self.is_loop_alive(): logger.error('Engine main loop stopped.') exit(1) assert len(req_types) == len(data) @@ -148,79 +252,120 @@ def batched_send_async(self, req_types: List[RequestType], data=rdata) for req_id, rtype, rdata in zip(req_ids, req_types, data) ] - self.req_que.put(reqs) + return req_ids, reqs + async def async_batched_send_async(self, req_types: List[RequestType], + data: List[Any]): + """Batched send request asynchronize.""" + req_ids, reqs = self._gather_request(req_types, data) + await self._async_req_put(reqs) + return req_ids + + async def async_send_async(self, req_type: RequestType, data: Any): + """send request asynchronize.""" + return (await self.async_batched_send_async(req_types=[req_type], + data=[data]))[0] + + def batched_send_async(self, req_types: List[RequestType], + data: List[Any]) -> List[int]: + """Batched send request asynchronize. + + Different behavior in threadsafe mode. + """ + if not self.is_thread_safe(): + coro = self.async_batched_send_async(req_types, data) + return self.run_until_complete(coro) + + req_ids, reqs = self._gather_request(req_types, data) + self._req_put(reqs) return req_ids def send_async(self, req_type: RequestType, data: Any) -> int: """send request asynchronize.""" return self.batched_send_async(req_types=[req_type], data=[data])[0] - def recv_any(self, que_timeout: float = None) -> Response: + async def async_recv_any(self, que_timeout: float = None) -> Response: """receive any response.""" - # check resp dict self._prefetch_resps() for req_id in self.resp_dict: ret = self._pop_resp(req_id, default=None) if ret is not None: return ret + return await self._async_resp_get() - # check resp que - return self._resp_que_get(timeout=que_timeout) + def recv_any(self, que_timeout: float = None) -> Response: + """receive any response.""" + coro = self.async_recv_any(que_timeout) + return self.run_until_complete(coro) - def recv_all(self, req_id: int): + def recv_all(self, req_id: int, block: bool = True): """revceive all response with req_id.""" self._prefetch_resps() resps = self.resp_dict.pop(req_id, []) return resps - def recv(self, req_id: int, que_timeout: float = None) -> Response: - """receive response of given request id.""" - # check resp dict + async def async_recv(self, + req_id: int, + que_timeout: float = None) -> Response: + """receive response of given request id async.""" ret = self._pop_resp(req_id, default=None) if ret is not None: return ret # check resp que while True: - resp: Response = self._resp_que_get(timeout=que_timeout) + resp: Response = await self._async_resp_get() if resp.req_id != req_id: self._push_resp(req_id, resp) else: return resp - async def async_recv(self, - req_id: int, - que_timeout: float = None) -> Response: - """receive response of given request id async.""" + def recv(self, req_id: int, que_timeout: float = None) -> Response: + """receive response of given request id. + + Different behavior in threadsafe mode. + """ + if not self.is_thread_safe(): + coro = self.async_recv(req_id, que_timeout) + return self.run_until_complete(coro) + ret = self._pop_resp(req_id, default=None) if ret is not None: return ret # check resp que while True: - resp: Response = await self._async_resp_que_get(timeout=que_timeout - ) + resp: Response = self._resp_get() if resp.req_id != req_id: self._push_resp(req_id, resp) else: return resp + async def async_send(self, + req_type: RequestType, + data: Any, + que_timeout: float = None): + """send and receive synchronize.""" + req_id = await self.async_send_async(req_type, data) + return await self.async_recv(req_id, que_timeout=que_timeout) + def send(self, req_type: RequestType, data: Any, que_timeout: float = None) -> Response: """send and receive synchronize.""" req_id = self.send_async(req_type, data) - return self.recv(req_id, que_timeout=que_timeout) + def response_callback(self, resp: Response): + """response callback.""" + self.resp_que.put_nowait(resp) + class RequestManager: """Request manager.""" - def __init__(self): - self._next_sender_id = 0 + def __init__(self, thread_safe: bool = False): self.senders: Dict[int, RequestSender] = dict() self.callbacks: Dict[RequestType, Callable] = dict() self.request_priority: List[RequestType] = [ @@ -228,47 +373,158 @@ def __init__(self): RequestType.END_SESSION, RequestType.ADD_SESSION, RequestType.ADD_MESSAGE ] - self.requests = Queue() - self.mutex = Lock() - - def build_sender(self, thread: Thread = None): + self.requests: asyncio.Queue = None + self._loop_task: asyncio.Future = None + self._loop_coro: Callable = None + self._thread_safe = thread_safe + self._next_sender_id = 0 + self._mutex = Lock() + self._loop_thread: Thread = None + + self.thread_requests: Queue = None + # every sender has it's own responses, this responses is + # only used in thread safe mode. + self.responses: asyncio.Queue = None + if thread_safe: + self.thread_requests = Queue() + + def create_loop_task(self): + """create coro task.""" + logger.debug('creating engine loop task.') + event_loop = asyncio.get_event_loop() + assert self._loop_coro is not None, ( + 'Please set loop task with manager.start_loop') + loop_unshielded = event_loop.create_task(self._loop_coro(), + name='EngineMainLoop') + loop_unshielded.add_done_callback(_raise_exception_on_finish) + self._loop_task = asyncio.shield(loop_unshielded) + self.requests = asyncio.Queue() + return self._loop_task + + @property + def event_loop(self): + """get event loop.""" + if self._loop_task is None: + return None + else: + return self._loop_task.get_loop() + + def is_thread_safe(self): + """is thread safe.""" + return self._thread_safe + + def start_loop(self, loop: asyncio.Task): + """start main loop.""" + self._loop_coro = loop + + def __get_thread_reqs(): + """get thread reqs.""" + num_reqs = self.thread_requests.qsize() + reqs = [] + for _ in range(num_reqs): + tmp_reqs = self.thread_requests.get_nowait() + if isinstance(tmp_reqs, Request): + tmp_reqs = [tmp_reqs] + reqs += tmp_reqs + return reqs + + async def __req_loop(): + """req loop.""" + while True: + # get reqs + reqs = __get_thread_reqs() + + if len(reqs) > 0: + await self.requests.put(reqs) + else: + await asyncio.sleep(0.02) + + def __put_thread_resps(resps: List[Response]): + """put thread resps.""" + for resp in resps: + sender = self.senders.get(resp.sender_id, None) + if sender is None: + continue + sender.resp_thread_que.put_nowait(resp) + + async def __resp_loop(): + """resp loop.""" + while True: + num_resps = self.responses.qsize() + resps = [] + for _ in range(num_resps): + resps.append(self.responses.get_nowait()) + if len(resps) > 0: + __put_thread_resps(resps) + else: + await asyncio.sleep(0.02) + + def __run_forever(event_loop: asyncio.BaseEventLoop): + """run forever.""" + logger.debug('start thread run forever.') + asyncio.set_event_loop(event_loop) + self.create_loop_task() + req_loop = event_loop.create_task(__req_loop(), + name='RunForeverReqLoop') + req_loop.add_done_callback(_ignore_exception_on_finish) + resp_loop = event_loop.create_task(__resp_loop(), + name='RunForeverRespLoop') + resp_loop.add_done_callback(_ignore_exception_on_finish) + self.event_loop.run_forever() + + if self.is_thread_safe(): + event_loop = asyncio.new_event_loop() + self.responses = asyncio.Queue() + self._loop_thread = Thread(target=__run_forever, + args=(event_loop, ), + daemon=True) + self._loop_thread.start() + + def is_loop_alive(self): + """check if main loop is alive.""" + + def __check_threadsafe(): + if self._loop_thread is None: + return False + if not self._loop_thread.is_alive(): + return False + if self._loop_task is None: + return False + return not self._loop_task.done() + + if self.is_thread_safe(): + return __check_threadsafe() + + if self._loop_task is None: + logger.debug('loop task has not been created.') + return False + if self._loop_task.get_loop() != asyncio.get_event_loop(): + logger.warning('Current event loop is different from' + ' the one bound to loop task!') + return False + return not self._loop_task.done() + + def build_sender(self): """create a new sender.""" - with self.mutex: + with self._mutex: sender_id = self._next_sender_id self._next_sender_id += 1 - new_sender = RequestSender.new(sender_id, self.requests, thread) + new_sender = RequestSender.new(sender_id, self) self.senders[sender_id] = new_sender return new_sender - def bind_func(self, req_type: RequestType, callback: Callable): - """bind handler for given request type.""" - self.callbacks[req_type] = callback - - def set_request_priority(self, priority: List[RequestType]): - """set the priority of request type.""" - self.request_priority = priority - def has_requests(self): """has unprocessed request.""" + if self.requests is None: + return False return not self.requests.empty() - def response(self, resp: Response, timeout: float = None): - """send response.""" - if resp.sender_id not in self.senders: - logger.warning(f'sender {resp.sender_id} not exist. ' - f'Send {resp} failed.') - return - resp_que = self.senders[resp.sender_id].resp_que - resp_que.put(resp, timeout=timeout) - def get_all_requests(self) -> Dict[RequestType, Request]: """get all requests in current queue.""" num_reqs = self.requests.qsize() - reqs: List[Request] = [] - tmp = num_reqs - while tmp: - tmp -= 1 - elem = self.requests.get() + reqs: ReqList = [] + for _ in range(num_reqs): + elem = self.requests.get_nowait() if isinstance(elem, Request): elem = [elem] reqs += elem @@ -280,7 +536,23 @@ def get_all_requests(self) -> Dict[RequestType, Request]: reqs_by_type[req.type].append(req) return reqs_by_type - def process_request(self, req_type, reqs, **kwargs): + def bind_func(self, req_type: RequestType, callback: Callable): + """bind handler for given request type.""" + self.callbacks[req_type] = callback + + def set_request_priority(self, priority: List[RequestType]): + """set the priority of request type.""" + self.request_priority = priority + + def response(self, resp: Response): + """send response.""" + if resp.sender_id not in self.senders: + logger.warning(f'sender {resp.sender_id} not exist. ' + f'Send {resp} failed.') + return + self.senders[resp.sender_id].response_callback(resp) + + def process_request(self, req_type: RequestType, reqs: ReqList, **kwargs): """process reqs with given req type.""" # get callback func = self.callbacks.get(req_type, None) @@ -297,7 +569,10 @@ def process_request(self, req_type, reqs, **kwargs): self.response(resp) def step(self, **kwargs): - """handle requests.""" + """handle requests. + + Should only be called in loop task. + """ reqs_by_type = self.get_all_requests() # handle requests @@ -306,5 +581,9 @@ def step(self, **kwargs): if req_type not in reqs_by_type or len(reqs_by_type) == 0: continue - reqs: List[Request] = reqs_by_type[req_type] + reqs: ReqList = reqs_by_type[req_type] self.process_request(req_type, reqs, **kwargs) + + def run_until_complete(self, future: Awaitable): + """run untile complete.""" + return _run_until_complete(future) diff --git a/lmdeploy/pytorch/kernels/__init__.py b/lmdeploy/pytorch/kernels/__init__.py index a1e2ead436..31d2e4c0d8 100644 --- a/lmdeploy/pytorch/kernels/__init__.py +++ b/lmdeploy/pytorch/kernels/__init__.py @@ -1,9 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .alibi_pagedattention import alibi_paged_attention_fwd from .apply_rotary_pos_emb import apply_rotary_pos_emb -from .biased_pagedattention import biased_paged_attention_fwd from .fill_kv_cache import fill_kv_cache -from .flashattention_nopad import context_attention_fwd from .fused_rotary_emb import fused_rotary_emb from .multinomial_sampling import multinomial_sampling from .pagedattention import paged_attention_fwd @@ -11,8 +9,7 @@ from .rms_norm import rms_norm __all__ = [ - 'apply_rotary_pos_emb', 'context_attention_fwd', 'fused_rotary_emb', - 'paged_attention_fwd', 'biased_paged_attention_fwd', + 'apply_rotary_pos_emb', 'fused_rotary_emb', 'paged_attention_fwd', 'alibi_paged_attention_fwd', 'fill_kv_cache', 'multinomial_sampling', 'rms_norm', 'rerope_attention_fwd' ] diff --git a/lmdeploy/pytorch/kernels/biased_pagedattention.py b/lmdeploy/pytorch/kernels/biased_pagedattention.py deleted file mode 100644 index 1270c17e7b..0000000000 --- a/lmdeploy/pytorch/kernels/biased_pagedattention.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -# modify from: https://github.com/ModelTC/lightllm -import torch -import triton -import triton.language as tl -from torch import Tensor - -assert triton.__version__ >= '2.1.0' - -_NV_CAP = torch.cuda.get_device_capability() -if _NV_CAP[0] >= 8: - - @triton.jit - def _convert_pv(p, v): - """convert pv.""" - p = p.to(v.dtype) - return p, v -else: - - @triton.jit - def _convert_pv(p, v): - """convert pv.""" - v = v.to(p.dtype) - return p, v - - -@triton.jit -def _fwd_kernel( - Q, - K, - V, - Bias, - sm_scale, - B_Start_Loc, - B_Seqlen, - B_kvlen, - Block_offsets, - Out, - stride_qbs, - stride_qh, - stride_qd, - stride_kbs, - stride_kh, - stride_kd, - stride_vbs, - stride_vh, - stride_vd, - stride_biasbs, - stride_biash, - stride_biasq, - stride_biask, - stride_obs, - stride_oh, - stride_od, - stride_boffb, - kv_group_num, - BLOCK_M: tl.constexpr, - BLOCK_DMODEL: tl.constexpr, - BLOCK_N: tl.constexpr, -): - """biased paged attention kernel.""" - cur_batch = tl.program_id(0) - cur_head = tl.program_id(1) - start_m = tl.program_id(2) - - cur_kv_head = cur_head // kv_group_num - - cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) - cur_batch_kv_len = tl.load(B_kvlen + cur_batch) - cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) - - block_start_loc = BLOCK_M * start_m - - # initialize offsets - offs_n = tl.arange(0, BLOCK_N) - offs_d = tl.arange(0, BLOCK_DMODEL) - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + - cur_head * stride_qh + offs_d[None, :] * stride_qd) - off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + - offs_d[:, None] * stride_kd) - off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + - offs_d[None, :] * stride_vd) - off_bias = (cur_batch * stride_biasbs + cur_head * stride_biash + - offs_m[:, None] * stride_biasq + - offs_n[None, :] * stride_biask) - - q = tl.load(Q + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) - - k_ptrs = K + off_k - v_ptrs = V + off_v - bias_ptrs = Bias + off_bias - - block_offset_ptrs = Block_offsets + cur_batch * stride_boffb - - # initialize pointer to m and l - m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf') - l_i = tl.zeros([BLOCK_M], dtype=tl.float32) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - - block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0) - - for start_n in range(0, block_mask * cur_batch_kv_len, BLOCK_N): - start_n = tl.multiple_of(start_n, BLOCK_N) - - start_block_id = start_n // BLOCK_N - b_offset = tl.load(block_offset_ptrs + start_block_id) - - # -- compute qk ---- - k = tl.load( - k_ptrs + b_offset * BLOCK_N * stride_kbs, - mask=(start_n + offs_n[None, :]) < cur_batch_kv_len, - other=0.0, - ) - - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) - qk *= sm_scale - - bias = tl.load( - bias_ptrs + start_n, - mask=(start_n + offs_n[None, :]) < cur_batch_kv_len - and (offs_m[:, None] < cur_batch_seq_len), - other=-1e30, - ) - qk += bias - - # -- compute p, m_i and l_i - m_i_new = tl.maximum(m_i, tl.max(qk, 1)) - p = tl.exp(qk - m_i_new[:, None]) - alpha = tl.exp(m_i - m_i_new) - l_i_new = alpha * l_i + tl.sum(p, 1) - # -- update output accumulator -- - # scale acc - acc = acc * alpha[:, None] - # update acc - v = tl.load( - v_ptrs + b_offset * BLOCK_N * stride_vbs, - mask=(start_n + offs_n[:, None]) < cur_batch_kv_len, - other=0.0, - ) - - p, v = _convert_pv(p, v) - acc += tl.dot(p, v) - # update m_i and l_i - l_i = l_i_new - m_i = m_i_new - - acc = acc / l_i[:, None] - # initialize pointers to output - off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + - cur_head * stride_oh + offs_d[None, :] * stride_od) - out_ptrs = Out + off_o - tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len) - - -@torch.no_grad() -def biased_paged_attention_fwd( - q: Tensor, - k: Tensor, - v: Tensor, - bias: Tensor, - o: Tensor, - block_offsets: Tensor, - b_start_loc: Tensor, - b_seq_len: Tensor, - b_kv_seq_len: Tensor, - max_input_len: int, - BLOCK: int = 64, -): - """Paged attention forward with custom bias. - - Args: - q (Tensor): Query state. - k (Tensor): Key state caches. - v (Tensor): Value state caches. - bias (Tensor): Bias of the QK. - o (Tensor): Output state. - block_offsets (Tensor): The block offset of key and value. - b_start_loc (Tensor): Start token location of each data in batch. - b_seq_len (Tensor): Query length for each data in batch. - b_kv_seq_len (Tensor): Key/Value length for each data in batch. - max_input_len (int): The max input length. - BLOCK (int): The kernel block size. - """ - # shape constraints - Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] - assert Lq == Lk and Lk == Lv - assert Lk in {16, 32, 64, 128} - assert bias.dtype == torch.float32 - - if bias.dim() == 2: - bias = bias.unsqueeze(0) - - if bias.dim() == 3: - bias = bias.unsqueeze(1) - - sm_scale = 1.0 / (Lq**0.5) # 计算scale系数 - batch, head = b_seq_len.shape[0], q.shape[-2] - kv_group_num = q.shape[-2] // k[0].shape[-2] - - grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, - bias_head_stride = 0 if bias.size(1) == 1 else bias.stride(-3) - - num_warps = 4 if Lk <= 64 else 8 - _fwd_kernel[grid]( - q, - k, - v, - bias, - sm_scale, - b_start_loc, - b_seq_len, - b_kv_seq_len, - block_offsets, - o, - q.stride(-3), - q.stride(-2), - q.stride(-1), - k.stride(-3), - k.stride(-2), - k.stride(-1), - v.stride(-3), - v.stride(-2), - v.stride(-1), - bias.stride(-4), - bias_head_stride, - bias.stride(-2), - bias.stride(-1), - o.stride(-3), - o.stride(-2), - o.stride(-1), - block_offsets.stride(0), - kv_group_num=kv_group_num, - BLOCK_M=BLOCK, - BLOCK_DMODEL=Lk, - BLOCK_N=BLOCK, - num_warps=num_warps, - num_stages=1, - ) diff --git a/lmdeploy/pytorch/kernels/flashattention_nopad.py b/lmdeploy/pytorch/kernels/flashattention_nopad.py deleted file mode 100644 index 567a744968..0000000000 --- a/lmdeploy/pytorch/kernels/flashattention_nopad.py +++ /dev/null @@ -1,199 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -# modify from: https://github.com/ModelTC/lightllm -import torch -import triton -import triton.language as tl -from torch import Tensor - -assert triton.__version__ >= '2.1.0' - - -@triton.jit -def _fwd_kernel( - Q, - K, - V, - sm_scale, - B_Start_Loc, - B_Seqlen, - B_kv_start_loc, - B_kvlen, - Out, - stride_qbs, - stride_qh, - stride_qd, - stride_kbs, - stride_kh, - stride_kd, - stride_vbs, - stride_vh, - stride_vd, - stride_obs, - stride_oh, - stride_od, - kv_group_num, - BLOCK_M: tl.constexpr, - BLOCK_DMODEL: tl.constexpr, - BLOCK_N: tl.constexpr, -): - """flash attention forward triton kernel.""" - cur_batch = tl.program_id(0) - cur_head = tl.program_id(1) - start_m = tl.program_id(2) - - cur_kv_head = cur_head // kv_group_num - - cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) - cur_batch_kv_len = tl.load(B_kvlen + cur_batch) - cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) - cur_batch_in_kv_start_index = tl.load(B_kv_start_loc + cur_batch) - history_len = cur_batch_kv_len - cur_batch_seq_len - - block_start_loc = BLOCK_M * start_m - - # initialize offsets - offs_n = tl.arange(0, BLOCK_N) - offs_d = tl.arange(0, BLOCK_DMODEL) - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + - cur_head * stride_qh + offs_d[None, :] * stride_qd) - off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + - offs_d[:, None] * stride_kd) - off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + - offs_d[None, :] * stride_vd) - - q = tl.load(Q + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) - - k_ptrs = K + off_k - v_ptrs = V + off_v - - # initialize pointer to m and l - m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf') - l_i = tl.zeros([BLOCK_M], dtype=tl.float32) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - - block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0) - - for start_n in range(0, block_mask * cur_batch_kv_len, BLOCK_N): - start_n = tl.multiple_of(start_n, BLOCK_N) - # -- compute qk ---- - k = tl.load( - k_ptrs + (cur_batch_in_kv_start_index + start_n) * stride_kbs, - mask=(start_n + offs_n[None, :]) < cur_batch_kv_len, - other=0.0, - ) - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) - qk *= sm_scale - # NOTE: inf - inf = nan, and nan will leads to error - qk = tl.where( - (history_len + offs_m[:, None]) >= (start_n + offs_n[None, :]), - qk, - float(-1e30), - ) - - # -- compute m_ij, p, l_ij - m_ij = tl.max(qk, 1) - p = tl.exp(qk - m_ij[:, None]) - l_ij = tl.sum(p, 1) - # -- update m_i and l_i - m_i_new = tl.maximum(m_i, m_ij) - alpha = tl.exp(m_i - m_i_new) - beta = tl.exp(m_ij - m_i_new) - l_i_new = alpha * l_i + beta * l_ij - # -- update output accumulator -- - # scale p - p_scale = beta / l_i_new - p = p * p_scale[:, None] - # scale acc - acc_scale = l_i / l_i_new * alpha - acc = acc * acc_scale[:, None] - # update acc - v = tl.load( - v_ptrs + (cur_batch_in_kv_start_index + start_n) * stride_vbs, - mask=(start_n + offs_n[:, None]) < cur_batch_kv_len, - other=0.0, - ) - - p = p.to(v.dtype) - acc += tl.dot(p, v) - # update m_i and l_i - l_i = l_i_new - m_i = m_i_new - # initialize pointers to output - off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + - cur_head * stride_oh + offs_d[None, :] * stride_od) - out_ptrs = Out + off_o - tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len) - - -@torch.no_grad() -def context_attention_fwd( - q: Tensor, - k: Tensor, - v: Tensor, - o: Tensor, - b_start_loc: Tensor, - b_seq_len: Tensor, - b_kv_start_loc: Tensor, - b_kv_seq_len: Tensor, - max_input_len: int, - BLOCK: int = 64, -): - """Context Attention forward. - - Args: - q (Tensor): Query state. - k (Tensor): Key state caches. - v (Tensor): Value state caches. - o (Tensor): Output state. - b_start_loc (Tensor): Start token location of each data in batch. - b_seq_len (Tensor): Query length for each data in batch. - b_kv_start_loc (Tensor): Start token location of kv in each data - in batch. - b_kv_seq_len (Tensor): Key/Value length for each data in batch. - max_input_len (int): The max input length. - BLOCK (int): The kernel block size. - """ - # shape constraints - Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] - assert Lq == Lk and Lk == Lv - assert Lk in {16, 32, 64, 128} - - sm_scale = 1.0 / (Lq**0.5) # 计算scale系数 - batch, head = b_seq_len.shape[0], q.shape[1] - kv_group_num = q.shape[1] // k.shape[1] - - grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, - - num_warps = 4 if Lk <= 64 else 8 - _fwd_kernel[grid]( - q, - k, - v, - sm_scale, - b_start_loc, - b_seq_len, - b_kv_start_loc, - b_kv_seq_len, - o, - q.stride(0), - q.stride(1), - q.stride(2), - k.stride(0), - k.stride(1), - k.stride(2), - v.stride(0), - v.stride(1), - v.stride(2), - o.stride(0), - o.stride(1), - o.stride(2), - kv_group_num=kv_group_num, - BLOCK_M=BLOCK, - BLOCK_DMODEL=Lk, - BLOCK_N=BLOCK, - num_warps=num_warps, - num_stages=1, - ) - return diff --git a/lmdeploy/pytorch/kernels/multinomial_sampling.py b/lmdeploy/pytorch/kernels/multinomial_sampling.py index 9ad29c773b..44ed759052 100644 --- a/lmdeploy/pytorch/kernels/multinomial_sampling.py +++ b/lmdeploy/pytorch/kernels/multinomial_sampling.py @@ -21,8 +21,8 @@ def _multinomial_sampling_kernel(Scores, Seeds, Offsets, Indices, Outputs, offset = tl.load(Offsets + off, mask=off_mask).to(tl.int32) samp = tl.rand(seed, offset)[:, None] - acc = tl.zeros((BLOCK, ), dtype=Scores.dtype.element_ty) - output = tl.full((BLOCK, ), -1, dtype=tl.int64) + acc = tl.zeros((BLOCK, ), dtype=tl.float32) + output = tl.load(Indices + off * stride_ib, mask=off_mask) for b_idx in range(0, num_tokens, BLOCK_N): s_off = b_idx + n_off @@ -30,7 +30,7 @@ def _multinomial_sampling_kernel(Scores, Seeds, Offsets, Indices, Outputs, scores = tl.load(Scores + off[:, None] * stride_sb + s_off[None, :] * stride_st, mask=s_mask, - other=0.0) + other=0.0).to(acc.dtype) cum_scores = acc[:, None] + tl.cumsum(scores, 1) acc += tl.sum(scores, 1) @@ -75,7 +75,7 @@ def __kernel_meta(): assert indices.dim() == 2 assert indices.size() == scores.size() - outputs = indices.new_empty(batch_size) + outputs = indices[:, 0].clone() BLOCK = 32 BLOCK_N = 64 @@ -96,5 +96,5 @@ def __kernel_meta(): BLOCK=BLOCK, BLOCK_N=BLOCK_N, **kernel_meta) - torch.cuda.synchronize() + return outputs diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py index b58415b65e..4bae383740 100644 --- a/lmdeploy/pytorch/messages.py +++ b/lmdeploy/pytorch/messages.py @@ -25,8 +25,8 @@ class SamplingParam: repetition_penalty: float = 1.0 ignore_eos: bool = False random_seed: int = None - stop_words: List[int] = None - bad_words: List[int] = None + stop_words: List[int] = field(default_factory=list) + bad_words: List[int] = field(default_factory=list) max_new_tokens: int = 512 min_new_tokens: int = 0 @@ -75,7 +75,7 @@ def from_gen_config(self, gen_config: EngineGenerationConfig): if max_new_tokens < 0: logger.warning('`max_new_tokens` has to be a strictly' f' positive value, but is {max_new_tokens}') - max_new_tokens = 0 + max_new_tokens = 512 if min_new_tokens < 0 or min_new_tokens > max_new_tokens: logger.warning('`min_new_tokens` has to be ' 'a int >=0 and <= `max_new_tokens`,' @@ -126,7 +126,8 @@ def __init__(self, session_id: int, block_size: int) -> None: def add_sequence(self, token_ids: Tensor, sampling_param: SamplingParam = None, - adapter_name: str = None) -> 'SchedulerSequence': + adapter_name: str = None, + return_logits: bool = False) -> 'SchedulerSequence': """Add a new message.""" if not isinstance(token_ids, Tensor): token_ids = torch.tensor(token_ids) @@ -143,7 +144,8 @@ def add_sequence(self, num_new_tokens=0, sampling_param=sampling_param, adapter_name=adapter_name, - arrive_time=time.time()) + arrive_time=time.time(), + return_logits=return_logits) self.sequences[seq.seq_id] = seq return seq @@ -174,6 +176,7 @@ def fork_sequence( adapter_name=seq.adapter_name, arrive_time=time.time(), meta=deepcopy(seq.meta), + return_logits=seq.return_logits, random_offsets=seq.random_offsets + 1) self.sequences[new_msg.seq_id] = new_msg @@ -198,6 +201,7 @@ class SchedulerSequence: adapter_name: str = None arrive_time: float = 0.0 meta: Any = None + return_logits: bool = False random_offsets: int = 0 @property diff --git a/lmdeploy/pytorch/models/falcon.py b/lmdeploy/pytorch/models/falcon.py index 60f663e485..aa63fb2625 100644 --- a/lmdeploy/pytorch/models/falcon.py +++ b/lmdeploy/pytorch/models/falcon.py @@ -12,71 +12,11 @@ from torch.distributed._tensor import DeviceMesh from transformers.modeling_outputs import \ BaseModelOutputWithPastAndCrossAttentions -from transformers.models.falcon.modeling_falcon import build_alibi_tensor from ..dist_utils import (colwise_parallelize_linear_fn, rowwise_parallelize_linear_fn) -from ..kernels import (alibi_paged_attention_fwd, fill_kv_cache, - paged_attention_fwd) - - -# rotary pos emb helpers -# (torch.jit.script does not seem to support staticmethod...) -def rotate_half(x): - x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:] - return torch.cat((-x2, x1), dim=-1) - - -class PatchedFalconRotaryEmbedding(nn.Module): - """Implementation adapted from Huggingface transformers.""" - - def _patched_set_cos_sin_cache(self, seq_len, device, dtype): - self.seq_len_cached = seq_len - t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype) - freqs = torch.einsum('i,j->ij', t, self.inv_freq) - emb = torch.cat((freqs, freqs), dim=-1).to(device) - - if dtype in [torch.float16, torch.bfloat16]: - emb = emb.float() - - self.cos_cached = emb.cos()[None, :, :] - self.sin_cached = emb.sin()[None, :, :] - - self.cos_cached = self.cos_cached.type(dtype) - self.sin_cached = self.sin_cached.type(dtype) - - def patched_cos_sin(self, - position_ids_1d: torch.Tensor, - device='cpu', - dtype=torch.bfloat16) -> torch.Tensor: - total_length = int(position_ids_1d.max().item()) + 1 - - if (self.seq_len_cached is None) or (total_length > - self.seq_len_cached): - self._patched_set_cos_sin_cache(total_length, device, dtype) - # position_ids.shape == [1, packed_seq_len] - # position_ids_1d.shape == [packed_seq_len] - return ( - self.cos_cached[:, position_ids_1d, None, :], - self.sin_cached[:, position_ids_1d, None, :], - ) - - def _contiguous_batching_forward(self, query: torch.Tensor, - key: torch.Tensor, - position_ids_1d: torch.Tensor): - # batch, seq_len, *_ = query.shape - cos, sin = self.patched_cos_sin(position_ids_1d, - device=query.device, - dtype=query.dtype) - return ( - (query * cos) + (rotate_half(query) * sin), - (key * cos) + (rotate_half(key) * sin), - ) - - def forward(self, query, key, position_ids_or_past_key_values_length=0): - """forward.""" - return self._contiguous_batching_forward( - query, key, position_ids_or_past_key_values_length) +from ..kernels import (alibi_paged_attention_fwd, apply_rotary_pos_emb, + fill_kv_cache, fused_rotary_emb, paged_attention_fwd) class PatchedFalconAttention(nn.Module): @@ -93,7 +33,7 @@ def _distribute_partition_fn(self, mod_name: str, mod: nn.Module, # e.g. 40b-instruct, GQA # split qkv across groups # no finer-grained partitioning - weight = mod.weight.reshape( + mod.weight.data = mod.weight.reshape( -1, # num groups (self.num_heads + self.num_kv_heads * 2) * self.head_dim, self.hidden_size, @@ -101,31 +41,22 @@ def _distribute_partition_fn(self, mod_name: str, mod: nn.Module, elif self.multi_query: # e.g. 7b-instruct, MQA # split to q, copy kv - weight = mod.weight.reshape( - -1, - self.head_dim, - self.hidden_size, - ) + weight = mod.weight.unflatten(0, (-1, self.head_dim)) q_weight = weight[:self.num_heads] - k_weight = weight[self.num_heads:self.num_heads + 1] - v_weight = weight[self.num_heads + 1:self.num_heads + 2] - q_weight_shards = torch.tensor_split(q_weight, - world_size, - dim=0) + kv_weight = weight[-2:] + q_weight_shards = q_weight.chunk(world_size, 0) weight_shards = [] for q in q_weight_shards: # only shard q heads but # copy single k/v head to all ranks weight_shards.append(q) - weight_shards.append(k_weight) - weight_shards.append(v_weight) + weight_shards.append(kv_weight) mod.weight.data = torch.cat(weight_shards, dim=0) # here we keep the weight to be 3D, # so that column parallel will split it # into integer-numbered heads # no bias for 7b-instruct and 40b-instruct - colwise_parallelize_linear_fn(mod, device_mesh=device_mesh, to_local=True) @@ -137,7 +68,7 @@ def _distribute_partition_fn(self, mod_name: str, mod: nn.Module, elif mod_name in ['dense']: if self.new_decoder_architecture: # e.g. 40b-instruct, GQA - weight = mod.weight.reshape( + mod.weight.data = mod.weight.reshape( self.hidden_size, -1, # num groups self.num_heads * self.head_dim, @@ -202,79 +133,80 @@ def _split_heads( 2, :] else: # e.g. 7b-instruct model - batch_size, seq_length, three_times_hidden_size = fused_qkv.shape - if not dist.is_initialized(): - num_head = self.num_heads - else: - # this trick will, for example, split 11 into [4, 4, 3] - # following the way column parallel linear splitting - # non-dividable dims - num_head = self.num_heads - dist.get_rank() - 1 - num_head = 1 + num_head // dist.get_world_size() - fused_qkv = fused_qkv.view(batch_size, seq_length, num_head + 2, - self.head_dim) - return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[ - ..., [-1], :] + fused_qkv = fused_qkv.unflatten(-1, (-1, self.head_dim)) + split_shape = (fused_qkv.size(-2) - 2, 1, 1) + return fused_qkv.split(split_shape, dim=-2) def _contiguous_batching_forward( self, hidden_states: torch.Tensor, - position_ids: torch.Tensor, alibi: Optional[torch.Tensor], - attention_mask: torch.Tensor, layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - head_mask: Optional[torch.Tensor] = None, - use_cache: bool = False, output_attentions: bool = False, ): # prepare inputs for continuous batch forwarding context = self.context.context - - history_lengths = context.history_lengths q_start_loc = context.q_start_loc q_seq_length = context.q_seq_length - history_lengths = q_seq_length.new_tensor(history_lengths) - kv_seq_length = q_seq_length + history_lengths - max_seq_len = q_seq_length.max().item() - - fused_qkv = self.query_key_value( - hidden_states) # [batch_size, seq_length, 3 x hidden_size] + history_lengths = context.history_lengths + kv_seq_length = context.kv_seq_length + max_seq_length = context.max_seq_length + block_offsets = context.block_offsets + position_ids_1d = context.position_ids_1d + + def __maybe_rotary_fn(query_states, key_states, value_states): + scaling_factor = 1.0 + inv_freq = self.maybe_rotary.inv_freq + query_states, key_states = fused_rotary_emb( + query_states[None], + key_states[None], + position_ids_1d[None], + inv_freq=inv_freq, + scaling_factor=scaling_factor, + out_q=query_states[None], + out_k=key_states[None]) + return query_states[0], key_states[0], value_states + + def __rotary_emb_fn(query_states, key_states, value_states): + """rotary embedding func.""" + kv_seq_len = max_seq_length + max(history_lengths) + + cos, sin = self.rotary_emb(value_states.transpose(0, 1), + kv_seq_len) + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, context.position_ids, + position_ids_1d) + return query_states, key_states, value_states + + fused_qkv = self.query_key_value(hidden_states) # 3 x [batch_size, seq_length, num_heads, head_dim] (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) - batch_size, query_length, _, _ = query_layer.shape - - if isinstance(self.maybe_rotary, nn.Module): - position_ids_1d = self.context.context.position_ids_1d - query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, - position_ids_1d) - - if layer_past is not None: - past_key, past_value = layer_past - # concatenate along seq_length dimension: - # - key: [batch_size * self.num_heads, kv_length, head_dim] - # - value: [batch_size * self.num_heads, kv_length, head_dim] - - fill_kv_cache(key_layer.contiguous(), - value_layer.contiguous(), - past_key, - past_value, - q_start_loc, - q_seq_length, - block_offsets=context.block_offsets, - history_lengths=context.history_lengths, - context=context) - - if use_cache: - present = (key_layer, value_layer) - else: - present = None - - attn_output = torch.empty_like(query_layer) - block_offsets = context.block_offsets - - if alibi is None: + query_layer = query_layer.flatten(0, 1) + key_layer = key_layer.flatten(0, 1) + value_layer = value_layer.flatten(0, 1) + if hasattr(self, 'maybe_rotary'): + query_layer, key_layer, value_layer = __maybe_rotary_fn( + query_layer, key_layer, value_layer) + elif hasattr(self, 'rotary_emb'): + query_layer, key_layer, value_layer = __rotary_emb_fn( + query_layer, key_layer, value_layer) + + past_key, past_value = layer_past + fill_kv_cache(key_layer.contiguous(), + value_layer.contiguous(), + past_key, + past_value, + q_start_loc, + q_seq_length, + block_offsets=block_offsets, + history_lengths=history_lengths, + context=context) + + attn_output = query_layer + + if not alibi: paged_attention_fwd(q=query_layer, k=past_key, v=past_value, @@ -283,9 +215,15 @@ def _contiguous_batching_forward( q_start_loc=q_start_loc, q_seqlens=q_seq_length, kv_seqlens=kv_seq_length, - max_seqlen=max_seq_len) + max_seqlen=max_seq_length) else: + num_heads_full = self.num_heads + head_offset = 0 + if dist.is_initialized(): + world_size = dist.get_world_size() + rank = dist.get_rank() + head_offset = self.num_heads // world_size * rank alibi_paged_attention_fwd(q=query_layer, k=past_key, v=past_value, @@ -294,17 +232,18 @@ def _contiguous_batching_forward( b_start_loc=q_start_loc, b_seq_len=q_seq_length, b_kv_seq_len=kv_seq_length, - max_input_len=max_seq_len, + max_input_len=max_seq_length, + head_offset=head_offset, + num_heads=num_heads_full, alibi_scale=self.inv_norm_factor) - attn_output = attn_output.reshape(batch_size, query_length, -1) - + attn_output = attn_output[None].flatten(-2, -1) output_tensor = self.dense(attn_output) if output_attentions: - return output_tensor, present, None + return output_tensor, layer_past, None else: - return output_tensor, present + return output_tensor, layer_past def forward( self, @@ -316,11 +255,8 @@ def forward( use_cache: bool = False, output_attentions: bool = False, ): - position_ids = self.context.context.position_ids - return self._contiguous_batching_forward(hidden_states, position_ids, - alibi, attention_mask, - layer_past, head_mask, - use_cache, output_attentions) + return self._contiguous_batching_forward(hidden_states, alibi, + layer_past) class PatchedFalconMLP(nn.Module): @@ -350,41 +286,17 @@ class PatchedFalconModel(nn.Module): def _contiguous_batching_forward( self, input_ids: Optional[torch.LongTensor] = None, - # position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, - attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: - # history_lengths = self.context.context.history_lengths - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions # noqa - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - use_cache = use_cache if use_cache is not None else self.config.use_cache # noqa - return_dict = return_dict if return_dict is not None else self.config.use_return_dict # noqa + output_attentions = False + use_cache = True use_alibi = getattr(self, 'use_alibi', getattr(self, 'alibi', False)) - if input_ids is not None and inputs_embeds is not None: - raise ValueError( - 'You cannot specify both input_ids and inputs_embeds at the same time' # noqa - ) - elif input_ids is not None: - batch_size, seq_length = input_ids.shape - elif inputs_embeds is not None: - batch_size, seq_length, _ = inputs_embeds.shape - else: - raise ValueError( - 'You have to specify either input_ids or inputs_embeds') - # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape batch_size x num_heads x N x N @@ -397,63 +309,35 @@ def _contiguous_batching_forward( hidden_states = inputs_embeds - # presents = () if use_cache else None - all_self_attentions = () if output_attentions else None - all_hidden_states = () if output_hidden_states else None - # Compute alibi tensor: check build_alibi_tensor documentation - if use_alibi: - alibi = build_alibi_tensor(attention_mask, - self.num_heads, - dtype=hidden_states.dtype) - else: - alibi = None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) outputs = block( hidden_states, - # position_ids=position_ids, layer_past=layer_past, attention_mask=None, head_mask=head_mask[i], use_cache=use_cache, output_attentions=output_attentions, - alibi=alibi, + alibi=use_alibi, ) hidden_states = outputs[0] - if output_attentions: - all_self_attentions = all_self_attentions + ( - outputs[2 if use_cache else 1], ) - # Add last hidden state - hidden_states = self.ln_f(hidden_states) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) - - if not return_dict: - return tuple(v for v in [ - hidden_states, past_key_values, all_hidden_states, - all_self_attentions - ] if v is not None) - return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=past_key_values, - hidden_states=all_hidden_states, - attentions=all_self_attentions, + hidden_states=None, + attentions=None, ) def forward( self, input_ids: Optional[torch.LongTensor] = None, - # position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, attention_mask: Optional[torch.Tensor] = None, @@ -466,12 +350,7 @@ def forward( ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: return self._contiguous_batching_forward( - input_ids=input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict) + input_ids=input_ids, past_key_values=past_key_values) class PatchedFalconForCausalLM(nn.Module): diff --git a/lmdeploy/pytorch/models/gemma.py b/lmdeploy/pytorch/models/gemma.py index b40f142279..67366a2cce 100644 --- a/lmdeploy/pytorch/models/gemma.py +++ b/lmdeploy/pytorch/models/gemma.py @@ -100,7 +100,7 @@ def __rotary_emb_fn(query_states, key_states, value_states): scaling_factor=scaling_factor, out_q=query_states[None], out_k=key_states[None]) - return query_states, key_states, value_states + return query_states[0], key_states[0], value_states query_states, key_states, value_states = __qkv_proj(hidden_states) diff --git a/lmdeploy/pytorch/models/llama.py b/lmdeploy/pytorch/models/llama.py index 79037818a9..20e01402cd 100644 --- a/lmdeploy/pytorch/models/llama.py +++ b/lmdeploy/pytorch/models/llama.py @@ -245,7 +245,7 @@ def __rotary_emb_fn_438_fused(query_states, key_states, value_states): scaling_factor=scaling_factor, out_q=query_states[None], out_k=key_states[None]) - return query_states, key_states, value_states + return query_states[0], key_states[0], value_states def __rotary_emb_fn_438(query_states, key_states, value_states): rotary_name = type(self.rotary_emb).__name__ diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py index ec12db19ac..30f463dccd 100644 --- a/lmdeploy/pytorch/models/module_map.py +++ b/lmdeploy/pytorch/models/module_map.py @@ -30,8 +30,6 @@ f'{LMDEPLOY_PYTORCH_MODEL_PATH}.falcon.PatchedFalconAttention', 'modeling_falcon.FalconModel': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.falcon.PatchedFalconModel', - 'modeling_falcon.FalconRotaryEmbedding': - f'{LMDEPLOY_PYTORCH_MODEL_PATH}.falcon.PatchedFalconRotaryEmbedding', 'modeling_falcon.FalconMLP': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.falcon.PatchedFalconMLP', 'modeling_falcon.FalconForCausalLM': diff --git a/lmdeploy/pytorch/models/patch.py b/lmdeploy/pytorch/models/patch.py index fb7c66607e..b4e9573718 100644 --- a/lmdeploy/pytorch/models/patch.py +++ b/lmdeploy/pytorch/models/patch.py @@ -212,6 +212,8 @@ def _register_hooks(): lambda mod, inputs, outputs: output_fn(outputs, device_mesh)) for name, child in model.named_children(): + if rank == 0: + logger.debug(f'Distribute module: <{name}>') new_child = _dist_model(child, rank, device_mesh) if new_child != child: model.register_module(name, child) diff --git a/lmdeploy/pytorch/supported_models.py b/lmdeploy/pytorch/supported_models.py new file mode 100644 index 0000000000..1941291573 --- /dev/null +++ b/lmdeploy/pytorch/supported_models.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from transformers import AutoConfig + +from lmdeploy.utils import get_logger + +logger = get_logger('lmdeploy') + +_SUPPORTED_ARCHS = dict( + # baichuan-7b + BaiChuanForCausalLM=False, + # baichuan2-7b, baichuan-13b, baichuan2-13b + BaichuanForCausalLM=True, + # chatglm2-6b, chatglm3-6b + ChatGLMModel=True, + # deepseek-moe + DeepseekForCausalLM=True, + # falcon-7b + FalconForCausalLM=True, + # gemma-7b + GemmaForCausalLM=True, + # internlm + InternLMForCausalLM=True, + # internlm2 + InternLM2ForCausalLM=True, + # internlm-xcomposer + InternLMXComposerForCausalLM=False, + # internlm2-xcomposer + InternLM2XComposerForCausalLM=False, + # llama, llama2, alpaca, vicuna, codellama, ultracm, yi, + # deepseek-coder, deepseek-llm + LlamaForCausalLM=True, + # Mistral-7B + MistralForCausalLM=True, + # Mixtral-8x7B + MixtralForCausalLM=True, + # Qwen 7B-72B, Qwen-VL-7B + QWenLMHeadModel=False, + # Qwen1.5 7B-72B + Qwen2ForCausalLM=True, +) + + +def is_supported(model_path: str): + """Check whether supported by pytorch engine. + + Args: + model_path (str): the path of a model. + It could be one of the following options: + - i) A local directory path of a turbomind model which is + converted by `lmdeploy convert` command or download from + ii) and iii). + - ii) The model_id of a lmdeploy-quantized model hosted + inside a model repo on huggingface.co, such as + "InternLM/internlm-chat-20b-4bit", + "lmdeploy/llama2-chat-70b-4bit", etc. + - iii) The model_id of a model hosted inside a model repo + on huggingface.co, such as "internlm/internlm-chat-7b", + "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" + and so on. + Returns: + support_by_torch (bool): Whether input model is supported by pytorch engine + """ # noqa: E501 + import os + + support_by_torch = False + + triton_model_path = os.path.join(model_path, 'triton_models') + if os.path.exists(triton_model_path): + logger.warning(f'{model_path} seems to be a turbomind workspace, ' + 'which can only be ran with turbomind engine.') + else: + cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + + if hasattr(cfg, 'architectures'): + arch = cfg.architectures[0] + elif hasattr(cfg, + 'auto_map') and 'AutoModelForCausalLM' in cfg.auto_map: + arch = cfg.auto_map['AutoModelForCausalLM'].split('.')[-1] + else: + raise RuntimeError( + f'Could not find model architecture from config: {cfg}') + + if arch in _SUPPORTED_ARCHS: + support_by_torch = _SUPPORTED_ARCHS[arch] + # special cases + if arch == 'BaichuanForCausalLM': + # baichuan-13B not supported by pytorch + if cfg.num_attention_heads == 40 and cfg.vocab_size == 64000: + support_by_torch = False + + return support_by_torch diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 53402b133c..3e622d604c 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -4,7 +4,7 @@ import os import random from argparse import ArgumentError -from contextlib import contextmanager +from contextlib import asynccontextmanager from queue import Empty, Queue from threading import Thread from typing import Dict, List, Literal, Optional, Union @@ -59,7 +59,7 @@ def _config_model_name(config): raise ArgumentError(None, f'Please set model_name for {model_path}') else: - logger.warning(f'Best matched chat template name: {model_name}') + logger.info(f'matched chat template name: {model_name}') return model_name @@ -111,9 +111,10 @@ def __init__(self, chat_template_config: Optional[ChatTemplateConfig] = None, tp: int = 1, **kwargs) -> None: - logger.info(f'AsyncEngine init with backend={backend}, backend_config' - f'={backend_config}, chat_template_config=' - f'{chat_template_config}') + logger.info( + f'input backend={backend}, backend_config={backend_config}') + logger.info(f'input chat_template_config={chat_template_config}') + self.model_name = deduce_a_name(model_path, model_name, backend_config, chat_template_config) # build chat template config @@ -122,6 +123,7 @@ def __init__(self, elif chat_template_config.model_name is None: chat_template_config.model_name = self.model_name self.chat_template = chat_template_config.chat_template + # prevent bc for k in list(kwargs.keys()): if hasattr(chat_template_config, k): @@ -129,26 +131,26 @@ def __init__(self, 'chat_template_config instead') v = kwargs.pop(k) setattr(chat_template_config, k, v) + logger.info(f'updated chat_template_onfig={chat_template_config}') # build backend engine if backend == 'turbomind': - logger.info('Running turbomind engine for pipeline.') self._build_turbomind(model_path=model_path, backend_config=backend_config, chat_template_config=chat_template_config, tp=tp, **kwargs) elif backend == 'pytorch': - logger.info('Running pytorch engine for pipeline.') self._build_pytorch(model_path=model_path, backend_config=backend_config, **kwargs) else: raise ValueError(f'unsupported backend {backend}') + logger.info(f'updated backend_config={self.backend_config}') + # parameters for member functions - self.session_len = backend_config.session_len - self.backend_config = backend_config + self.session_len = self.backend_config.session_len self.stop_words = _stop_words(self.chat_template.stop_words, self.engine.tokenizer) if self.stop_words is not None: @@ -187,6 +189,7 @@ def _build_turbomind( engine_config=backend_config, chat_template_config=chat_template_config, **kwargs) + self.backend_config = backend_config def _build_pytorch( self, @@ -205,6 +208,7 @@ def _build_pytorch( backend_config.session_len = self.chat_template.session_len self.engine = Engine(model_path=model_path, engine_config=backend_config) + self.backend_config = backend_config def __call__(self, prompts: Union[List[str], str, List[Dict], List[List[Dict]]], @@ -253,30 +257,30 @@ def __call__(self, do_preprocess=do_preprocess, **kwargs) - def stop_session(self, session_id: int): + async def stop_session(self, session_id: int): """Stop a session by a session_id.""" if str(session_id) in self.id2generator: - self.id2generator[str(session_id)].cancel(session_id) + await self.id2generator[str(session_id)].async_cancel(session_id) self.gens_set.add(self.id2generator[str(session_id)]) self.running_session_ids.discard(session_id) - def end_session(self, session_id: int): + async def end_session(self, session_id: int): """Clear a session by a session_id.""" if str(session_id) in self.id2generator: - self.id2generator[str(session_id)].end(session_id) + await self.id2generator[str(session_id)].async_end(session_id) self.id2step[str(session_id)] = 0 self.gens_set.add(self.id2generator[str(session_id)]) self.running_session_ids.discard(session_id) - @contextmanager - def safe_run(self, session_id: Optional[int] = None): + @asynccontextmanager + async def safe_run(self, session_id: Optional[int] = None): """A context manager to make sure server's safe running.""" try: yield except (Exception, asyncio.CancelledError) as e: # noqa - self.stop_session(session_id) + await self.stop_session(session_id) raise e if str(session_id) in self.id2generator: self.gens_set.add(self.id2generator[str(session_id)]) @@ -288,7 +292,7 @@ async def get_generator(self, stop: bool, session_id: int): return self.engine.create_instance() # waiting no generator is available or the same session_id is running while self.gens_set == set() or session_id in self.running_session_ids: - await asyncio.sleep(0) + await asyncio.sleep(0.1) generator = self.gens_set.pop() self.id2generator[str(session_id)] = generator self.running_session_ids.add(session_id) @@ -489,10 +493,10 @@ async def generate( yield GenOut('', self.id2step[str(session_id)], len(input_ids), 0, finish_reason) if sequence_end is True and sequence_start is False: - self.end_session(session_id) + await self.end_session(session_id) else: generator = await self.get_generator(False, session_id) - with self.safe_run(session_id): + async with self.safe_run(session_id): state = DetokenizeState() async for outputs in generator.async_stream_infer( session_id=session_id, @@ -528,4 +532,4 @@ async def generate( # manually end pytorch session # TODO modify pytorch or turbomind api if self.backend == 'pytorch' and sequence_end: - self.end_session(session_id) + await self.end_session(session_id) diff --git a/lmdeploy/serve/gradio/turbomind_coupled.py b/lmdeploy/serve/gradio/turbomind_coupled.py index 0371c0b11f..be522cb2b3 100644 --- a/lmdeploy/serve/gradio/turbomind_coupled.py +++ b/lmdeploy/serve/gradio/turbomind_coupled.py @@ -78,7 +78,7 @@ async def reset_local_func(instruction_txtbox: gr.Textbox, """ state_chatbot = [] # end the session - InterFace.async_engine.end_session(session_id) + await InterFace.async_engine.end_session(session_id) return (state_chatbot, state_chatbot, gr.Textbox.update(value='')) @@ -94,12 +94,12 @@ async def cancel_local_func(state_chatbot: Sequence, cancel_btn: gr.Button, session_id (int): the session id """ yield (state_chatbot, disable_btn, disable_btn) - InterFace.async_engine.stop_session(session_id) + await InterFace.async_engine.stop_session(session_id) # pytorch backend does not support resume chat history now if InterFace.async_engine.backend == 'pytorch': yield (state_chatbot, disable_btn, enable_btn) else: - InterFace.async_engine.end_session(session_id) + await InterFace.async_engine.end_session(session_id) messages = [] for qa in state_chatbot: messages.append(dict(role='user', content=qa[0])) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 8b8969c45e..97ea2d9939 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -216,7 +216,8 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: async for res in result_generator: if await raw_request.is_disconnected(): # Abort the request if the client disconnects. - VariableInterface.async_engine.stop_session(request.session_id) + await VariableInterface.async_engine.stop_session( + request.session_id) return create_error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected') final_res = res @@ -376,7 +377,8 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: async for res in result_generator: if await raw_request.is_disconnected(): # Abort the request if the client disconnects. - VariableInterface.async_engine.stop_session(request.session_id) + await VariableInterface.async_engine.stop_session( + request.session_id) return create_error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected') final_res = res @@ -522,7 +524,8 @@ async def _inner_call(i, generator): async for res in generator: if await raw_request.is_disconnected(): # Abort the request if the client disconnects. - VariableInterface.async_engine.stop_session(request.session_id) + await VariableInterface.async_engine.stop_session( + request.session_id) return create_error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected') final_res = res @@ -688,7 +691,8 @@ async def _inner_call(i, generator): async for res in generator: if await raw_request.is_disconnected(): # Abort the request if the client disconnects. - VariableInterface.async_engine.stop_session(request.session_id) + await VariableInterface.async_engine.stop_session( + request.session_id) return create_error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected') final_res = res @@ -825,7 +829,8 @@ async def stream_results() -> AsyncGenerator[bytes, None]: async for out in generation: if await raw_request.is_disconnected(): # Abort the request if the client disconnects. - VariableInterface.qos_engine.stop_session(request.session_id) + await VariableInterface.qos_engine.stop_session( + request.session_id) return create_error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected') text += out.response @@ -870,7 +875,8 @@ async def chat_interactive_v1(request: GenerateRequest, """ if request.cancel: if request.session_id != -1: - VariableInterface.async_engine.stop_session(request.session_id) + await VariableInterface.async_engine.stop_session( + request.session_id) return { 'text': '', 'tokens': 0, @@ -879,7 +885,7 @@ async def chat_interactive_v1(request: GenerateRequest, 'finish_reason': 'stop' } else: - create_error_response( + return create_error_response( HTTPStatus.BAD_REQUEST, 'please set a session_id to cancel a request') if request.session_id == -1: @@ -931,7 +937,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]: async for out in generation: if await raw_request.is_disconnected(): # Abort the request if the client disconnects. - async_engine.stop_session(request.session_id) + await async_engine.stop_session(request.session_id) return create_error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected') text += out.response @@ -1046,6 +1052,11 @@ def serve(model_path: str, VariableInterface.qos_engine.start() except FileNotFoundError: VariableInterface.qos_engine = None + else: + # hide qos functions if not applied + for i in range(len(app.router.routes)): + if 'qos' in app.router.routes[i].path: + app.router.routes[i].include_in_schema = False for i in range(3): print( diff --git a/lmdeploy/serve/qos_engine/qos_engine.py b/lmdeploy/serve/qos_engine/qos_engine.py index bd6c8bf8a7..1a311bc0f2 100644 --- a/lmdeploy/serve/qos_engine/qos_engine.py +++ b/lmdeploy/serve/qos_engine/qos_engine.py @@ -65,9 +65,9 @@ def is_qos_enabled(self): """check while qos engine is enabled.""" return self.qos_config.is_qos_enabled - def stop_session(self, session_id: int): + async def stop_session(self, session_id: int): """Stop a session by a session_id.""" - self.engine.stop_session(session_id) + await self.engine.stop_session(session_id) async def generate(self, request): """entry of qos engine generate for three api.""" diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py new file mode 100644 index 0000000000..99dd4400c4 --- /dev/null +++ b/lmdeploy/turbomind/supported_models.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from transformers import AutoConfig + +from lmdeploy.utils import get_logger + +logger = get_logger('lmdeploy') + +_SUPPORTED_ARCHS = dict( + # baichuan-7b + BaiChuanForCausalLM=True, + # baichuan2-7b, baichuan-13b, baichuan2-13b + BaichuanForCausalLM=True, + # chatglm2-6b, chatglm3-6b + ChatGLMModel=False, + # deepseek-moe + DeepseekForCausalLM=False, + # falcon-7b + FalconForCausalLM=False, + # gemma-7b + GemmaForCausalLM=False, + # internlm + InternLMForCausalLM=True, + # internlm2 + InternLM2ForCausalLM=True, + # internlm-xcomposer + InternLMXComposerForCausalLM=True, + # internlm2-xcomposer + InternLM2XComposerForCausalLM=False, + # llama, llama2, alpaca, vicuna, codellama, ultracm, yi, + # deepseek-coder, deepseek-llm + LlamaForCausalLM=True, + # Mistral-7B + MistralForCausalLM=False, + # Mixtral-8x7B + MixtralForCausalLM=False, + # Qwen 7B-72B, Qwen-VL-7B + QWenLMHeadModel=True, + # Qwen1.5 7B-72B + Qwen2ForCausalLM=False) + + +def is_supported(model_path: str): + """Check whether supported by turbomind engine. + + Args: + model_path (str): the path of a model. + It could be one of the following options: + - i) A local directory path of a turbomind model which is + converted by `lmdeploy convert` command or download from + ii) and iii). + - ii) The model_id of a lmdeploy-quantized model hosted + inside a model repo on huggingface.co, such as + "InternLM/internlm-chat-20b-4bit", + "lmdeploy/llama2-chat-70b-4bit", etc. + - iii) The model_id of a model hosted inside a model repo + on huggingface.co, such as "internlm/internlm-chat-7b", + "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" + and so on. + Returns: + support_by_turbomind (bool): Whether input model is supported by turbomind engine + """ # noqa: E501 + import os + + support_by_turbomind = False + triton_model_path = os.path.join(model_path, 'triton_models') + if os.path.exists(triton_model_path): + support_by_turbomind = True + else: + cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + + if hasattr(cfg, 'architectures'): + arch = cfg.architectures[0] + elif hasattr(cfg, + 'auto_map') and 'AutoModelForCausalLM' in cfg.auto_map: + arch = cfg.auto_map['AutoModelForCausalLM'].split('.')[-1] + else: + raise RuntimeError( + f'Could not find model architecture from config: {cfg}') + + if arch in _SUPPORTED_ARCHS: + support_by_turbomind = _SUPPORTED_ARCHS[arch] + # special cases + if arch == 'BaichuanForCausalLM': + num_attn_head = cfg.num_attention_heads + if num_attn_head == 40: + # baichuan-13B, baichuan2-13B not supported by turbomind + support_by_turbomind = False + return support_by_turbomind diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index cc4281f723..77032e61d3 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -566,6 +566,11 @@ def end(self, session_id: int): sequence_end=True): pass + async def async_end(self, session_id: int): + """End the given session.""" + self.end(session_id) + await asyncio.sleep(0.002) + def cancel(self, session_id: int): """Stop current streaming inference.""" input_ids = [self.tm_model.tokenizer.eos_token_id] @@ -578,6 +583,11 @@ def cancel(self, session_id: int): stop=True): pass + async def async_cancel(self, session_id: int): + """End the given session.""" + self.cancel(session_id) + await asyncio.sleep(0.002) + def prepare_inputs(self, session_id, input_ids, diff --git a/lmdeploy/version.py b/lmdeploy/version.py index d45eba82e4..277e7502ec 100644 --- a/lmdeploy/version.py +++ b/lmdeploy/version.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple -__version__ = '0.2.4' +__version__ = '0.2.5' short_version = __version__ diff --git a/requirements/test.txt b/requirements/test.txt index d06440a9d7..bcdf679df3 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,7 +1,7 @@ allure-pytest coverage pynvml -pytest +pytest==8.0.2 pytest-assume pytest-order pytest-rerunfailures diff --git a/tests/pytorch/engine/test_request.py b/tests/pytorch/engine/test_request.py index 66dbd55783..813a30e8e7 100644 --- a/tests/pytorch/engine/test_request.py +++ b/tests/pytorch/engine/test_request.py @@ -1,29 +1,59 @@ +import asyncio + import pytest from lmdeploy.pytorch.engine.request import (RequestManager, RequestType, - ResponseType) + Response, ResponseType) class TestRequestHander: @pytest.fixture - def manager(self): - yield RequestManager() + def event_loop(self): + old_loop = asyncio.get_event_loop() + new_loop = asyncio.new_event_loop() + yield new_loop + new_loop.stop() + asyncio.set_event_loop(old_loop) - def _stop_engine_callback(self, reqs, **kwargs): - raise RuntimeError('stop_engine') + @pytest.fixture + def thread_safe(self, request): + yield request.param - def test_bind(self, manager): + @pytest.fixture + def manager(self, thread_safe): + yield RequestManager(thread_safe=thread_safe) + + @pytest.mark.parametrize('thread_safe', [True, False]) + def test_bind(self, manager, event_loop): + + def __stop_engine_callback(reqs, **kwargs): + for req in reqs: + manager.response( + Response(type=ResponseType.SUCCESS, + sender_id=req.sender_id, + req_id=req.req_id, + data=f'{req.data} success')) + + async def __dummy_loop(): + while True: + manager.step() + await asyncio.sleep(0.1) + + asyncio.set_event_loop(event_loop) sender = manager.build_sender() + manager.start_loop(__dummy_loop) # test not bind req_id = sender.send_async(RequestType.STOP_ENGINE, None) - manager.step() resp = sender.recv(req_id) assert resp.type == ResponseType.HANDLER_NOT_EXIST + assert manager.is_loop_alive() + # test bind success sender.send_async(RequestType.STOP_ENGINE, None) - manager.bind_func(RequestType.STOP_ENGINE, self._stop_engine_callback) - with pytest.raises(RuntimeError): - manager.step() + manager.bind_func(RequestType.STOP_ENGINE, __stop_engine_callback) + req_id = sender.send_async(RequestType.STOP_ENGINE, 'test') + resp = sender.recv(req_id) + assert resp.data == 'test success' diff --git a/tests/pytorch/kernel/test_multinomial_sampling.py b/tests/pytorch/kernel/test_multinomial_sampling.py index 4512a34879..f0f594dde6 100644 --- a/tests/pytorch/kernel/test_multinomial_sampling.py +++ b/tests/pytorch/kernel/test_multinomial_sampling.py @@ -19,10 +19,15 @@ def batch_size(self, select_ids): yield len(select_ids) @pytest.fixture - def scores(self, num_tokens, batch_size, select_ids): + def dtype(self, request): + yield request.param + + @pytest.fixture + def scores(self, num_tokens, batch_size, select_ids, dtype): ret = torch.zeros(batch_size, num_tokens).cuda() batch_ids = torch.arange(batch_size).cuda() ret[batch_ids, select_ids] = 1 + ret = ret.to(dtype) yield ret @pytest.fixture @@ -45,6 +50,8 @@ def gt(self, batch_size, select_ids, indices): batch_ids = torch.arange(batch_size).cuda() yield indices[batch_ids, select_ids] + @pytest.mark.parametrize('dtype', + [torch.float32, torch.half, torch.bfloat16]) @pytest.mark.parametrize(['num_tokens', 'select_ids'], [ (8, (4, 2) * 30), (200, (50, 150)), diff --git a/tests/pytorch/kernel/test_paged_attention.py b/tests/pytorch/kernel/test_paged_attention.py index 55175aee5f..3cd208f052 100644 --- a/tests/pytorch/kernel/test_paged_attention.py +++ b/tests/pytorch/kernel/test_paged_attention.py @@ -248,36 +248,6 @@ def test_paged_attention(self, conti_q, blocked_kv, block_offsets, max_seqlen=max_seq_len) torch.testing.assert_close(out, conti_gt, atol=1e-3, rtol=1e-5) - @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(4, 2)], - indirect=True) - @pytest.mark.parametrize(['seq_lens', 'history_lens'], - [([30, 50, 70, 90], [50, 40, 30, 20])], - indirect=True) - @pytest.mark.parametrize('block_size', [16], indirect=True) - def test_biased_paged_attention(self, conti_q, blocked_kv, block_offsets, - start_loc, seq_lens, history_lens, - block_size, mask, conti_gt): - from lmdeploy.pytorch.kernels import biased_paged_attention_fwd - kv_seq_lens = seq_lens + history_lens - max_seq_len = seq_lens.max().item() - - blocked_k, blocked_v = blocked_kv - out = torch.empty_like(conti_q) - - biased_paged_attention_fwd(conti_q, - blocked_k, - blocked_v, - mask, - out, - block_offsets=block_offsets, - b_start_loc=start_loc, - b_seq_len=seq_lens, - b_kv_seq_len=kv_seq_lens, - max_input_len=max_seq_len, - BLOCK=block_size) - - torch.testing.assert_close(out, conti_gt) - @pytest.fixture def win_size(self, request): yield request.param diff --git a/tests/test_lmdeploy/test_auto_backend.py b/tests/test_lmdeploy/test_auto_backend.py new file mode 100644 index 0000000000..b96ca4e3cb --- /dev/null +++ b/tests/test_lmdeploy/test_auto_backend.py @@ -0,0 +1,77 @@ +import os +import tempfile + +import numpy as np +import pytest + + +class TestAutoBackend: + + @pytest.fixture + def turbomind_workspace(self): + workspace = tempfile.TemporaryDirectory( + 'internlm-chat-7b-turbomind').name + os.makedirs(os.path.join(workspace, 'triton_models'), exist_ok=True) + return workspace + + @pytest.fixture + def models(self): + # example models to test + # format (model_path, is_pytorch_supported, is_turbomind_supported) + models = [ + ('baichuan-inc/Baichuan-7B', False, True), + ('baichuan-inc/Baichuan2-7B-Chat', True, True), + ('baichuan-inc/Baichuan-13B-Chat', False, False), + ('baichuan-inc/Baichuan2-13B-Chat', True, False), + ('internlm/internlm-chat-7b', True, True), + ('internlm/internlm2-chat-7b', True, True), + ('internlm/internlm-xcomposer2-7b', False, False), + ('internlm/internlm-xcomposer-7b', False, True), + ('THUDM/chatglm2-6b', True, False), + ('THUDM/chatglm3-6b', True, False), + ('deepseek-ai/deepseek-moe-16b-chat', True, False), + ('tiiuae/falcon-7b-instruct', True, False), + ('01-ai/Yi-34B-Chat', True, True), + ('codellama/CodeLlama-7b-Instruct-hf', True, True), + ('mistralai/Mistral-7B-Instruct-v0.1', True, False), + ('mistralai/Mixtral-8x7B-Instruct-v0.1', True, False), + ('Qwen/Qwen-7B-Chat', False, True), + ('Qwen/Qwen-VL-Chat', False, True), + ('Qwen/Qwen1.5-4B-Chat', True, False), + ] + return models + + def test_pytorch_is_suppored(self, turbomind_workspace, models): + from lmdeploy.pytorch.supported_models import is_supported + assert is_supported(turbomind_workspace) is False + for m, flag, _ in models: + assert is_supported(m) is flag + + def test_turbomind_is_suppored(self, turbomind_workspace, models): + from lmdeploy.turbomind.supported_models import is_supported + assert is_supported(turbomind_workspace) is True + for m, _, flag in models: + assert is_supported(m) is flag + + def test_autoget_backend(self, turbomind_workspace, models): + from lmdeploy.archs import autoget_backend + assert autoget_backend(turbomind_workspace) == 'turbomind' + n = len(models) + choices = np.random.choice(n, n // 2, replace=False) + for i in choices: + model, is_support_pytorch, is_support_turbomind = models[i] + target = 'turbomind' if is_support_turbomind else 'pytorch' + backend = autoget_backend(model) + assert backend == target + + def test_autoget_backend_config(self, turbomind_workspace): + from lmdeploy.archs import autoget_backend_config + from lmdeploy.messages import (PytorchEngineConfig, + TurbomindEngineConfig) + assert type(autoget_backend_config( + turbomind_workspace)) is TurbomindEngineConfig + assert type(autoget_backend_config( + 'internlm/internlm-chat-7b')) is TurbomindEngineConfig + assert type( + autoget_backend_config( + 'mistralai/Mistral-7B-Instruct-v0.1')) is PytorchEngineConfig