diff --git a/.github/md-link-config.json b/.github/md-link-config.json index 469ac707a6..3b9bca0dcc 100644 --- a/.github/md-link-config.json +++ b/.github/md-link-config.json @@ -17,6 +17,15 @@ }, { "pattern": "^http://localhost" + }, + { + "pattern": "^https://twitter.com" + }, + { + "pattern": "^https://platform.openai.com" + }, + { + "pattern": "^http://0.0.0.0" } ], "httpHeaders": [ diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index afc94417f8..f2279536ad 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -2,8 +2,29 @@ name: daily_ete_test on: workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch']" + model: + required: true + description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models' + type: string + default: "['quantization','convert','pipeline','restful','chat','interface-pipeline']" schedule: - - cron: '00 18 * * *' + - cron: '00 21 * * *' env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache @@ -13,7 +34,7 @@ env: jobs: test_functions: runs-on: [self-hosted, linux-a100] - timeout-minutes: 420 + timeout-minutes: 240 env: REPORT_DIR: /nvme/qa_test_models/test-reports container: @@ -23,6 +44,7 @@ jobs: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/bigdisk/qa_test_models:/mnt/bigdisk/qa_test_models - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Setup systems @@ -33,7 +55,10 @@ jobs: dpkg -i /root/packages/allure_2.24.1-1_all.deb rm -rf /var/lib/apt/lists/* - name: Clone repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} - name: Install pytorch run: | python3 -m pip cache dir @@ -68,64 +93,89 @@ jobs: run: | python3 -m pip list lmdeploy check_env + rm -rf allure-results - name: Test lmdeploy - quantization w4a16 continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | pytest autotest/tools/quantization/test_quantization_w4a16.py -m 'not pr_test' -n 8 --alluredir=allure-results --clean-alluredir - name: Test lmdeploy - quantization kv int8 continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | pytest autotest/tools/quantization/test_quantization_kvint8.py -n 8 --alluredir=allure-results - name: Test lmdeploy - quantization w8a8 continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=allure-results - name: Test lmdeploy - quantization kv int8 and w4a16 continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | pytest autotest/tools/quantization/test_quantization_kvint8_w4a16.py -n 8 --alluredir=allure-results - name: Test lmdeploy - convert continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'convert')) run: | - pytest autotest/tools/convert -m 'not pr_test' -n 6 --alluredir=allure-results --dist loadgroup - - name: Test lmdeploy - interface turbomind case + pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=allure-results + - name: Test lmdeploy - chat workspace continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat')) timeout-minutes: 20 run: | - pytest autotest/interface/pipeline/test_pipeline_turbomind_func.py -m 'not pr_test' --alluredir=allure-results - - name: Test lmdeploy - pipeline turbomind - continue-on-error: true - timeout-minutes: 45 - run: pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'not pr_test' --alluredir=allure-results - - name: Test lmdeploy - pipeline torch + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - chat hf turbomind continue-on-error: true - timeout-minutes: 75 - run: pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'not pr_test' --alluredir=allure-results - - name: Test lmdeploy - restful turbomind + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat')) + timeout-minutes: 20 + run: | + pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - chat hf torch continue-on-error: true - timeout-minutes: 60 - run: pytest autotest/tools/restful/test_restful_chat_turbomind.py -m 'not pr_test' --alluredir=allure-results - - name: Test lmdeploy - restful torch + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'chat')) + timeout-minutes: 20 + run: | + pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - pipeline turbomind continue-on-error: true - timeout-minutes: 80 - run: pytest autotest/tools/restful/test_restful_chat_pytorch.py -m 'not pr_test' --alluredir=allure-results - - name: Test lmdeploy - chat workspace + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'pipeline')) + timeout-minutes: 25 + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - restful turbomind continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful')) timeout-minutes: 30 run: | - pytest autotest/tools/chat/test_command_chat_workspace.py -m 'not pr_test' -n 4 --alluredir=allure-results - - name: Test lmdeploy - chat hf turbomind + pytest autotest/tools/restful/test_restful_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/restful/test_restful_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - interface pipeline turbomind case continue-on-error: true - timeout-minutes: 45 + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'interface-pipeline')) + timeout-minutes: 20 run: | - pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'not pr_test' -n 4 --alluredir=allure-results - - name: Test lmdeploy - chat hf torch + pytest autotest/interface/pipeline/test_pipeline_turbomind_func.py -m 'not pr_test' --alluredir=allure-results + - name: Test lmdeploy - pipeline torch + continue-on-error: true + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'pipeline')) + timeout-minutes: 25 + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results + - name: Test lmdeploy - restful torch continue-on-error: true - timeout-minutes: 60 + if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'restful')) + timeout-minutes: 40 run: | - pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'not pr_test' -n 4 --alluredir=allure-results + pytest autotest/tools/restful/test_restful_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results + pytest autotest/tools/restful/test_restful_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results - name: Test lmdeploy - rerun all fail cases - timeout-minutes: 60 + timeout-minutes: 30 run: | pytest autotest --lf --alluredir=allure-results - name: Generate reports diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml index a41e639f30..08bf24b4b7 100644 --- a/.github/workflows/pr_ete_test.yml +++ b/.github/workflows/pr_ete_test.yml @@ -34,6 +34,7 @@ jobs: - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip - /nvme/share_data/github-actions/packages:/root/packages - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/bigdisk/qa_test_models:/mnt/bigdisk/qa_test_models - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Setup systems @@ -81,7 +82,7 @@ jobs: lmdeploy check_env - name: Test lmdeploy timeout-minutes: 120 - run: CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m pr_test --alluredir=allure-results --clean-alluredir + run: CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m pr_test -v -s --alluredir=allure-results --clean-alluredir - name: Generate reports if: always() run: | diff --git a/README.md b/README.md index 9f4f0d66b5..5cf1699902 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,23 @@
-[![docs](https://img.shields.io/badge/docs-latest-blue)](https://lmdeploy.readthedocs.io/en/latest/) -[![badge](https://github.com/InternLM/lmdeploy/workflows/lint/badge.svg)](https://github.com/InternLM/lmdeploy/actions) [![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy) +![PyPI - Downloads](https://img.shields.io/pypi/dm/lmdeploy) [![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE) [![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) +[📘Documentation](https://lmdeploy.readthedocs.io/en/latest/) | +[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started.html) | +[🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose) + English | [简体中文](README_zh-CN.md) -
+👋 join us on [![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=wechat&label=WeChat)](https://r.vansin.top/?r=internwx) +[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=twitter&label=Twitter)](https://twitter.com/intern_lm) +[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=discord&label=Discord)](https://discord.gg/xa29JuW87d) -

- 👋 join us on Twitter, Discord and WeChat -

+ ______________________________________________________________________ @@ -23,6 +26,7 @@ ______________________________________________________________________
2024 +- \[2024/02\] Support Qwen 1.5, Gemma, Mistral, Mixtral, Deepseek-MOE and so on. - \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](./docs/en/serving/restful_api.md). - \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](./docs/en/serving/proxy_server.md) - \[2024/01\] Support [PyTorch inference engine](./docs/en/inference/pytorch.md), developed entirely in Python, helping to lower the barriers for developers and enable rapid experimentation with new features and technologies. diff --git a/README_zh-CN.md b/README_zh-CN.md index 51155b819a..b7a0e61a69 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -1,20 +1,23 @@
-[![docs](https://img.shields.io/badge/docs-latest-blue)](https://lmdeploy.readthedocs.io/zh-cn/latest/) -[![badge](https://github.com/InternLM/lmdeploy/workflows/lint/badge.svg)](https://github.com/InternLM/lmdeploy/actions) [![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy) +![PyPI - Downloads](https://img.shields.io/pypi/dm/lmdeploy) [![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE) [![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) +[📘Documentation](https://lmdeploy.readthedocs.io/zh-cn/latest/) | +[🛠️Quick Start](https://lmdeploy.readthedocs.io/zh-cn/latest/get_started.html) | +[🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose) + [English](README.md) | 简体中文 -
+👋 join us on [![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=wechat&label=WeChat)](https://r.vansin.top/?r=internwx) +[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=twitter&label=Twitter)](https://twitter.com/intern_lm) +[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=discord&label=Discord)](https://discord.gg/xa29JuW87d) -

- 👋 join us on Twitter, Discord and WeChat -

+ ______________________________________________________________________ @@ -23,6 +26,7 @@ ______________________________________________________________________
2024 +- \[2024/02\] 支持 Qwen 1.5、Gemma、Mistral、Mixtral、Deepseek-MOE 等模型 - \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) 发布,支持无缝接入[LMDeploy Serving Service](./docs/zh_cn/serving/restful_api.md) - \[2024/01\] 支持多模型、多机、多卡推理服务。使用方法请参考[此处](./docs/zh_cn/serving/proxy_server.md) - \[2024/01\] 增加 [PyTorch 推理引擎](./docs/zh_cn/inference/pytorch.md),作为 TurboMind 引擎的补充。帮助降低开发门槛,和快速实验新特性、新技术 diff --git a/autotest/config.yaml b/autotest/config.yaml index 60100c6fa8..75988f2891 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -1,4 +1,4 @@ -model_path: /nvme/qa_test_models +model_path: /mnt/bigdisk/qa_test_models dst_path: /nvme/qa_test_models/autotest_model log_path: /nvme/qa_test_models/autotest_model/log dataset_path: /nvme/qa_test_models/...dataset @@ -13,67 +13,69 @@ tp_config: turbomind_model: - - llama-2-7b-chat - - internlm2-chat-1_8b - - internlm-chat-7b - - internlm-chat-20b - - internlm2-chat-7b - - internlm2-chat-20b - - Qwen-7B-Chat - - Qwen-14B-Chat - - llama2-chat-7b-w4 - - Baichuan2-7B-Chat - - Yi-6B-Chat - - internlm2-1_8b - - internlm2-20b - - CodeLlama-7b-Instruct-hf + - meta-llama/Llama-2-7b-chat + - internlm/internlm2-chat-1_8b + - internlm/internlm-chat-7b + - internlm/internlm-chat-20b + - internlm/internlm2-chat-7b + - internlm/internlm2-chat-20b + - internlm/internlm2-chat-7b-4bits + - internlm/internlm2-chat-20b-4bits + - Qwen/Qwen-7B-Chat + - Qwen/Qwen-14B-Chat + - lmdeploy/llama2-chat-7b-w4 + - baichuan-inc/Baichuan2-7B-Chat + - 01-ai/Yi-6B-Chat + - internlm/internlm2-1_8b + - internlm/internlm2-20b + - codellama/CodeLlama-7b-Instruct-hf pytorch_model: - - llama-2-7b-chat - - internlm-chat-7b - - internlm-chat-20b - - internlm2-chat-7b - - internlm2-chat-20b - - Baichuan2-7B-Chat - - Baichuan2-13B-Chat - - chatglm2-6b - - falcon-7b - - Yi-6B-Chat - - internlm2-1_8b - - internlm2-20b - - Qwen1.5-7B-Chat - - Mistral-7B-Instruct-v0.1 - - Mixtral-8x7B-Instruct-v0.1 - - gemma-7b-it - - deepseek-moe-16b-chat + - meta-llama/Llama-2-7b-chat + - internlm/internlm-chat-7b + - internlm/internlm-chat-20b + - internlm/internlm2-chat-7b + - internlm/internlm2-chat-20b + - baichuan-inc/Baichuan2-7B-Chat + - baichuan-inc/Baichuan2-13B-Chat + - THUDM/chatglm2-6b + - tiiuae/falcon-7b + - 01-ai/Yi-6B-Chat + - internlm/internlm2-1_8b + - internlm/internlm2-20b + - Qwen/Qwen1.5-7B-Chat + - mistralai/Mistral-7B-Instruct-v0.1 + - mistralai/Mixtral-8x7B-Instruct-v0.1 + - google/gemma-7b-it + - deepseek-ai/deepseek-moe-16b-chat quatization_case_config: w4a16: - - llama-2-7b-chat - - internlm-chat-20b - - Qwen-7B-Chat - - Qwen-14B-Chat - - internlm2-chat-20b - - Baichuan2-7B-Chat - - internlm2-20b + - meta-llama/Llama-2-7b-chat + - internlm/internlm-chat-20b + - Qwen/Qwen-7B-Chat + - Qwen/Qwen-14B-Chat + - internlm/internlm2-chat-20b + - baichuan-inc/Baichuan2-7B-Chat + - internlm/internlm2-20b kvint8: # more models are supported kvint8 quantization, but the chat response are not good, already removed - - llama-2-7b-chat - - internlm-chat-20b - - internlm2-chat-20b + - meta-llama/Llama-2-7b-chat + - internlm/internlm-chat-20b + - internlm/internlm2-chat-20b kvint8_w4a16: - - llama-2-7b-chat - - internlm-chat-20b - - internlm2-chat-20b - - internlm2-20b - - Qwen-7B-Chat - - Qwen-14B-Chat - - Baichuan2-7B-Chat + - meta-llama/Llama-2-7b-chat + - internlm/internlm-chat-20b + - internlm/internlm2-chat-20b + - internlm/internlm2-20b + - Qwen/Qwen-7B-Chat + - Qwen/Qwen-14B-Chat + - baichuan-inc/Baichuan2-7B-Chat w8a8: - - llama-2-7b-chat - - internlm-chat-20b - - internlm2-chat-20b - - internlm2-chat-7b - - Yi-6B-Chat - - internlm2-20b + - meta-llama/Llama-2-7b-chat + - internlm/internlm-chat-20b + - internlm/internlm2-chat-20b + - internlm/internlm2-chat-7b + - 01-ai/Yi-6B-Chat + - internlm/internlm2-20b diff --git a/autotest/interface/pipeline/test_pipeline_turbomind_func.py b/autotest/interface/pipeline/test_pipeline_turbomind_func.py index 8251fae347..64a07b3ddb 100644 --- a/autotest/interface/pipeline/test_pipeline_turbomind_func.py +++ b/autotest/interface/pipeline/test_pipeline_turbomind_func.py @@ -10,7 +10,7 @@ @pytest.mark.flaky(reruns=0) class TestPipelineTurbomindFuncRegression: - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_backend_config_tp(self, config, model): with pytest.raises(AssertionError, match='tp should be 2\\^n'): model_path = '/'.join([config.get('model_path'), model]) @@ -18,7 +18,7 @@ def test_backend_config_tp(self, config, model): pipe = pipeline(model_path, backend_config=backend_config) del pipe - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_backend_config_session_len(self, config, model): model_path = '/'.join([config.get('model_path'), model]) backend_config = TurbomindEngineConfig(session_len=10) @@ -29,7 +29,7 @@ def test_backend_config_session_len(self, config, model): assert response[i].finish_reason == 'length', str(response[i]) assert response[i].generate_token_len == 0, str(response[i]) - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_gen_config_test(self, config, model): model_path = '/'.join([config.get('model_path'), model]) pipe = pipeline(model_path) @@ -111,7 +111,7 @@ def test_gen_config_test(self, config, model): del pipe - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def future_test_backend_config_cache_max_entry_count(self, config, model): model_path = '/'.join([config.get('model_path'), model]) backend_config = TurbomindEngineConfig(cache_max_entry_count=-1) @@ -122,7 +122,7 @@ def future_test_backend_config_cache_max_entry_count(self, config, model): with assume: assert response[i].finish_reason == 'length', str(response[i]) - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_backend_config_max_batch_size2(self, config, model): model_path = '/'.join([config.get('model_path'), model]) backend_config = TurbomindEngineConfig(max_batch_size=-1) @@ -140,7 +140,7 @@ def test_backend_config_max_batch_size2(self, config, model): with assume: assert response[i].text == '', str(response[i]) - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_pipeline_batch_infer(self, config, model): model_path = '/'.join([config.get('model_path'), model]) pipe = pipeline(model_path) @@ -160,7 +160,7 @@ def test_pipeline_batch_infer(self, config, model): with assume: assert response[i].session_id == i - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_pipeline_stream_infer(self, config, model): model_path = '/'.join([config.get('model_path'), model]) pipe = pipeline(model_path) @@ -207,7 +207,7 @@ def test_pipeline_stream_infer(self, config, model): with assume: assert outputs_list[-1].finish_reason is not None, str(output) - @pytest.mark.parametrize('model', ['internlm2-chat-20b']) + @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_pipeline_stream_infer2(self, config, model): model_path = '/'.join([config.get('model_path'), model]) pipe = pipeline(model_path) diff --git a/autotest/interface/pipeline/test_pipeline_turbomind_longtext_func.py b/autotest/interface/pipeline/test_pipeline_turbomind_longtext_func.py new file mode 100644 index 0000000000..13bfd8aff3 --- /dev/null +++ b/autotest/interface/pipeline/test_pipeline_turbomind_longtext_func.py @@ -0,0 +1,88 @@ +import pytest +from utils.get_run_config import get_tp_num + +from lmdeploy import TurbomindEngineConfig, pipeline + + +@pytest.mark.order(8) +@pytest.mark.pipeline_func +@pytest.mark.timeout(600) +class TestPipelineLongtextFunc: + + def test_long_test_chat_7b(self, config): + model = 'internlm/internlm2-chat-7b' + tp_config = get_tp_num(config, model) + model_path = '/'.join([config.get('model_path'), model]) + + backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, + session_len=210000, + tp=tp_config) + pipe = pipeline(model_path, backend_config=backend_config) + prompt = '今 天 心 ' * int(200000 / 6) + + # batch infer + pipe(prompt) + + # stream infer + for outputs in pipe.stream_infer(prompt): + continue + + prompts = ['今 天 心 ' * int(200000 / 6)] * 2 + # batch infer + pipe(prompts) + + # stream infer + for outputs in pipe.stream_infer(prompts): + continue + + def test_long_test_chat_20b(self, config): + model = 'internlm/internlm2-chat-20b' + tp_config = get_tp_num(config, model) + model_path = '/'.join([config.get('model_path'), model]) + + backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, + session_len=210000, + tp=tp_config) + pipe = pipeline(model_path, backend_config=backend_config) + prompt = '今 天 心 ' * int(200000 / 6) + + # batch infer + pipe(prompt) + + # stream infer + for outputs in pipe.stream_infer(prompt): + continue + + prompts = ['今 天 心 ' * int(200000 / 6)] * 2 + # batch infer + pipe(prompts) + + # stream infer + for outputs in pipe.stream_infer(prompts): + continue + + def test_long_test_20b(self, config): + model = 'internlm/internlm2-20b' + tp_config = get_tp_num(config, model) + model_path = '/'.join([config.get('model_path'), model]) + + backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, + session_len=210000, + tp=tp_config) + pipe = pipeline(model_path, backend_config=backend_config) + prompt = '今 天 心 ' * int(200000 / 6) + + # batch infer + pipe(prompt) + + # stream infer + for outputs in pipe.stream_infer(prompt): + continue + + prompts = ['今 天 心 ' * int(200000 / 6)] * 2 + # batch infer + pipe(prompts) + + # stream infer + for outputs in pipe.stream_infer(prompts): + continue diff --git a/autotest/prompt_case.yaml b/autotest/prompt_case.yaml index e1839ce3f2..ce5d174518 100644 --- a/autotest/prompt_case.yaml +++ b/autotest/prompt_case.yaml @@ -77,6 +77,9 @@ chinese_poem_case: - internlm2-20b: - len_g: 5 + - falcon: + - len_g: + 5 english_poem_case: - write a romantic English poem: - contain: @@ -110,6 +113,7 @@ emoji_case: - \u2714 - 赞 - emoji + - '!' traditional_chinese_case: - 使用繁體介紹香港維多利亞港: - contain: diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py index 5854584122..f0f8e1c8b3 100644 --- a/autotest/tools/chat/test_command_chat_hf_pytorch.py +++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py @@ -1,7 +1,8 @@ import allure import conftest import pytest -from utils.config_utils import get_torch_model_list +from utils.config_utils import (get_cuda_prefix_by_workerid, + get_torch_model_list) from utils.run_client_chat import hf_command_line_test conftest._init_cli_case_list() @@ -15,12 +16,40 @@ def getCaseList(): @pytest.mark.order(10) @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_pytorch_chat +@pytest.mark.gpu_num_1 @pytest.mark.parametrize('usercase', getCaseList()) -@pytest.mark.parametrize('model', get_torch_model_list()) -def test_hf_pytorch_chat(config, model, cli_case_config, usercase): - result, chat_log, msg = hf_command_line_test(config, usercase, - cli_case_config.get(usercase), - model, 'torch') +@pytest.mark.parametrize('model', get_torch_model_list(tp_num=1)) +def test_hf_pytorch_chat_tp1(config, model, cli_case_config, usercase, + worker_id): + result, chat_log, msg = hf_command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'torch', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) + if chat_log is not None: + allure.attach.file(chat_log, + attachment_type=allure.attachment_type.TEXT) + + assert result, msg + + +@pytest.mark.order(10) +@pytest.mark.usefixtures('cli_case_config') +@pytest.mark.hf_pytorch_chat +@pytest.mark.gpu_num_2 +@pytest.mark.parametrize('usercase', getCaseList()) +@pytest.mark.parametrize('model', get_torch_model_list(tp_num=2)) +def test_hf_pytorch_chat_tp2(config, model, cli_case_config, usercase, + worker_id): + result, chat_log, msg = hf_command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'torch', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2)) if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -34,7 +63,7 @@ def test_hf_pytorch_chat(config, model, cli_case_config, usercase): @pytest.mark.pr_test @pytest.mark.xdist_group(name='pr_test') @pytest.mark.parametrize('usercase', getCaseList()) -@pytest.mark.parametrize('model', ['internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_hf_pytorch_chat_pr(config, model, cli_case_config, usercase): result, chat_log, msg = hf_command_line_test( config, diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py index 3c889fd26d..3e763c0ef2 100644 --- a/autotest/tools/chat/test_command_chat_hf_turbomind.py +++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py @@ -1,7 +1,8 @@ import allure import conftest import pytest -from utils.config_utils import get_turbomind_model_list +from utils.config_utils import (get_cuda_prefix_by_workerid, + get_turbomind_model_list) from utils.run_client_chat import hf_command_line_test conftest._init_cli_case_list() @@ -15,12 +16,41 @@ def getCaseList(): @pytest.mark.order(10) @pytest.mark.usefixtures('cli_case_config') @pytest.mark.hf_turbomind_chat +@pytest.mark.gpu_num_1 @pytest.mark.parametrize('usercase', getCaseList()) -@pytest.mark.parametrize('model', get_turbomind_model_list()) -def test_hf_turbomind_chat(config, model, cli_case_config, usercase): - result, chat_log, msg = hf_command_line_test(config, usercase, - cli_case_config.get(usercase), - model, 'turbomind') +@pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=1)) +def test_hf_turbomind_chat_tp1(config, model, cli_case_config, usercase, + worker_id): + result, chat_log, msg = hf_command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'turbomind', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) + + if chat_log is not None: + allure.attach.file(chat_log, + attachment_type=allure.attachment_type.TEXT) + + assert result, msg + + +@pytest.mark.order(10) +@pytest.mark.usefixtures('cli_case_config') +@pytest.mark.hf_turbomind_chat +@pytest.mark.gpu_num_2 +@pytest.mark.parametrize('usercase', getCaseList()) +@pytest.mark.parametrize('model', get_turbomind_model_list(tp_num=2)) +def test_hf_turbomind_chat_tp2(config, model, cli_case_config, usercase, + worker_id): + result, chat_log, msg = hf_command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'turbomind', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2)) if chat_log is not None: allure.attach.file(chat_log, @@ -36,7 +66,8 @@ def test_hf_turbomind_chat(config, model, cli_case_config, usercase): @pytest.mark.xdist_group(name='pr_test') @pytest.mark.parametrize('usercase', getCaseList()) @pytest.mark.parametrize( - 'model', ['internlm2-chat-20b', 'internlm2-chat-20b-inner-w4a16']) + 'model', + ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-w4a16']) def test_hf_turbomind_chat_pr(config, model, cli_case_config, usercase): result, chat_log, msg = hf_command_line_test( config, diff --git a/autotest/tools/chat/test_command_chat_workspace.py b/autotest/tools/chat/test_command_chat_workspace.py index 34f0608783..26afeaf998 100644 --- a/autotest/tools/chat/test_command_chat_workspace.py +++ b/autotest/tools/chat/test_command_chat_workspace.py @@ -1,7 +1,8 @@ import allure import conftest import pytest -from utils.config_utils import get_turbomind_model_list +from utils.config_utils import (get_cuda_prefix_by_workerid, + get_turbomind_model_list) from utils.run_client_chat import command_line_test conftest._init_cli_case_list() @@ -12,9 +13,9 @@ def getPromptCaseList(): return prompt_list -def getModelList(): +def getModelList(tp_num): return [ - item for item in get_turbomind_model_list() + item for item in get_turbomind_model_list(tp_num) if 'kvint8' not in item.lower() ] @@ -22,12 +23,39 @@ def getModelList(): @pytest.mark.order(10) @pytest.mark.usefixtures('cli_case_config') @pytest.mark.command_chat +@pytest.mark.gpu_num_1 @pytest.mark.parametrize('usercase', getPromptCaseList()) -@pytest.mark.parametrize('model', getModelList()) -def test_workspace_chat(config, cli_case_config, usercase, model): - result, chat_log, msg = command_line_test(config, usercase, - cli_case_config.get(usercase), - model, 'turbomind', None) +@pytest.mark.parametrize('model', getModelList(tp_num=1)) +def test_workspace_chat_tp1(config, cli_case_config, usercase, model, + worker_id): + result, chat_log, msg = command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'turbomind', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id)) + if chat_log is not None: + allure.attach.file(chat_log, + attachment_type=allure.attachment_type.TEXT) + assert result, msg + + +@pytest.mark.order(10) +@pytest.mark.usefixtures('cli_case_config') +@pytest.mark.command_chat +@pytest.mark.gpu_num_2 +@pytest.mark.parametrize('usercase', getPromptCaseList()) +@pytest.mark.parametrize('model', getModelList(tp_num=2)) +def test_workspace_chat_tp2(config, cli_case_config, usercase, model, + worker_id): + result, chat_log, msg = command_line_test( + config, + usercase, + cli_case_config.get(usercase), + model, + 'turbomind', + cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2)) if chat_log is not None: allure.attach.file(chat_log, attachment_type=allure.attachment_type.TEXT) @@ -38,10 +66,10 @@ def test_workspace_chat(config, cli_case_config, usercase, model): @pytest.mark.usefixtures('cli_case_config') @pytest.mark.command_chat @pytest.mark.pr_test -@pytest.mark.xdist_group(name='pr_test') @pytest.mark.parametrize('usercase', getPromptCaseList()) @pytest.mark.parametrize( - 'model', ['internlm2-chat-20b', 'internlm2-chat-20b-inner-w4a16']) + 'model', + ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-w4a16']) def test_workspace_chat_pr(config, cli_case_config, usercase, model): result, chat_log, msg = command_line_test( config, diff --git a/autotest/tools/convert/test_convert.py b/autotest/tools/convert/test_convert.py index 074ed8f93d..8fe8d30949 100644 --- a/autotest/tools/convert/test_convert.py +++ b/autotest/tools/convert/test_convert.py @@ -4,15 +4,16 @@ import allure import pytest -from utils.config_utils import get_turbomind_model_list +from utils.config_utils import (get_cuda_prefix_by_workerid, + get_turbomind_model_list) from utils.get_run_config import get_command_with_extra, get_model_name @pytest.mark.order(5) @pytest.mark.convert @pytest.mark.parametrize('model', get_turbomind_model_list()) -def test_convert(config, model): - convert(config, model) +def test_convert(config, model, worker_id): + convert(config, model, get_cuda_prefix_by_workerid(worker_id)) @pytest.mark.order(5) @@ -20,32 +21,40 @@ def test_convert(config, model): @pytest.mark.pr_test @pytest.mark.xdist_group(name='pr_test') @pytest.mark.parametrize( - 'model', ['internlm2-chat-20b', 'internlm2-chat-20b-inner-w4a16']) + 'model', + ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-w4a16']) def test_convert_pr(config, model): - convert(config, model) + convert(config, model, 'CUDA_VISIBLE_DEVICES=5') -def convert(config, model_case): +def convert(config, model_case, cuda_prefix): origin_model_path = config.get('model_path') + '/' + model_case dst_path = config.get('dst_path') + '/workspace_' + model_case log_path = config.get('log_path') model_name = get_model_name(model_case) - if 'w4' in model_case: - cmd = get_command_with_extra( - ' '.join([ - 'lmdeploy convert', model_name, origin_model_path, - '--dst-path', dst_path, '--model-format awq --group-size 128' - ]), config, model_name, True) + if 'w4' in model_case or '4bits' in model_case: + cmd = get_command_with_extra(' '.join([ + 'lmdeploy convert', model_name, origin_model_path, '--dst-path', + dst_path, '--model-format awq --group-size 128' + ]), + config, + model_name, + True, + cuda_prefix=cuda_prefix) else: - cmd = get_command_with_extra( - ' '.join([ - 'lmdeploy convert', model_name, origin_model_path, - '--dst-path', dst_path - ]), config, model_name, True) + cmd = get_command_with_extra(' '.join([ + 'lmdeploy convert', model_name, origin_model_path, '--dst-path', + dst_path + ]), + config, + model_name, + True, + cuda_prefix=cuda_prefix) - convert_log = os.path.join(log_path, 'convert_' + model_case + '.log') + convert_log = os.path.join(log_path, + 'convert_' + model_case.split('/')[1] + '.log') print('reproduce command convert: ' + cmd + '\n') with open(convert_log, 'w') as f: # remove existing workspace diff --git a/autotest/tools/pipeline/pipeline_chat_script.py b/autotest/tools/pipeline/pipeline_chat_script.py index 70d4abcb36..f8a92d9b8f 100644 --- a/autotest/tools/pipeline/pipeline_chat_script.py +++ b/autotest/tools/pipeline/pipeline_chat_script.py @@ -30,7 +30,8 @@ def run_pipeline_chat_test(config, cases_info, model_case, tp, type): if 'pytorch' == type: backend_config = PytorchEngineConfig(tp=tp) else: - if 'kvint8' in model_case and 'w4' in model_case: + if 'kvint8' in model_case and ('w4' in model_case + or '4bits' in model_case): backend_config = TurbomindEngineConfig(tp=tp, model_format='awq', quant_policy=4) @@ -38,7 +39,7 @@ def run_pipeline_chat_test(config, cases_info, model_case, tp, type): backend_config = TurbomindEngineConfig(tp=tp, model_format='hf', quant_policy=4) - elif 'w4' in model_case: + elif 'w4' in model_case or '4bits' in model_case: backend_config = TurbomindEngineConfig(tp=tp, model_format='awq') else: backend_config = TurbomindEngineConfig(tp=tp) diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py index 5014f3a163..7e0318eebd 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py @@ -1,14 +1,15 @@ +import os from multiprocessing import Process import pytest -from utils.config_utils import get_torch_model_list +from utils.config_utils import get_cuda_id_by_workerid, get_torch_model_list from utils.pipeline_chat import (assert_pipeline_chat_log, run_pipeline_chat_test) -def getModelList(): +def getModelList(tp_num): return [ - item for item in get_torch_model_list() + item for item in get_torch_model_list(tp_num) if 'falcon' not in item.lower() and 'chatglm2' not in item.lower() ] @@ -16,9 +17,32 @@ def getModelList(): @pytest.mark.order(6) @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat_pytorch +@pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('model', getModelList()) -def test_pipeline_chat_pytorch(config, common_case_config, model): +@pytest.mark.parametrize('model', getModelList(tp_num=1)) +def test_pipeline_chat_pytorch_tp1(config, common_case_config, model, + worker_id): + os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + p = Process(target=run_pipeline_chat_test, + args=(config, common_case_config, model, 'pytorch')) + p.start() + p.join() + + # assert script + assert_pipeline_chat_log(config, common_case_config, model) + + +@pytest.mark.order(6) +@pytest.mark.usefixtures('common_case_config') +@pytest.mark.pipeline_chat_pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('model', getModelList(tp_num=2)) +def test_pipeline_chat_pytorch_tp2(config, common_case_config, model, + worker_id): + if 'gw' in worker_id: + os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, + tp_num=2) p = Process(target=run_pipeline_chat_test, args=(config, common_case_config, model, 'pytorch')) p.start() @@ -33,7 +57,7 @@ def test_pipeline_chat_pytorch(config, common_case_config, model): @pytest.mark.pipeline_chat_pytorch @pytest.mark.flaky(reruns=0) @pytest.mark.pr_test -@pytest.mark.parametrize('model', ['internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) def test_pipeline_chat_pytorch_pr(config, common_case_config, model): p = Process(target=run_pipeline_chat_test, args=(config, common_case_config, model, 'pytorch')) diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py index 90773d39bc..e12db44c1e 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py @@ -1,7 +1,8 @@ +import os from multiprocessing import Process import pytest -from utils.config_utils import get_turbomind_model_list +from utils.config_utils import get_all_model_list, get_cuda_id_by_workerid from utils.pipeline_chat import (assert_pipeline_chat_log, run_pipeline_chat_test) @@ -9,9 +10,29 @@ @pytest.mark.order(6) @pytest.mark.usefixtures('common_case_config') @pytest.mark.pipeline_chat +@pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('model', get_turbomind_model_list()) -def test_pipeline_chat(config, common_case_config, model): +@pytest.mark.parametrize('model', get_all_model_list(tp_num=1)) +def test_pipeline_chat_tp1(config, common_case_config, model, worker_id): + if 'gw' in worker_id: + os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id) + p = Process(target=run_pipeline_chat_test, + args=(config, common_case_config, model, 'turbomind')) + p.start() + p.join() + assert_pipeline_chat_log(config, common_case_config, model) + + +@pytest.mark.order(6) +@pytest.mark.usefixtures('common_case_config') +@pytest.mark.pipeline_chat +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('model', get_all_model_list(tp_num=2)) +def test_pipeline_chat_tp2(config, common_case_config, model, worker_id): + if 'gw' in worker_id: + os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id, + tp_num=2) p = Process(target=run_pipeline_chat_test, args=(config, common_case_config, model, 'turbomind')) p.start() @@ -25,7 +46,8 @@ def test_pipeline_chat(config, common_case_config, model): @pytest.mark.flaky(reruns=0) @pytest.mark.pr_test @pytest.mark.parametrize( - 'model', ['internlm2-chat-20b', 'internlm2-chat-20b-inner-w4a16']) + 'model', + ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-w4a16']) def test_pipeline_chat_pr(config, common_case_config, model): p = Process(target=run_pipeline_chat_test, args=(config, common_case_config, model, 'turbomind')) diff --git a/autotest/tools/quantization/test_quantization_kvint8.py b/autotest/tools/quantization/test_quantization_kvint8.py index 77957a676a..7c57d766e0 100644 --- a/autotest/tools/quantization/test_quantization_kvint8.py +++ b/autotest/tools/quantization/test_quantization_kvint8.py @@ -2,23 +2,23 @@ import allure import pytest +from utils.config_utils import get_cuda_prefix_by_workerid from utils.quantization_utils import quantization -model_list = [('llama-2-7b-chat', 'CUDA_VISIBLE_DEVICES=1'), - ('internlm-chat-20b', 'CUDA_VISIBLE_DEVICES=2'), - ('internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=3'), - ('Qwen-7B-Chat', 'CUDA_VISIBLE_DEVICES=4'), - ('Qwen-14B-Chat', 'CUDA_VISIBLE_DEVICES=5'), - ('internlm2-20b', 'CUDA_VISIBLE_DEVICES=6'), - ('Baichuan2-7B-Chat', 'CUDA_VISIBLE_DEVICES=7')] +model_list = [ + 'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b', + 'internlm/internlm2-chat-20b', 'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat', + 'internlm/internlm2-20b', 'baichuan-inc/Baichuan2-7B-Chat' +] @pytest.mark.order(1) @pytest.mark.quantization_kvint8 @pytest.mark.timeout(900) -@pytest.mark.parametrize('model, prefix', model_list) -def test_quantization_kvint8(config, model, prefix): - quantization_kvint8(config, model + '-inner-kvint8', model, prefix) +@pytest.mark.parametrize('model', model_list) +def test_quantization_kvint8(config, model, worker_id): + quantization_kvint8(config, model + '-inner-kvint8', model, + get_cuda_prefix_by_workerid(worker_id)) def quantization_kvint8(config, quantization_model_name, origin_model_name, @@ -29,9 +29,10 @@ def quantization_kvint8(config, quantization_model_name, origin_model_name, cuda_prefix) log_path = config.get('log_path') quantization_log = os.path.join( - log_path, - '_'.join(['quantization', quantization_type, quantization_model_name - ]) + '.log') + log_path, '_'.join([ + 'quantization', quantization_type, + quantization_model_name.split('/')[1] + ]) + '.log') allure.attach.file(quantization_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/quantization/test_quantization_kvint8_w4a16.py b/autotest/tools/quantization/test_quantization_kvint8_w4a16.py index 9a1c5b6555..44dc1751fb 100644 --- a/autotest/tools/quantization/test_quantization_kvint8_w4a16.py +++ b/autotest/tools/quantization/test_quantization_kvint8_w4a16.py @@ -2,23 +2,26 @@ import allure import pytest +from utils.config_utils import get_cuda_prefix_by_workerid from utils.quantization_utils import quantization -model_list = [('llama-2-7b-chat-inner-kvint8', 'CUDA_VISIBLE_DEVICES=1'), - ('internlm-chat-20b-inner-kvint8', 'CUDA_VISIBLE_DEVICES=2'), - ('internlm2-chat-20b-inner-kvint8', 'CUDA_VISIBLE_DEVICES=3'), - ('Qwen-7B-Chat-inner-kvint8', 'CUDA_VISIBLE_DEVICES=4'), - ('Qwen-14B-Chat-inner-kvint8', 'CUDA_VISIBLE_DEVICES=5'), - ('internlm2-20b-inner-kvint8', 'CUDA_VISIBLE_DEVICES=6'), - ('Baichuan2-7B-Chat-inner-kvint8', 'CUDA_VISIBLE_DEVICES=7')] +model_list = [ + 'meta-llama/Llama-2-7b-chat-inner-kvint8', + 'internlm/internlm-chat-20b-inner-kvint8', + 'internlm/internlm2-chat-20b-inner-kvint8', + 'Qwen/Qwen-7B-Chat-inner-kvint8', 'Qwen/Qwen-14B-Chat-inner-kvint8', + 'internlm/internlm2-20b-inner-kvint8', + 'baichuan-inc/Baichuan2-7B-Chat-inner-kvint8' +] @pytest.mark.order(4) @pytest.mark.quantization_kvint8_w4a16 @pytest.mark.timeout(900) -@pytest.mark.parametrize('model, prefix', model_list) -def test_quantization_kvint8_w4a16(config, model, prefix): - quantization_kvint8(config, model + '-w4a16', model, prefix) +@pytest.mark.parametrize('model', model_list) +def test_quantization_kvint8_w4a16(config, model, worker_id): + quantization_kvint8(config, model + '-w4a16', model, + get_cuda_prefix_by_workerid(worker_id)) def quantization_kvint8(config, quantization_model_name, origin_model_name, @@ -29,9 +32,10 @@ def quantization_kvint8(config, quantization_model_name, origin_model_name, cuda_prefix) log_path = config.get('log_path') quantization_log = os.path.join( - log_path, - '_'.join(['quantization', quantization_type, quantization_model_name - ]) + '.log') + log_path, '_'.join([ + 'quantization', quantization_type, + quantization_model_name.split('/')[1] + ]) + '.log') allure.attach.file(quantization_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/quantization/test_quantization_w4a16.py b/autotest/tools/quantization/test_quantization_w4a16.py index 15749ba70b..3bafadd494 100644 --- a/autotest/tools/quantization/test_quantization_w4a16.py +++ b/autotest/tools/quantization/test_quantization_w4a16.py @@ -2,32 +2,34 @@ import allure import pytest +from utils.config_utils import get_cuda_prefix_by_workerid from utils.quantization_utils import quantization -model_list = [('llama-2-7b-chat', 'CUDA_VISIBLE_DEVICES=0'), - ('internlm-chat-20b', 'CUDA_VISIBLE_DEVICES=1'), - ('Qwen-7B-Chat', 'CUDA_VISIBLE_DEVICES=2'), - ('Qwen-14B-Chat', 'CUDA_VISIBLE_DEVICES=3'), - ('Qwen-VL', 'CUDA_VISIBLE_DEVICES=4'), - ('internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=5'), - ('internlm2-20b', 'CUDA_VISIBLE_DEVICES=6'), - ('Baichuan2-7B-Chat', 'CUDA_VISIBLE_DEVICES=7')] +model_list = [ + 'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b', + 'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat', 'Qwen/Qwen-VL', + 'internlm/internlm2-chat-20b', 'internlm/internlm2-20b', + 'baichuan-inc/Baichuan2-7B-Chat' +] @pytest.mark.order(3) @pytest.mark.quantization_w4a16 @pytest.mark.timeout(900) -@pytest.mark.parametrize('model, prefix', model_list) -def test_quantization_w4a16(config, model, prefix): - quantization_w4a16(config, model + '-inner-w4a16', model, prefix) +@pytest.mark.parametrize('model', model_list) +def test_quantization_w4a16(config, model, worker_id): + quantization_w4a16(config, model + '-inner-w4a16', model, + get_cuda_prefix_by_workerid(worker_id)) @pytest.mark.order(3) @pytest.mark.quantization_w4a16 @pytest.mark.pr_test +@pytest.mark.flaky(reruns=0) @pytest.mark.timeout(900) -@pytest.mark.parametrize('model, prefix', - [('internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=5')]) +@pytest.mark.parametrize( + 'model, prefix', + [('internlm/internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=5')]) def test_quantization_w4a16_pr(config, model, prefix): quantization_w4a16(config, model + '-inner-w4a16', model, prefix) @@ -40,9 +42,10 @@ def quantization_w4a16(config, quantization_model_name, origin_model_name, cuda_prefix) log_path = config.get('log_path') quantization_log = os.path.join( - log_path, - '_'.join(['quantization', quantization_type, quantization_model_name - ]) + '.log') + log_path, '_'.join([ + 'quantization', quantization_type, + quantization_model_name.split('/')[1] + ]) + '.log') allure.attach.file(quantization_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/quantization/test_quantization_w8a8.py b/autotest/tools/quantization/test_quantization_w8a8.py index 7e6690d423..37a198c6d5 100644 --- a/autotest/tools/quantization/test_quantization_w8a8.py +++ b/autotest/tools/quantization/test_quantization_w8a8.py @@ -2,25 +2,23 @@ import allure import pytest +from utils.config_utils import get_cuda_prefix_by_workerid from utils.quantization_utils import quantization -model_list = [('llama-2-7b-chat', 'CUDA_VISIBLE_DEVICES=0'), - ('internlm-chat-20b', 'CUDA_VISIBLE_DEVICES=1'), - ('internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=2'), - ('internlm2-chat-7b', 'CUDA_VISIBLE_DEVICES=3'), - ('Yi-6B-Chat', 'CUDA_VISIBLE_DEVICES=4'), - ('internlm2-20b', 'CUDA_VISIBLE_DEVICES=5')] - -# ('Baichuan2-7B-Chat', 'CUDA_VISIBLE_DEVICES=6') -# ('Baichuan2-13B-Chat', 'CUDA_VISIBLE_DEVICES=7') +model_list = [ + 'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b', + 'internlm/internlm2-chat-20b', 'internlm/internlm2-chat-7b', + '01-ai/Yi-6B-Chat', 'internlm/internlm2-20b' +] @pytest.mark.order(2) @pytest.mark.quantization_w8a8 @pytest.mark.timeout(900) -@pytest.mark.parametrize('model, prefix', model_list) -def test_quantization_w8a8(config, model, prefix): - quantization_w8a8(config, model + '-inner-w8a8', model, prefix) +@pytest.mark.parametrize('model', model_list) +def test_quantization_w8a8(config, model, worker_id): + quantization_w8a8(config, model + '-inner-w8a8', model, + get_cuda_prefix_by_workerid(worker_id)) def quantization_w8a8(config, quantization_model_name, origin_model_name, @@ -31,9 +29,10 @@ def quantization_w8a8(config, quantization_model_name, origin_model_name, cuda_prefix) log_path = config.get('log_path') quantization_log = os.path.join( - log_path, - '_'.join(['quantization', quantization_type, quantization_model_name - ]) + '.log') + log_path, '_'.join([ + 'quantization', quantization_type, + quantization_model_name.split('/')[1] + ]) + '.log') allure.attach.file(quantization_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/restful/test_restful_chat_pytorch.py b/autotest/tools/restful/test_restful_chat_pytorch.py index 1b6fe5607a..6c5b33aa3f 100644 --- a/autotest/tools/restful/test_restful_chat_pytorch.py +++ b/autotest/tools/restful/test_restful_chat_pytorch.py @@ -5,35 +5,50 @@ import allure import pytest from pytest import assume -from utils.config_utils import get_torch_model_list +from utils.config_utils import (get_cuda_prefix_by_workerid, + get_torch_model_list, get_workerid) from utils.get_run_config import get_command_with_extra from utils.run_client_chat import command_line_test from utils.run_restful_chat import (get_model, health_check, interactive_test, open_chat_test) -HTTP_URL = 'http://localhost:23333' +BASE_HTTP_URL = 'http://localhost' +DEFAULT_PORT = 23333 @pytest.fixture(scope='function', autouse=True) -def prepare_environment(request, config): +def prepare_environment(request, config, worker_id): model_path = config.get('model_path') log_path = config.get('log_path') - model = request.param + param = request.param + model = param['model'] + cuda_prefix = param['cuda_prefix'] + tp_num = param['tp_num'] - cmd = ['lmdeploy serve api_server ' + model_path + '/' + model] + if cuda_prefix is None: + cuda_prefix = get_cuda_prefix_by_workerid(worker_id, tp_num=tp_num) + + worker_num = get_workerid(worker_id) + if worker_num is None: + port = DEFAULT_PORT + else: + port = DEFAULT_PORT + worker_num cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path + - '/' + model + ' --backend pytorch', + '/' + model + ' --backend pytorch' + + ' --server-port ' + str(port), config, model, need_tp=True) - start_log = os.path.join(log_path, 'start_restful_' + model + '.log') + print('reproduce command restful: ' + cmd) + + start_log = os.path.join(log_path, + 'start_restful_' + model.split('/')[1] + '.log') with open(start_log, 'w') as f: f.writelines('reproduce command restful: ' + cmd + '\n') - print('reproduce command restful: ' + cmd) # convert convertRes = subprocess.Popen([cmd], @@ -45,7 +60,7 @@ def prepare_environment(request, config): pid = convertRes.pid allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT) - http_url = HTTP_URL + http_url = BASE_HTTP_URL + ':' + str(port) start_time = int(time()) sleep(5) for i in range(120): @@ -58,42 +73,69 @@ def prepare_environment(request, config): yield if pid > 0: - kill_log = os.path.join(log_path, 'kill_' + model + '.log') + kill_log = os.path.join(log_path, + 'kill_' + model.split('/')[1] + '.log') - subprocess.Popen([ - "ps -ef | grep multiprocessing | grep -v grep | awk '{print $2}' " - + '| xargs kill -9' - ], - shell=True, - text=True, - encoding='utf-8') with open(kill_log, 'w') as f: convertRes.kill() allure.attach.file(kill_log, attachment_type=allure.attachment_type.TEXT) -def getModelList(): - return [ - item for item in get_torch_model_list() if 'chat' in item.lower() - and 'falcon' not in item.lower() and 'chatglm2' not in item.lower() - ] +def getModelList(tp_num): + return [{ + 'model': item, + 'cuda_prefix': None, + 'tp_num': tp_num + } for item in get_torch_model_list(tp_num) if 'chat' in item.lower()] @pytest.mark.order(7) @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api_pytorch +@pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(), indirect=True) -def test_restful_chat(config, common_case_config): - run_all_step(config, common_case_config) +@pytest.mark.parametrize('prepare_environment', + getModelList(tp_num=1), + indirect=True) +def test_restful_chat_tp1(config, common_case_config, worker_id): + if get_workerid(worker_id) is None: + run_all_step(config, common_case_config) + else: + run_all_step(config, + common_case_config, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) -def run_all_step(config, cases_info): - http_url = HTTP_URL +@pytest.mark.order(7) +@pytest.mark.usefixtures('common_case_config') +@pytest.mark.restful_api_pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('prepare_environment', + getModelList(tp_num=2), + indirect=True) +def test_restful_chat_tp2(config, common_case_config, worker_id): + if get_workerid(worker_id) is None: + run_all_step(config, common_case_config) + else: + run_all_step(config, + common_case_config, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + +def run_all_step(config, + cases_info, + worker_id: str = 'default', + port: int = DEFAULT_PORT): + http_url = BASE_HTTP_URL + ':' + str(port) model = get_model(http_url) - print(model) + if model is None: + assert False, 'server not start correctly' + for case in cases_info.keys(): if (case == 'memory_test' or case == 'emoji_case') and 'chat' not in model.lower(): @@ -103,15 +145,17 @@ def run_all_step(config, cases_info): with allure.step(case + ' step1 - command chat regression'): chat_result, chat_log, msg = command_line_test( - config, case, case_info, model, 'api_client', http_url) - allure.attach.file(chat_log, - attachment_type=allure.attachment_type.TEXT) - with assume: - assert chat_result, msg + config, case, case_info, model + worker_id, 'api_client', + http_url) + if chat_log is not None: + allure.attach.file(chat_log, + attachment_type=allure.attachment_type.TEXT) + with assume: + assert chat_result, msg with allure.step(case + ' step2 - restful_test - openai chat'): restful_result, restful_log, msg = open_chat_test( - config, case_info, model, http_url) + config, case_info, model, http_url, worker_id) allure.attach.file(restful_log, attachment_type=allure.attachment_type.TEXT) with assume: @@ -119,7 +163,7 @@ def run_all_step(config, cases_info): with allure.step(case + ' step3 - restful_test - interactive chat'): active_result, interactive_log, msg = interactive_test( - config, case_info, model, http_url) + config, case_info, model, http_url, worker_id) allure.attach.file(interactive_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/tools/restful/test_restful_chat_turbomind.py b/autotest/tools/restful/test_restful_chat_turbomind.py index cad858333a..f442aec10a 100644 --- a/autotest/tools/restful/test_restful_chat_turbomind.py +++ b/autotest/tools/restful/test_restful_chat_turbomind.py @@ -5,41 +5,56 @@ import allure import pytest from pytest import assume -from utils.config_utils import get_turbomind_model_list +from utils.config_utils import (get_all_model_list, + get_cuda_prefix_by_workerid, get_workerid) from utils.get_run_config import get_command_with_extra from utils.run_client_chat import command_line_test from utils.run_restful_chat import (get_model, health_check, interactive_test, open_chat_test) -HTTP_URL = 'http://localhost:23333' +BASE_HTTP_URL = 'http://localhost' +DEFAULT_PORT = 23333 @pytest.fixture(scope='function', autouse=True) -def prepare_environment(request, config): +def prepare_environment(request, config, worker_id): model_path = config.get('model_path') log_path = config.get('log_path') param = request.param model = param['model'] cuda_prefix = param['cuda_prefix'] + tp_num = param['tp_num'] + + if cuda_prefix is None: + cuda_prefix = get_cuda_prefix_by_workerid(worker_id, tp_num=tp_num) + + worker_num = get_workerid(worker_id) + if worker_num is None: + port = DEFAULT_PORT + else: + port = DEFAULT_PORT + worker_num cmd = ['lmdeploy serve api_server ' + model_path + '/' + model] cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path + - '/' + model, + '/' + model + ' --server-port ' + str(port), config, model, need_tp=True, cuda_prefix=cuda_prefix) - if 'kvint8' in model and 'w4' not in model: - cmd += ' --model-format hf --quant-policy 4' - if 'kvint8' in model and 'w4' in model: + if 'kvint8' in model: cmd += ' --quant-policy 4' - if 'w4' in model: + if 'w4' in model or '4bits' in model: + cmd += ' --model-format awq' + else: + cmd += ' --model-format hf' + if 'w4' in model or '4bits' in model: cmd += ' --model-format awq' - start_log = os.path.join(log_path, 'start_restful_' + model + '.log') + start_log = os.path.join(log_path, + 'start_restful_' + model.split('/')[1] + '.log') print('reproduce command restful: ' + cmd) @@ -56,7 +71,7 @@ def prepare_environment(request, config): pid = convertRes.pid allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT) - http_url = HTTP_URL + http_url = BASE_HTTP_URL + ':' + str(port) start_time = int(time()) sleep(5) for i in range(120): @@ -68,7 +83,8 @@ def prepare_environment(request, config): break yield if pid > 0: - kill_log = os.path.join(log_path, 'kill_' + model + '.log') + kill_log = os.path.join(log_path, + 'kill_' + model.split('/')[1] + '.log') with open(kill_log, 'w') as f: convertRes.kill() @@ -76,45 +92,79 @@ def prepare_environment(request, config): allure.attach.file(kill_log, attachment_type=allure.attachment_type.TEXT) -def getModelList(): +def getModelList(tp_num): return [{ 'model': item, - 'cuda_prefix': None - } for item in get_turbomind_model_list() if 'chat' in item.lower()] + 'cuda_prefix': None, + 'tp_num': tp_num + } for item in get_all_model_list(tp_num) if 'chat' in item.lower()] @pytest.mark.order(7) @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api +@pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('prepare_environment', getModelList(), indirect=True) -def test_restful_chat(config, common_case_config): - run_all_step(config, common_case_config) +@pytest.mark.parametrize('prepare_environment', + getModelList(tp_num=1), + indirect=True) +def test_restful_chat_tp1(request, config, common_case_config, worker_id): + if get_workerid(worker_id) is None: + run_all_step(config, common_case_config) + else: + run_all_step(config, + common_case_config, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) @pytest.mark.order(7) @pytest.mark.usefixtures('common_case_config') @pytest.mark.restful_api +@pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) -@pytest.mark.pr_test @pytest.mark.parametrize('prepare_environment', - [{ - 'model': 'internlm2-chat-20b', - 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6' - }, { - 'model': 'internlm2-chat-20b-inner-w4a16', - 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6' - }], + getModelList(tp_num=2), + indirect=True) +def test_restful_chat_tp2(config, common_case_config, worker_id): + if get_workerid(worker_id) is None: + run_all_step(config, common_case_config) + else: + run_all_step(config, + common_case_config, + worker_id=worker_id, + port=DEFAULT_PORT + get_workerid(worker_id)) + + +@pytest.mark.order(7) +@pytest.mark.usefixtures('common_case_config') +@pytest.mark.restful_api +@pytest.mark.flaky(reruns=0) +@pytest.mark.pr_test +@pytest.mark.parametrize('prepare_environment', [{ + 'model': 'internlm/internlm2-chat-20b', + 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6', + 'tp_num': 2 +}, { + 'model': 'internlm/internlm2-chat-20b-inner-w4a16', + 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6', + 'tp_num': 2 +}], indirect=True) def test_restful_chat_pr(config, common_case_config): run_all_step(config, common_case_config) -def run_all_step(config, cases_info): - http_url = HTTP_URL +def run_all_step(config, + cases_info, + worker_id: str = 'default', + port: int = DEFAULT_PORT): + http_url = BASE_HTTP_URL + ':' + str(port) model = get_model(http_url) - print(model) + + if model is None: + assert False, 'server not start correctly' for case in cases_info.keys(): if (case == 'memory_test' or case == 'emoji_case') and 'chat' not in model.lower(): diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index f811e94f22..48905a1063 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -1,11 +1,11 @@ import os import yaml +from utils.get_run_config import get_tp_num -def get_turbomind_model_list(): +def get_turbomind_model_list(tp_num: int = None): config_path = os.path.join('autotest/config.yaml') - print(config_path) with open(config_path) as f: config = yaml.load(f.read(), Loader=yaml.SafeLoader) @@ -18,12 +18,16 @@ def get_turbomind_model_list(): for key in quatization_case_config.get('kvint8_w4a16'): case_list.append(key + '-inner-kvint8-w4a16') + if tp_num is not None: + return [ + item for item in case_list if get_tp_num(config, item) == tp_num + ] + return case_list -def get_torch_model_list(): +def get_torch_model_list(tp_num: int = None): config_path = os.path.join('autotest/config.yaml') - print(config_path) with open(config_path) as f: config = yaml.load(f.read(), Loader=yaml.SafeLoader) @@ -32,4 +36,64 @@ def get_torch_model_list(): for key in quatization_case_config.get('w8a8'): case_list.append(key + '-inner-w8a8') + if tp_num is not None: + return [ + item for item in case_list if get_tp_num(config, item) == tp_num + ] + return case_list + + +def get_all_model_list(tp_num: int = None): + config_path = os.path.join('autotest/config.yaml') + with open(config_path) as f: + config = yaml.load(f.read(), Loader=yaml.SafeLoader) + + case_list = config.get('turbomind_model') + for key in config.get('pytorch_model'): + if key not in case_list: + case_list.append(key) + quatization_case_config = config.get('quatization_case_config') + for key in quatization_case_config.get('w4a16'): + case_list.append(key + '-inner-w4a16') + for key in quatization_case_config.get('kvint8'): + case_list.append(key + '-inner-kvint8') + for key in quatization_case_config.get('kvint8_w4a16'): + case_list.append(key + '-inner-kvint8-w4a16') + + if tp_num is not None: + return [ + item for item in case_list if get_tp_num(config, item) == tp_num + ] + + return case_list + + +def get_cuda_prefix_by_workerid(worker_id, tp_num: int = 1): + if worker_id is None or 'gw' not in worker_id: + return None + else: + if tp_num == 1: + return 'CUDA_VISIBLE_DEVICES=' + worker_id.replace('gw', '') + elif tp_num == 2: + cuda_num = int(worker_id.replace('gw', '')) * 2 + return 'CUDA_VISIBLE_DEVICES=' + ','.join( + [str(cuda_num), str(cuda_num + 1)]) + + +def get_cuda_id_by_workerid(worker_id, tp_num: int = 1): + if worker_id is None or 'gw' not in worker_id: + return None + else: + if tp_num == 1: + return worker_id.replace('gw', '') + elif tp_num == 2: + cuda_num = int(worker_id.replace('gw', '')) * 2 + return ','.join([str(cuda_num), str(cuda_num + 1)]) + + +def get_workerid(worker_id): + if worker_id is None or 'gw' not in worker_id: + return None + else: + return int(worker_id.replace('gw', '')) diff --git a/autotest/utils/get_run_config.py b/autotest/utils/get_run_config.py index 03c53cf5d5..120446f47d 100644 --- a/autotest/utils/get_run_config.py +++ b/autotest/utils/get_run_config.py @@ -102,12 +102,21 @@ def _get_available_cude(): def _simple_model_name(model): - model_name = model.replace('-inner-w4a16', '') + if '/' in model: + model_name = model.split('/')[1] + else: + model_name = model + model_name = model_name.replace('-inner-w4a16', '') model_name = model_name.replace('-inner-w8a8', '') model_name = model_name.replace('-inner-kvint8', '') model_name = model_name.replace('-w4a16', '') return model_name +def _split_model_name(model): + model_name = model.split('/')[1] + return model_name + + if __name__ == '__main__': - print(_simple_model_name('Baichuan2-7B-Chat-inner-w4a16')) + print(_simple_model_name('baichuan-inc/Baichuan2-7B-Chat-inner-w4a16')) diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 274730d4a1..0d65ad6ae0 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -27,7 +27,8 @@ def run_pipeline_chat_test(config, cases_info, model_case, type): if 'pytorch' == type: backend_config = PytorchEngineConfig(tp=tp) else: - if 'kvint8' in model_case and 'w4' in model_case: + if 'kvint8' in model_case and ('w4' in model_case + or '4bits' in model_case): backend_config = TurbomindEngineConfig(tp=tp, model_format='awq', quant_policy=4) @@ -35,7 +36,7 @@ def run_pipeline_chat_test(config, cases_info, model_case, type): backend_config = TurbomindEngineConfig(tp=tp, model_format='hf', quant_policy=4) - elif 'w4' in model_case: + elif 'w4' in model_case or '4bits' in model_case: backend_config = TurbomindEngineConfig(tp=tp, model_format='awq') else: backend_config = TurbomindEngineConfig(tp=tp) @@ -43,6 +44,7 @@ def run_pipeline_chat_test(config, cases_info, model_case, type): # run testcases gen_config = GenerationConfig(temperature=0.01) + gen_config = GenerationConfig() for case in cases_info.keys(): if (case == 'memory_test' or case == 'emoji_case') and 'chat' not in model_case.lower(): @@ -50,7 +52,8 @@ def run_pipeline_chat_test(config, cases_info, model_case, type): case_info = cases_info.get(case) pipeline_chat_log = os.path.join( - log_path, 'pipeline_chat_' + model_case + '_' + case + '.log') + log_path, + 'pipeline_chat_' + model_case.split('/')[1] + '_' + case + '.log') file = open(pipeline_chat_log, 'w') @@ -94,7 +97,8 @@ def assert_pipeline_chat_log(config, cases_info, model_case): result = False with allure.step('case - ' + case): pipeline_chat_log = os.path.join( - log_path, 'pipeline_chat_' + model_case + '_' + case + '.log') + log_path, 'pipeline_chat_' + model_case.split('/')[1] + '_' + + case + '.log') with open(pipeline_chat_log, 'r') as f: lines = f.readlines() diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py index 759f6a5169..d4b4272713 100644 --- a/autotest/utils/quantization_utils.py +++ b/autotest/utils/quantization_utils.py @@ -13,9 +13,10 @@ def quantization(config, origin_model_path = config.get('model_path') + '/' + origin_model_name quantization_model_path = model_path + '/' + quantization_model_name quantization_log = os.path.join( - log_path, - '_'.join(['quantization', quantization_type, quantization_model_name - ]) + '.log') + log_path, '_'.join([ + 'quantization', quantization_type, + quantization_model_name.split('/')[1] + ]) + '.log') if quantization_type == 'w4a16': quantization_cmd = ' '.join([ diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py index cfdd2bfa54..334154a95a 100644 --- a/autotest/utils/run_client_chat.py +++ b/autotest/utils/run_client_chat.py @@ -10,7 +10,7 @@ def command_line_test(config, case_info, model_case, type, - extra, + extra: str = None, cuda_prefix: str = None): dst_path = config.get('dst_path') @@ -24,12 +24,16 @@ def command_line_test(config, config, model_case, cuda_prefix=cuda_prefix) - if 'kvint8' in model_case and 'w4' not in model_case: - cmd += ' --model-format hf --quant-policy 4' - if 'kvint8' in model_case and 'w4' in model_case: + if 'kvint8' in model_case: cmd += ' --quant-policy 4' - if 'w4' in model_case: + if 'w4' in model_case or '4bits' in model_case: + cmd += ' --model-format awq' + else: + cmd += ' --model-format hf' + elif 'w4' in model_case or '4bits' in model_case: cmd += ' --model-format awq' + if 'chat' not in model_case.lower(): + cmd += ' --cap completion' return command_test(config, [cmd], model_case, case, case_info, type == 'turbomind') @@ -48,11 +52,13 @@ def hf_command_line_test(config, need_tp=True, cuda_prefix=cuda_prefix) - if 'kvint8' in model_case and 'w4' not in model_case: - cmd += ' --model-format hf --quant-policy 4' - if 'kvint8' in model_case and 'w4' in model_case: + if 'kvint8' in model_case: cmd += ' --quant-policy 4' - if 'w4' in model_case: + if 'w4' in model_case or '4bits' in model_case: + cmd += ' --model-format awq' + else: + cmd += ' --model-format hf' + elif 'w4' in model_case or '4bits' in model_case: cmd += ' --model-format awq' return command_test(config, [cmd], model_case, '_'.join(['hf', type, case]), case_info, True) @@ -66,8 +72,12 @@ def command_test(config, cmd, model, case, case_info, need_extract_output): log_path = config.get('log_path') model_name = get_model_name(model) - chat_log = os.path.join(log_path, - 'chat_' + model + '_' + case + '.log') + if '/' in model: + chat_log = os.path.join( + log_path, 'chat_' + model.split('/')[1] + '_' + case + '.log') + else: + chat_log = os.path.join(log_path, + 'chat_' + model + '_' + case + '.log') file = open(chat_log, 'w') @@ -78,7 +88,7 @@ def command_test(config, cmd, model, case, case_info, need_extract_output): file.writelines('reproduce command chat: ' + ' '.join(cmd) + '\n') spliter = '\n\n' - if model == 'CodeLlama-7b-Instruct-hf': + if 'CodeLlama-7b-Instruct-hf' in model: spliter = '\n!!\n' # join prompt together prompt = '' @@ -136,15 +146,13 @@ def command_test(config, cmd, model, case, case_info, need_extract_output): # 从输出中解析模型输出的对话内容 def parse_dialogue(inputs: str, model: str): dialogues = inputs.strip() - if model == 'CodeLlama-7b-Instruct-hf': + if 'CodeLlama-7b-Instruct-hf' in model: sep = 'enter !! to end the input >>>' else: sep = 'double enter to end input >>>' dialogues = dialogues.strip() dialogues = dialogues.split(sep) dialogues = [d.strip() for d in dialogues] - if 'Llama' in model: - return dialogues return dialogues[1:-1] # 去除首尾无用字符 diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 6236ebe50c..8d82a4c9c4 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -7,10 +7,11 @@ from lmdeploy.serve.openai.api_client import APIClient -def open_chat_test(config, case_info, model, url): +def open_chat_test(config, case_info, model, url, worker_id: str = 'default'): log_path = config.get('log_path') - restful_log = os.path.join(log_path, 'restful_' + model + '.log') + restful_log = os.path.join(log_path, + 'restful_' + model + '_' + worker_id + '.log') file = open(restful_log, 'w') @@ -49,10 +50,15 @@ def open_chat_test(config, case_info, model, url): return result, restful_log, msg -def interactive_test(config, case_info, model, url): +def interactive_test(config, + case_info, + model, + url, + worker_id: str = 'default'): log_path = config.get('log_path') - interactive_log = os.path.join(log_path, 'interactive_' + model + '.log') + interactive_log = os.path.join( + log_path, 'interactive_' + model + '_' + worker_id + '.log') file = open(interactive_log, 'w') diff --git a/.readthedocs.yaml b/docs/en/.readthedocs.yaml similarity index 81% rename from .readthedocs.yaml rename to docs/en/.readthedocs.yaml index 05ec15cca3..525ef5f7a3 100644 --- a/.readthedocs.yaml +++ b/docs/en/.readthedocs.yaml @@ -7,6 +7,11 @@ build: tools: python: "3.8" + +sphinx: + configuration: docs/en/conf.py + + python: install: - requirements: requirements/docs.txt diff --git a/docs/en/get_started.md b/docs/en/get_started.md index 2b09aa3b74..084947095c 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started.md @@ -31,19 +31,11 @@ For more information on inference pipeline parameters, please refer to [here](./ ## Serving -LMDeploy's `api_server` enables models to be easily packed into services with a single command. The provided RESTful APIs are compatible with OpenAI's interfaces. Below are an example of service startup: +LMDeploy offers various serving methods, choosing one that best meet your requirements. -```shell -lmdeploy serve api_server internlm/internlm-chat-7b -``` - -The default port of `api_server` is `23333`. After the server is launched, you can communicate with server on terminal through `api_client`: - -```shell -lmdeploy serve api_client http://0.0.0.0:23333 -``` - -You can overview and try out `api_server` APIs online by swagger UI at `http://0.0.0.0:23333`, or you can read the API specification from [here](serving/restful_api.md). +- [Serving with openai compatible server](https://lmdeploy.readthedocs.io/en/latest/serving/restful_api.html) +- [Serving with docker](https://lmdeploy.readthedocs.io/en/latest/serving/restful_api.html#option-2-deploying-with-docker) +- [Serving with gradio](https://lmdeploy.readthedocs.io/en/latest/serving/gradio.html) ## Quantization diff --git a/docs/en/index.rst b/docs/en/index.rst index daccaae535..66e9c059b1 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -49,8 +49,8 @@ Welcome to LMDeploy's tutorials! :caption: serving serving/restful_api.md - serving/proxy_server.md serving/gradio.md + serving/proxy_server.md .. _quantization: .. toctree:: diff --git a/docs/en/serving/gradio.md b/docs/en/serving/gradio.md index 803dff50f5..7b223565ff 100644 --- a/docs/en/serving/gradio.md +++ b/docs/en/serving/gradio.md @@ -1,10 +1,25 @@ -# Steps to create a huggingface online demo +# Serving with Gradio -## create space +Starting an LLM model's gradio service with LMDeploy and interacting with the model on the WebUI is incredibly simple. + +```shell +pip install lmdeploy[serve] +lmdeploy serve gradio {model_path} +``` + +All it takes is one-line command, with the `{model_path}` replaced by the model ID from huggingface hub, such as `internlm/internlm2-chat-7b`, or the local path to the model. + +For detailed parameters of the command, please turn to `lmdeploy serve gradio -h` for help. + +## Create a huggingface demo + +If you want to create an online demo project for your model on huggingface, please follow the steps below. + +### Step 1: Create space First, register for a Hugging Face account. After successful registration, click on your profile picture in the upper right corner and select “New Space” to create one. Follow the Hugging Face guide to choose the necessary configurations, and you will have a blank demo space ready. -## A demo for LMDeploy +### Step 2: Develop demo's entrypoint `app.py` Replace the content of `app.py` in your space with the following code: @@ -12,7 +27,7 @@ Replace the content of `app.py` in your space with the following code: from lmdeploy.serve.gradio.turbomind_coupled import run_local from lmdeploy.messages import TurbomindEngineConfig -backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05) +backend_config = TurbomindEngineConfig(max_batch_size=8) model_path = 'internlm/internlm2-chat-7b' run_local(model_path, backend_config=backend_config, server_name="huggingface-space") ``` @@ -25,7 +40,7 @@ lmdeploy ## FAQs -- ZeroGPU compatibility issue. ZeroGPU is more suitable for inference methods similar to PyTorch, rather than Turbomind. You can switch to the PyTorch backend or enable standard GPUs. +- ZeroGPU compatibility issue. ZeroGPU is not suitable for LMDeploy turbomind engine. Please use the standard GPUs. Or, you can change the backend config in the above code to `PyTorchEngineConfig` to use the ZeroGPU. - Gradio version issue, versions above 4.0.0 are currently not supported. You can modify this in `app.py`, for example: ```python import os diff --git a/docs/en/serving/restful_api.md b/docs/en/serving/restful_api.md index de1ea9fa44..d092d9b288 100644 --- a/docs/en/serving/restful_api.md +++ b/docs/en/serving/restful_api.md @@ -1,34 +1,29 @@ -# Restful API +# Serving with OpenAI Compatible Server -## Launch Service +This article primarily discusses the deployment of a single LLM model across multiple GPUs on a single node, providing a service that is compatible with the OpenAI interface, as well as the usage of the service API. +For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](./proxy_server.md). -The user can open the http url print by the following command in a browser. +In the following sections, we will first introduce two methods for starting the service, choosing the appropriate one based on your application scenario. -- **Please check the http url for the detailed api usage!!!** -- **Please check the http url for the detailed api usage!!!** -- **Please check the http url for the detailed api usage!!!** +Next, we focus on the definition of the service's RESTful API, explore the various ways to interact with the interface, and demonstrate how to try the service through the Swagger UI or LMDeploy CLI tools. -```shell -lmdeploy serve api_server ./workspace --server-name 0.0.0.0 --server-port ${server_port} --tp 1 -``` +Finally, we showcase how to integrate the service into a WebUI, providing you with a reference to easily set up a demonstration demo. -The parameters supported by api_server can be viewed through the command line `lmdeploy serve api_server -h`. +## Launch Service -We provide some RESTful APIs. Three of them are in OpenAI format. +Take the [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) model hosted on huggingface hub as an example, you can choose one the following methods to start the service. -- /v1/chat/completions -- /v1/models -- /v1/completions +### Option 1: Launching with lmdeploy CLI -However, we recommend users try -our own api `/v1/chat/interactive` which provides more arguments for users to modify. The performance is comparatively better. +```shell +lmdeploy serve api_server internlm/internlm2-chat-7b --server-port 23333 +``` -**Note** please, if you want to launch multiple requests, you'd better set different `session_id` for both -`/v1/chat/completions` and `/v1/chat/interactive` apis. Or, we will set them random values. +The arguments of `api_server` can be viewed through the command `lmdeploy serve api_server -h`, for instance, `--tp` to set tensor parallelism, `--session-len` to specify the max length of the context window, `--cache-max-entry-count` to adjust the GPU mem ratio for k/v cache etc. -## Deploy http service with docker +### Option 2: Deploying with docker -LMDeploy offers [official docker image](https://hub.docker.com/r/openmmlab/lmdeploy/tags) for deployment. The image can be used to run OpenAI compatible server. +With LMDeploy [official docker image](https://hub.docker.com/r/openmmlab/lmdeploy/tags), you can run OpenAI compatible server as follows: ```shell docker run --runtime nvidia --gpus all \ @@ -40,11 +35,60 @@ docker run --runtime nvidia --gpus all \ lmdeploy serve api_server internlm/internlm2-chat-7b ``` -Just like the previous section, user can try the Swagger UI with a web browser. +The parameters of `api_server` are the same with that mentioned in "[option 1](#option-1-launching-with-lmdeploy-cli)" section + +## RESTful API + +LMDeploy's RESTful API is compatible with the following three OpenAI interfaces: + +- /v1/chat/completions +- /v1/models +- /v1/completions + +Additionally, LMDeploy also defines `/v1/chat/interactive` to support interactive inference. The feature of interactive inference is that there's no need to pass the user conversation history as required by `v1/chat/completions`, since the conversation history will be cached on the server side. This method boasts excellent performance during multi-turn long context inference. + +You can overview and try out the offered RESTful APIs by the website `http://0.0.0.0:23333` as shown in the below image after launching the service successfully. + +![swagger_ui](https://github.com/InternLM/lmdeploy/assets/4560679/b891dd90-3ffa-4333-92b2-fb29dffa1459) + +Or, you can use the LMDeploy's built-in CLI tool to verify the service correctness right from the console. + +```shell +# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 +lmdeploy serve api_client ${api_server_url} +``` + +If you need to integrate the service into your own projects or products, we recommend the following approach: + +### Integrate with `OpenAI` + +Here is an example of interaction with the endpoint `v1/chat/completions` service via the openai package. +Before running it, please install the openai package by `pip install openai` + +```python +from openai import OpenAI +client = OpenAI( + api_key='YOUR_API_KEY', + base_url="http://0.0.0.0:23333/v1" +) + +response = client.chat.completions.create( + model="internlm2-chat-7b", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": " provide three suggestions about time management"}, + ], + temperature=0.8, + top_p=0.8 +) +print(response) +``` + +You can invoke other OpenAI interfaces using similar methods. For more detailed information, please refer to the [OpenAI API guide](https://platform.openai.com/docs/guides/text-generation) -## python +### Integrate with lmdeploy `APIClient` -We have integrated the client-side functionalities of these services into the `APIClient` class. Below are some examples demonstrating how to invoke the `api_server` service on the client side. +Below are some examples demonstrating how to visit the service through `APIClient` If you want to use the `/v1/chat/completions` endpoint, you can try the following code: @@ -57,7 +101,7 @@ for item in api_client.chat_completions_v1(model=model_name, messages=messages): print(item) ``` -For the `/v1/completions` endpoint. If you want to use the `/v1/completions` endpoint, you can try: +For the `/v1/completions` endpoint, you can try: ```python from lmdeploy.serve.openai.api_client import APIClient @@ -67,23 +111,29 @@ for item in api_client.completions_v1(model=model_name, prompt='hi'): print(item) ``` -Lmdeploy supports maintaining session histories on the server for `/v1/chat/interactive` api. We disable the -feature by default. +As for `/v1/chat/interactive`,we disable the feature by default. Please open it by setting `interactive_mode = True`. If you don't, it falls back to openai compatible interfaces. -- On interactive mode, the chat history is kept on the server. In a multiple rounds of conversation, you should set - `interactive_mode = True` and the same `session_id` (can't be -1, it's the default number) to `/v1/chat/interactive` for requests. -- On normal mode, no chat history is kept on the server. - -The interactive mode can be controlled by the `interactive_mode` boolean parameter. The following is an example of normal mode. If you want to experience the interactive mode, simply pass in `interactive_mode=True`. +Keep in mind that `session_id` indicates an identical sequence and all requests belonging to the same sequence must share the same `session_id`. +For instance, in a sequence with 10 rounds of chatting requests, the `session_id` in each request should be the same. ```python from lmdeploy.serve.openai.api_client import APIClient -api_client = APIClient('http://{server_ip}:{server_port}') -for item in api_client.chat_interactive_v1(prompt='hi'): - print(item) +api_client = APIClient(f'http://{server_ip}:{server_port}') +messages = [ + "hi, what's your name?", + "who developed you?", + "Tell me more about your developers", + "Summarize the information we've talked so far" +] +for message in messages: + for item in api_client.chat_interactive_v1(prompt=message, + session_id=1, + interactive_mode=True, + stream=False): + print(item) ``` -## Java/Golang/Rust +### Integrate with Java/Golang/Rust May use [openapi-generator-cli](https://github.com/OpenAPITools/openapi-generator-cli) to convert `http://{server_ip}:{server_port}/openapi.json` to java/rust/golang client. Here is an example: @@ -102,29 +152,17 @@ rust/src: apis lib.rs models ``` -## cURL +### Integrate with cURL -cURL is a tool for observing the output of the api. +cURL is a tool for observing the output of the RESTful APIs. -List Models: +- list served models `v1/models` ```bash curl http://{server_ip}:{server_port}/v1/models ``` -Interactive Chat: - -```bash -curl http://{server_ip}:{server_port}/v1/chat/interactive \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "Hello! How are you?", - "session_id": 1, - "interactive_mode": true - }' -``` - -Chat Completions: +- chat `v1/chat/completions` ```bash curl http://{server_ip}:{server_port}/v1/chat/completions \ @@ -135,7 +173,7 @@ curl http://{server_ip}:{server_port}/v1/chat/completions \ }' ``` -Text Completions: +- text completions `v1/completions` ```shell curl http://{server_ip}:{server_port}/v1/completions \ @@ -146,18 +184,23 @@ curl http://{server_ip}:{server_port}/v1/completions \ }' ``` -## CLI client - -There is a client script for restful api server. +- interactive chat `v1/chat/interactive` -```shell -# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 -lmdeploy serve api_client api_server_url +```bash +curl http://{server_ip}:{server_port}/v1/chat/interactive \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "Hello! How are you?", + "session_id": 1, + "interactive_mode": true + }' ``` -## webui through gradio +## Integrate with WebUI + +LMDeploy utilizes `gradio` or [OpenAOE](https://github.com/InternLM/OpenAOE) to integrate a web ui for `api_server` -You can also test restful-api through webui. +### Option 1: gradio ```shell # api_server_url is what printed in api_server.py, e.g. http://localhost:23333 @@ -166,9 +209,7 @@ You can also test restful-api through webui. lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port} ``` -## webui through OpenAOE - -You can use [OpenAOE](https://github.com/InternLM/OpenAOE) for seamless integration with LMDeploy. +### Option 2: OpenAOE ```shell pip install -U openaoe @@ -191,7 +232,3 @@ Please refer to the [guidance](https://github.com/InternLM/OpenAOE/blob/main/doc 5. If you need to adjust other default parameters of the session, such as the content of fields like system. You can directly pass in the initialization parameters of the [dialogue template](https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/model.py). For example, for the internlm-chat-7b model, you can set the `--meta-instruction` parameter when starting the `api_server`. 6. Regarding the stop words, we only support characters that encode into a single index. Furthermore, there may be multiple indexes that decode into results containing the stop word. In such cases, if the number of these indexes is too large, we will only use the index encoded by the tokenizer. If you want use a stop symbol that encodes into multiple indexes, you may consider performing string matching on the streaming client side. Once a successful match is found, you can then break out of the streaming loop. - -## request distribution service - -Please refer to our [request distributor server](./proxy_server.md) diff --git a/docs/zh_cn/.readthedocs.yaml b/docs/zh_cn/.readthedocs.yaml new file mode 100644 index 0000000000..9f94662947 --- /dev/null +++ b/docs/zh_cn/.readthedocs.yaml @@ -0,0 +1,18 @@ +version: 2 + +formats: all + +build: + os: "ubuntu-22.04" + tools: + python: "3.8" + + +sphinx: + configuration: docs/zh_cn/conf.py + + +python: + install: + - requirements: requirements/docs.txt + - requirements: requirements/readthedocs.txt diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md index f5d6b1c13f..26fd61ab71 100644 --- a/docs/zh_cn/get_started.md +++ b/docs/zh_cn/get_started.md @@ -31,19 +31,11 @@ print(response) ## 推理服务 -LMDeploy `api_server` 支持把模型一键封装为服务,对外提供的 RESTful API 兼容 openai 的接口。以下为服务启动的示例: +LMDeploy 提供了多种部署模型推理服务的方式,总有一款适合你。 -```shell -lmdeploy serve api_server internlm/internlm-chat-7b -``` - -服务默认端口是23333。在 server 启动后,你可以在终端通过`api_client`与server进行对话: - -```shell -lmdeploy serve api_client http://0.0.0.0:23333 -``` - -除了`api_client`,你还可以通过 Swagger UI `http://0.0.0.0:23333` 在线阅读和试用 `api_server` 的各接口,也可直接查阅[文档](serving/restful_api.md),了解各接口的定义和使用方法。 +- [部署类 openai 的服务](https://lmdeploy.readthedocs.io/zh-cn/latest//serving/restful_api.html) +- [通过 docker 部署服务](https://lmdeploy.readthedocs.io/zh-cn/latest/serving/restful_api.html#docker) +- [部署 gradio 服务](https://lmdeploy.readthedocs.io/zh-cn/latest/serving/gradio.html) ## 模型量化 diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index e89bfa661b..265fb716d2 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -50,8 +50,8 @@ :caption: 服务 serving/restful_api.md - serving/proxy_server.md serving/gradio.md + serving/proxy_server.md .. _量化: diff --git a/docs/zh_cn/serving/gradio.md b/docs/zh_cn/serving/gradio.md index fe1e01af3f..3e70f68856 100644 --- a/docs/zh_cn/serving/gradio.md +++ b/docs/zh_cn/serving/gradio.md @@ -1,11 +1,26 @@ -# 从 LMDeploy 创建一个 huggingface 的在线 demo +# 部署 gradio 服务 -## 创建 space +通过 LMDeploy 启动 LLM 模型的 gradio 服务,并在 WebUI 上和模型对话特别简单,一条命令即可。 + +```shell +pip install lmdeploy[serve] +lmdeploy serve gradio {model_path} +``` + +把上面命令中的 `{model_path}` 换成 huggingface hub 上的模型 id,比如 internlm/internlm2-chat-7b,或者换成模型的本地路径就可以了。 + +关于命令的详细参数,请使用 `lmdeploy serve gradio --help` 查阅。 + +## 创建 huggingface demo + +如果想要在 huggingface 上创建模型的在线演示项目,请按以下步骤进行。 + +### 第一步:创建 space 首先,注册一个 huggingface 的账号,注册成功后,可以点击右上角头像,选择 New Space 创建。 根据 huggingface 的引导选择需要的配置,完成后即可得到一个空白的 demo。 -## 使用 LMDeploy 的 demo +### 第二步:编写 demo 入口代码 app.py 以 `internlm/internlm2-chat-7b` 模型为例,将 space 空间中的`app.py`内容填写为: @@ -13,7 +28,7 @@ from lmdeploy.serve.gradio.turbomind_coupled import run_local from lmdeploy.messages import TurbomindEngineConfig -backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05) +backend_config = TurbomindEngineConfig(max_batch_size=8) model_path = 'internlm/internlm2-chat-7b' run_local(model_path, backend_config=backend_config, server_name="huggingface-space") ``` @@ -26,7 +41,7 @@ lmdeploy ## FAQs -- ZeroGPU 适配问题。ZeroGPU 更适合类似 PyTorch 这样的推理方式,而非 Turbomind。可以改用 pytorch 后端,或者启用普通 GPU。 +- ZeroGPU 适配问题。ZeroGPU不适用 LMDeploy Turbomind 引擎,请选择普通 GPU,或者把上述代码中的 backend_config 改成 PyTorchEngineConfig,就可以用 ZeroGPU 了。 - gradio 版本问题,目前不支持 4.0.0 以上版本,可以在 `app.py` 中修改,类似: ```python import os diff --git a/docs/zh_cn/serving/restful_api.md b/docs/zh_cn/serving/restful_api.md index bb0e4da12d..76b76a0a63 100644 --- a/docs/zh_cn/serving/restful_api.md +++ b/docs/zh_cn/serving/restful_api.md @@ -1,31 +1,29 @@ -# Restful API +# 部署类 openai 服务 -## 启动服务 +本文主要介绍单个模型在单机多卡环境下,部署兼容 openai 接口服务的方式,以及服务接口的用法。为行文方便,我们把该服务名称为 `api_server`。对于多模型的并行服务,请阅读[请求分发服务器](./proxy_server.md)一文。 -用户将下面命令输出的 http url 复制到浏览器打开,详细查看所有的 API 及其使用方法。 -请一定查看`http://{server_ip}:{server_port}`!!! -请一定查看`http://{server_ip}:{server_port}`!!! -请一定查看`http://{server_ip}:{server_port}`!!! -重要的事情说三遍。 +在这篇文章中, 我们首先介绍服务启动的两种方法,你可以根据应用场景,选择合适的。 -```shell -lmdeploy serve api_server ./workspace --server-name 0.0.0.0 --server-port ${server_port} --tp 1 -``` +其次,我们重点介绍服务的 RESTful API 定义,以及接口使用的方式,并展示如何通过 Swagger UI、LMDeploy CLI 工具体验服务功能 -api_server 启动时支持的参数可以通过命令行`lmdeploy serve api_server -h`查看。 +最后,向大家演示把服务接入到 WebUI 的方式,你可以参考它简单搭建一个演示 demo。 -我们提供的 restful api,其中三个仿照 OpenAI 的形式。 +## 启动服务 -- /v1/chat/completions -- /v1/models -- /v1/completions +以 huggingface hub 上的 [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) 模型为例,你可以任选以下方式之一,启动推理服务。 -不过,我们建议用户用我们提供的另一个 API: `/v1/chat/interactive`。 -它有更好的性能,提供更多的参数让用户自定义修改。 +### 方式一:使用 lmdeploy cli 工具 -## 用 docker 部署 http 服务 +```shell +lmdeploy serve api_server internlm/internlm2-chat-7b --server-port 23333 +``` + +api_server 启动时的参数可以通过命令行`lmdeploy serve api_server -h`查看。 +比如,`--tp` 设置张量并行,`--session-len` 设置推理的最大上下文窗口长度,`--cache-max-entry-count` 调整 k/v cache 的内存使用比例等等。 -LMDeploy 提供了官方[镜像](https://hub.docker.com/r/openmmlab/lmdeploy/tags)。使用这个镜像,可以运行兼容 OpenAI 的服务。下面是使用示例: +### 方式二:使用 docker + +使用 LMDeploy 官方[镜像](https://hub.docker.com/r/openmmlab/lmdeploy/tags),可以运行兼容 OpenAI 的服务。下面是使用示例: ```shell docker run --runtime nvidia --gpus all \ @@ -37,16 +35,64 @@ docker run --runtime nvidia --gpus all \ lmdeploy serve api_server internlm/internlm2-chat-7b ``` -然后像上面一样使用浏览器试用 Swagger UI 即可。 +在这个例子中,`lmdeploy server api_server` 的命令参数与方式一一致。 + +## RESTful API + +LMDeploy 的 RESTful API 兼容了 OpenAI 以下 3 个接口: + +- /v1/chat/completions +- /v1/models +- /v1/completions + +此外,LMDeploy 还定义了 `/v1/chat/interactive`,用来支持交互式推理。交互式推理的特点是不用像`v1/chat/completions`传入用户对话历史,因为对话历史会被缓存在服务端。 +这种方式在多轮次的长序列推理时,拥有很好的性能。 + +服务启动后,你可以在浏览器中打开网页 http://0.0.0.0:23333,通过 Swagger UI 查看接口的详细说明,并且也可以直接在网页上操作,体验每个接口的用法,如下图所示。 + +![swagger_ui](https://github.com/InternLM/lmdeploy/assets/4560679/b891dd90-3ffa-4333-92b2-fb29dffa1459) + +也可以使用 LMDeploy 自带的 CLI 工具,在控制台验证服务的正确性。 + +```shell +# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 +lmdeploy serve api_client ${api_server_url} +``` + +若需要把服务集成到自己的项目或者产品中,我们推荐以下用法: + +### 使用 openai 接口 + +以下代码是通过 openai 包使用 `v1/chat/completions` 服务的例子。运行之前,请先安装 openai 包: `pip install openai`。 + +```python +from openai import OpenAI +client = OpenAI( + api_key='YOUR_API_KEY', + base_url="http://0.0.0.0:23333/v1" +) + +response = client.chat.completions.create( + model="internlm2-chat-7b", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": " provide three suggestions about time management"}, + ], + temperature=0.8, + top_p=0.8 +) +print(response) +``` + +关于其他 openai 接口的调用,也可以如法炮制。详情请参考 openai 官方[文档](https://platform.openai.com/docs/guides/text-generation) -## python +### 使用 lmdeploy `APIClient` 接口 -我们将这些服务的客户端功能集成在 `APIClient` 类中。下面是一些例子,展示如何在客户端调用 `api_server` 服务。 如果你想用 `/v1/chat/completions` 接口,你可以尝试下面代码: ```python from lmdeploy.serve.openai.api_client import APIClient -api_client = APIClient('http://{server_ip}:{server_port}') +api_client = APIClient(f'http://{server_ip}:{server_port}') model_name = api_client.available_models[0] messages = [{"role": "user", "content": "Say this is a test!"}] for item in api_client.chat_completions_v1(model=model_name, messages=messages): @@ -57,28 +103,35 @@ for item in api_client.chat_completions_v1(model=model_name, messages=messages): ```python from lmdeploy.serve.openai.api_client import APIClient -api_client = APIClient('http://{server_ip}:{server_port}') +api_client = APIClient(f'http://{server_ip}:{server_port}') model_name = api_client.available_models[0] for item in api_client.completions_v1(model=model_name, prompt='hi'): print(item) ``` -LMDeploy 的 `/v1/chat/interactive` api 支持将对话内容管理在服务端,但是我们默认关闭。如果想尝试,请阅读以下介绍: +关于 `/v1/chat/interactive` 接口,我们默认是关闭的。在使用时,请设置`interactive_mode = True`打开它。否则,它会退化为 openai 接口。 -- 交互模式下,对话历史保存在 server。在一次完整的多轮对话中,所有请求设置`interactive_mode = True`, `session_id`保持相同 (不为 -1,这是缺省值)。 -- 非交互模式下,server 不保存历史记录。 - -交互模式可以通过 `interactive_mode` 布尔量参数控制。下面是一个普通模式的例子, -如果要体验交互模式,将 `interactive_mode=True` 传入即可。 +在交互式推理中,每个对话序列的 id 必须唯一,所有属于该独立的对话请求,必须使用相同的 id。这里的 id 对应与接口中的 `session_id`。 +比如,一个对话序列中,有 10 轮对话请求,那么每轮对话请求中的 `session_id` 都要相同。 ```python from lmdeploy.serve.openai.api_client import APIClient -api_client = APIClient('http://{server_ip}:{server_port}') -for item in api_client.chat_interactive_v1(prompt='hi'): - print(item) +api_client = APIClient(f'http://{server_ip}:{server_port}') +messages = [ + "hi, what's your name?", + "who developed you?", + "Tell me more about your developers", + "Summarize the information we've talked so far" +] +for message in messages: + for item in api_client.chat_interactive_v1(prompt=message, + session_id=1, + interactive_mode=True, + stream=False): + print(item) ``` -## Java/Golang/Rust +### 使用 Java/Golang/Rust 可以使用代码生成工具 [openapi-generator-cli](https://github.com/OpenAPITools/openapi-generator-cli) 将 `http://{server_ip}:{server_port}/openapi.json` 转成 java/rust/golang 客户端。 下面是一个使用示例: @@ -97,29 +150,17 @@ rust/src: apis lib.rs models ``` -## cURL +### 使用 cURL cURL 也可以用于查看 API 的输出结果 -查看模型列表: +- 查看模型列表 `v1/models` ```bash curl http://{server_ip}:{server_port}/v1/models ``` -Interactive Chat: - -```bash -curl http://{server_ip}:{server_port}/v1/chat/interactive \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "Hello! How are you?", - "session_id": 1, - "interactive_mode": true - }' -``` - -Chat Completions: +- 对话 `v1/chat/completions` ```bash curl http://{server_ip}:{server_port}/v1/chat/completions \ @@ -130,7 +171,7 @@ curl http://{server_ip}:{server_port}/v1/chat/completions \ }' ``` -Text Completions: +- 文本补全 `v1/completions` ```shell curl http://{server_ip}:{server_port}/v1/completions \ @@ -141,18 +182,23 @@ curl http://{server_ip}:{server_port}/v1/completions \ }' ``` -## CLI client +- 交互式对话 `v1/chat/interactive` -restful api 服务可以通过客户端测试,例如 - -```shell -# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 -lmdeploy serve api_client api_server_url +```bash +curl http://{server_ip}:{server_port}/v1/chat/interactive \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "Hello! How are you?", + "session_id": 1, + "interactive_mode": true + }' ``` -## webui through gradio +## 接入 WebUI + +LMDeploy 提供 gradio 和 [OpenAOE](https://github.com/InternLM/OpenAOE) 两种方式,为 api_server 接入 WebUI。 -也可以直接用 webui 测试使用 restful-api。 +### 方式一:通过 gradio 接入 ```shell # api_server_url 就是 api_server 产生的,比如 http://localhost:23333 @@ -161,9 +207,7 @@ lmdeploy serve api_client api_server_url lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port} ``` -## webui through OpenAOE - -可以使用 [OpenAOE](https://github.com/InternLM/OpenAOE) 无缝接入restful api服务. +### 方式二:通过 OpenAOE 接入 ```shell pip install -U openaoe @@ -185,7 +229,3 @@ openaoe -f /path/to/your/config-template.yaml 5. 如需调整会话默认的其他参数,比如 system 等字段的内容,可以直接将[对话模板](https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/model.py)初始化参数传入。比如 internlm-chat-7b 模型,可以通过启动`api_server`时,设置`--meta-instruction`参数。 6. 关于停止符,我们只支持编码后为单个 index 的字符。此外,可能存在多种 index 都会解码出带有停止符的结果。对于这种情况,如果这些 index 数量太多,我们只会采用 tokenizer 编码出的 index。而如果你想要编码后为多个 index 的停止符,可以考虑在流式客户端做字符串匹配,匹配成功后跳出流式循环即可。 - -## 多机并行服务 - -请参考我们的 [请求分发服务器](./proxy_server.md) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 2ecd1cdc96..97ea2d9939 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -1052,6 +1052,11 @@ def serve(model_path: str, VariableInterface.qos_engine.start() except FileNotFoundError: VariableInterface.qos_engine = None + else: + # hide qos functions if not applied + for i in range(len(app.router.routes)): + if 'qos' in app.router.routes[i].path: + app.router.routes[i].include_in_schema = False for i in range(3): print( diff --git a/lmdeploy/version.py b/lmdeploy/version.py index d45eba82e4..277e7502ec 100644 --- a/lmdeploy/version.py +++ b/lmdeploy/version.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple -__version__ = '0.2.4' +__version__ = '0.2.5' short_version = __version__