diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py index 81872c0dd1..89ad20a533 100644 --- a/.github/scripts/eval_chat_config.py +++ b/.github/scripts/eval_chat_config.py @@ -174,7 +174,6 @@ max_out_len=MAX_NEW_TOKENS, max_seq_len=MAX_SESSION_LEN, batch_size=128, - concurrency=128, meta_template=llama2_meta_template, run_cfg=dict(num_gpus=1), end_str='[INST]') diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index 84fcaf5034..f03bbf4a50 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -199,9 +199,9 @@ jobs: chmod -R 777 $workdir test_tools: - needs: test_quantization if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} runs-on: [self-hosted, linux-a100] + needs: test_quantization timeout-minutes: 150 strategy: fail-fast: false diff --git a/.github/workflows/daily_ete_test_v100.yml b/.github/workflows/daily_ete_test_v100.yml new file mode 100644 index 0000000000..8b32bab1f7 --- /dev/null +++ b/.github/workflows/daily_ete_test_v100.yml @@ -0,0 +1,667 @@ +name: daily_ete_test_v100 + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch', 'turbomind_vl']" + model: + required: true + description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models' + type: string + default: "['pipeline','restful','chat']" + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false + regression_func: + required: true + description: 'regression functions' + type: string + default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']" + schedule: + - cron: '00 16 * * 0-4' + +env: + HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache + HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai + OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} + COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} + TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy + OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy + OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt + +jobs: + linux-build: + if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.1 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + + download_pkgs: + needs: linux-build + if: ${{!cancelled()}} + runs-on: linux-v100 + timeout-minutes: 50 + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/187:/mnt/187 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} + - name: Change testconfig on v100 + run: | + mv ${{env.TEST_CODE_PATH}}/autotest/config-v100.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Copy Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Copy Artifacts - offline + if: ${{inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + + test_quantization: + needs: download_pkgs + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} + runs-on: linux-v100 + timeout-minutes: 180 + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /root/modelscope_hub + MODELSCOPE_MODULES_CACHE: /root/modelscope_modules + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/modelscope_hub:/root/modelscope_hub + - /nvme/github-actions/modelscope_modules:/root/modelscope_modules + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /nvme/qa_test_models/lmdeploy/autotest:/local_case + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - quantization w4a16 + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind') + run: | + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - convert + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind') + run: | + pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_tools: + needs: test_quantization + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} + runs-on: linux-v100 + timeout-minutes: 240 + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend) || fromJSON('["turbomind", "pytorch", "turbomind_vl"]')}} + model: ${{ fromJSON(inputs.model) || fromJSON('["pipeline","restful","chat"]')}} + exclude: + - backend: turbomind_vl + model: chat + include: + - backend: turbomind + model: local_case + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /root/modelscope_hub + MODELSCOPE_MODULES_CACHE: /root/modelscope_modules + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/modelscope_hub:/root/modelscope_hub + - /nvme/github-actions/modelscope_modules:/root/modelscope_modules + - /nvme/github-actions/resources/lora:/root/lora + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /nvme/qa_test_models/lmdeploy/autotest:/local_case + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + cp -r /root/lora . + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - chat workspace + continue-on-error: true + if: matrix.backend == 'turbomind' && matrix.model == 'chat' + run: | + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - chat + continue-on-error: true + if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'chat' + run: | + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - pipeline + continue-on-error: true + if: matrix.model == 'pipeline' + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - restful + continue-on-error: true + if: matrix.model == 'restful' + run: | + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - restful workspace + continue-on-error: true + if: matrix.backend == 'turbomind' && matrix.model == 'restful' + run: | + pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - local testcase + if: matrix.backend == 'turbomind' && matrix.model == 'local_case' + run: | + pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_restful: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} + runs-on: linux-v100 + needs: test_quantization + strategy: + fail-fast: false + matrix: + backend: ['turbomind', 'pytorch'] + timeout-minutes: 120 + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Start restful api turbomind + if: matrix.backend == 'turbomind' + run: | + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b-chat --tp 2 > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 600s + - name: Start restful api pytorch + if: matrix.backend == 'pytorch' + run: | + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b-chat --tp 2 --backend pytorch --dtype float16 > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 600s + - name: Test lmdeploy - restful api + timeout-minutes: 75 + run: | + pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Kill api server + if: always() + run: | + kill -15 "$restful_pid" + - name: Start restful api turbomind - base + if: matrix.backend == 'turbomind' + run: | + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b --tp 2 > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 600s + - name: Start restful api pytorch - base + if: matrix.backend == 'pytorch' + run: | + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b --tp 2 --backend pytorch --dtype float16 > restful.log 2>&1 & + echo "restful_pid=$!" >> "$GITHUB_ENV" + sleep 600s + - name: Test lmdeploy - restful api - base + timeout-minutes: 40 + run: | + pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Kill api server + if: always() + run: | + kill -15 "$restful_pid" + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_pipeline: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'pipeline'))}} + runs-on: linux-v100 + needs: test_quantization + timeout-minutes: 240 + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - interface pipeline case + run: | + pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + + test_benchmark: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} + runs-on: linux-v100 + needs: test_quantization + timeout-minutes: 120 + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test benchmark script + run: | + pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_evaluation: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}} + runs-on: linux-v100 + needs: test_quantization + timeout-minutes: 120 + strategy: + fail-fast: false + matrix: + evaluate_type: ['chat', 'base'] + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/resources:/root/resources + - /nvme/github-actions/opencompass-data:/root/opencompass-data + - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/shared:/mnt/shared + - /mnt/187:/mnt/187 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy - dependency + run: | + # manually install flash attn + # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases + python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps + python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Install opencompass + run: | + git clone --depth=1 https://github.com/open-compass/opencompass.git + cd opencompass + python3 -m pip install -e . + echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV + - name: Check env + run: | + pip install triton==3.0.0 + pip uninstall -y nvidia-nccl-cu11 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + rm -rf /nvme/qa_test_models/autotest_model/log/* + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Setup paths for evaluation + run: | + ln -s /root/opencompass-data ./data + python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models . + - name: Evaluate models + if: matrix.evaluate_type == 'chat' + run: | + export LMDEPLOY_DIR=$(pwd) + + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, turbomind_internlm2_5_20b_chat, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, turbomind_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, turbomind_llama_3d1_8b_instruct_4bits, turbomind_llama_3d1_8b_instruct_kvint4, turbomind_qwen2_7b_instruct, turbomind_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true + - name: Evaluate base models + if: matrix.evaluate_type == 'base' + run: | + export LMDEPLOY_DIR=$(pwd) + + python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]" "[*mmlu_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} base true + - name: Clear workspace + if: always() + run: | + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + + get_benchmark_result: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}} + needs: [test_benchmark] + timeout-minutes: 5 + runs-on: linux-v100 + env: + BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} + steps: + - name: Clone repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Get overview + run: | + pip install pandas fire mmengine + python3 .github/scripts/action_tools.py generate_benchmark_report $BENCHMARK_REPORT_DIR + + + get_coverage_report: + if: ${{!cancelled()}} + runs-on: linux-v100 + needs: [test_tools, test_restful, test_pipeline, test_benchmark] + timeout-minutes: 5 + container: + image: openmmlab/lmdeploy:latest-cu12 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy + run: | + python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Get coverage report + run: | + pip install coverage + coverage combine ${{env.REPORT_DIR}} + coverage xml -o ${{env.REPORT_DIR}}/coverage.xml + coverage report -m + mv .coverage ${{env.REPORT_DIR}}/.coverage + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + notify_to_feishu: + if: always() && !cancelled() && (github.ref_name == 'develop' || github.ref_name == 'main') + needs: [get_benchmark_result, get_coverage_report, test_evaluation] + timeout-minutes: 5 + runs-on: linux-v100 + steps: + - name: notify + if: contains(needs.*.result, 'failure') + run: | + curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- Daily test finished!!!","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.FEISHU_USER_ID }}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }} diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml new file mode 100644 index 0000000000..172667ec0c --- /dev/null +++ b/autotest/config-v100.yaml @@ -0,0 +1,131 @@ +model_path: /nvme/qa_test_models +dst_path: /nvme/qa_test_models/autotest_model +log_path: /nvme/qa_test_models/autotest_model/log +benchmark_path: /nvme/qa_test_models/benchmark-reports +dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json + +tp_config: + internlm-chat-20b: 2 + internlm2-chat-20b: 2 + Baichuan2-13B-Chat: 2 + Mixtral-8x7B-Instruct-v0.1: 2 + Qwen-VL-Chat: 2 + llava-v1.5-13b: 2 + internlm2_5-20b-chat: 2 + internlm2_5-20b: 2 + Meta-Llama-3-1-70B-Instruct: 4 + internlm2_5-7b-chat-1m: 4 + Qwen2-7B-Instruct-GPTQ-Int4: 2 + InternVL2-26B: 2 + InternVL2-40B: 2 + MiniCPM-V-2_6: 2 + +turbomind_chat_model: + - meta-llama/Meta-Llama-3-1-8B-Instruct + - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct-inner-4bits + - internlm/internlm2_5-7b-chat + - internlm/internlm2_5-20b-chat + - internlm/internlm-xcomposer2d5-7b + - OpenGVLab/InternVL2-2B + - OpenGVLab/InternVL2-8B + - OpenGVLab/InternVL2-26B + - OpenGVLab/Mini-InternVL-Chat-2B-V1-5 + - Qwen/Qwen2-7B-Instruct-AWQ + - Qwen/Qwen2-1.5B-Instruct + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2-7B-Instruct-GPTQ-Int4 + - mistralai/Mistral-7B-Instruct-v0.3 + - THUDM/glm-4-9b-chat + + +pytorch_chat_model: + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-1-8B-Instruct + - internlm/internlm2_5-7b-chat + - internlm/internlm2_5-20b-chat + - OpenGVLab/InternVL2-2B + - OpenGVLab/InternVL2-4B + - OpenGVLab/InternVL2-8B + - OpenGVLab/InternVL2-26B + - Qwen/Qwen2-1.5B-Instruct + - Qwen/Qwen1.5-MoE-A2.7B-Chat + - Qwen/Qwen2-VL-2B-Instruct + - Qwen/Qwen2-VL-7B-Instruct + - google/gemma-2-9b-it + - mistralai/Mistral-7B-Instruct-v0.2 + - THUDM/glm-4v-9b + - THUDM/glm-4-9b-chat + - microsoft/Phi-3-mini-4k-instruct + - deepseek-ai/DeepSeek-V2-Lite-Chat + +turbomind_base_model: + - internlm/internlm2_5-7b + - internlm/internlm2_5-20b + +pytorch_base_model: + - internlm/internlm2_5-7b + - internlm/internlm2_5-20b + +vl_model: + - OpenGVLab/InternVL2-2B + - OpenGVLab/InternVL2-4B + - OpenGVLab/InternVL2-8B + - OpenGVLab/InternVL2-26B + - Qwen/Qwen2-VL-2B-Instruct + - Qwen/Qwen2-VL-7B-Instruct + - internlm/internlm-xcomposer2d5-7b + - THUDM/glm-4v-9b + - microsoft/Phi-3-mini-4k-instruct + +turbomind_quatization: + no_awq: + - meta-llama/Meta-Llama-3-1-8B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct + - internlm/internlm-xcomposer2d5-7b + - OpenGVLab/Mini-InternVL-Chat-2B-V1-5 + - mistralai/Mistral-7B-Instruct-v0.3 + - THUDM/glm-4-9b-chat + gptq: + - internlm/internlm2_5-7b-chat + no_kvint4: + - openbmb/MiniCPM-V-2_6 + no_kvint8: + - deepseek-ai/DeepSeek-V2-Lite-Chat + +pytorch_quatization: + awq: + - internlm/internlm2_5-7b-chat + - internlm/internlm2_5-20b-chat + - Qwen/Qwen2-1.5B-Instruct + w8a8: + - internlm/internlm2_5-7b-chat + - internlm/internlm2_5-7b + no_kvint4: + - OpenGVLab/InternVL2-4B + - deepseek-ai/DeepSeek-V2-Lite-Chat + - microsoft/Phi-3-mini-4k-instruct + - microsoft/Phi-3-vision-128k-instruct + no_kvint8: + - deepseek-ai/DeepSeek-V2-Lite-Chat + + +longtext_model: + - meta-llama/Meta-Llama-3-1-8B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-1-70B-Instruct + - internlm/internlm2_5-7b-chat-1m + - internlm/internlm2-chat-20b + +benchmark_model: + - meta-llama/Llama-2-7b-chat-hf + - meta-llama/Meta-Llama-3-1-8B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-1-70B-Instruct + - internlm/internlm2_5-7b-chat + - internlm/internlm2_5-20b-chat + - THUDM/glm-4-9b-chat + - mistralai/Mistral-7B-Instruct-v0.3 + - mistralai/Mixtral-8x7B-Instruct-v0.1 + - deepseek-ai/DeepSeek-V2-Lite-Chat diff --git a/autotest/config.yaml b/autotest/config.yaml index 4e4b20f206..46b9bd9ce1 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -48,6 +48,7 @@ turbomind_chat_model: - mistralai/Mistral-7B-Instruct-v0.1 - mistralai/Mistral-7B-Instruct-v0.2 - mistralai/Mistral-7B-Instruct-v0.3 + - mistralai/Mixtral-8x7B-Instruct-v0.1 - lmdeploy/llama2-chat-7b-w4 - baichuan-inc/Baichuan2-7B-Chat - 01-ai/Yi-6B-Chat @@ -90,7 +91,6 @@ pytorch_chat_model: - mistralai/Mistral-7B-Instruct-v0.1 - mistralai/Mistral-7B-Instruct-v0.2 - mistralai/Mixtral-8x7B-Instruct-v0.1 - - mistralai/Mixtral-8x7B-Instruct-v0.1 - google/gemma-7b-it - google/gemma-2-9b-it - deepseek-ai/deepseek-moe-16b-chat diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py index 096918b6b1..bd33ed33a0 100644 --- a/autotest/interface/pipeline/test_pipeline_func.py +++ b/autotest/interface/pipeline/test_pipeline_func.py @@ -15,6 +15,14 @@ from lmdeploy import (GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline) +from lmdeploy.utils import is_bf16_supported + + +def init_pipeline(model_path, backend_config): + if not is_bf16_supported() and isinstance(backend_config, + PytorchEngineConfig): + backend_config.dtype = 'float16' + return pipeline(model_path, backend_config=backend_config) @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @@ -26,7 +34,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response = pipe('Hi, pls intro yourself') result, msg = assert_pipeline_single_return(response) save_pipeline_common_log(config, file_name, result, response, msg) @@ -56,7 +64,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response = [] for item in pipe.stream_infer('Hi, pls intro yourself'): response.append(item) @@ -88,7 +96,7 @@ def run_pipeline_testcase_with_prompt(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response = pipe(['Hi, pls intro yourself', 'Shanghai is']) result, msg = assert_pipeline_batch_return(response, 2) save_pipeline_common_log(config, file_name, result, response, msg) @@ -118,7 +126,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response = [] for item in pipe.stream_infer(['Pls intro yourself', 'Shanghai is']): response.append(item) @@ -149,7 +157,7 @@ def test_return_with_message(config, model, backend, worker_id): def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) prompts = [[{'role': 'user', 'content': 'Hi, pls intro yourself'}]] response = pipe(prompts) print(response) @@ -180,7 +188,7 @@ def test_return_with_message_stream(config, model, backend, worker_id): def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) prompts = [[{'role': 'user', 'content': 'Hi, pls intro yourself'}]] response = [] for item in pipe.stream_infer(prompts): @@ -212,7 +220,7 @@ def test_return_with_message_batch(config, model, backend, worker_id): def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) prompts = [[{ 'role': 'user', 'content': 'Hi, pls intro yourself' @@ -249,7 +257,7 @@ def test_return_with_message_batch_stream(config, model, backend, worker_id): def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) prompts = [[{ 'role': 'user', 'content': 'Hi, pls intro yourself' @@ -287,7 +295,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) gen_config = GenerationConfig(logprobs=10, max_new_tokens=5, top_k=40, @@ -320,7 +328,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) gen_config = GenerationConfig(logprobs=10, max_new_tokens=5, top_k=40, @@ -358,7 +366,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(session_len=10, tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response = pipe(['Hi, pls intro yourself', 'Shanghai is']) result = True @@ -392,7 +400,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test min_new_tokens gen_config = GenerationConfig(min_new_tokens=200, ignore_eos=True) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], @@ -428,7 +436,7 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test stop_words gen_config = GenerationConfig(stop_words=[' and', '浦', ' to']) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], @@ -467,7 +475,7 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test bad_words gen_config = GenerationConfig(bad_words=[' and', '浦', ' to']) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], @@ -502,7 +510,7 @@ def test_gen_config_special_words_false(config, model, backend, worker_id): def run_pipeline_testcase_special_words(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test special_words prompt = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' + \ '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \ @@ -543,7 +551,7 @@ def test_gen_config_special_words_true(config, model, backend, worker_id): def run_pipeline_testcase_special_words(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test special_words prompt = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' + \ '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \ @@ -587,7 +595,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test repetition_penalty gen_config = GenerationConfig(repetition_penalty=0.01, random_seed=1, @@ -626,7 +634,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test repetition_penalty gen_config = GenerationConfig(repetition_penalty=1.2, random_seed=1) response = pipe('Shanghai is', gen_config=gen_config) @@ -658,7 +666,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test repetition_penalty gen_config = GenerationConfig(top_p=0.1, random_seed=1) response = pipe('Shanghai is', gen_config=gen_config) @@ -690,7 +698,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test repetition_penalty gen_config = GenerationConfig(top_k=1, max_new_tokens=20, @@ -727,7 +735,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) response_list = [] for i in range(3): gen_config = GenerationConfig(random_seed=i, @@ -764,7 +772,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) gen_config = GenerationConfig(random_seed=1, top_k=40, do_sample=True) response_list = [] for i in range(3): @@ -798,7 +806,7 @@ def run_pipeline_testcase(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) gen_config = GenerationConfig(temperature=1.0, top_k=40, do_sample=True) @@ -833,7 +841,7 @@ def run_pipeline_testcase_max_new_tokens(config, model, backend, model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test max_new_tokens gen_config = GenerationConfig(max_new_tokens=5) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], @@ -870,7 +878,7 @@ def run_pipeline_testcase_ignore_eos(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) # test max_new_tokens with ignore_eos gen_config = GenerationConfig(ignore_eos=True, max_new_tokens=256) response = pipe(['Hi, pls intro yourself', 'Shanghai is'], @@ -907,7 +915,7 @@ def test_backend_config_input_validation(config, model, backend, worker_id): tp_num=2) model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) with pytest.raises(AssertionError): gen_config = GenerationConfig(top_p=0) pipe('Shanghai is', gen_config=gen_config) @@ -1018,7 +1026,7 @@ def test_backend_config_tp(config, model, backend, worker_id): worker_id, tp_num=2) model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=100) - pipe = pipeline(model_path, backend_config=backend_config) + pipe = init_pipeline(model_path, backend_config=backend_config) del pipe torch.cuda.empty_cache() if 'gw' in worker_id: diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py index 0ea643524f..c80dbe0dfc 100644 --- a/autotest/utils/benchmark_utils.py +++ b/autotest/utils/benchmark_utils.py @@ -7,6 +7,8 @@ from utils.config_utils import get_workerid from utils.run_restful_chat import health_check +from lmdeploy.utils import is_bf16_supported + DEFAULT_PORT = 23333 GENERATION_CONFIG = ' -c 8 256 -ct 128 128 2048 128 -pt 1 128 128 2048' GENERATION_LONGTEXT_CONFIG = ' -c 1 --session-len 200000 -ct 1024 -pt 198000' @@ -40,6 +42,8 @@ def generation_test(config, run_config = '' if backend == 'pytorch': command += ' --backend pytorch' + if not is_bf16_supported(): + command += ' --dtype float16' else: if '4bit' in model: command += ' --model-format awq' @@ -105,6 +109,8 @@ def throughput_test(config, run_config = '--num-prompts 3000' if backend == 'pytorch': command += ' --backend pytorch' + if not is_bf16_supported(): + command += ' --dtype float16' else: if '4bit' in model: command += ' --model-format awq' diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index ca041dc9a1..8aa5f933fb 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -4,6 +4,8 @@ import yaml from utils.get_run_config import get_tp_num +from lmdeploy.utils import is_bf16_supported + def get_turbomind_model_list(tp_num: int = None, model_type: str = 'chat_model', @@ -85,14 +87,16 @@ def get_torch_model_list(tp_num: int = None, def get_all_model_list(tp_num: int = None, quant_policy: int = None, model_type: str = 'chat_model'): + case_list = get_turbomind_model_list(tp_num=tp_num, model_type=model_type, quant_policy=quant_policy) - for case in get_torch_model_list(tp_num=tp_num, - quant_policy=quant_policy, - model_type=model_type): - if case not in case_list: - case_list.append(case) + if is_bf16_supported(): + for case in get_torch_model_list(tp_num=tp_num, + quant_policy=quant_policy, + model_type=model_type): + if case not in case_list: + case_list.append(case) return [x for x in case_list if 'w8a8' not in x] diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 33d65448ab..1ab34b23d5 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -10,6 +10,7 @@ from lmdeploy import pipeline from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig +from lmdeploy.utils import is_bf16_supported from lmdeploy.vl import load_image from lmdeploy.vl.constants import IMAGE_TOKEN @@ -32,6 +33,8 @@ def run_pipeline_chat_test(config, if 'pytorch' in type: backend_config = PytorchEngineConfig(tp=tp) + if not is_bf16_supported(): + backend_config.dtype = 'float16' else: backend_config = TurbomindEngineConfig(tp=tp) @@ -292,6 +295,10 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None): backend_config.model_format = 'awq' if quant_policy is not None: backend_config.quant_policy = quant_policy + + if not is_bf16_supported(): + backend_config.cache_max_entry_count = 0.5 + backend_config.dtype = 'float16' pipe = pipeline(hf_path, backend_config=backend_config) pipeline_chat_log = os.path.join( diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py index 75b7319aeb..752168958a 100644 --- a/autotest/utils/quantization_utils.py +++ b/autotest/utils/quantization_utils.py @@ -2,6 +2,8 @@ import subprocess from subprocess import PIPE +from lmdeploy.utils import is_bf16_supported + def quantization(config, quantization_model_name, @@ -21,17 +23,17 @@ def quantization(config, if quantization_type == 'awq': quantization_cmd = ' '.join([ cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path, - '--work-dir', quantization_model_path, '--batch-size 32' + '--work-dir', quantization_model_path ]) elif quantization_type == 'gptq': quantization_cmd = ' '.join([ cuda_prefix, 'lmdeploy lite auto_gptq', origin_model_path, - '--work-dir', quantization_model_path, '--batch-size 32' + '--work-dir', quantization_model_path ]) elif quantization_type == 'w8a8': quantization_cmd = ' '.join([ cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path, - '--work-dir', quantization_model_path, '--batch-size 32' + '--work-dir', quantization_model_path ]) else: return False, 'quantization type should in [awq, gptq, w8a8], \ @@ -40,6 +42,11 @@ def quantization(config, if 'llama-3' in origin_model_name.lower(): quantization_cmd += ' --search-scale True' + if not is_bf16_supported(): + quantization_cmd += ' --batch-size 8' + else: + quantization_cmd += ' --batch-size 32' + with open(quantization_log, 'w') as f: # remove existing folder subprocess.run([' '.join(['rm -rf', quantization_model_path])], diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py index edc2268e30..529bf4a6a0 100644 --- a/autotest/utils/run_client_chat.py +++ b/autotest/utils/run_client_chat.py @@ -4,6 +4,8 @@ from utils.get_run_config import get_command_with_extra, get_model_name from utils.rule_condition_assert import assert_result +from lmdeploy.utils import is_bf16_supported + TEMPLATE = 'autotest/template.json' @@ -63,6 +65,9 @@ def hf_command_line_test(config, need_tp=True, cuda_prefix=cuda_prefix) + if type == 'pytorch': + if not is_bf16_supported(): + cmd += ' --dtype float16' if type == 'turbomind': if ('w4' in model_case or ('4bits' in model_case or 'awq' in model_case.lower())): diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 1eb84f1d93..c567db4d00 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -13,6 +13,7 @@ from utils.run_client_chat import command_line_test from lmdeploy.serve.openai.api_client import APIClient +from lmdeploy.utils import is_bf16_supported BASE_HTTP_URL = 'http://localhost' DEFAULT_PORT = 23333 @@ -60,12 +61,17 @@ def start_restful_api(config, param, model, model_path, backend_type, cmd += ' --model-format gptq' if backend_type == 'pytorch': cmd += ' --backend pytorch' + if not is_bf16_supported(): + cmd += ' --dtype float16' if 'llava' in model: cmd += ' --model-name vicuna' if 'quant_policy' in param.keys() and param['quant_policy'] is not None: quant_policy = param['quant_policy'] cmd += f' --quant-policy {quant_policy}' + if not is_bf16_supported(): + cmd += ' --cache-max-entry-count 0.5' + start_log = os.path.join( log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log') @@ -87,13 +93,18 @@ def start_restful_api(config, param, model, model_path, backend_type, content = file.read() print(content) start_time = int(time()) + + start_timeout = 300 + if not is_bf16_supported(): + start_timeout = 600 + sleep(5) - for i in range(300): + for i in range(start_timeout): sleep(1) end_time = int(time()) total_time = end_time - start_time result = health_check(http_url) - if result or total_time >= 300: + if result or total_time >= start_timeout: break allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT) return pid, startRes diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py index b28937dd4c..952de5d9f7 100644 --- a/benchmark/profile_generation.py +++ b/benchmark/profile_generation.py @@ -349,6 +349,7 @@ def parse_args(): session_len_act = ArgumentHelper.session_len(pt_group, default=2048) prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group) rope_scaling_factor_act = ArgumentHelper.rope_scaling_factor(pt_group) + dtype_act = ArgumentHelper.dtype(pt_group) # turbomind engine args tb_group = parser.add_argument_group('TurboMind engine argument') @@ -358,6 +359,7 @@ def parse_args(): tb_group._group_actions.append(cache_block_seq_len_act) tb_group._group_actions.append(prefix_caching_act) tb_group._group_actions.append(rope_scaling_factor_act) + tb_group._group_actions.append(dtype_act) ArgumentHelper.model_format(tb_group, default='hf') args = parser.parse_args() return args @@ -416,6 +418,7 @@ def main(): rope_scaling_factor=args.rope_scaling_factor, tp=args.tp, enable_prefix_caching=args.enable_prefix_caching, + dtype=args.dtype, ) elif args.backend == 'pytorch': engine_config = PytorchEngineConfig( @@ -426,6 +429,7 @@ def main(): thread_safe=True, eager_mode=args.eager_mode, enable_prefix_caching=args.enable_prefix_caching, + dtype=args.dtype, ) gen_config = GenerationConfig(top_k=args.top_k, top_p=args.top_p, diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 9d573d51b1..58786d9c80 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -289,6 +289,7 @@ def parse_args(): cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group) prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group) quant_policy_act = ArgumentHelper.quant_policy(pt_group, default=0) + dtype_act = ArgumentHelper.dtype(pt_group) # turbomind engine args tb_group = parser.add_argument_group('TurboMind engine argument') @@ -298,6 +299,8 @@ def parse_args(): tb_group._group_actions.append(cache_block_seq_len_act) tb_group._group_actions.append(prefix_caching_act) tb_group._group_actions.append(quant_policy_act) + tb_group._group_actions.append(dtype_act) + ArgumentHelper.model_format(tb_group, default='hf') ArgumentHelper.num_tokens_per_iter(tb_group) ArgumentHelper.max_prefill_iters(tb_group) @@ -321,6 +324,7 @@ def main(): num_tokens_per_iter=args.num_tokens_per_iter, max_prefill_iters=args.max_prefill_iters, enable_prefix_caching=args.enable_prefix_caching, + dtype=args.dtype, ) elif args.backend == 'pytorch': engine_config = PytorchEngineConfig( @@ -333,6 +337,7 @@ def main(): eager_mode=args.eager_mode, enable_prefix_caching=args.enable_prefix_caching, quant_policy=args.quant_policy, + dtype=args.dtype, ) engine = Engine(args.model_path, engine_config, csv=args.csv)