diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py
index 81872c0dd1..89ad20a533 100644
--- a/.github/scripts/eval_chat_config.py
+++ b/.github/scripts/eval_chat_config.py
@@ -174,7 +174,6 @@
                                 max_out_len=MAX_NEW_TOKENS,
                                 max_seq_len=MAX_SESSION_LEN,
                                 batch_size=128,
-                                concurrency=128,
                                 meta_template=llama2_meta_template,
                                 run_cfg=dict(num_gpus=1),
                                 end_str='[INST]')
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 84fcaf5034..f03bbf4a50 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -199,9 +199,9 @@ jobs:
           chmod -R 777 $workdir
 
   test_tools:
-    needs: test_quantization
     if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
     runs-on: [self-hosted, linux-a100]
+    needs: test_quantization
     timeout-minutes: 150
     strategy:
       fail-fast: false
diff --git a/.github/workflows/daily_ete_test_v100.yml b/.github/workflows/daily_ete_test_v100.yml
new file mode 100644
index 0000000000..8b32bab1f7
--- /dev/null
+++ b/.github/workflows/daily_ete_test_v100.yml
@@ -0,0 +1,667 @@
+name: daily_ete_test_v100
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      backend:
+        required: true
+        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch', 'turbomind_vl']"
+      model:
+        required: true
+        description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
+        type: string
+        default: "['pipeline','restful','chat']"
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
+      regression_func:
+        required: true
+        description: 'regression functions'
+        type: string
+        default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']"
+  schedule:
+    - cron:  '00 16 * * 0-4'
+
+env:
+  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
+  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
+  OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
+  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
+
+jobs:
+  linux-build:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.1
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+
+  download_pkgs:
+    needs: linux-build
+    if: ${{!cancelled()}}
+    runs-on: linux-v100
+    timeout-minutes: 50
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
+      - name: Change testconfig on v100
+        run: |
+          mv ${{env.TEST_CODE_PATH}}/autotest/config-v100.yaml ${{env.TEST_CODE_PATH}}/autotest/config.yaml
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+
+  test_quantization:
+    needs: download_pkgs
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
+    runs-on: linux-v100
+    timeout-minutes: 180
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /root/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/modelscope_hub:/root/modelscope_hub
+        - /nvme/github-actions/modelscope_modules:/root/modelscope_modules
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - quantization w4a16
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
+        run: |
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - convert
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
+        run: |
+          pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_tools:
+    needs: test_quantization
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    runs-on: linux-v100
+    timeout-minutes: 240
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend) || fromJSON('["turbomind", "pytorch", "turbomind_vl"]')}}
+        model: ${{ fromJSON(inputs.model) || fromJSON('["pipeline","restful","chat"]')}}
+        exclude:
+          - backend: turbomind_vl
+            model: chat
+        include:
+          - backend: turbomind
+            model: local_case
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /root/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/modelscope_hub:/root/modelscope_hub
+        - /nvme/github-actions/modelscope_modules:/root/modelscope_modules
+        - /nvme/github-actions/resources/lora:/root/lora
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /nvme/qa_test_models/lmdeploy/autotest:/local_case
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          cp -r /root/lora .
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - chat workspace
+        continue-on-error: true
+        if: matrix.backend == 'turbomind' && matrix.model == 'chat'
+        run: |
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - chat
+        continue-on-error: true
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'chat'
+        run: |
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - pipeline
+        continue-on-error: true
+        if: matrix.model == 'pipeline'
+        run: |
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - restful
+        continue-on-error: true
+        if: matrix.model == 'restful'
+        run: |
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - restful workspace
+        continue-on-error: true
+        if: matrix.backend == 'turbomind' && matrix.model == 'restful'
+        run: |
+          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - local testcase
+        if: matrix.backend == 'turbomind' && matrix.model == 'local_case'
+        run: |
+          pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}}|| true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_restful:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ['turbomind', 'pytorch']
+    timeout-minutes: 120
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Start restful api turbomind
+        if: matrix.backend == 'turbomind'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b-chat --tp 2 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Start restful api pytorch
+        if: matrix.backend == 'pytorch'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b-chat --tp 2 --backend pytorch --dtype float16 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Test lmdeploy - restful api
+        timeout-minutes: 75
+        run: |
+          pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Kill api server
+        if: always()
+        run: |
+          kill -15 "$restful_pid"
+      - name: Start restful api turbomind - base
+        if: matrix.backend == 'turbomind'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b --tp 2 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Start restful api pytorch - base
+        if: matrix.backend == 'pytorch'
+        run: |
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/internlm/internlm2_5-20b --tp 2 --backend pytorch --dtype float16 > restful.log  2>&1  &
+          echo "restful_pid=$!" >> "$GITHUB_ENV"
+          sleep 600s
+      - name: Test lmdeploy - restful api - base
+        timeout-minutes: 40
+        run: |
+          pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Kill api server
+        if: always()
+        run: |
+          kill -15 "$restful_pid"
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_pipeline:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'pipeline'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    timeout-minutes: 240
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - interface pipeline case
+        run: |
+          pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+
+  test_benchmark:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    timeout-minutes: 120
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test benchmark script
+        run: |
+          pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_evaluation:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'evaluation'))}}
+    runs-on: linux-v100
+    needs: test_quantization
+    timeout-minutes: 120
+    strategy:
+        fail-fast: false
+        matrix:
+          evaluate_type: ['chat', 'base']
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/resources:/root/resources
+        - /nvme/github-actions/opencompass-data:/root/opencompass-data
+        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/shared:/mnt/shared
+        - /mnt/187:/mnt/187
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy - dependency
+        run: |
+          # manually install flash attn
+          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
+          python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
+          python3 -m pip install /root/packages/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl --no-deps
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Install opencompass
+        run: |
+          git clone --depth=1 https://github.com/open-compass/opencompass.git
+          cd opencompass
+          python3 -m pip install -e .
+          echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
+      - name: Check env
+        run: |
+          pip install triton==3.0.0
+          pip uninstall -y nvidia-nccl-cu11
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          rm -rf /nvme/qa_test_models/autotest_model/log/*
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Setup paths for evaluation
+        run: |
+          ln -s /root/opencompass-data ./data
+          python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
+      - name: Evaluate models
+        if: matrix.evaluate_type == 'chat'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, pt_internlm2_5_7b_chat, turbomind_internlm2_5_20b_chat, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, pt_internlm2_5_20b_chat, turbomind_llama_3d1_8b_instruct, pt_llama_3d1_8b_instruct, turbomind_llama_3d1_8b_instruct_4bits, turbomind_llama_3d1_8b_instruct_kvint4, turbomind_qwen2_7b_instruct, turbomind_qwen2_7b_instruct_4bits, pt_qwen1_5_moe_2_7b_chat, pt_gemma_2_9b_it]" "[*race_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} chat true
+      - name: Evaluate base models
+        if: matrix.evaluate_type == 'base'
+        run: |
+          export LMDEPLOY_DIR=$(pwd)
+
+          python3 .github/scripts/action_tools.py evaluate "[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]" "[*mmlu_datasets, *gsm8k_datasets]" /root/evaluation-reports/${{ github.run_id }} base true
+      - name: Clear workspace
+        if: always()
+        run: |
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+
+  get_benchmark_result:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'benchmark'))}}
+    needs: [test_benchmark]
+    timeout-minutes: 5
+    runs-on: linux-v100
+    env:
+      BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Get overview
+        run: |
+          pip install pandas fire mmengine
+          python3 .github/scripts/action_tools.py generate_benchmark_report $BENCHMARK_REPORT_DIR
+
+
+  get_coverage_report:
+    if: ${{!cancelled()}}
+    runs-on: linux-v100
+    needs: [test_tools, test_restful, test_pipeline, test_benchmark]
+    timeout-minutes: 5
+    container:
+      image: openmmlab/lmdeploy:latest-cu12
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Get coverage report
+        run: |
+          pip install coverage
+          coverage combine ${{env.REPORT_DIR}}
+          coverage xml -o ${{env.REPORT_DIR}}/coverage.xml
+          coverage report -m
+          mv .coverage ${{env.REPORT_DIR}}/.coverage
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  notify_to_feishu:
+    if: always() && !cancelled() && (github.ref_name == 'develop' || github.ref_name == 'main')
+    needs: [get_benchmark_result, get_coverage_report, test_evaluation]
+    timeout-minutes: 5
+    runs-on: linux-v100
+    steps:
+      - name: notify
+        if: contains(needs.*.result, 'failure')
+        run: |
+          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- Daily test finished！！！","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.FEISHU_USER_ID }}'"}]]}}}}'  ${{ secrets.FEISHU_WEBHOOK_URL }}
diff --git a/autotest/config-v100.yaml b/autotest/config-v100.yaml
new file mode 100644
index 0000000000..172667ec0c
--- /dev/null
+++ b/autotest/config-v100.yaml
@@ -0,0 +1,131 @@
+model_path: /nvme/qa_test_models
+dst_path: /nvme/qa_test_models/autotest_model
+log_path: /nvme/qa_test_models/autotest_model/log
+benchmark_path: /nvme/qa_test_models/benchmark-reports
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+
+tp_config:
+    internlm-chat-20b: 2
+    internlm2-chat-20b: 2
+    Baichuan2-13B-Chat: 2
+    Mixtral-8x7B-Instruct-v0.1: 2
+    Qwen-VL-Chat: 2
+    llava-v1.5-13b: 2
+    internlm2_5-20b-chat: 2
+    internlm2_5-20b: 2
+    Meta-Llama-3-1-70B-Instruct: 4
+    internlm2_5-7b-chat-1m: 4
+    Qwen2-7B-Instruct-GPTQ-Int4: 2
+    InternVL2-26B: 2
+    InternVL2-40B: 2
+    MiniCPM-V-2_6: 2
+
+turbomind_chat_model:
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-8B-Instruct-inner-4bits
+    - internlm/internlm2_5-7b-chat
+    - internlm/internlm2_5-20b-chat
+    - internlm/internlm-xcomposer2d5-7b
+    - OpenGVLab/InternVL2-2B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+    - Qwen/Qwen2-7B-Instruct-AWQ
+    - Qwen/Qwen2-1.5B-Instruct
+    - Qwen/Qwen2.5-7B-Instruct
+    - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
+    - mistralai/Mistral-7B-Instruct-v0.3
+    - THUDM/glm-4-9b-chat
+
+
+pytorch_chat_model:
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - internlm/internlm2_5-7b-chat
+    - internlm/internlm2_5-20b-chat
+    - OpenGVLab/InternVL2-2B
+    - OpenGVLab/InternVL2-4B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - Qwen/Qwen2-1.5B-Instruct
+    - Qwen/Qwen1.5-MoE-A2.7B-Chat
+    - Qwen/Qwen2-VL-2B-Instruct
+    - Qwen/Qwen2-VL-7B-Instruct
+    - google/gemma-2-9b-it
+    - mistralai/Mistral-7B-Instruct-v0.2
+    - THUDM/glm-4v-9b
+    - THUDM/glm-4-9b-chat
+    - microsoft/Phi-3-mini-4k-instruct
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
+
+turbomind_base_model:
+    - internlm/internlm2_5-7b
+    - internlm/internlm2_5-20b
+
+pytorch_base_model:
+    - internlm/internlm2_5-7b
+    - internlm/internlm2_5-20b
+
+vl_model:
+    - OpenGVLab/InternVL2-2B
+    - OpenGVLab/InternVL2-4B
+    - OpenGVLab/InternVL2-8B
+    - OpenGVLab/InternVL2-26B
+    - Qwen/Qwen2-VL-2B-Instruct
+    - Qwen/Qwen2-VL-7B-Instruct
+    - internlm/internlm-xcomposer2d5-7b
+    - THUDM/glm-4v-9b
+    - microsoft/Phi-3-mini-4k-instruct
+
+turbomind_quatization:
+    no_awq:
+        - meta-llama/Meta-Llama-3-1-8B-Instruct
+        - meta-llama/Meta-Llama-3-8B-Instruct
+        - internlm/internlm-xcomposer2d5-7b
+        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+        - mistralai/Mistral-7B-Instruct-v0.3
+        - THUDM/glm-4-9b-chat
+    gptq:
+        - internlm/internlm2_5-7b-chat
+    no_kvint4:
+        - openbmb/MiniCPM-V-2_6
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+
+pytorch_quatization:
+    awq:
+        - internlm/internlm2_5-7b-chat
+        - internlm/internlm2_5-20b-chat
+        - Qwen/Qwen2-1.5B-Instruct
+    w8a8:
+        - internlm/internlm2_5-7b-chat
+        - internlm/internlm2_5-7b
+    no_kvint4:
+        - OpenGVLab/InternVL2-4B
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+        - microsoft/Phi-3-mini-4k-instruct
+        - microsoft/Phi-3-vision-128k-instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+
+
+longtext_model:
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-70B-Instruct
+    - internlm/internlm2_5-7b-chat-1m
+    - internlm/internlm2-chat-20b
+
+benchmark_model:
+    - meta-llama/Llama-2-7b-chat-hf
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Meta-Llama-3-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-70B-Instruct
+    - internlm/internlm2_5-7b-chat
+    - internlm/internlm2_5-20b-chat
+    - THUDM/glm-4-9b-chat
+    - mistralai/Mistral-7B-Instruct-v0.3
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
+    - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 4e4b20f206..46b9bd9ce1 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -48,6 +48,7 @@ turbomind_chat_model:
     - mistralai/Mistral-7B-Instruct-v0.1
     - mistralai/Mistral-7B-Instruct-v0.2
     - mistralai/Mistral-7B-Instruct-v0.3
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
     - lmdeploy/llama2-chat-7b-w4
     - baichuan-inc/Baichuan2-7B-Chat
     - 01-ai/Yi-6B-Chat
@@ -90,7 +91,6 @@ pytorch_chat_model:
     - mistralai/Mistral-7B-Instruct-v0.1
     - mistralai/Mistral-7B-Instruct-v0.2
     - mistralai/Mixtral-8x7B-Instruct-v0.1
-    - mistralai/Mixtral-8x7B-Instruct-v0.1
     - google/gemma-7b-it
     - google/gemma-2-9b-it
     - deepseek-ai/deepseek-moe-16b-chat
diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index 096918b6b1..bd33ed33a0 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -15,6 +15,14 @@
 
 from lmdeploy import (GenerationConfig, PytorchEngineConfig,
                       TurbomindEngineConfig, pipeline)
+from lmdeploy.utils import is_bf16_supported
+
+
+def init_pipeline(model_path, backend_config):
+    if not is_bf16_supported() and isinstance(backend_config,
+                                              PytorchEngineConfig):
+        backend_config.dtype = 'float16'
+    return pipeline(model_path, backend_config=backend_config)
 
 
 @pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
@@ -26,7 +34,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = pipe('Hi, pls intro yourself')
         result, msg = assert_pipeline_single_return(response)
         save_pipeline_common_log(config, file_name, result, response, msg)
@@ -56,7 +64,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = []
         for item in pipe.stream_infer('Hi, pls intro yourself'):
             response.append(item)
@@ -88,7 +96,7 @@ def run_pipeline_testcase_with_prompt(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'])
         result, msg = assert_pipeline_batch_return(response, 2)
         save_pipeline_common_log(config, file_name, result, response, msg)
@@ -118,7 +126,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = []
         for item in pipe.stream_infer(['Pls intro yourself', 'Shanghai is']):
             response.append(item)
@@ -149,7 +157,7 @@ def test_return_with_message(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{'role': 'user', 'content': 'Hi, pls intro yourself'}]]
         response = pipe(prompts)
         print(response)
@@ -180,7 +188,7 @@ def test_return_with_message_stream(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{'role': 'user', 'content': 'Hi, pls intro yourself'}]]
         response = []
         for item in pipe.stream_infer(prompts):
@@ -212,7 +220,7 @@ def test_return_with_message_batch(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{
             'role': 'user',
             'content': 'Hi, pls intro yourself'
@@ -249,7 +257,7 @@ def test_return_with_message_batch_stream(config, model, backend, worker_id):
     def run_pipeline_testcase(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         prompts = [[{
             'role': 'user',
             'content': 'Hi, pls intro yourself'
@@ -287,7 +295,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(logprobs=10,
                                       max_new_tokens=5,
                                       top_k=40,
@@ -320,7 +328,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(logprobs=10,
                                       max_new_tokens=5,
                                       top_k=40,
@@ -358,7 +366,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(session_len=10, tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'])
 
         result = True
@@ -392,7 +400,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test min_new_tokens
         gen_config = GenerationConfig(min_new_tokens=200, ignore_eos=True)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -428,7 +436,7 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test stop_words
         gen_config = GenerationConfig(stop_words=[' and', '浦', ' to'])
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -467,7 +475,7 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test bad_words
         gen_config = GenerationConfig(bad_words=[' and', '浦', ' to'])
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -502,7 +510,7 @@ def test_gen_config_special_words_false(config, model, backend, worker_id):
     def run_pipeline_testcase_special_words(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test special_words
         prompt = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' + \
             '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \
@@ -543,7 +551,7 @@ def test_gen_config_special_words_true(config, model, backend, worker_id):
     def run_pipeline_testcase_special_words(config, model, backend, file_name):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test special_words
         prompt = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' + \
             '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \
@@ -587,7 +595,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend,
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(repetition_penalty=0.01,
                                       random_seed=1,
@@ -626,7 +634,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend,
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(repetition_penalty=1.2, random_seed=1)
         response = pipe('Shanghai is', gen_config=gen_config)
@@ -658,7 +666,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(top_p=0.1, random_seed=1)
         response = pipe('Shanghai is', gen_config=gen_config)
@@ -690,7 +698,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test repetition_penalty
         gen_config = GenerationConfig(top_k=1,
                                       max_new_tokens=20,
@@ -727,7 +735,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         response_list = []
         for i in range(3):
             gen_config = GenerationConfig(random_seed=i,
@@ -764,7 +772,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(random_seed=1, top_k=40, do_sample=True)
         response_list = []
         for i in range(3):
@@ -798,7 +806,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         gen_config = GenerationConfig(temperature=1.0,
                                       top_k=40,
                                       do_sample=True)
@@ -833,7 +841,7 @@ def run_pipeline_testcase_max_new_tokens(config, model, backend,
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test max_new_tokens
         gen_config = GenerationConfig(max_new_tokens=5)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -870,7 +878,7 @@ def run_pipeline_testcase_ignore_eos(config, model, backend, file_name):
 
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=2)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         # test max_new_tokens with ignore_eos
         gen_config = GenerationConfig(ignore_eos=True, max_new_tokens=256)
         response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
@@ -907,7 +915,7 @@ def test_backend_config_input_validation(config, model, backend, worker_id):
                                                                      tp_num=2)
     model_path = '/'.join([config.get('model_path'), model])
     backend_config = backend(tp=2)
-    pipe = pipeline(model_path, backend_config=backend_config)
+    pipe = init_pipeline(model_path, backend_config=backend_config)
     with pytest.raises(AssertionError):
         gen_config = GenerationConfig(top_p=0)
         pipe('Shanghai is', gen_config=gen_config)
@@ -1018,7 +1026,7 @@ def test_backend_config_tp(config, model, backend, worker_id):
                 worker_id, tp_num=2)
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = backend(tp=100)
-        pipe = pipeline(model_path, backend_config=backend_config)
+        pipe = init_pipeline(model_path, backend_config=backend_config)
         del pipe
         torch.cuda.empty_cache()
         if 'gw' in worker_id:
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 0ea643524f..c80dbe0dfc 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -7,6 +7,8 @@
 from utils.config_utils import get_workerid
 from utils.run_restful_chat import health_check
 
+from lmdeploy.utils import is_bf16_supported
+
 DEFAULT_PORT = 23333
 GENERATION_CONFIG = ' -c 8 256 -ct 128 128 2048 128 -pt 1 128 128 2048'
 GENERATION_LONGTEXT_CONFIG = ' -c 1 --session-len 200000 -ct 1024 -pt 198000'
@@ -40,6 +42,8 @@ def generation_test(config,
     run_config = ''
     if backend == 'pytorch':
         command += ' --backend pytorch'
+        if not is_bf16_supported():
+            command += ' --dtype float16'
     else:
         if '4bit' in model:
             command += ' --model-format awq'
@@ -105,6 +109,8 @@ def throughput_test(config,
         run_config = '--num-prompts 3000'
     if backend == 'pytorch':
         command += ' --backend pytorch'
+        if not is_bf16_supported():
+            command += ' --dtype float16'
     else:
         if '4bit' in model:
             command += ' --model-format awq'
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index ca041dc9a1..8aa5f933fb 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -4,6 +4,8 @@
 import yaml
 from utils.get_run_config import get_tp_num
 
+from lmdeploy.utils import is_bf16_supported
+
 
 def get_turbomind_model_list(tp_num: int = None,
                              model_type: str = 'chat_model',
@@ -85,14 +87,16 @@ def get_torch_model_list(tp_num: int = None,
 def get_all_model_list(tp_num: int = None,
                        quant_policy: int = None,
                        model_type: str = 'chat_model'):
+
     case_list = get_turbomind_model_list(tp_num=tp_num,
                                          model_type=model_type,
                                          quant_policy=quant_policy)
-    for case in get_torch_model_list(tp_num=tp_num,
-                                     quant_policy=quant_policy,
-                                     model_type=model_type):
-        if case not in case_list:
-            case_list.append(case)
+    if is_bf16_supported():
+        for case in get_torch_model_list(tp_num=tp_num,
+                                         quant_policy=quant_policy,
+                                         model_type=model_type):
+            if case not in case_list:
+                case_list.append(case)
     return [x for x in case_list if 'w8a8' not in x]
 
 
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 33d65448ab..1ab34b23d5 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -10,6 +10,7 @@
 
 from lmdeploy import pipeline
 from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig
+from lmdeploy.utils import is_bf16_supported
 from lmdeploy.vl import load_image
 from lmdeploy.vl.constants import IMAGE_TOKEN
 
@@ -32,6 +33,8 @@ def run_pipeline_chat_test(config,
 
     if 'pytorch' in type:
         backend_config = PytorchEngineConfig(tp=tp)
+        if not is_bf16_supported():
+            backend_config.dtype = 'float16'
     else:
         backend_config = TurbomindEngineConfig(tp=tp)
 
@@ -292,6 +295,10 @@ def run_pipeline_vl_chat_test(config, model_case, quant_policy: int = None):
         backend_config.model_format = 'awq'
     if quant_policy is not None:
         backend_config.quant_policy = quant_policy
+
+    if not is_bf16_supported():
+        backend_config.cache_max_entry_count = 0.5
+        backend_config.dtype = 'float16'
     pipe = pipeline(hf_path, backend_config=backend_config)
 
     pipeline_chat_log = os.path.join(
diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
index 75b7319aeb..752168958a 100644
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
@@ -2,6 +2,8 @@
 import subprocess
 from subprocess import PIPE
 
+from lmdeploy.utils import is_bf16_supported
+
 
 def quantization(config,
                  quantization_model_name,
@@ -21,17 +23,17 @@ def quantization(config,
     if quantization_type == 'awq':
         quantization_cmd = ' '.join([
             cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path,
-            '--work-dir', quantization_model_path, '--batch-size 32'
+            '--work-dir', quantization_model_path
         ])
     elif quantization_type == 'gptq':
         quantization_cmd = ' '.join([
             cuda_prefix, 'lmdeploy lite auto_gptq', origin_model_path,
-            '--work-dir', quantization_model_path, '--batch-size 32'
+            '--work-dir', quantization_model_path
         ])
     elif quantization_type == 'w8a8':
         quantization_cmd = ' '.join([
             cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path,
-            '--work-dir', quantization_model_path, '--batch-size 32'
+            '--work-dir', quantization_model_path
         ])
     else:
         return False, 'quantization type should in [awq, gptq, w8a8], \
@@ -40,6 +42,11 @@ def quantization(config,
     if 'llama-3' in origin_model_name.lower():
         quantization_cmd += ' --search-scale True'
 
+    if not is_bf16_supported():
+        quantization_cmd += ' --batch-size 8'
+    else:
+        quantization_cmd += ' --batch-size 32'
+
     with open(quantization_log, 'w') as f:
         # remove existing folder
         subprocess.run([' '.join(['rm -rf', quantization_model_path])],
diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py
index edc2268e30..529bf4a6a0 100644
--- a/autotest/utils/run_client_chat.py
+++ b/autotest/utils/run_client_chat.py
@@ -4,6 +4,8 @@
 from utils.get_run_config import get_command_with_extra, get_model_name
 from utils.rule_condition_assert import assert_result
 
+from lmdeploy.utils import is_bf16_supported
+
 TEMPLATE = 'autotest/template.json'
 
 
@@ -63,6 +65,9 @@ def hf_command_line_test(config,
                                  need_tp=True,
                                  cuda_prefix=cuda_prefix)
 
+    if type == 'pytorch':
+        if not is_bf16_supported():
+            cmd += ' --dtype float16'
     if type == 'turbomind':
         if ('w4' in model_case
                 or ('4bits' in model_case or 'awq' in model_case.lower())):
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 1eb84f1d93..c567db4d00 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -13,6 +13,7 @@
 from utils.run_client_chat import command_line_test
 
 from lmdeploy.serve.openai.api_client import APIClient
+from lmdeploy.utils import is_bf16_supported
 
 BASE_HTTP_URL = 'http://localhost'
 DEFAULT_PORT = 23333
@@ -60,12 +61,17 @@ def start_restful_api(config, param, model, model_path, backend_type,
             cmd += ' --model-format gptq'
     if backend_type == 'pytorch':
         cmd += ' --backend pytorch'
+        if not is_bf16_supported():
+            cmd += ' --dtype float16'
     if 'llava' in model:
         cmd += ' --model-name vicuna'
     if 'quant_policy' in param.keys() and param['quant_policy'] is not None:
         quant_policy = param['quant_policy']
         cmd += f' --quant-policy {quant_policy}'
 
+    if not is_bf16_supported():
+        cmd += ' --cache-max-entry-count 0.5'
+
     start_log = os.path.join(
         log_path, 'start_restful_' + model.split('/')[1] + worker_id + '.log')
 
@@ -87,13 +93,18 @@ def start_restful_api(config, param, model, model_path, backend_type,
         content = file.read()
         print(content)
     start_time = int(time())
+
+    start_timeout = 300
+    if not is_bf16_supported():
+        start_timeout = 600
+
     sleep(5)
-    for i in range(300):
+    for i in range(start_timeout):
         sleep(1)
         end_time = int(time())
         total_time = end_time - start_time
         result = health_check(http_url)
-        if result or total_time >= 300:
+        if result or total_time >= start_timeout:
             break
     allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
     return pid, startRes
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
index b28937dd4c..952de5d9f7 100644
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -349,6 +349,7 @@ def parse_args():
     session_len_act = ArgumentHelper.session_len(pt_group, default=2048)
     prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
     rope_scaling_factor_act = ArgumentHelper.rope_scaling_factor(pt_group)
+    dtype_act = ArgumentHelper.dtype(pt_group)
 
     # turbomind engine args
     tb_group = parser.add_argument_group('TurboMind engine argument')
@@ -358,6 +359,7 @@ def parse_args():
     tb_group._group_actions.append(cache_block_seq_len_act)
     tb_group._group_actions.append(prefix_caching_act)
     tb_group._group_actions.append(rope_scaling_factor_act)
+    tb_group._group_actions.append(dtype_act)
     ArgumentHelper.model_format(tb_group, default='hf')
     args = parser.parse_args()
     return args
@@ -416,6 +418,7 @@ def main():
                     rope_scaling_factor=args.rope_scaling_factor,
                     tp=args.tp,
                     enable_prefix_caching=args.enable_prefix_caching,
+                    dtype=args.dtype,
                 )
             elif args.backend == 'pytorch':
                 engine_config = PytorchEngineConfig(
@@ -426,6 +429,7 @@ def main():
                     thread_safe=True,
                     eager_mode=args.eager_mode,
                     enable_prefix_caching=args.enable_prefix_caching,
+                    dtype=args.dtype,
                 )
             gen_config = GenerationConfig(top_k=args.top_k,
                                           top_p=args.top_p,
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index 9d573d51b1..58786d9c80 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -289,6 +289,7 @@ def parse_args():
     cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
     prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
     quant_policy_act = ArgumentHelper.quant_policy(pt_group, default=0)
+    dtype_act = ArgumentHelper.dtype(pt_group)
 
     # turbomind engine args
     tb_group = parser.add_argument_group('TurboMind engine argument')
@@ -298,6 +299,8 @@ def parse_args():
     tb_group._group_actions.append(cache_block_seq_len_act)
     tb_group._group_actions.append(prefix_caching_act)
     tb_group._group_actions.append(quant_policy_act)
+    tb_group._group_actions.append(dtype_act)
+
     ArgumentHelper.model_format(tb_group, default='hf')
     ArgumentHelper.num_tokens_per_iter(tb_group)
     ArgumentHelper.max_prefill_iters(tb_group)
@@ -321,6 +324,7 @@ def main():
             num_tokens_per_iter=args.num_tokens_per_iter,
             max_prefill_iters=args.max_prefill_iters,
             enable_prefix_caching=args.enable_prefix_caching,
+            dtype=args.dtype,
         )
     elif args.backend == 'pytorch':
         engine_config = PytorchEngineConfig(
@@ -333,6 +337,7 @@ def main():
             eager_mode=args.eager_mode,
             enable_prefix_caching=args.enable_prefix_caching,
             quant_policy=args.quant_policy,
+            dtype=args.dtype,
         )
 
     engine = Engine(args.model_path, engine_config, csv=args.csv)