.github/workflows/daily_ete_test.yml

name: daily_ete_test

on:
  workflow_dispatch:
    inputs:
      repo_org:
        required: false
        description: 'Tested repository organization name. Default is InternLM'
        type: string
        default: 'InternLM/lmdeploy'
      repo_ref:
        required: false
        description: 'Set branch or tag or commit id. Default is "main"'
        type: string
        default: 'main'
      backend:
        required: true
        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
        type: string
        default: "['turbomind', 'pytorch']"
      model:
        required: true
        description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
        type: string
        default: "['quantization','convert','pipeline','restful','chat','interface-pipeline']"
  schedule:
    - cron:  '00 21 * * *'

env:
  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai


jobs:
  test_functions:
    runs-on: [self-hosted, linux-a100]
    timeout-minutes: 240
    env:
      REPORT_DIR: /nvme/qa_test_models/test-reports
    container:
      image: nvcr.io/nvidia/tritonserver:22.12-py3
      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
      volumes:
        - /nvme/github-actions/pip-cache:/root/.cache/pip
        - /nvme/github-actions/packages:/root/packages
        - /nvme/qa_test_models:/nvme/qa_test_models
        - /mnt/bigdisk/qa_test_models:/mnt/bigdisk/qa_test_models
        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
    steps:
      - name: Setup systems
        run: |
          rm /etc/apt/sources.list.d/cuda*.list
          apt-get update && apt-get install -y --no-install-recommends rapidjson-dev \
              libgoogle-glog-dev libgl1 openjdk-8-jre-headless
          dpkg -i /root/packages/allure_2.24.1-1_all.deb
          rm -rf /var/lib/apt/lists/*
      - name: Clone repository
        uses: actions/checkout@v3
        with:
          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: Install pytorch
        run: |
          python3 -m pip cache dir
          python3 -m pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
      - name: Build lmdeploy
        run: |
          python3 -m pip install cmake
          python3 -m pip install -r requirements/build.txt
          mkdir build
          cd build
          cmake .. \
              -DCMAKE_BUILD_TYPE=RelWithDebInfo \
              -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
              -DCMAKE_INSTALL_PREFIX=/opt/tritonserver \
              -DBUILD_PY_FFI=ON \
              -DBUILD_MULTI_GPU=ON \
              -DCMAKE_CUDA_FLAGS="-lineinfo" \
              -DUSE_NVTX=ON \
              -DSM=80 \
              -DCMAKE_CUDA_ARCHITECTURES=80 \
              -DBUILD_TEST=OFF
          make -j$(nproc) && make install
      - name: Install lmdeploy
        run: |
          python3 -m pip install packaging protobuf transformers_stream_generator transformers datasets
          # manually install flash attn
          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.0cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
          python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl
          python3 -m pip install -r requirements.txt -r requirements/test.txt
          python3 -m pip install .
      - name: Check env
        run: |
          python3 -m pip list
          lmdeploy check_env
          rm -rf allure-results
      - name: Test lmdeploy - quantization w4a16
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization'))
        run: |
          pytest autotest/tools/quantization/test_quantization_w4a16.py -m 'not pr_test' -n 8 --alluredir=allure-results --clean-alluredir
      - name: Test lmdeploy - quantization kv int8
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization'))
        run: |
          pytest autotest/tools/quantization/test_quantization_kvint8.py -n 8 --alluredir=allure-results
      - name: Test lmdeploy - quantization w8a8
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'quantization'))
        run: |
          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=allure-results
      - name: Test lmdeploy - quantization kv int8 and w4a16
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization'))
        run: |
          pytest autotest/tools/quantization/test_quantization_kvint8_w4a16.py -n 8 --alluredir=allure-results
      - name: Test lmdeploy - convert
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'convert'))
        run: |
          pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=allure-results
      - name: Test lmdeploy - chat workspace
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat'))
        timeout-minutes: 20
        run: |
          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
      - name: Test lmdeploy - chat hf turbomind
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat'))
        timeout-minutes: 20
        run: |
          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
      - name: Test lmdeploy - chat hf torch
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'chat'))
        timeout-minutes: 20
        run: |
          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
      - name: Test lmdeploy - pipeline turbomind
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
        timeout-minutes: 25
        run: |
          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
      - name: Test lmdeploy - restful turbomind
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful'))
        timeout-minutes: 30
        run: |
          pytest autotest/tools/restful/test_restful_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
          pytest autotest/tools/restful/test_restful_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
      - name: Test lmdeploy - interface pipeline turbomind case
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'interface-pipeline'))
        timeout-minutes: 20
        run: |
          pytest autotest/interface/pipeline/test_pipeline_turbomind_func.py -m 'not pr_test' --alluredir=allure-results
      - name: Test lmdeploy - pipeline torch
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
        timeout-minutes: 25
        run: |
          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
      - name: Test lmdeploy - restful torch
        continue-on-error: true
        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'restful'))
        timeout-minutes: 40
        run: |
          pytest autotest/tools/restful/test_restful_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
          pytest autotest/tools/restful/test_restful_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
      - name: Test lmdeploy - rerun all fail cases
        timeout-minutes: 30
        run: |
          pytest autotest --lf --alluredir=allure-results
      - name: Generate reports
        if: always()
        run: |
          export date_today="$(date +'%Y%m%d-%H%M%S')"
          export report_dir="$REPORT_DIR/$date_today"
          echo "Save report to $ALLURE_DIR"
          allure generate -c -o $report_dir
      - name: Clear workfile
        if: always()
        run: |
          export workdir=$(pwd)
          cd ..
          rm -rf $workdir
          mkdir $workdir
          chmod -R 777 $workdir

  test_triton:
    runs-on: [self-hosted, linux-a100]
    timeout-minutes: 30
    env:
      HF_MODEL: /nvme/qa_test_models/internlm-chat-20b
      WORKDIR: /nvme/qa_test_models/triton_workspace
      TB_MODEL: internlm-chat-20b-fp16-tp2
      GRPC_PORT: 33337
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
      - name: Create test container
        run: |
          export CONTAINER_ID=$(docker create \
            --rm \
            --gpus='"device=4,5"' \
            --shm-size 16g \
            --cap-add=SYS_PTRACE \
            --cap-add=SYS_ADMIN \
            --security-opt seccomp=unconfined \
            --name "lmdeploy-ci-triton-$GITHUB_RUN_ID" \
            --workdir /__w/lmdeploy/lmdeploy \
            --env NCCL_LAUNCH_MODE=GROUP \
            -v $(pwd)/../../:/__w \
            -v ${HF_MODEL}:/root/workspace/hf_model \
            -v ${WORKDIR}:/root/workspace/workdir \
            -v ${HOST_PIP_CACHE_DIR}:/root/.cache/pip \
            -v ${HOST_LOCALTIME}:/etc/localtime:ro \
            openmmlab/lmdeploy:latest tail -f /dev/null \
             )
          docker start $CONTAINER_ID
          echo "CONTAINER_ID=$CONTAINER_ID"
          echo "CONTAINER_ID=$CONTAINER_ID"  >> $GITHUB_ENV
      - name: Build lmdeploy from source
        run: |
          docker exec $CONTAINER_ID mkdir build
          docker exec --workdir /__w/lmdeploy/lmdeploy/build \
            --env http_proxy=${{secrets.PROXY}} \
            --env https_proxy=${{secrets.PROXY}} \
            --env HTTP_PROXY=${{secrets.PROXY}} \
            --env HTTPS_PROXY=${{secrets.PROXY}} \
            --env no_proxy="localhost,127.0.0.1" \
            --env NO_PROXY="localhost,127.0.0.1" \
            $CONTAINER_ID  cmake .. \
               -DCMAKE_BUILD_TYPE=RelWithDebInfo \
               -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
               -DCMAKE_INSTALL_PREFIX=/opt/tritonserver \
               -DBUILD_PY_FFI=ON \
               -DBUILD_MULTI_GPU=ON \
               -DCMAKE_CUDA_FLAGS="-lineinfo" \
               -DUSE_NVTX=ON \
               -DSM=80 \
               -DCMAKE_CUDA_ARCHITECTURES=80 \
               -DBUILD_TEST=OFF
          docker exec --workdir /__w/lmdeploy/lmdeploy/build $CONTAINER_ID  make -j$(nproc)
          docker exec --workdir /__w/lmdeploy/lmdeploy/build $CONTAINER_ID  make install \
            --env http_proxy=${{secrets.PROXY}} \
            --env https_proxy=${{secrets.PROXY}} \
            --env HTTP_PROXY=${{secrets.PROXY}} \
            --env HTTPS_PROXY=${{secrets.PROXY}}
      - name: Install lmdeploy
        run: |
          docker exec \
            --env http_proxy=${{secrets.PROXY}} \
            --env https_proxy=${{secrets.PROXY}} \
            $CONTAINER_ID python3 -m pip install tritonclient[grpc] protobuf

          docker exec \
            --env http_proxy=${{secrets.PROXY}} \
            --env https_proxy=${{secrets.PROXY}} \
            $CONTAINER_ID python3 -m pip install -r requirements/test.txt

          docker exec \
            --env http_proxy=${{secrets.PROXY}} \
            --env https_proxy=${{secrets.PROXY}} \
            $CONTAINER_ID python3 -m pip install .

          docker exec $CONTAINER_ID lmdeploy check_env
      - name: Convert to turbomind model
        run: |
          docker exec $CONTAINER_ID \
            lmdeploy convert \
            internlm-chat-20b \
            /root/workspace/hf_model \
            --tp 2 \
            --dst-path /root/workspace/workdir/${TB_MODEL}
      - name: Start triton server service
        run: |
          docker exec --detach $CONTAINER_ID bash -c \
            "tritonserver \
            --model-repository=/root/workspace/workdir/${TB_MODEL}/model_repository \
            --allow-http=0 \
            --allow-grpc=1 \
            --grpc-port=${GRPC_PORT} \
            --log-verbose=0 \
            --allow-metrics=1 > run.log 2>&1 ; touch finish.txt"
          # wait for triton server to fully start up
          sleep 180s
          # print triton server log file
          cat run.log
          python3 -c 'import os; assert not os.path.exists("finish.txt"), "Failed to start tritonserver"'
      - name: Test triton server
        run: |
          docker exec \
            --env no_proxy="localhost,127.0.0.1" \
            --env NO_PROXY="localhost,127.0.0.1" \
            $CONTAINER_ID python3 .github/scripts/test_triton_server.py --port ${GRPC_PORT}
          # print triton server log file
          cat run.log
      - name: Clear workfile
        if: always()
        run: |
          export workdir=$(pwd)
          docker exec --workdir /__w/lmdeploy $CONTAINER_ID rm -rf lmdeploy
          mkdir $workdir
          chmod -R 777 $workdir
          docker exec --workdir /__w/lmdeploy $CONTAINER_ID rm -rf /root/workspace/workdir/${TB_MODEL}
          docker stop $CONTAINER_ID


  notify_to_feishu:
    if: always() && !cancelled() && (github.ref_name == 'develop' || github.ref_name == 'main')
    needs: [test_functions, test_triton]
    timeout-minutes: 5
    runs-on: [self-hosted, linux-a100]
    steps:
      - name: fail notify
        if: contains(needs.*.result, 'failure')
        run: |
          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- daily test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.FEISHU_USER_ID }}'"}]]}}}}'  ${{ secrets.FEISHU_WEBHOOK_URL }}