merge main

InternLM · Mar 7, 2024 · 60d9bfd · 60d9bfd
2 parents 50ca504 + e710c4c
commit 60d9bfd
Show file tree

Hide file tree

Showing 25 changed files with 780 additions and 298 deletions.
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
@@ -2,8 +2,29 @@ name: daily_ete_test
 
 on:
   workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      backend:
+        required: true
+        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch']"
+      model:
+        required: true
+        description: 'Set testcase module filter: chat, restful, pipeline, quantization. Default contains all models'
+        type: string
+        default: "['quantization','convert','pipeline','restful','chat','interface-pipeline']"
   schedule:
-    - cron:  '00 18 * * *'
+    - cron:  '00 21 * * *'
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
@@ -13,7 +34,7 @@ env:
 jobs:
   test_functions:
     runs-on: [self-hosted, linux-a100]
-    timeout-minutes: 420
+    timeout-minutes: 240
     env:
       REPORT_DIR: /nvme/qa_test_models/test-reports
     container:
@@ -23,6 +44,7 @@ jobs:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
         - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/bigdisk/qa_test_models:/mnt/bigdisk/qa_test_models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Setup systems
@@ -33,7 +55,10 @@ jobs:
           dpkg -i /root/packages/allure_2.24.1-1_all.deb
           rm -rf /var/lib/apt/lists/*
       - name: Clone repository
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
       - name: Install pytorch
         run: |
           python3 -m pip cache dir
@@ -68,64 +93,89 @@ jobs:
         run: |
           python3 -m pip list
           lmdeploy check_env
+          rm -rf allure-results
       - name: Test lmdeploy - quantization w4a16
         continue-on-error: true
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization'))
         run: |
           pytest autotest/tools/quantization/test_quantization_w4a16.py -m 'not pr_test' -n 8 --alluredir=allure-results --clean-alluredir
       - name: Test lmdeploy - quantization kv int8
         continue-on-error: true
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization'))
         run: |
           pytest autotest/tools/quantization/test_quantization_kvint8.py -n 8 --alluredir=allure-results
       - name: Test lmdeploy - quantization w8a8
         continue-on-error: true
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'quantization'))
         run: |
           pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=allure-results
       - name: Test lmdeploy - quantization kv int8 and w4a16
         continue-on-error: true
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization'))
         run: |
           pytest autotest/tools/quantization/test_quantization_kvint8_w4a16.py -n 8 --alluredir=allure-results
       - name: Test lmdeploy - convert
         continue-on-error: true
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'convert'))
         run: |
-          pytest autotest/tools/convert -m 'not pr_test' -n 6 --alluredir=allure-results --dist loadgroup
-      - name: Test lmdeploy - interface turbomind case
+          pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=allure-results
+      - name: Test lmdeploy - chat workspace
         continue-on-error: true
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat'))
         timeout-minutes: 20
         run: |
-          pytest autotest/interface/pipeline/test_pipeline_turbomind_func.py -m 'not pr_test' --alluredir=allure-results
-      - name: Test lmdeploy - pipeline turbomind
-        continue-on-error: true
-        timeout-minutes: 45
-        run: pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'not pr_test' --alluredir=allure-results
-      - name: Test lmdeploy - pipeline torch
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
+      - name: Test lmdeploy - chat hf turbomind
         continue-on-error: true
-        timeout-minutes: 75
-        run: pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'not pr_test' --alluredir=allure-results
-      - name: Test lmdeploy - restful turbomind
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat'))
+        timeout-minutes: 20
+        run: |
+          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
+          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
+      - name: Test lmdeploy - chat hf torch
         continue-on-error: true
-        timeout-minutes: 60
-        run: pytest autotest/tools/restful/test_restful_chat_turbomind.py -m 'not pr_test' --alluredir=allure-results
-      - name: Test lmdeploy - restful torch
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'chat'))
+        timeout-minutes: 20
+        run: |
+          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
+          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
+      - name: Test lmdeploy - pipeline turbomind
         continue-on-error: true
-        timeout-minutes: 80
-        run: pytest autotest/tools/restful/test_restful_chat_pytorch.py -m 'not pr_test' --alluredir=allure-results
-      - name: Test lmdeploy - chat workspace
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
+        timeout-minutes: 25
+        run: |
+          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
+          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
+      - name: Test lmdeploy - restful turbomind
         continue-on-error: true
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful'))
         timeout-minutes: 30
         run: |
-          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'not pr_test' -n 4 --alluredir=allure-results
-      - name: Test lmdeploy - chat hf turbomind
+          pytest autotest/tools/restful/test_restful_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
+          pytest autotest/tools/restful/test_restful_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
+      - name: Test lmdeploy - interface pipeline turbomind case
         continue-on-error: true
-        timeout-minutes: 45
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'interface-pipeline'))
+        timeout-minutes: 20
         run: |
-          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'not pr_test' -n 4 --alluredir=allure-results
-      - name: Test lmdeploy - chat hf torch
+          pytest autotest/interface/pipeline/test_pipeline_turbomind_func.py -m 'not pr_test' --alluredir=allure-results
+      - name: Test lmdeploy - pipeline torch
+        continue-on-error: true
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
+        timeout-minutes: 25
+        run: |
+          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
+          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
+      - name: Test lmdeploy - restful torch
         continue-on-error: true
-        timeout-minutes: 60
+        if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'restful'))
+        timeout-minutes: 40
         run: |
-          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'not pr_test' -n 4 --alluredir=allure-results
+          pytest autotest/tools/restful/test_restful_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=allure-results
+          pytest autotest/tools/restful/test_restful_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=allure-results
       - name: Test lmdeploy - rerun all fail cases
-        timeout-minutes: 60
+        timeout-minutes: 30
         run: |
           pytest autotest --lf --alluredir=allure-results
       - name: Generate reports

diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml
@@ -34,6 +34,7 @@ jobs:
         - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip
         - /nvme/share_data/github-actions/packages:/root/packages
         - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/bigdisk/qa_test_models:/mnt/bigdisk/qa_test_models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Setup systems
@@ -81,7 +82,7 @@ jobs:
           lmdeploy check_env
       - name: Test lmdeploy
         timeout-minutes: 120
-        run: CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m pr_test --alluredir=allure-results --clean-alluredir
+        run: CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m pr_test -v -s --alluredir=allure-results --clean-alluredir
       - name: Generate reports
         if: always()
         run: |

diff --git a/autotest/config.yaml b/autotest/config.yaml
@@ -1,4 +1,4 @@
-model_path: /nvme/qa_test_models
+model_path: /mnt/bigdisk/qa_test_models
 dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
 dataset_path: /nvme/qa_test_models/...dataset
@@ -13,67 +13,69 @@ tp_config:
 
 
 turbomind_model:
-    - llama-2-7b-chat
-    - internlm2-chat-1_8b
-    - internlm-chat-7b
-    - internlm-chat-20b
-    - internlm2-chat-7b
-    - internlm2-chat-20b
-    - Qwen-7B-Chat
-    - Qwen-14B-Chat
-    - llama2-chat-7b-w4
-    - Baichuan2-7B-Chat
-    - Yi-6B-Chat
-    - internlm2-1_8b
-    - internlm2-20b
-    - CodeLlama-7b-Instruct-hf
+    - meta-llama/Llama-2-7b-chat
+    - internlm/internlm2-chat-1_8b
+    - internlm/internlm-chat-7b
+    - internlm/internlm-chat-20b
+    - internlm/internlm2-chat-7b
+    - internlm/internlm2-chat-20b
+    - internlm/internlm2-chat-7b-4bits
+    - internlm/internlm2-chat-20b-4bits
+    - Qwen/Qwen-7B-Chat
+    - Qwen/Qwen-14B-Chat
+    - lmdeploy/llama2-chat-7b-w4
+    - baichuan-inc/Baichuan2-7B-Chat
+    - 01-ai/Yi-6B-Chat
+    - internlm/internlm2-1_8b
+    - internlm/internlm2-20b
+    - codellama/CodeLlama-7b-Instruct-hf
 
 
 pytorch_model:
-    - llama-2-7b-chat
-    - internlm-chat-7b
-    - internlm-chat-20b
-    - internlm2-chat-7b
-    - internlm2-chat-20b
-    - Baichuan2-7B-Chat
-    - Baichuan2-13B-Chat
-    - chatglm2-6b
-    - falcon-7b
-    - Yi-6B-Chat
-    - internlm2-1_8b
-    - internlm2-20b
-    - Qwen1.5-7B-Chat
-    - Mistral-7B-Instruct-v0.1
-    - Mixtral-8x7B-Instruct-v0.1
-    - gemma-7b-it
-    - deepseek-moe-16b-chat
+    - meta-llama/Llama-2-7b-chat
+    - internlm/internlm-chat-7b
+    - internlm/internlm-chat-20b
+    - internlm/internlm2-chat-7b
+    - internlm/internlm2-chat-20b
+    - baichuan-inc/Baichuan2-7B-Chat
+    - baichuan-inc/Baichuan2-13B-Chat
+    - THUDM/chatglm2-6b
+    - tiiuae/falcon-7b
+    - 01-ai/Yi-6B-Chat
+    - internlm/internlm2-1_8b
+    - internlm/internlm2-20b
+    - Qwen/Qwen1.5-7B-Chat
+    - mistralai/Mistral-7B-Instruct-v0.1
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
+    - google/gemma-7b-it
+    - deepseek-ai/deepseek-moe-16b-chat
 
 
 quatization_case_config:
     w4a16:
-        - llama-2-7b-chat
-        - internlm-chat-20b
-        - Qwen-7B-Chat
-        - Qwen-14B-Chat
-        - internlm2-chat-20b
-        - Baichuan2-7B-Chat
-        - internlm2-20b
+        - meta-llama/Llama-2-7b-chat
+        - internlm/internlm-chat-20b
+        - Qwen/Qwen-7B-Chat
+        - Qwen/Qwen-14B-Chat
+        - internlm/internlm2-chat-20b
+        - baichuan-inc/Baichuan2-7B-Chat
+        - internlm/internlm2-20b
     kvint8: # more models are supported kvint8 quantization, but the chat response are not good, already removed
-        - llama-2-7b-chat
-        - internlm-chat-20b
-        - internlm2-chat-20b
+        - meta-llama/Llama-2-7b-chat
+        - internlm/internlm-chat-20b
+        - internlm/internlm2-chat-20b
     kvint8_w4a16:
-        - llama-2-7b-chat
-        - internlm-chat-20b
-        - internlm2-chat-20b
-        - internlm2-20b
-        - Qwen-7B-Chat
-        - Qwen-14B-Chat
-        - Baichuan2-7B-Chat
+        - meta-llama/Llama-2-7b-chat
+        - internlm/internlm-chat-20b
+        - internlm/internlm2-chat-20b
+        - internlm/internlm2-20b
+        - Qwen/Qwen-7B-Chat
+        - Qwen/Qwen-14B-Chat
+        - baichuan-inc/Baichuan2-7B-Chat
     w8a8:
-        - llama-2-7b-chat
-        - internlm-chat-20b
-        - internlm2-chat-20b
-        - internlm2-chat-7b
-        - Yi-6B-Chat
-        - internlm2-20b
+        - meta-llama/Llama-2-7b-chat
+        - internlm/internlm-chat-20b
+        - internlm/internlm2-chat-20b
+        - internlm/internlm2-chat-7b
+        - 01-ai/Yi-6B-Chat
+        - internlm/internlm2-20b
diff --git a/autotest/interface/pipeline/test_pipeline_turbomind_func.py b/autotest/interface/pipeline/test_pipeline_turbomind_func.py
@@ -10,15 +10,15 @@
 @pytest.mark.flaky(reruns=0)
 class TestPipelineTurbomindFuncRegression:
 
-    @pytest.mark.parametrize('model', ['internlm2-chat-20b'])
+    @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
     def test_backend_config_tp(self, config, model):
         with pytest.raises(AssertionError, match='tp should be 2\\^n'):
             model_path = '/'.join([config.get('model_path'), model])
             backend_config = TurbomindEngineConfig(tp=100)
             pipe = pipeline(model_path, backend_config=backend_config)
             del pipe
 
-    @pytest.mark.parametrize('model', ['internlm2-chat-20b'])
+    @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
     def test_backend_config_session_len(self, config, model):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = TurbomindEngineConfig(session_len=10)
@@ -29,7 +29,7 @@ def test_backend_config_session_len(self, config, model):
             assert response[i].finish_reason == 'length', str(response[i])
             assert response[i].generate_token_len == 0, str(response[i])
 
-    @pytest.mark.parametrize('model', ['internlm2-chat-20b'])
+    @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
     def test_gen_config_test(self, config, model):
         model_path = '/'.join([config.get('model_path'), model])
         pipe = pipeline(model_path)
@@ -111,7 +111,7 @@ def test_gen_config_test(self, config, model):
 
         del pipe
 
-    @pytest.mark.parametrize('model', ['internlm2-chat-20b'])
+    @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
     def future_test_backend_config_cache_max_entry_count(self, config, model):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = TurbomindEngineConfig(cache_max_entry_count=-1)
@@ -122,7 +122,7 @@ def future_test_backend_config_cache_max_entry_count(self, config, model):
             with assume:
                 assert response[i].finish_reason == 'length', str(response[i])
 
-    @pytest.mark.parametrize('model', ['internlm2-chat-20b'])
+    @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
     def test_backend_config_max_batch_size2(self, config, model):
         model_path = '/'.join([config.get('model_path'), model])
         backend_config = TurbomindEngineConfig(max_batch_size=-1)
@@ -140,7 +140,7 @@ def test_backend_config_max_batch_size2(self, config, model):
             with assume:
                 assert response[i].text == '', str(response[i])
 
-    @pytest.mark.parametrize('model', ['internlm2-chat-20b'])
+    @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
     def test_pipeline_batch_infer(self, config, model):
         model_path = '/'.join([config.get('model_path'), model])
         pipe = pipeline(model_path)
@@ -160,7 +160,7 @@ def test_pipeline_batch_infer(self, config, model):
             with assume:
                 assert response[i].session_id == i
 
-    @pytest.mark.parametrize('model', ['internlm2-chat-20b'])
+    @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
     def test_pipeline_stream_infer(self, config, model):
         model_path = '/'.join([config.get('model_path'), model])
         pipe = pipeline(model_path)
@@ -207,7 +207,7 @@ def test_pipeline_stream_infer(self, config, model):
         with assume:
             assert outputs_list[-1].finish_reason is not None, str(output)
 
-    @pytest.mark.parametrize('model', ['internlm2-chat-20b'])
+    @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
     def test_pipeline_stream_infer2(self, config, model):
         model_path = '/'.join([config.get('model_path'), model])
         pipe = pipeline(model_path)