support ZhipuAI/cogagent-9b-20241220 (#2810)

modelscope · Dec 31, 2024 · 6b6825e · 6b6825e
1 parent 307dd05
commit 6b6825e
Show file tree

Hide file tree

Showing 7 changed files with 36 additions and 6 deletions.
diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
@@ -26,7 +26,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
 
     # test with install
     pip install .
-    pip install auto_gptq bitsandbytes deepspeed -U -i https://mirrors.aliyun.com/pypi/simple/
+    pip install auto_gptq bitsandbytes deepspeed==0.14.* -U -i https://mirrors.aliyun.com/pypi/simple/
 else
     echo "Running case in release image, run case directly!"
 fi

diff --git a/docs/source/Instruction/ReleaseNote3.0.md b/docs/source/Instruction/ReleaseNote3.0.md
@@ -84,4 +84,4 @@
 1. RM/PPO能力3.0版本尚不支持，请使用2.6.1版本
 2. 自定义数据集评测3.0版本尚不支持，请使用2.6.1版本
 3. Megatron预训练能力3.0版本尚不支持，请使用2.6.1版本
-4. 文档和README，尤其是英文部分暂时未更新完整
+3. 文档和README暂时未更新完整
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -490,7 +490,8 @@
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
 |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
 |[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B)|ovis1_6|ovis1_6|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)|
-|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|-|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|-|-|[THUDM/cogagent-9b-20241220](https://huggingface.co/THUDM/cogagent-9b-20241220)|
 |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)|
 |[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge_v|glm_edge_v|transformers>=4.46|vision|[THUDM/glm-edge-4b-chat](https://huggingface.co/THUDM/glm-edge-4b-chat)|
 |[ZhipuAI/cogvlm-chat](https://modelscope.cn/models/ZhipuAI/cogvlm-chat)|cogvlm|cogvlm|transformers<4.42|-|[THUDM/cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf)|

diff --git a/docs/source_en/Instruction/ReleaseNote3.0.md b/docs/source_en/Instruction/ReleaseNote3.0.md
@@ -97,4 +97,4 @@ The parameters marked as compatible in version 2.0 have been entirely removed.
 1. RM/PPO capabilities are not supported in version 3.0. Please use version 2.6.1.
 2. Custom dataset evaluation is not supported in version 3.0. Please use version 2.6.1.
 3. Megatron pre-training capabilities are not supported in version 3.0. Please use version 2.6.1.
-4. Documentation and README, especially the English portions, are temporarily incomplete and will be updated.
+4. Documentation and README are temporarily incomplete and will be updated.
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -490,7 +490,8 @@ The table below introduces the models integrated with ms-swift:
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
 |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
 |[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B)|ovis1_6|ovis1_6|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)|
-|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|-|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|-|-|[THUDM/cogagent-9b-20241220](https://huggingface.co/THUDM/cogagent-9b-20241220)|
 |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)|
 |[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge_v|glm_edge_v|transformers>=4.46|vision|[THUDM/glm-edge-4b-chat](https://huggingface.co/THUDM/glm-edge-4b-chat)|
 |[ZhipuAI/cogvlm-chat](https://modelscope.cn/models/ZhipuAI/cogvlm-chat)|cogvlm|cogvlm|transformers<4.42|-|[THUDM/cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf)|

diff --git a/swift/llm/model/model/glm.py b/swift/llm/model/model/glm.py
@@ -185,6 +185,12 @@ def get_model_tokenizer_glm4v(model_dir: str,
                 ],
                 requires=['transformers>=4.42,<4.45'],
             ),
+            ModelGroup(
+                [
+                    Model('ZhipuAI/cogagent-9b-20241220', 'THUDM/cogagent-9b-20241220'),
+                ],
+                requires=['transformers>=4.42'],
+            )
         ],
         TemplateType.glm4v,
         get_model_tokenizer_glm4v,

diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
@@ -81,6 +81,27 @@ def test_glm4v():
                          '主要是白色和棕色相间的花纹。背景模糊不清，但似乎是一个室内环境。')
 
 
+def test_cogagent():
+    pt_engine = PtEngine('ZhipuAI/cogagent-9b-20241220')
+    messages = [{
+        'role':
+        'user',
+        'content':
+        """<image>Task: I'm looking for a software to \"edit my photo with grounding\"
+History steps:
+(Platform: Mac)
+(Answer in Action-Operation-Sensitive format.)"""
+    }]
+    images = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/agent.png']
+    response = _infer_model(pt_engine, messages=messages, images=images)
+    pt_engine.default_template.template_backend = 'jinja'
+    response2 = _infer_model(pt_engine, messages=messages, images=images)
+    assert response == response2 == (
+        """Action: Click on the 'Adobe Photoshop 2023' icon located in the middle of the screen to open the application.
+Grounded Operation: CLICK(box=[[346,574,424,710]], element_type='卡片', element_info='Adobe Photoshop 2023')
+<<一般操作>>""")
+
+
 def test_minicpmv():
     pt_engine = PtEngine('OpenBMB/MiniCPM-V-2_6')
     _infer_model(pt_engine)
@@ -314,7 +335,8 @@ def test_doc_owl2():
     # test_deepseek_vl()
     # test_deepseek_vl2()
     # test_qwen_vl()
-    test_glm4v()
+    # test_glm4v()
+    test_cogagent()
     # test_minicpmv()
     # test_got_ocr()
     # test_paligemma()