0.31.0 +qwen2-vl

matatonic · Sep 13, 2024 · e438d38 · e438d38
1 parent c615dcc
commit e438d38
Show file tree

Hide file tree

Showing 8 changed files with 211 additions and 126 deletions.
diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml
@@ -5,6 +5,7 @@ on:
   push:
     branches:
       - 'main'
+      - 'dev'
   release:
     types: [published]
 

diff --git a/Dockerfile b/Dockerfile
@@ -10,7 +10,7 @@ RUN git clone https://github.com/togethercomputer/Dragonfly --single-branch /app
 
 COPY requirements.txt .
 ARG VERSION=latest
-RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "transformers>=4.44.2\nautoawq>=0.2.5" >> requirements.txt ; fi
+RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "git+https://github.com/huggingface/transformers\nautoawq>=0.2.5" >> requirements.txt ; fi
 # TODO: nvidia apex wheel
 RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt
 

diff --git a/README.md b/README.md
@@ -83,7 +83,7 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 - - [ ] [InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ) (currently errors)
 - - [X] [InternVL2-1B](https://huggingface.co/OpenGVLab/InternVL2-1B)
 - - [X] [InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5) (wont gpu split yet)
-- - [X] [InternVL-Chat-V1-5-Int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-Int8) (wont gpu split yet)
+- - [ ] [InternVL-Chat-V1-5-AWQ](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-AWQ) (wont gpu split yet)
 - - [X] [Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5) (alternate docker only)
 - - [X] [Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)
 - [X] [Salesforce](https://huggingface.co/Salesforce)
@@ -133,6 +133,11 @@ If you can't find your favorite model, you can [open a new issue](https://github
 
 ## Recent updates
 
+Version 0.31.0
+
+- new model support: Qwen/Qwen2-VL family of models (video untested, GPTQ not working yet, but AWQ and BF16 are fine)
+- transformers from git
+- Regression: THUD/glm-4v-9b  broken in this release
 
 Version 0.30.0
 

diff --git a/backend/qwen2-vl.py b/backend/qwen2-vl.py
@@ -0,0 +1,83 @@
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+
+import os
+from vision_qna import *
+
+# Qwen/Qwen2-VL-2B-Instruct-AWQ
+# Qwen/Qwen2-VL-2B-Instruct
+# Qwen/Qwen2-VL-7B-Instruct-AWQ
+# Qwen/Qwen2-VL-7B-Instruct
+# X Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4
+# X Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8
+# X Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4
+# X Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "qwen2-vl"
+    format: 'chatml'
+    vision_layers: List[str] = ['visual']
+
+    def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
+        super().__init__(model_id, device, device_map, extra_params, format)
+
+        self.processor = AutoProcessor.from_pretrained(model_id)
+
+        del self.params['trust_remote_code']
+        self.model = Qwen2VLForConditionalGeneration.from_pretrained(**self.params).eval()
+
+        self.loaded_banner()
+
+    async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
+        # image_tag = '<|vision_start|><|image_pad|><|vision_end|>'
+
+        messages = []
+
+        for m in request.messages:
+            if m.role == 'user':
+                msg = { 'role': m.role, 'content': [] }
+                for c in m.content:
+                    if c.type == 'image_url':
+                        # hack around https://github.com/QwenLM/Qwen2-VL/issues/202'
+                        if c.image_url.url.startswith('data:image'):
+                            parts = c.image_url.url.split(';')
+                            if parts[1].startswith('charset='):
+                                c.image_url.url = parts[0] + ';' + parts[2]
+
+                        msg['content'].extend([{'type': c.type, 'image': c.image_url.url}])
+                    elif c.type == 'text':
+                        msg['content'].extend([{'type': c.type, 'text': c.text}])
+                    elif c.type == 'video': # not likely to work.
+                        msg['content'].extend([{'type': c.type, 'video': c.image_url.url}])
+            else:
+                #msg = { 'role': m.role, 'content': [{ 'type': 'text', 'text': c.text }] }
+                msg = { 'role': m.role, 'content': c.text }
+
+            messages.extend([msg])
+
+        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.device)
+
+        params = self.get_generation_params(request, default_params={})
+
+        generation_kwargs = dict(
+            **inputs,
+            **params,
+        )
+
+        for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs):
+            end = new_text.find(self.processor.tokenizer.eos_token)
+            if end == -1:
+                yield new_text
+            else:
+                yield new_text[:end]
+                break
diff --git a/model_conf_tests.json b/model_conf_tests.json
@@ -24,13 +24,10 @@
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40", "--load-in-4bit"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],
-  ["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL2-1B", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/InternVL2-1B", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0"],
-  ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0", "--load-in-4bit"],
-  ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL2-8B", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/InternVL2-8B", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL2-26B", "--device-map", "cuda:0", "--load-in-4bit"],
@@ -42,11 +39,12 @@
   ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40", "--load-in-4bit"],
   ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40"],
   ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5"],
-  ["OpenGVLab/Mini-InternVL-Chat-4B-V1-5", "--max-tiles", "40", "--load-in-4bit"],
-  ["OpenGVLab/Mini-InternVL-Chat-4B-V1-5", "--load-in-4bit"],
-  ["OpenGVLab/Mini-InternVL-Chat-4B-V1-5"],
   ["Qwen/Qwen-VL-Chat", "--load-in-4bit"],
   ["Qwen/Qwen-VL-Chat"],
+  ["Qwen/Qwen2-VL-2B-Instruct-AWQ", "-A", "flash_attention_2"],
+  ["Qwen/Qwen2-VL-2B-Instruct", "-A", "flash_attention_2"],
+  ["Qwen/Qwen2-VL-7B-Instruct-AWQ", "-A", "flash_attention_2"],
+  ["Qwen/Qwen2-VL-7B-Instruct", "-A", "flash_attention_2"],
   ["Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5"],
   ["Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"],
   ["Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5"],
@@ -69,14 +67,10 @@
   ["fancyfeast/joy-caption-pre-alpha", "-A", "flash_attention_2"],
   ["internlm/internlm-xcomposer2d5-7b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
   ["internlm/internlm-xcomposer2d5-7b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["internlm/internlm-xcomposer2-4khd-7b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
   ["internlm/internlm-xcomposer2-4khd-7b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["internlm/internlm-xcomposer2-7b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
   ["internlm/internlm-xcomposer2-7b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["internlm/internlm-xcomposer2-7b-4bit", "-A", "flash_attention_2"],
-  ["internlm/internlm-xcomposer2-vl-1_8b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
   ["internlm/internlm-xcomposer2-vl-1_8b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["internlm/internlm-xcomposer2-vl-7b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
   ["internlm/internlm-xcomposer2-vl-7b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["internlm/internlm-xcomposer2-vl-7b-4bit", "-A", "flash_attention_2"],
   ["llava-hf/llava-1.5-13b-hf", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],

diff --git a/requirements.txt b/requirements.txt
@@ -41,6 +41,7 @@ matplotlib
 optimum
 tiktoken
 transformers_stream_generator
+qwen-vl-utils
 
 # video
 decord
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ on: @@
       push:
         branches:
           - 'main'
+          - 'dev'
       release:
         types: [published]
@@ Expand Down @@