0.38.0 +ovis1.6

matatonic · Oct 8, 2024 · 9f4dc20 · 9f4dc20
1 parent 0fdf839
commit 9f4dc20
Show file tree

Hide file tree

Showing 5 changed files with 224 additions and 120 deletions.
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 <summary>Full list of supported models</summary>
 
 - [X] [AIDC-AI](https://huggingface.co/AIDC-AI)
+- - [X] [Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)
 - - [X] [Ovis1.5-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.5-Gemma2-9B)
 - - [X] [Ovis1.5-Llama3-8B](https://huggingface.co/AIDC-AI/Ovis1.5-Llama3-8B)
 - [X] [Ai2](https://huggingface.co/allenai)
@@ -158,6 +159,10 @@ If you can't find your favorite model, you can [open a new issue](https://github
 
 ## Recent updates
 
+Version 0.38.0
+
+- new model support: AIDC-AI/Ovis1.6-Gemma2-9B
+
 Version 0.37.0
 
 - new model support: nvidia/NVLM-D-72B

diff --git a/backend/ovis16.py b/backend/ovis16.py
@@ -0,0 +1,95 @@
+from transformers import AutoModelForCausalLM
+
+from vision_qna import *
+
+# AIDC-AI/Ovis1.6-Gemma2-9B
+
+IMAGE_TOKEN = "<image>"
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "generic"
+    format: str = "custom"
+    visual_layers: List[str] = ['visual_tokenizer', 'vte']
+
+    def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
+        super().__init__(model_id, device, device_map, extra_params, format)
+
+        self.params['multimodal_max_length'] = 8192
+
+        self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()
+
+        self.text_tokenizer = self.model.get_text_tokenizer()
+        self.visual_tokenizer = self.model.get_visual_tokenizer()
+
+        # bitsandbytes already moves the model to the device, so we don't need to do it again.
+        if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)):
+           self.model = self.model.to(self.device)
+
+        self.loaded_banner()
+
+    async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
+        conversation = []
+        images = []
+        for m in request.messages:
+            content = ''
+            for c in m.content:
+                if c.type == 'image_url':
+                    image = await url_to_image(c.image_url.url)
+                    images.extend([image])
+                    content = IMAGE_TOKEN + '\n' + content
+                elif c.type == 'text':
+                    content += c.text
+
+            if content:
+                if m.role == 'user':
+                    conversation.extend([{'from': 'human', 'value': content }])
+                elif m.role == 'assistant':
+                    conversation.extend([{'from': 'gpt', 'value': content }])
+                # system is ignored
+
+        if len(images) < 1:
+            images = [ await url_to_image(black_pixel_url) ]
+            conversation[0]['value'] = IMAGE_TOKEN + '\n' + conversation[0]['value']
+
+        _prompt, input_ids, pixel_values = self.model.preprocess_inputs(conversation, images)
+        attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
+        input_ids = input_ids.unsqueeze(0).to(device=self.model.device)
+        attention_mask = attention_mask.unsqueeze(0).to(device=self.model.device)
+        pixel_values = [pixel_values.to(dtype=self.visual_tokenizer.dtype, device=self.visual_tokenizer.device)]
+
+        _, inputs_embeds, labels, attention_mask = self.model.merge_multimodal(
+            text_input_ids=input_ids,
+            text_attention_masks=attention_mask,
+            text_labels=None,
+            pixel_values=pixel_values,
+            left_padding=True
+        )
+
+        default_params =  dict(
+            max_new_tokens=1024,
+            do_sample=False,
+            top_p=None,
+            top_k=None,
+            temperature=None,
+            repetition_penalty=None,
+            eos_token_id=self.model.generation_config.eos_token_id,
+            pad_token_id=self.text_tokenizer.pad_token_id,
+            use_cache=True,
+            num_beams=1,
+        )
+
+        params = self.get_generation_params(request, default_params=default_params)
+
+        generation_kwargs = dict(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            **params,
+        )
+
+        for new_text in threaded_streaming_generator(generate=self.model.llm.generate, tokenizer=self.text_tokenizer, generation_kwargs=generation_kwargs):
+            end = new_text.find(self.text_tokenizer.eos_token)
+            if end == -1:
+                yield new_text
+            else:
+                yield new_text[:end]
+                break
diff --git a/model_conf_tests.json b/model_conf_tests.json
@@ -1,4 +1,5 @@
 [
+  ["AIDC-AI/Ovis1.6-Gemma2-9B", "-A", "flash_attention_2"],
   ["AIDC-AI/Ovis1.5-Gemma2-9B", "-A", "flash_attention_2"],
   ["AIDC-AI/Ovis1.5-Llama3-8B", "-A", "flash_attention_2"],
   ["BAAI/Bunny-v1_0-2B-zh", "--load-in-4bit"],