Skip to content

Commit

Permalink
0.38.0 +ovis1.6
Browse files Browse the repository at this point in the history
  • Loading branch information
matatonic committed Oct 8, 2024
1 parent 0fdf839 commit 9f4dc20
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 120 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
<summary>Full list of supported models</summary>

- [X] [AIDC-AI](https://huggingface.co/AIDC-AI)
- - [X] [Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)
- - [X] [Ovis1.5-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.5-Gemma2-9B)
- - [X] [Ovis1.5-Llama3-8B](https://huggingface.co/AIDC-AI/Ovis1.5-Llama3-8B)
- [X] [Ai2](https://huggingface.co/allenai)
Expand Down Expand Up @@ -158,6 +159,10 @@ If you can't find your favorite model, you can [open a new issue](https://github

## Recent updates

Version 0.38.0

- new model support: AIDC-AI/Ovis1.6-Gemma2-9B

Version 0.37.0

- new model support: nvidia/NVLM-D-72B
Expand Down
95 changes: 95 additions & 0 deletions backend/ovis16.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from transformers import AutoModelForCausalLM

from vision_qna import *

# AIDC-AI/Ovis1.6-Gemma2-9B

IMAGE_TOKEN = "<image>"

class VisionQnA(VisionQnABase):
model_name: str = "generic"
format: str = "custom"
visual_layers: List[str] = ['visual_tokenizer', 'vte']

def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
super().__init__(model_id, device, device_map, extra_params, format)

self.params['multimodal_max_length'] = 8192

self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()

self.text_tokenizer = self.model.get_text_tokenizer()
self.visual_tokenizer = self.model.get_visual_tokenizer()

# bitsandbytes already moves the model to the device, so we don't need to do it again.
if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)):
self.model = self.model.to(self.device)

self.loaded_banner()

async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
conversation = []
images = []
for m in request.messages:
content = ''
for c in m.content:
if c.type == 'image_url':
image = await url_to_image(c.image_url.url)
images.extend([image])
content = IMAGE_TOKEN + '\n' + content
elif c.type == 'text':
content += c.text

if content:
if m.role == 'user':
conversation.extend([{'from': 'human', 'value': content }])
elif m.role == 'assistant':
conversation.extend([{'from': 'gpt', 'value': content }])
# system is ignored

if len(images) < 1:
images = [ await url_to_image(black_pixel_url) ]
conversation[0]['value'] = IMAGE_TOKEN + '\n' + conversation[0]['value']

_prompt, input_ids, pixel_values = self.model.preprocess_inputs(conversation, images)
attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
input_ids = input_ids.unsqueeze(0).to(device=self.model.device)
attention_mask = attention_mask.unsqueeze(0).to(device=self.model.device)
pixel_values = [pixel_values.to(dtype=self.visual_tokenizer.dtype, device=self.visual_tokenizer.device)]

_, inputs_embeds, labels, attention_mask = self.model.merge_multimodal(
text_input_ids=input_ids,
text_attention_masks=attention_mask,
text_labels=None,
pixel_values=pixel_values,
left_padding=True
)

default_params = dict(
max_new_tokens=1024,
do_sample=False,
top_p=None,
top_k=None,
temperature=None,
repetition_penalty=None,
eos_token_id=self.model.generation_config.eos_token_id,
pad_token_id=self.text_tokenizer.pad_token_id,
use_cache=True,
num_beams=1,
)

params = self.get_generation_params(request, default_params=default_params)

generation_kwargs = dict(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
**params,
)

for new_text in threaded_streaming_generator(generate=self.model.llm.generate, tokenizer=self.text_tokenizer, generation_kwargs=generation_kwargs):
end = new_text.find(self.text_tokenizer.eos_token)
if end == -1:
yield new_text
else:
yield new_text[:end]
break
1 change: 1 addition & 0 deletions model_conf_tests.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[
["AIDC-AI/Ovis1.6-Gemma2-9B", "-A", "flash_attention_2"],
["AIDC-AI/Ovis1.5-Gemma2-9B", "-A", "flash_attention_2"],
["AIDC-AI/Ovis1.5-Llama3-8B", "-A", "flash_attention_2"],
["BAAI/Bunny-v1_0-2B-zh", "--load-in-4bit"],
Expand Down
Loading

0 comments on commit 9f4dc20

Please sign in to comment.