diff --git a/README.md b/README.md index 427d959..2cfa6fb 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/ Full list of supported models - [X] [AIDC-AI](https://huggingface.co/AIDC-AI) +- - [X] [Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B) - - [X] [Ovis1.5-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.5-Gemma2-9B) - - [X] [Ovis1.5-Llama3-8B](https://huggingface.co/AIDC-AI/Ovis1.5-Llama3-8B) - [X] [Ai2](https://huggingface.co/allenai) @@ -158,6 +159,10 @@ If you can't find your favorite model, you can [open a new issue](https://github ## Recent updates +Version 0.38.0 + +- new model support: AIDC-AI/Ovis1.6-Gemma2-9B + Version 0.37.0 - new model support: nvidia/NVLM-D-72B diff --git a/backend/ovis16.py b/backend/ovis16.py new file mode 100644 index 0000000..b5cbb58 --- /dev/null +++ b/backend/ovis16.py @@ -0,0 +1,95 @@ +from transformers import AutoModelForCausalLM + +from vision_qna import * + +# AIDC-AI/Ovis1.6-Gemma2-9B + +IMAGE_TOKEN = "" + +class VisionQnA(VisionQnABase): + model_name: str = "generic" + format: str = "custom" + visual_layers: List[str] = ['visual_tokenizer', 'vte'] + + def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): + super().__init__(model_id, device, device_map, extra_params, format) + + self.params['multimodal_max_length'] = 8192 + + self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval() + + self.text_tokenizer = self.model.get_text_tokenizer() + self.visual_tokenizer = self.model.get_visual_tokenizer() + + # bitsandbytes already moves the model to the device, so we don't need to do it again. + if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)): + self.model = self.model.to(self.device) + + self.loaded_banner() + + async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]: + conversation = [] + images = [] + for m in request.messages: + content = '' + for c in m.content: + if c.type == 'image_url': + image = await url_to_image(c.image_url.url) + images.extend([image]) + content = IMAGE_TOKEN + '\n' + content + elif c.type == 'text': + content += c.text + + if content: + if m.role == 'user': + conversation.extend([{'from': 'human', 'value': content }]) + elif m.role == 'assistant': + conversation.extend([{'from': 'gpt', 'value': content }]) + # system is ignored + + if len(images) < 1: + images = [ await url_to_image(black_pixel_url) ] + conversation[0]['value'] = IMAGE_TOKEN + '\n' + conversation[0]['value'] + + _prompt, input_ids, pixel_values = self.model.preprocess_inputs(conversation, images) + attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id) + input_ids = input_ids.unsqueeze(0).to(device=self.model.device) + attention_mask = attention_mask.unsqueeze(0).to(device=self.model.device) + pixel_values = [pixel_values.to(dtype=self.visual_tokenizer.dtype, device=self.visual_tokenizer.device)] + + _, inputs_embeds, labels, attention_mask = self.model.merge_multimodal( + text_input_ids=input_ids, + text_attention_masks=attention_mask, + text_labels=None, + pixel_values=pixel_values, + left_padding=True + ) + + default_params = dict( + max_new_tokens=1024, + do_sample=False, + top_p=None, + top_k=None, + temperature=None, + repetition_penalty=None, + eos_token_id=self.model.generation_config.eos_token_id, + pad_token_id=self.text_tokenizer.pad_token_id, + use_cache=True, + num_beams=1, + ) + + params = self.get_generation_params(request, default_params=default_params) + + generation_kwargs = dict( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + **params, + ) + + for new_text in threaded_streaming_generator(generate=self.model.llm.generate, tokenizer=self.text_tokenizer, generation_kwargs=generation_kwargs): + end = new_text.find(self.text_tokenizer.eos_token) + if end == -1: + yield new_text + else: + yield new_text[:end] + break diff --git a/model_conf_tests.json b/model_conf_tests.json index c9f499c..d3f8d18 100644 --- a/model_conf_tests.json +++ b/model_conf_tests.json @@ -1,4 +1,5 @@ [ + ["AIDC-AI/Ovis1.6-Gemma2-9B", "-A", "flash_attention_2"], ["AIDC-AI/Ovis1.5-Gemma2-9B", "-A", "flash_attention_2"], ["AIDC-AI/Ovis1.5-Llama3-8B", "-A", "flash_attention_2"], ["BAAI/Bunny-v1_0-2B-zh", "--load-in-4bit"], diff --git a/vision.sample.env b/vision.sample.env index b086afe..4324af5 100644 --- a/vision.sample.env +++ b/vision.sample.env @@ -6,123 +6,124 @@ HF_HUB_ENABLE_HF_TRANSFER=1 #CUDA_VISIBLE_DEVICES=1,0 #OPENEDAI_DEVICE_MAP="sequential" -#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Gemma2-9B -A flash_attention_2" # test pass✅, time: 10.0s, mem: 23.2GB, 13/13 tests passed, (32/2.8s) 11.6 T/s -#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Llama3-8B -A flash_attention_2" # test pass✅, time: 6.0s, mem: 19.2GB, 13/13 tests passed, (32/1.5s) 21.2 T/s -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit" # test pass✅, time: 7.8s, mem: 9.6GB, 13/13 tests passed, (39/1.7s) 23.0 T/s -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh" # test pass✅, time: 5.8s, mem: 10.8GB, 13/13 tests passed, (38/1.1s) 33.4 T/s -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit" # test pass✅, time: 11.3s, mem: 8.5GB, 13/13 tests passed, (59/3.0s) 19.8 T/s -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B" # test pass✅, time: 9.6s, mem: 12.0GB, 13/13 tests passed, (70/2.4s) 29.2 T/s -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh" # test pass✅, time: 8.1s, mem: 12.7GB, 13/13 tests passed, (37/1.8s) 20.1 T/s -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit" # test pass✅, time: 9.6s, mem: 5.1GB, 13/13 tests passed, (48/2.7s) 17.9 T/s -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B" # test pass✅, time: 7.4s, mem: 12.2GB, 13/13 tests passed, (44/2.0s) 22.1 T/s -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit" # test pass✅, time: 10.8s, mem: 5.8GB, 13/13 tests passed, (44/3.0s) 14.6 T/s -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B" # test pass✅, time: 8.7s, mem: 13.0GB, 13/13 tests passed, (35/2.3s) 15.1 T/s -#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit" # test pass✅, time: 30.4s, mem: 29.2GB, 13/13 tests passed, (72/9.3s) 7.7 T/s -#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB" # test pass✅, time: 21.9s, mem: 71.6GB, 13/13 tests passed, (77/6.4s) 11.9 T/s -#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat --load-in-4bit -A flash_attention_2" # test pass✅, time: 63.9s, mem: 65.7GB, 13/13 tests passed, (137/20.6s) 6.6 T/s -#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat -A flash_attention_2" # test pass✅, time: 68.1s, mem: 76.1GB, 13/13 tests passed, (159/21.9s) 7.3 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 23.5s, mem: 27.2GB, 13/13 tests passed, (60/7.2s) 8.3 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit" # test pass✅, time: 29.8s, mem: 30.6GB, 13/13 tests passed, (58/9.3s) 6.2 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40" # test pass✅, time: 25.4s, mem: 55.5GB, 13/13 tests passed, (45/7.9s) 5.7 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 19.0s, mem: 52.4GB, 13/13 tests passed, (50/5.8s) 8.7 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 32.0s, mem: 1.7GB, 13/13 tests passed, (271/10.2s) 26.5 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0" # test pass✅, time: 8.3s, mem: 2.4GB, 13/13 tests passed, (77/2.3s) 34.2 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 17.2s, mem: 5.1GB, 13/13 tests passed, (156/5.2s) 30.2 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0" # test pass✅, time: 8.6s, mem: 7.0GB, 13/13 tests passed, (90/2.3s) 38.3 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.9s, mem: 8.9GB, 13/13 tests passed, (43/2.8s) 15.4 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0" # test pass✅, time: 8.0s, mem: 18.6GB, 13/13 tests passed, (43/2.2s) 19.9 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 26.4s, mem: 27.2GB, 13/13 tests passed, (75/8.1s) 9.2 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0" # test pass✅, time: 20.3s, mem: 52.4GB, 13/13 tests passed, (59/6.4s) 9.3 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 38.6s, mem: 31.2GB, 13/13 tests passed, (82/11.8s) 6.9 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0" # test pass✅, time: 47.6s, mem: 76.2GB, 13/13 tests passed, (140/15.2s) 9.2 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-Llama3-76B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 40.6s, mem: 51.1GB, 13/13 tests passed, (40/12.8s) 3.1 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit" # test pass✅, time: 6.9s, mem: 5.5GB, 13/13 tests passed, (42/1.8s) 23.3 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 7.7s, mem: 7.8GB, 13/13 tests passed, (42/2.1s) 20.4 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40" # test pass✅, time: 6.8s, mem: 9.7GB, 13/13 tests passed, (48/1.8s) 27.0 T/s -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 5.8s, mem: 7.4GB, 13/13 tests passed, (48/1.5s) 32.0 T/s -#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit" # test pass✅, time: 10.0s, mem: 10.9GB, 13/13 tests passed -#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 7.0s, mem: 19.3GB, 13/13 tests passed -#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 13.2s, mem: 6.6GB, 13/13 tests passed, (44/4.0s) 11.1 T/s -#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct -A flash_attention_2" # test pass✅, time: 14.9s, mem: 16.0GB, 13/13 tests passed, (36/4.4s) 8.2 T/s -#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 19.1s, mem: 18.2GB, 13/13 tests passed, (36/5.7s) 6.3 T/s -#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct -A flash_attention_2" # test pass✅, time: 16.2s, mem: 27.3GB, 13/13 tests passed, (31/4.9s) 6.4 T/s -#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-72B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 38.7s, mem: 45.0GB, 13/13 tests passed, (31/12.1s) 2.6 T/s -#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5" # test pass✅, time: 9.4s, mem: 9.1GB, 13/13 tests passed, (68/2.5s) 26.8 T/s -#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5" # test pass✅, time: 3.9s, mem: 9.1GB, 13/13 tests passed, (10/0.8s) 12.7 T/s -#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5" # test pass✅, time: 9.7s, mem: 9.1GB, 13/13 tests passed, (73/2.6s) 28.1 T/s -#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1" # test pass✅, time: 9.8s, mem: 9.6GB, 13/13 tests passed, (74/2.7s) 27.7 T/s -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.0s, mem: 11.2GB, 13/13 tests passed, (14/2.8s) 4.9 T/s -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0" # test pass✅, time: 9.3s, mem: 20.5GB, 13/13 tests passed, (22/2.7s) 8.0 T/s -#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 23.3s, mem: 15.7GB, 13/13 tests passed, (92/7.1s) 12.9 T/s -#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 17.4s, mem: 25.7GB, 13/13 tests passed, (79/5.3s) 14.8 T/s -#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant" # test pass✅, time: 127.4s, mem: 5.8GB, 13/13 tests passed, (104/41.8s) 2.5 T/s -#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 92.6s, mem: 6.1GB, 13/13 tests passed, (84/30.1s) 2.8 T/s -#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2" # test pass✅, time: 26.8s, mem: 15.0GB, 13/13 tests passed, (40/8.3s) 4.8 T/s -#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit --use-double-quant" # test pass✅, time: 48.3s, mem: 7.6GB, 13/13 tests passed, (318/15.3s) 20.7 T/s -#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 41.1s, mem: 7.9GB, 13/13 tests passed, (310/13.3s) 23.3 T/s -#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2" # test pass✅, time: 31.6s, mem: 17.8GB, 13/13 tests passed, (302/10.1s) 29.9 T/s -#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit --use-double-quant" # test pass✅, time: 41.9s, mem: 8.2GB, 13/13 tests passed, (214/13.3s) 16.1 T/s -#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 37.2s, mem: 8.5GB, 13/13 tests passed, (214/11.7s) 18.2 T/s -#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2" # test pass✅, time: 29.5s, mem: 18.1GB, 13/13 tests passed, (208/9.5s) 21.9 T/s -#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant" # test pass✅, time: 130.7s, mem: 42.9GB, 13/13 tests passed, (285/42.4s) 6.7 T/s -#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 111.7s, mem: 47.9GB, 13/13 tests passed, (271/36.5s) 7.4 T/s -#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit" # test pass✅, time: 15.4s, mem: 15.5GB, 13/13 tests passed, (49/4.6s) 10.6 T/s -#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 11.8s, mem: 21.5GB, 13/13 tests passed, (32/3.5s) 9.3 T/s -#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2 --load-in-4bit" # test pass✅, time: 11.3s, mem: 7.2GB, 13/13 tests passed, (37/3.2s) 11.4 T/s -#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2" # test pass✅, time: 9.8s, mem: 12.1GB, 13/13 tests passed, (37/2.6s) 14.0 T/s -#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two --load-in-4bit -A flash_attention_2" # test pass✅, time: 51.4s, mem: 8.9GB, 13/13 tests passed, (157/14.3s) 11.0 T/s -#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two -A flash_attention_2" # test pass✅, time: 29.8s, mem: 18.4GB, 13/13 tests passed, (144/8.6s) 16.7 T/s -#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha --load-in-4bit -A flash_attention_2" # test pass✅, time: 122.1s, mem: 8.2GB, 13/13 tests passed, (710/43.2s) 16.4 T/s -#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha -A flash_attention_2" # test pass✅, time: 65.1s, mem: 17.8GB, 13/13 tests passed, (680/20.5s) 33.2 T/s -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test fail❌, time: 3.1s, mem: 8.7GB, 1/13 tests passed -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 20.4s, mem: 29.7GB, 13/13 tests passed, (33/6.6s) 5.0 T/s -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 22.4s, mem: 21.8GB, 13/13 tests passed, (49/7.3s) 6.7 T/s -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 27.9s, mem: 18.1GB, 13/13 tests passed, (61/8.3s) 7.4 T/s -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit -A flash_attention_2" # test pass✅, time: 64.2s, mem: 8.5GB, 13/13 tests passed, (84/23.4s) 3.6 T/s -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 8.5s, mem: 7.5GB, 13/13 tests passed, (23/2.3s) 10.0 T/s -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 23.4s, mem: 20.5GB, 13/13 tests passed, (54/7.8s) 6.9 T/s -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit -A flash_attention_2" # test pass✅, time: 60.6s, mem: 10.3GB, 13/13 tests passed, (61/16.0s) 3.8 T/s -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.7s, mem: 9.0GB, 13/13 tests passed, (58/4.6s) 12.7 T/s -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 11.3s, mem: 26.2GB, 13/13 tests passed, (59/3.2s) 18.7 T/s -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 13.5s, mem: 5.3GB, 13/13 tests passed, (62/4.0s) 15.6 T/s -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 10.1s, mem: 14.0GB, 13/13 tests passed, (65/2.8s) 23.4 T/s -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 72.2s, mem: 22.0GB, 13/13 tests passed, (184/23.3s) 7.9 T/s -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2" # test pass✅, time: 88.3s, mem: 67.1GB, 13/13 tests passed, (246/28.7s) 8.6 T/s -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 20.1s, mem: 12.2GB, 13/13 tests passed, (55/6.1s) 9.0 T/s -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2" # test pass✅, time: 17.9s, mem: 29.2GB, 13/13 tests passed, (55/5.4s) 10.2 T/s -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 22.7s, mem: 7.6GB, 13/13 tests passed, (88/6.9s) 12.7 T/s -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2" # test pass✅, time: 16.8s, mem: 16.4GB, 13/13 tests passed, (82/5.1s) 16.2 T/s -#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-0.5b-ov -A flash_attention_2" # test pass✅, time: 9.0s, mem: 23.3GB, 13/13 tests passed, (37/2.4s) 15.4 T/s -#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-7b-ov -A flash_attention_2" # test pass✅, time: 18.9s, mem: 36.4GB, 13/13 tests passed, (51/5.8s) 8.9 T/s -#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 40.4s, mem: 8.6GB, 13/13 tests passed, (73/5.5s) 13.2 T/s -#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2" # test pass✅, time: 25.1s, mem: 22.3GB, 13/13 tests passed, (67/4.0s) 16.8 T/s -#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-90B-Vision-Instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 70.3s, mem: 50.7GB, 13/13 tests passed, (95/18.4s) 5.2 T/s -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.6s, mem: 0.7GB, 13/13 tests passed -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 2.9s, mem: 1.0GB, 13/13 tests passed -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 4.2s, mem: 1.3GB, 13/13 tests passed -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 2.5s, mem: 2.1GB, 13/13 tests passed -#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 13.6s, mem: 7.2GB, 13/13 tests passed, (51/3.9s) 13.1 T/s -#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2" # test pass✅, time: 9.6s, mem: 12.1GB, 13/13 tests passed, (37/2.6s) 14.0 T/s -#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 9.6s, mem: 4.4GB, 13/13 tests passed, (37/2.6s) 14.1 T/s -#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2" # test pass✅, time: 8.6s, mem: 9.3GB, 13/13 tests passed, (41/2.4s) 17.4 T/s -#CLI_COMMAND="python vision.py -m mistralai/Pixtral-12B-2409" # test pass✅, time: 16.0s, mem: 35.5GB, 13/13 tests passed (manual calc) 12.7 T/s -#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2 --load-in-4bit" # test pass✅, time: 11.1s, mem: 13.9GB, 13/13 tests passed, (37/3.1s) 11.7 T/s -#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2" # test pass✅, time: 10.0s, mem: 16.3GB, 13/13 tests passed, (37/2.8s) 13.0 T/s -#CLI_COMMAND="python vision.py -m nvidia/NVLM-D-72B -A flash_attention_2 --load-in-4bit" # test pass✅, time: 62.0s, mem: 56.7GB, 13/13 tests passed, (66/19.7s) 3.3 T/s -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6-int4 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 19.0s, mem: 9.2GB, 13/13 tests passed, (93/5.2s) 18.0 T/s -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.8s, mem: 9.5GB, 13/13 tests passed, (99/4.4s) 22.5 T/s -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 13.3s, mem: 18.8GB, 13/13 tests passed, (101/3.4s) 30.1 T/s -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 28.2s, mem: 8.9GB, 13/13 tests passed, (76/8.8s) 8.6 T/s -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 25.7s, mem: 18.9GB, 13/13 tests passed, (75/7.7s) 9.8 T/s -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.0s, mem: 7.9GB, 13/13 tests passed, (49/2.2s) 22.6 T/s -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 7.1s, mem: 8.2GB, 13/13 tests passed, (52/1.5s) 35.4 T/s -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.9s, mem: 7.9GB, 13/13 tests passed, (44/1.9s) 22.9 T/s -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 9.0s, mem: 8.2GB, 13/13 tests passed, (64/1.8s) 35.7 T/s -#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit" # test pass✅, time: 13.9s, mem: 6.5GB, 13/13 tests passed, (80/4.3s) 18.5 T/s -#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0" # test pass✅, time: 9.1s, mem: 16.4GB, 13/13 tests passed, (82/2.8s) 29.8 T/s -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit" # test pass✅, time: 11.3s, mem: 7.8GB, 13/13 tests passed, (43/3.2s) 13.4 T/s -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1" # test pass✅, time: 9.7s, mem: 18.2GB, 13/13 tests passed, (51/2.6s) 19.3 T/s -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit" # test pass✅, time: 14.0s, mem: 7.8GB, 13/13 tests passed, (61/4.1s) 15.1 T/s -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1" # test pass✅, time: 13.1s, mem: 18.2GB, 13/13 tests passed, (95/3.8s) 24.7 T/s -#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 6.5s, mem: 2.4GB, 13/13 tests passed, (63/1.8s) 34.6 T/s -#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2" # test pass✅, time: 4.9s, mem: 4.3GB, 13/13 tests passed, (63/1.2s) 50.4 T/s +#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.6-Gemma2-9B -A flash_attention_2" # test pass✅, time: 32.8s, mem: 22.8GB, 13/13 tests passed, (133/10.4s) 12.8 T/s +#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Gemma2-9B -A flash_attention_2" # test pass✅, time: 9.4s, mem: 23.2GB, 13/13 tests passed, (32/2.6s) 12.4 T/s +#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Llama3-8B -A flash_attention_2" # test pass✅, time: 5.9s, mem: 19.2GB, 13/13 tests passed, (32/1.4s) 22.1 T/s +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit" # test pass✅, time: 8.5s, mem: 9.5GB, 13/13 tests passed, (39/1.6s) 24.4 T/s +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh" # test pass✅, time: 5.5s, mem: 10.8GB, 13/13 tests passed, (38/1.1s) 34.8 T/s +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit" # test pass✅, time: 10.7s, mem: 8.5GB, 13/13 tests passed, (59/2.8s) 21.1 T/s +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B" # test pass✅, time: 9.1s, mem: 12.0GB, 13/13 tests passed, (70/2.2s) 32.0 T/s +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh" # test pass✅, time: 7.9s, mem: 12.7GB, 13/13 tests passed, (37/1.8s) 20.9 T/s +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit" # test pass✅, time: 9.2s, mem: 5.1GB, 13/13 tests passed, (48/2.6s) 18.7 T/s +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B" # test pass✅, time: 7.3s, mem: 12.2GB, 13/13 tests passed, (44/1.8s) 23.9 T/s +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit" # test pass✅, time: 10.7s, mem: 5.8GB, 13/13 tests passed, (44/3.1s) 14.3 T/s +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B" # test pass✅, time: 8.4s, mem: 13.0GB, 13/13 tests passed, (35/2.3s) 15.4 T/s +#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit" # test pass✅, time: 32.2s, mem: 29.2GB, 13/13 tests passed, (79/9.7s) 8.1 T/s +#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB" # test pass✅, time: 20.8s, mem: 71.8GB, 13/13 tests passed, (71/5.8s) 12.2 T/s +#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat --load-in-4bit -A flash_attention_2" # test pass✅, time: 63.4s, mem: 65.8GB, 13/13 tests passed, (137/20.5s) 6.7 T/s +#CLI_COMMAND="python vision.py -m BAAI/Emu3-Chat -A flash_attention_2" # test pass✅, time: 67.8s, mem: 76.2GB, 13/13 tests passed, (159/21.8s) 7.3 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 23.7s, mem: 27.6GB, 13/13 tests passed, (60/7.3s) 8.2 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit" # test pass✅, time: 30.1s, mem: 31.0GB, 13/13 tests passed, (58/9.4s) 6.1 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40" # test pass✅, time: 25.5s, mem: 55.9GB, 13/13 tests passed, (45/7.9s) 5.7 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 19.1s, mem: 52.7GB, 13/13 tests passed, (50/5.9s) 8.5 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 31.8s, mem: 2.0GB, 13/13 tests passed, (271/10.2s) 26.5 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0" # test pass✅, time: 7.6s, mem: 2.7GB, 13/13 tests passed, (77/2.2s) 34.5 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 17.4s, mem: 5.5GB, 13/13 tests passed, (156/5.4s) 28.8 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0" # test pass✅, time: 8.3s, mem: 7.4GB, 13/13 tests passed, (90/2.4s) 36.7 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.1s, mem: 9.3GB, 13/13 tests passed, (43/2.9s) 14.8 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0" # test pass✅, time: 8.0s, mem: 19.1GB, 13/13 tests passed, (43/2.2s) 19.2 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 26.7s, mem: 27.7GB, 13/13 tests passed, (75/8.3s) 9.0 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0" # test pass✅, time: 20.0s, mem: 52.8GB, 13/13 tests passed, (59/6.3s) 9.4 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 39.0s, mem: 31.7GB, 13/13 tests passed, (82/12.3s) 6.7 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0" # test pass✅, time: 47.5s, mem: 76.8GB, 13/13 tests passed, (140/15.3s) 9.1 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-Llama3-76B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 40.8s, mem: 51.5GB, 13/13 tests passed, (40/12.8s) 3.1 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit" # test pass✅, time: 6.9s, mem: 6.0GB, 13/13 tests passed, (42/1.8s) 23.0 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 7.8s, mem: 8.2GB, 13/13 tests passed, (42/2.1s) 19.8 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40" # test pass✅, time: 6.8s, mem: 10.1GB, 13/13 tests passed, (48/1.8s) 26.5 T/s +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 6.1s, mem: 7.8GB, 13/13 tests passed, (48/1.6s) 30.7 T/s +#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit" # test pass✅, time: 10.2s, mem: 11.3GB, 13/13 tests passed +#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 6.9s, mem: 19.6GB, 13/13 tests passed +#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 12.9s, mem: 7.0GB, 13/13 tests passed, (44/3.8s) 11.6 T/s +#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct -A flash_attention_2" # test pass✅, time: 18.5s, mem: 16.4GB, 13/13 tests passed, (36/5.6s) 6.4 T/s +#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 22.5s, mem: 18.5GB, 13/13 tests passed, (36/6.7s) 5.4 T/s +#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct -A flash_attention_2" # test pass✅, time: 16.6s, mem: 27.5GB, 13/13 tests passed, (31/4.9s) 6.4 T/s +#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-72B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 36.1s, mem: 45.2GB, 13/13 tests passed, (31/11.4s) 2.7 T/s +#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5" # test pass✅, time: 9.0s, mem: 9.3GB, 13/13 tests passed, (68/2.4s) 28.2 T/s +#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5" # test pass✅, time: 3.8s, mem: 9.3GB, 13/13 tests passed, (10/0.8s) 13.1 T/s +#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5" # test pass✅, time: 9.1s, mem: 9.3GB, 13/13 tests passed, (73/2.4s) 29.9 T/s +#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1" # test pass✅, time: 9.5s, mem: 9.8GB, 13/13 tests passed, (74/2.6s) 28.0 T/s +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.7s, mem: 11.4GB, 13/13 tests passed, (14/2.8s) 5.0 T/s +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0" # test pass✅, time: 8.9s, mem: 20.7GB, 13/13 tests passed, (22/2.7s) 8.3 T/s +#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 23.2s, mem: 15.8GB, 13/13 tests passed, (92/7.4s) 12.4 T/s +#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 17.3s, mem: 26.0GB, 13/13 tests passed, (79/5.3s) 14.8 T/s +#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant" # test pass✅, time: 124.4s, mem: 6.1GB, 13/13 tests passed, (104/40.4s) 2.6 T/s +#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 95.7s, mem: 6.5GB, 13/13 tests passed, (84/31.6s) 2.7 T/s +#CLI_COMMAND="python vision.py -m allenai/MolmoE-1B-0924 -A flash_attention_2" # test pass✅, time: 26.6s, mem: 15.4GB, 13/13 tests passed, (40/8.4s) 4.8 T/s +#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit --use-double-quant" # test pass✅, time: 49.6s, mem: 8.1GB, 13/13 tests passed, (318/16.0s) 19.9 T/s +#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 40.6s, mem: 8.4GB, 13/13 tests passed, (310/13.1s) 23.6 T/s +#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-D-0924 -A flash_attention_2" # test pass✅, time: 31.9s, mem: 18.3GB, 13/13 tests passed, (302/10.3s) 29.5 T/s +#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit --use-double-quant" # test pass✅, time: 43.7s, mem: 8.7GB, 13/13 tests passed, (214/14.0s) 15.3 T/s +#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 36.8s, mem: 8.9GB, 13/13 tests passed, (214/11.8s) 18.1 T/s +#CLI_COMMAND="python vision.py -m allenai/Molmo-7B-O-0924 -A flash_attention_2" # test pass✅, time: 29.7s, mem: 18.6GB, 13/13 tests passed, (208/9.5s) 21.9 T/s +#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit --use-double-quant" # test pass✅, time: 130.7s, mem: 43.4GB, 13/13 tests passed, (285/42.8s) 6.7 T/s +#CLI_COMMAND="python vision.py -m allenai/Molmo-72B-0924 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 111.3s, mem: 48.2GB, 13/13 tests passed, (271/36.1s) 7.5 T/s +#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit" # test pass✅, time: 15.4s, mem: 15.8GB, 13/13 tests passed, (49/4.5s) 10.8 T/s +#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 11.7s, mem: 21.8GB, 13/13 tests passed, (32/3.4s) 9.4 T/s +#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2 --load-in-4bit" # test pass✅, time: 11.1s, mem: 7.5GB, 13/13 tests passed, (37/3.1s) 11.9 T/s +#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2" # test pass✅, time: 9.4s, mem: 12.5GB, 13/13 tests passed, (37/2.6s) 14.2 T/s +#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two --load-in-4bit -A flash_attention_2" # test pass✅, time: 49.3s, mem: 9.3GB, 13/13 tests passed, (209/18.6s) 11.2 T/s +#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-alpha-two -A flash_attention_2" # test pass✅, time: 38.1s, mem: 18.8GB, 13/13 tests passed, (166/9.3s) 17.9 T/s +#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha --load-in-4bit -A flash_attention_2" # test pass✅, time: 116.5s, mem: 8.4GB, 13/13 tests passed, (658/37.8s) 17.4 T/s +#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha -A flash_attention_2" # test pass✅, time: 63.1s, mem: 18.1GB, 13/13 tests passed, (644/19.2s) 33.5 T/s +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test fail❌, time: 3.1s, mem: 9.0GB, 1/13 tests passed +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 25.9s, mem: 30.0GB, 13/13 tests passed, (27/5.9s) 4.6 T/s +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 23.7s, mem: 22.1GB, 13/13 tests passed, (40/6.3s) 6.3 T/s +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 26.2s, mem: 18.4GB, 13/13 tests passed, (60/8.1s) 7.4 T/s +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit -A flash_attention_2" # test pass✅, time: 64.5s, mem: 8.8GB, 13/13 tests passed, (70/19.0s) 3.7 T/s +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 7.8s, mem: 7.7GB, 13/13 tests passed, (17/1.8s) 9.5 T/s +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 26.6s, mem: 20.8GB, 13/13 tests passed, (61/8.7s) 7.0 T/s +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit -A flash_attention_2" # test pass✅, time: 69.0s, mem: 10.6GB, 13/13 tests passed, (85/23.0s) 3.7 T/s +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.2s, mem: 9.3GB, 13/13 tests passed, (58/4.5s) 13.0 T/s +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 10.8s, mem: 26.4GB, 13/13 tests passed, (59/3.0s) 19.3 T/s +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.9s, mem: 5.4GB, 13/13 tests passed, (62/3.8s) 16.5 T/s +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 9.2s, mem: 14.2GB, 13/13 tests passed, (65/2.8s) 23.6 T/s +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 70.6s, mem: 22.1GB, 13/13 tests passed, (184/23.0s) 8.0 T/s +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2" # test pass✅, time: 88.1s, mem: 67.2GB, 13/13 tests passed, (246/28.7s) 8.6 T/s +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 19.7s, mem: 12.3GB, 13/13 tests passed, (55/6.0s) 9.2 T/s +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2" # test pass✅, time: 17.8s, mem: 29.4GB, 13/13 tests passed, (55/5.4s) 10.3 T/s +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 22.2s, mem: 7.7GB, 13/13 tests passed, (88/6.9s) 12.7 T/s +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2" # test pass✅, time: 16.4s, mem: 16.5GB, 13/13 tests passed, (82/4.9s) 16.7 T/s +#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-0.5b-ov -A flash_attention_2" # test pass✅, time: 8.8s, mem: 23.4GB, 13/13 tests passed, (37/2.4s) 15.7 T/s +#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-7b-ov -A flash_attention_2" # test pass✅, time: 18.8s, mem: 36.5GB, 13/13 tests passed, (51/5.7s) 8.9 T/s +#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 38.0s, mem: 8.7GB, 13/13 tests passed, (181/11.3s) 16.0 T/s +#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-11B-Vision-Instruct -A flash_attention_2" # test pass✅, time: 34.7s, mem: 22.4GB, 13/13 tests passed, (129/6.8s) 18.9 T/s +#CLI_COMMAND="python vision.py -m meta-llama/Llama-3.2-90B-Vision-Instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 123.9s, mem: 50.9GB, 13/13 tests passed, (347/59.3s) 5.9 T/s +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.3s, mem: 1.0GB, 13/13 tests passed +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 2.4s, mem: 1.3GB, 13/13 tests passed +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.6s, mem: 1.5GB, 13/13 tests passed +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 2.5s, mem: 2.4GB, 13/13 tests passed +#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 13.0s, mem: 7.4GB, 13/13 tests passed, (51/3.8s) 13.5 T/s +#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2" # test pass✅, time: 9.4s, mem: 12.4GB, 13/13 tests passed, (37/2.6s) 14.3 T/s +#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 9.4s, mem: 4.7GB, 13/13 tests passed, (37/2.6s) 14.2 T/s +#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2" # test pass✅, time: 8.1s, mem: 9.6GB, 13/13 tests passed, (41/2.2s) 18.6 T/s +#CLI_COMMAND="python vision.py -m mistralai/Pixtral-12B-2409" # test pass✅, time: 16.8s, mem: 35.8GB, 13/13 tests passed (manual calc) 12.7 T/s +#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2 --load-in-4bit" # test pass✅, time: 11.2s, mem: 14.3GB, 13/13 tests passed, (37/3.2s) 11.5 T/s +#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2" # test pass✅, time: 10.1s, mem: 16.6GB, 13/13 tests passed, (37/2.9s) 12.9 T/s +#CLI_COMMAND="python vision.py -m nvidia/NVLM-D-72B -A flash_attention_2 --load-in-4bit" # test pass✅, time: 63.4s, mem: 56.8GB, 13/13 tests passed, (67/20.3s) 3.3 T/s +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6-int4 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 19.8s, mem: 9.5GB, 13/13 tests passed, (96/5.4s) 17.8 T/s +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.6s, mem: 9.8GB, 13/13 tests passed, (104/4.7s) 22.1 T/s +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 13.2s, mem: 19.2GB, 13/13 tests passed, (97/3.4s) 28.8 T/s +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 30.6s, mem: 9.3GB, 13/13 tests passed, (80/9.4s) 8.5 T/s +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 27.6s, mem: 19.3GB, 13/13 tests passed, (75/8.6s) 8.7 T/s +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 11.0s, mem: 8.2GB, 13/13 tests passed, (69/2.7s) 25.7 T/s +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 6.7s, mem: 8.6GB, 13/13 tests passed, (52/1.4s) 36.0 T/s +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.6s, mem: 8.2GB, 13/13 tests passed, (37/1.7s) 22.1 T/s +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 7.2s, mem: 8.7GB, 13/13 tests passed, (63/1.8s) 34.5 T/s +#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit" # test pass✅, time: 13.5s, mem: 6.8GB, 13/13 tests passed, (81/4.2s) 19.4 T/s +#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0" # test pass✅, time: 8.6s, mem: 16.7GB, 13/13 tests passed, (69/2.3s) 29.6 T/s +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit" # test pass✅, time: 11.4s, mem: 8.1GB, 13/13 tests passed, (43/3.2s) 13.4 T/s +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1" # test pass✅, time: 9.5s, mem: 18.6GB, 13/13 tests passed, (51/2.6s) 19.8 T/s +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit" # test pass✅, time: 13.8s, mem: 8.1GB, 13/13 tests passed, (61/4.1s) 15.0 T/s +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1" # test pass✅, time: 12.9s, mem: 18.5GB, 13/13 tests passed, (95/3.8s) 24.7 T/s +#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 6.0s, mem: 2.7GB, 13/13 tests passed, (63/1.7s) 36.4 T/s +#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2" # test pass✅, time: 4.8s, mem: 4.6GB, 13/13 tests passed, (63/1.2s) 51.2 T/s diff --git a/vision_qna.py b/vision_qna.py index 9b64891..af1ab34 100644 --- a/vision_qna.py +++ b/vision_qna.py @@ -901,6 +901,8 @@ def guess_backend(model_name: str) -> str: return 'minigemini' if 'ovis' in model_id: + if '1.6' in model_id: + return 'ovis16' return 'ovis' if 'deepseek' in model_id: