diff --git a/README.md b/README.md
index 5e50a7a..deb8cf8 100644
--- a/README.md
+++ b/README.md
@@ -10,12 +10,13 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
## Model support
- [X] [OpenGVLab](https://huggingface.co/OpenGVLab)
+- - [X] [InternVL2-Llama3-76B](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B)
- - [X] [InternVL2-40B](https://huggingface.co/OpenGVLab/InternVL2-40B)
- - [X] [InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B)
- - [X] [InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B)
- - [X] [InternVL2-4B](https://huggingface.co/OpenGVLab/InternVL2-4B) (alternate docker only)
- - [X] [InternVL2-2B](https://huggingface.co/OpenGVLab/InternVL2-2B)
-- - [X] [InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)
+- - [X] [InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ) (currently errors)
- - [X] [InternVL2-1B](https://huggingface.co/OpenGVLab/InternVL2-1B)
- - [X] [InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5) (wont gpu split yet)
- - [X] [InternVL-Chat-V1-5-Int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-Int8) (wont gpu split yet)
@@ -100,7 +101,9 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
- - [X] [MGM-34B-HD](https://huggingface.co/YanweiLi/MGM-34B-HD) (alternate docker only)
- - [X] [MGM-8x7B-HD](https://huggingface.co/YanweiLi/MGM-8x7B-HD) (alternate docker only)
- [X] [cognitivecomputations/dolphin-vision-72b](https://huggingface.co/cognitivecomputations/dolphin-vision-72b)
-- [X] [qnguyen3/nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA) (wont gpu split)
+- [X] [qnguyen3]
+- - [X] [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA) (wont gpu split)
+- - [X] [nanoLLaVA-1.5](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) (wont gpu split)
- [ ] [01-ai/Yi-VL](https://huggingface.co/01-ai)
- - [ ] [Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B) (currently errors)
- - [ ] [Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B) (currently errors)
@@ -113,9 +116,15 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le
## Recent updates
+Version 0.27.1
+
+- new model support: qnguyen3/nanoLLaVA-1.5
+- Complete support for chat *without* images (using placeholder images where required, 1x1 clear or 8x8 black as necessary)
+- Require transformers==4.41.2 (4.42 breaks many models)
+
Version 0.27.0
-- new model support: OpenGVLab/InternVL2 series of models (1B, 2B, 4B, 8B*, 26B*, 40B*) - *(current top open source models)
+- new model support: OpenGVLab/InternVL2 series of models (1B, 2B, 4B, 8B*, 26B*, 40B*, 76B*) - *(current top open source models)
Version 0.26.0
@@ -242,9 +251,9 @@ cp vision-alt.sample.env vision-alt.env
2) You can run the server via docker compose like so:
```shell
-# for OpenedAI Vision Server (transformers>=4.39.0)
+# for OpenedAI Vision Server
docker compose up
-# for OpenedAI Vision Server (alternate, for Mini-Gemini > 2B, used transformers==4.36.2)
+# for OpenedAI Vision Server (alternate, for Mini-Gemini > 2B, uses transformers==4.36.2)
docker compose -f docker-compose.alt.yml up
```
@@ -262,7 +271,7 @@ docker compose -f docker-compose.alt.yml pull
```shell
# install the python dependencies
-pip install -U -r requirements.txt "transformers>=4.41.2" "autoawq>=0.2.5"
+pip install -U -r requirements.txt "transformers==4.41.2" "autoawq>=0.2.5"
# OR install the python dependencies for the alt version
pip install -U -r requirements.txt "transformers==4.36.2"
# run the server with your chosen model
diff --git a/backend/florence.py b/backend/florence.py
index 9df169c..a084b30 100644
--- a/backend/florence.py
+++ b/backend/florence.py
@@ -40,6 +40,9 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
async def chat_with_images(self, request: ImageChatRequest) -> str:
images, prompt = await prompt_from_messages(request.messages, self.format)
+ if len(images) < 1:
+ images = [ await url_to_image(black_pixel_url) ]
+
inputs = self.processor(text=prompt, images=images[0], return_tensors="pt").to(device=self.model.device, dtype=self.model.dtype)
default_params = {
diff --git a/backend/internvl-chat-v1-5.py b/backend/internvl-chat-v1-5.py
index 26cdb4d..384ec01 100644
--- a/backend/internvl-chat-v1-5.py
+++ b/backend/internvl-chat-v1-5.py
@@ -18,7 +18,7 @@
# OpenGVLab/InternVL2-8B
# OpenGVLab/InternVL2-26B
# OpenGVLab/InternVL2-40B (yi-34- nous-hermes-2)
-
+# OpenGVLab/InternVL2-Llama3-76B
MAX_TILES = 6
@@ -143,7 +143,7 @@ async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGener
for img in images:
image_tokens = '' + '' * self.model.num_image_token * img.size(0) + ''
prompt = prompt.replace('', image_tokens, 1)
-
+
model_inputs = self.tokenizer(prompt, return_tensors='pt')
input_ids = model_inputs['input_ids'].cuda()
attention_mask = model_inputs['attention_mask'].cuda()
diff --git a/backend/llama3vision.py b/backend/llama3vision.py
index 5808beb..6859ddb 100644
--- a/backend/llama3vision.py
+++ b/backend/llama3vision.py
@@ -29,6 +29,10 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
images, prompt = await prompt_from_messages(request.messages, self.format)
+ if len(images) < 1:
+ images = [ await url_to_image(black_pixel_url) ]
+ prompt = '\n' + prompt
+
input_ids = self.model.tokenizer_image_token(prompt, self.tokenizer, -200, return_tensors="pt").unsqueeze(0).to(self.device)
image_inputs = self.model.processor(
images=images,
diff --git a/backend/llava.py b/backend/llava.py
index e909ed3..16b179b 100644
--- a/backend/llava.py
+++ b/backend/llava.py
@@ -29,6 +29,10 @@ async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGener
images, prompt = await prompt_from_messages(request.messages, self.format)
+ if len(images) < 1:
+ images = [ await url_to_image(black_pixel_url) ]
+ prompt = "\n" + prompt
+
inputs = self.processor(prompt, images, return_tensors="pt").to(self.device)
params = self.get_generation_params(request)
diff --git a/backend/llavanext.py b/backend/llavanext.py
index 5fed1c7..b05c031 100644
--- a/backend/llavanext.py
+++ b/backend/llavanext.py
@@ -31,6 +31,10 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
images, prompt = await prompt_from_messages(request.messages, self.format)
+ if len(images) < 1:
+ images = [ await url_to_image(black_pixel_url) ]
+ prompt = "\n" + prompt
+
inputs = self.processor(prompt, images, return_tensors="pt").to(self.model.device)
default_params = dict(
diff --git a/backend/minicpm.py b/backend/minicpm.py
index 2605069..dc26efb 100644
--- a/backend/minicpm.py
+++ b/backend/minicpm.py
@@ -40,7 +40,7 @@ async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGener
else:
msgs.extend([{ 'role': m.role, 'content': c.text }])
- if not image:
+ if image is None:
image = await url_to_image(transparent_pixel_url)
# default uses num_beams: 3, but if streaming/sampling is requested, switch the defaults.
diff --git a/backend/minigemini.py b/backend/minigemini.py
index c48f85a..46a741b 100644
--- a/backend/minigemini.py
+++ b/backend/minigemini.py
@@ -52,7 +52,11 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
self.loaded_banner()
async def chat_with_images(self, request: ImageChatRequest) -> str:
- image_convert, prompt = await prompt_from_messages(request.messages, self.format)
+ images, prompt = await prompt_from_messages(request.messages, self.format)
+
+ if len(images) < 1:
+ images = [ await url_to_image(black_pixel_url) ]
+ prompt = '\n' + prompt
if hasattr(self.model.config, 'image_size_aux'):
if not hasattr(self.image_processor, 'image_size_raw'):
@@ -61,7 +65,7 @@ async def chat_with_images(self, request: ImageChatRequest) -> str:
self.image_processor.crop_size['width'] = self.model.config.image_size_aux
self.image_processor.size['shortest_edge'] = self.model.config.image_size_aux
- image_tensor = process_images(image_convert, self.image_processor, self.model.config)
+ image_tensor = process_images(images, self.image_processor, self.model.config)
image_grid = getattr(self.model.config, 'image_grid', 1)
if hasattr(self.model.config, 'image_size_aux'):
diff --git a/backend/nanollava.py b/backend/nanollava.py
index e5037c4..d051020 100644
--- a/backend/nanollava.py
+++ b/backend/nanollava.py
@@ -8,7 +8,8 @@
transformers.logging.set_verbosity_error()
warnings.filterwarnings('ignore')
-# 'qnguyen3/nanoLLaVA'
+# qnguyen3/nanoLLaVA
+# qnguyen3/nanoLLaVA-1.5
def join_int_lists(int_lists, separator):
result = []
diff --git a/model_conf_tests.json b/model_conf_tests.json
index 21080cb..1dab289 100644
--- a/model_conf_tests.json
+++ b/model_conf_tests.json
@@ -29,6 +29,7 @@
["OpenGVLab/InternVL2-1B", "--device-map", "cuda:0"],
["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0", "--load-in-4bit"],
["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0"],
+ ["OpenGVLab/InternVL2-2B-AWQ", "--device-map", "cuda:0"],
["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0", "--load-in-4bit"],
["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0"],
["OpenGVLab/InternVL2-8B", "--device-map", "cuda:0", "--load-in-4bit"],
@@ -37,6 +38,7 @@
["OpenGVLab/InternVL2-26B", "--device-map", "cuda:0"],
["OpenGVLab/InternVL2-40B", "--device-map", "cuda:0", "--load-in-4bit"],
["OpenGVLab/InternVL2-40B", "--device-map", "cuda:0"],
+ ["OpenGVLab/InternVL2-Llama3-76B", "--device-map", "cuda:0", "--load-in-4bit"],
["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--load-in-4bit"],
["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40", "--load-in-4bit"],
["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40"],
@@ -74,6 +76,8 @@
["echo840/Monkey-Chat"],
["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "--use-flash-attn", "--load-in-4bit"],
["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "--use-flash-attn"],
+ ["internlm/internlm-xcomposer2d5-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
+ ["internlm/internlm-xcomposer2d5-7b", "--use-flash-attn", "--device-map", "cuda:0"],
["internlm/internlm-xcomposer2-4khd-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
["internlm/internlm-xcomposer2-4khd-7b", "--use-flash-attn", "--device-map", "cuda:0"],
["internlm/internlm-xcomposer2-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
@@ -112,6 +116,8 @@
["qihoo360/360VL-8B", "--use-flash-attn"],
["qnguyen3/nanoLLaVA", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
["qnguyen3/nanoLLaVA", "--use-flash-attn", "--device-map", "cuda:0"],
+ ["qnguyen3/nanoLLaVA-1.5", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
+ ["qnguyen3/nanoLLaVA-1.5", "--use-flash-attn", "--device-map", "cuda:0"],
["qresearch/llama-3-vision-alpha-hf", "--device", "cuda:0", "--load-in-4bit"],
["qresearch/llama-3-vision-alpha-hf", "--device", "cuda:0"],
["tiiuae/falcon-11B-vlm", "--use-flash-attn", "--load-in-4bit"],
diff --git a/test_models.py b/test_models.py
index 4144282..38d4b0e 100755
--- a/test_models.py
+++ b/test_models.py
@@ -205,12 +205,11 @@ def generate_stream_response(image_url, prompt):
return answer
-
-
def single_round():
# XXX TODO: timeout
results = []
### Single round
+
# url tests
for name, url in urls.items():
answer = generate_response(url, "What is the subject of the image?")
@@ -244,8 +243,8 @@ def single_round():
else:
print(f"{name}[data_stream]: pass{', got: ' + answer if args.verbose else ''}")
- """
+ """
## OCR tests
quality_urls = {
'98.21': ('What is the total bill?', 'https://ocr.space/Content/Images/receipt-ocr-original.webp'),
@@ -262,6 +261,7 @@ def single_round():
break
else:
print(f"{name}[quality]: pass{', got: ' + answer if args.verbose else ''}")
+ """
# No image tests
no_image = {
@@ -287,7 +287,6 @@ def no_image_response(prompt):
else:
print(f"{name}[no_img]: pass{', got: ' + answer if args.verbose else ''}")
- """
return results
with open('model_conf_tests.json') as f:
diff --git a/vision.sample.env b/vision.sample.env
index 11f81a1..c70412b 100644
--- a/vision.sample.env
+++ b/vision.sample.env
@@ -4,126 +4,129 @@ HF_HOME=hf_home
HF_HUB_ENABLE_HF_TRANSFER=1
#HF_TOKEN=hf-...
#CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 8.2s, mem: 8.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V" # test pass✅, time: 7.6s, mem: 19.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit" # test pass✅, time: 8.1s, mem: 9.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh" # test pass✅, time: 5.6s, mem: 10.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit" # test pass✅, time: 11.2s, mem: 8.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B" # test pass✅, time: 8.3s, mem: 11.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh" # test pass✅, time: 7.3s, mem: 12.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit" # test pass✅, time: 8.7s, mem: 5.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B" # test pass✅, time: 6.6s, mem: 12.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit" # test pass✅, time: 9.2s, mem: 5.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B" # test pass✅, time: 8.2s, mem: 13.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 9.6s, mem: 9.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V" # test pass✅, time: 9.6s, mem: 19.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit" # test pass✅, time: 25.0s, mem: 29.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB" # test pass✅, time: 21.5s, mem: 71.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.7s, mem: 12.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 11.0s, mem: 22.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 12.7s, mem: 12.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.5s, mem: 12.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0" # test pass✅, time: 13.4s, mem: 22.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 14.1s, mem: 12.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 19.9s, mem: 26.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit" # test pass✅, time: 26.2s, mem: 30.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40" # test pass✅, time: 24.8s, mem: 54.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 17.9s, mem: 52.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass✅, time: 34.6s, mem: 31.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.6s, mem: 3.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0" # test pass✅, time: 6.6s, mem: 4.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 14.8s, mem: 4.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0" # test pass✅, time: 8.2s, mem: 6.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.6s, mem: 5.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0" # test pass✅, time: 7.7s, mem: 10.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.1s, mem: 8.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0" # test pass✅, time: 7.4s, mem: 18.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 21.4s, mem: 26.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0" # test pass✅, time: 19.9s, mem: 52.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 55.6s, mem: 32.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0" # test pass✅, time: 68.4s, mem: 77.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit" # test pass✅, time: 6.1s, mem: 5.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 6.8s, mem: 6.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40" # test pass✅, time: 6.7s, mem: 8.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 5.7s, mem: 7.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit" # test pass✅, time: 9.0s, mem: 6.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 9.3s, mem: 9.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40" # test pass✅, time: 8.4s, mem: 14.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5" # test pass✅, time: 7.4s, mem: 11.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit" # test pass✅, time: 8.7s, mem: 11.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 6.0s, mem: 19.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1" # test pass✅, time: 9.7s, mem: 10.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 24.9s, mem: 13.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 21.0s, mem: 37.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 25.9s, mem: 12.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 19.3s, mem: 36.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit" # test pass✅, time: 31.5s, mem: 22.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B" # test pass✅, time: 31.8s, mem: 40.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit" # test pass✅, time: 135.0s, mem: 22.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B" # test pass✅, time: 99.3s, mem: 40.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 70.9s, mem: 16.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0" # test pass✅, time: 50.9s, mem: 27.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.1s, mem: 11.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0" # test pass✅, time: 8.7s, mem: 20.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.1s, mem: 7.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 8.7s, mem: 17.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.9s, mem: 8.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.5s, mem: 18.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test pass✅, time: 5.3s, mem: 8.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 14.0s, mem: 15.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 18.6s, mem: 25.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b --use-flash-attn --load-in-4bit" # test pass✅, time: 48.0s, mem: 49.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey --load-in-4bit" # test pass✅, time: 9.3s, mem: 15.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass✅, time: 8.8s, mem: 21.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit" # test pass✅, time: 12.6s, mem: 15.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 11.2s, mem: 21.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn --load-in-4bit" # test pass✅, time: 10.5s, mem: 7.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn" # test pass✅, time: 9.0s, mem: 12.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 2.4s, mem: 7.2GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 19.8s, mem: 20.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.4s, mem: 6.4GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 25.7s, mem: 19.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn" # test pass✅, time: 13.7s, mem: 9.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.7s, mem: 3.0GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.6s, mem: 7.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.5s, mem: 6.5GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 24.1s, mem: 20.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn" # test pass✅, time: 13.5s, mem: 10.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.1s, mem: 9.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 9.2s, mem: 26.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.6s, mem: 5.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.9s, mem: 14.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 58.6s, mem: 23.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn" # test pass✅, time: 69.8s, mem: 68.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 20.7s, mem: 8.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn" # test pass✅, time: 18.6s, mem: 17.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 14.3s, mem: 17.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn" # test pass✅, time: 13.9s, mem: 33.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 13.0s, mem: 9.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn" # test pass✅, time: 12.6s, mem: 19.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.0s, mem: 1.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.5s, mem: 1.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.4s, mem: 1.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.7s, mem: 2.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn --load-in-4bit" # test pass✅, time: 10.2s, mem: 7.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn" # test pass✅, time: 8.5s, mem: 12.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 11.1s, mem: 9.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0" # test pass✅, time: 10.5s, mem: 19.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.0s, mem: 4.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.2s, mem: 8.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 8.3s, mem: 8.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V" # test pass✅, time: 7.3s, mem: 19.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit" # test pass✅, time: 8.2s, mem: 9.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh" # test pass✅, time: 5.4s, mem: 10.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit" # test pass✅, time: 11.6s, mem: 8.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B" # test pass✅, time: 9.0s, mem: 11.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh" # test pass✅, time: 8.5s, mem: 12.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit" # test pass✅, time: 9.2s, mem: 5.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B" # test pass✅, time: 6.4s, mem: 12.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit" # test pass✅, time: 8.9s, mem: 5.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B" # test pass✅, time: 7.9s, mem: 13.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 9.8s, mem: 9.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V" # test pass✅, time: 9.7s, mem: 19.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit" # test pass✅, time: 25.6s, mem: 29.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB" # test pass✅, time: 18.8s, mem: 71.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 11.0s, mem: 12.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 11.0s, mem: 22.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 12.8s, mem: 12.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.6s, mem: 12.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0" # test pass✅, time: 13.6s, mem: 22.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 14.4s, mem: 12.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 20.3s, mem: 26.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit" # test pass✅, time: 26.7s, mem: 30.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40" # test pass✅, time: 25.0s, mem: 54.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 18.0s, mem: 52.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass✅, time: 35.3s, mem: 31.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.8s, mem: 3.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0" # test pass✅, time: 6.6s, mem: 4.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 14.4s, mem: 4.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0" # test pass✅, time: 7.8s, mem: 6.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.3s, mem: 5.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0" # test pass✅, time: 7.3s, mem: 10.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.3s, mem: 8.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0" # test pass✅, time: 7.9s, mem: 18.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 21.6s, mem: 26.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0" # test pass✅, time: 20.0s, mem: 52.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 56.0s, mem: 32.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0" # test pass✅, time: 68.2s, mem: 77.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-Llama3-76B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 39.3s, mem: 53.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit" # test pass✅, time: 5.6s, mem: 5.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 6.4s, mem: 6.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40" # test pass✅, time: 6.3s, mem: 8.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 5.4s, mem: 7.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit" # test pass✅, time: 8.6s, mem: 6.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 9.0s, mem: 9.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40" # test pass✅, time: 8.0s, mem: 14.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5" # test pass✅, time: 6.9s, mem: 11.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit" # test pass✅, time: 9.0s, mem: 11.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 6.1s, mem: 19.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1" # test pass✅, time: 9.7s, mem: 10.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 25.5s, mem: 13.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 21.1s, mem: 37.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 26.3s, mem: 12.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 19.7s, mem: 36.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit" # test pass✅, time: 32.1s, mem: 22.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B" # test pass✅, time: 31.9s, mem: 40.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit" # test pass✅, time: 134.7s, mem: 22.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B" # test pass✅, time: 98.9s, mem: 40.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 70.6s, mem: 16.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0" # test pass✅, time: 50.5s, mem: 27.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.2s, mem: 11.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0" # test pass✅, time: 8.9s, mem: 20.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.5s, mem: 7.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 8.8s, mem: 17.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.5s, mem: 8.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.5s, mem: 18.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test pass✅, time: 5.9s, mem: 8.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.1s, mem: 15.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 18.7s, mem: 25.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b --use-flash-attn --load-in-4bit" # test pass✅, time: 48.2s, mem: 49.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey --load-in-4bit" # test pass✅, time: 9.0s, mem: 15.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass✅, time: 8.6s, mem: 21.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit" # test pass✅, time: 12.3s, mem: 15.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 10.9s, mem: 21.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn --load-in-4bit" # test pass✅, time: 10.5s, mem: 7.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn" # test pass✅, time: 8.7s, mem: 12.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 2.0s, mem: 7.2GB, 1/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 18.2s, mem: 20.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.4s, mem: 6.4GB, 1/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 26.0s, mem: 19.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn" # test pass✅, time: 15.1s, mem: 9.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.6s, mem: 3.0GB, 1/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.9s, mem: 7.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.5s, mem: 6.5GB, 1/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 22.2s, mem: 20.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn" # test pass✅, time: 15.9s, mem: 10.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.4s, mem: 9.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 9.4s, mem: 26.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.0s, mem: 5.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 8.1s, mem: 14.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 59.9s, mem: 23.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn" # test pass✅, time: 69.3s, mem: 68.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 21.4s, mem: 8.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn" # test pass✅, time: 19.0s, mem: 17.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 14.6s, mem: 17.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn" # test pass✅, time: 14.2s, mem: 33.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 12.8s, mem: 9.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn" # test pass✅, time: 12.5s, mem: 19.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 2.7s, mem: 1.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.5s, mem: 1.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.1s, mem: 1.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0" # test pass✅, time: 2.4s, mem: 2.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn --load-in-4bit" # test pass✅, time: 10.5s, mem: 7.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn" # test pass✅, time: 8.7s, mem: 12.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.2s, mem: 9.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0" # test pass✅, time: 10.1s, mem: 19.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.7s, mem: 4.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass✅, time: 5.7s, mem: 8.6GB, 13/13 tests passed.
#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 3.3s, mem: 3.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass✅, time: 11.7s, mem: 8.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit" # test pass✅, time: 9.4s, mem: 8.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn" # test pass✅, time: 7.6s, mem: 17.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.0s, mem: 7.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.5s, mem: 8.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit" # test pass✅, time: 9.0s, mem: 7.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0" # test pass✅, time: 8.8s, mem: 17.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn --load-in-4bit" # test pass✅, time: 14.6s, mem: 17.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn" # test pass✅, time: 16.3s, mem: 32.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit" # test pass✅, time: 10.9s, mem: 7.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1" # test pass✅, time: 9.8s, mem: 17.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit" # test pass✅, time: 9.6s, mem: 7.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1" # test pass✅, time: 12.7s, mem: 17.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn --load-in-4bit" # test pass✅, time: 6.2s, mem: 2.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass✅, time: 5.0s, mem: 4.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass✅, time: 9.7s, mem: 9.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit" # test pass✅, time: 9.8s, mem: 8.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn" # test pass✅, time: 8.0s, mem: 17.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.6s, mem: 7.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.3s, mem: 8.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 7.6s, mem: 7.7GB, 12/13 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.0s, mem: 8.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit" # test pass✅, time: 9.7s, mem: 8.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0" # test pass✅, time: 8.5s, mem: 17.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn --load-in-4bit" # test pass✅, time: 15.0s, mem: 17.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn" # test pass✅, time: 16.4s, mem: 32.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit" # test pass✅, time: 10.7s, mem: 8.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1" # test pass✅, time: 9.7s, mem: 17.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit" # test pass✅, time: 9.9s, mem: 8.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1" # test pass✅, time: 13.3s, mem: 17.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn --load-in-4bit" # test pass✅, time: 6.2s, mem: 2.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass✅, time: 4.9s, mem: 4.6GB, 13/13 tests passed.
diff --git a/vision_qna.py b/vision_qna.py
index 4c956bc..211f072 100644
--- a/vision_qna.py
+++ b/vision_qna.py
@@ -12,7 +12,8 @@
from loguru import logger
# When models require an image but no image given
-transparent_pixel_url = 'data:image/png;charset=utf-8;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABAQMAAAAl21bKAAAAA1BMVEUAAACnej3aAAAAAXRSTlMAQObYZgAAAApJREFUCNdjYAAAAAIAAeIhvDMAAAAASUVORK5CYII='
+black_pixel_url = 'data:image/png;charset=utf-8;base64,iVBORw0KGgoAAAANSUhEUgAAAAgAAAAICAIAAABLbSncAAAADElEQVQI12NgGB4AAADIAAF8Y2l9AAAAAElFTkSuQmCC'
+transparent_pixel_url = 'data:image/png;charset=utf-8;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAADElEQVQI12P4//8/AAX+Av7czFnnAAAAAElFTkSuQmCC'
class ImageURL(BaseModel):
url: str
@@ -89,7 +90,7 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
torch.set_grad_enabled(False)
def loaded_banner(self):
- logger.info(f"Loaded {self._model_id} on device: {self.model.device} with dtype: {self.model.dtype} and template: {self.format}")
+ logger.info(f"Loaded {self._model_id} [ device: {self.model.device}, dtype: {self.model.dtype}, template: {self.format} ]")
def select_device(self):
return 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'