From 96cc4e33b69a3dac5b3a3268055c254ef13a857f Mon Sep 17 00:00:00 2001
From: matatonic <matatonic-git@zhero.org>
Date: Sun, 22 Sep 2024 17:43:29 -0400
Subject: [PATCH] 0.33.0

---
 Dockerfile                |   2 +-
 README.md                 |  33 ++----
 backend/qwen2-vl.py       |   4 +
 model_conf_tests.alt.json |   6 +
 model_conf_tests.json     |   6 -
 requirements.txt          |   3 +-
 vision-alt.sample.env     |  54 +++++----
 vision.sample.env         | 231 +++++++++++++++++++-------------------
 8 files changed, 168 insertions(+), 171 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 04e2ef7..4c06d41 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,7 +10,7 @@ RUN git clone https://github.com/togethercomputer/Dragonfly --single-branch /app
 
 COPY requirements.txt .
 ARG VERSION=latest
-RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "git+https://github.com/huggingface/transformers\nautoawq>=0.2.5" >> requirements.txt ; fi
+RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "git+https://github.com/huggingface/transformers" >> requirements.txt ; fi
 # TODO: nvidia apex wheel
 RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt
 
diff --git a/README.md b/README.md
index 3a70ef0..33fbc4c 100644
--- a/README.md
+++ b/README.md
@@ -38,10 +38,10 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 - [X] [fancyfeast/joy-caption-pre-alpha](https://huggingface.co/spaces/fancyfeast/joy-caption-pre-alpha) (caption only)
 - [X] [fuyu-8b](https://huggingface.co/adept/fuyu-8b) [pretrain]
 - [X] [HuggingFaceM4/idefics2](https://huggingface.co/HuggingFaceM4) 
-- - [X] [idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) (wont gpu split)
-- - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (wont gpu split)
-- - [X] [idefics2-8b-chatty](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty) (wont gpu split)
-- - [X] [idefics2-8b-chatty-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty-AWQ) (wont gpu split)
+- - [X] [idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) (wont gpu split, alternate docker only)
+- - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (wont gpu split, alternate docker only)
+- - [X] [idefics2-8b-chatty](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty) (wont gpu split, alternate docker only)
+- - [X] [idefics2-8b-chatty-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty-AWQ) (wont gpu split, alternate docker only)
 - [X] [InternLM](https://huggingface.co/internlm/)
 - - [X] [XComposer2-2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b) (wont gpu split)
 - - [X] [XComposer2-4KHD-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b) (wont gpu split)
@@ -148,7 +148,9 @@ If you can't find your favorite model, you can [open a new issue](https://github
 Version 0.33.0
 
 - new model support: mx262/MiniMonkey, thanks [@white2018](https://github.com/white2018)
-- Fix qwen2-vl when used with qwen-agent and multiple system prompts (tools), thanks [@cedonley](https://github.com/cedonley)
+- Fix Qwen2-VL when used with Qwen-Agent and multiple system prompts (tools), thanks [@cedonley](https://github.com/cedonley)
+- idefics2-8b support moved to alt image
+- pin Qwen2-VL-7B-Instruct-AWQ revision, [see note for info](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ/discussions/4)
 
 Version 0.32.0
 
@@ -359,7 +361,7 @@ docker compose -f docker-compose.alt.yml pull
 python -m venv .venv
 source .venv/bin/activate
 # install the python dependencies
-pip install -U -r requirements.txt "git+https://github.com/huggingface/transformers" "autoawq>=0.2.5"
+pip install -U -r requirements.txt "git+https://github.com/huggingface/transformers"
 # OR install the python dependencies for the alt version
 pip install -U -r requirements.txt "transformers==4.41.2"
 # run the server with your chosen model
@@ -484,19 +486,6 @@ CUDA_VISIBLE_DEVICES=1,0 python vision.py -m llava-hf/llava-v1.6-34b-hf --device
 
 You can also use the environment variable: `OPENEDAI_DEVICE_MAP="sequential"` to specify the `--device-map` argument.
 
-4. "My Nvidia GPU isn't detected when using docker."
-- On Linux, you may need to specify the default runtime for your container environment (and perhaps install the nvidia-container-runtime), like so:
-In /etc/docker/daemon.json:
-```json
-{
-    "runtimes": {
-        "nvidia": {
-            "path": "nvidia-container-runtime",
-            "runtimeArgs": []
-        }
-    },
-    "default-runtime": "nvidia"
-}
-```
-- In Windows, be sure you have WSL2 installed and docker is configured to use it. Also make sure your nvidia drivers are up to date.
-
+4. "My Nvidia GPU isn't detected when using docker.", using Nvidia CUDA with docker.
+- Linux: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
+- Windows: Use WSL2 with docker and nvidia drivers: https://docs.nvidia.com/cuda/wsl-user-guide/index.html
diff --git a/backend/qwen2-vl.py b/backend/qwen2-vl.py
index 528463d..f0f3b06 100644
--- a/backend/qwen2-vl.py
+++ b/backend/qwen2-vl.py
@@ -25,6 +25,10 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
         self.processor = AutoProcessor.from_pretrained(model_id)
         
         del self.params['trust_remote_code']
+
+        if model_id == 'Qwen/Qwen2-VL-7B-Instruct-AWQ':
+            self.params['revision'] = '9d72ae62396aaa1817b006e07ddbbd121024f50d' # re: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ/discussions/4
+
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(**self.params).eval()
 
         self.loaded_banner()
diff --git a/model_conf_tests.alt.json b/model_conf_tests.alt.json
index 96aebe3..c2722ad 100644
--- a/model_conf_tests.alt.json
+++ b/model_conf_tests.alt.json
@@ -1,4 +1,10 @@
 [
+  ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
+  ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
+  ["HuggingFaceM4/idefics2-8b-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"],
+  ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
+  ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0"],
+  ["HuggingFaceM4/idefics2-8b-chatty-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0"],
   ["OpenGVLab/Mini-InternVL-Chat-4B-V1-5", "--max-tiles", "40", "--load-in-4bit"],
diff --git a/model_conf_tests.json b/model_conf_tests.json
index 95a864a..e6b285a 100644
--- a/model_conf_tests.json
+++ b/model_conf_tests.json
@@ -16,12 +16,6 @@
   ["BAAI/Bunny-v1_1-Llama-3-8B-V"],
   ["BAAI/Emu2-Chat", "--load-in-4bit"],
   ["BAAI/Emu2-Chat", "--max-memory=0:78GiB,1:20GiB"],
-  ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
-  ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["HuggingFaceM4/idefics2-8b-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
-  ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["HuggingFaceM4/idefics2-8b-chatty-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40", "--load-in-4bit"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40"],
diff --git a/requirements.txt b/requirements.txt
index e03778e..8a2319b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 accelerate
 auto_gptq
+autoawq
 bitsandbytes
 fastapi
 # See: https://github.com/bdashore3/flash-attention/releases for other windows flash_attn releases
@@ -54,4 +55,4 @@ git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
 
 # mistral
 mistral_inference>=1.4.0
-mistral_common>=1.4.0
+mistral_common[opencv]>=1.4.3
diff --git a/vision-alt.sample.env b/vision-alt.sample.env
index 0233fc6..57f5f8d 100644
--- a/vision-alt.sample.env
+++ b/vision-alt.sample.env
@@ -4,27 +4,33 @@ HF_HOME=hf_home
 HF_HUB_ENABLE_HF_TRANSFER=1
 #HF_TOKEN=hf-...
 #CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.3s, mem: 5.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0"  # test pass✅, time: 8.5s, mem: 10.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 10.2s, mem: 9.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit"  # test pass✅, time: 9.7s, mem: 6.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5"  # test pass✅, time: 7.9s, mem: 11.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 27.3s, mem: 13.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 23.3s, mem: 37.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 28.3s, mem: 12.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 20.5s, mem: 36.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit"  # test pass✅, time: 33.1s, mem: 22.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B"  # test pass✅, time: 31.4s, mem: 40.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit"  # test pass✅, time: 137.4s, mem: 22.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B"  # test pass✅, time: 99.7s, mem: 40.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b --use-flash-attn --load-in-4bit --device-map cuda:0"  # test pass✅, time: 52.7s, mem: 48.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b --use-flash-attn --load-in-4bit --device-map cuda:0"  # test pass✅, time: 27.4s, mem: 8.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b --use-flash-attn --device-map cuda:0"  # test fail❌, time: 8.4s, mem: 16.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn --load-in-4bit"  # test pass✅, time: 23.6s, mem: 8.0GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn"  # test pass✅, time: 18.3s, mem: 17.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.2s, mem: 4.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.9s, mem: 8.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 3.3s, mem: 3.4GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 9.8s, mem: 8.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn --load-in-4bit"  # test pass✅, time: 16.6s, mem: 16.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn"  # test pass✅, time: 16.5s, mem: 32.1GB, 13/13 tests passed.
\ No newline at end of file
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 24.5s, mem: 29.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 21.4s, mem: 39.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 24.7s, mem: 29.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 30.1s, mem: 31.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 24.1s, mem: 40.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 26.3s, mem: 31.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 10.4s, mem: 7.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0"  # test pass✅, time: 7.7s, mem: 11.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 9.9s, mem: 10.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit"  # test pass✅, time: 8.9s, mem: 6.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5"  # test pass✅, time: 7.2s, mem: 11.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 30.3s, mem: 13.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 22.3s, mem: 37.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 27.3s, mem: 12.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 20.7s, mem: 36.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit"  # test pass✅, time: 36.1s, mem: 22.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B"  # test pass✅, time: 30.5s, mem: 40.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit"  # test pass✅, time: 118.3s, mem: 22.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B"  # test pass✅, time: 123.8s, mem: 40.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b -A flash_attention_2 --load-in-4bit --device-map cuda:0"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (timeout).
+#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b -A flash_attention_2 --load-in-4bit --device-map cuda:0"  # test pass✅, time: 32.6s, mem: 8.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b -A flash_attention_2 --device-map cuda:0"  # test fail❌, time: 6.4s, mem: 16.6GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 20.4s, mem: 8.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf -A flash_attention_2"  # test pass✅, time: 16.7s, mem: 17.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 7.2s, mem: 5.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.4s, mem: 8.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 1.7s, mem: 3.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 9.0s, mem: 9.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 16.2s, mem: 17.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm -A flash_attention_2"  # test pass✅, time: 15.6s, mem: 32.3GB, 13/13 tests passed.
diff --git a/vision.sample.env b/vision.sample.env
index 5e5a638..e110915 100644
--- a/vision.sample.env
+++ b/vision.sample.env
@@ -4,120 +4,117 @@ HF_HOME=hf_home
 HF_HUB_ENABLE_HF_TRANSFER=1
 #HF_TOKEN=hf-...
 #CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Gemma2-9B -A flash_attention_2"  # test pass✅, time: 9.4s, mem: 23.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Llama3-8B -A flash_attention_2"  # test pass✅, time: 5.8s, mem: 19.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit"  # test pass✅, time: 10.6s, mem: 9.0GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V"  # test pass✅, time: 8.0s, mem: 20.0GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit"  # test pass✅, time: 7.1s, mem: 9.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh"  # test pass✅, time: 5.4s, mem: 11.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit"  # test pass✅, time: 10.2s, mem: 8.7GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B"  # test pass✅, time: 9.1s, mem: 12.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh"  # test pass✅, time: 7.8s, mem: 12.7GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit"  # test pass✅, time: 8.9s, mem: 5.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B"  # test pass✅, time: 6.7s, mem: 12.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit"  # test pass✅, time: 10.4s, mem: 6.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B"  # test pass✅, time: 8.3s, mem: 13.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit"  # test pass✅, time: 11.8s, mem: 9.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V"  # test pass✅, time: 10.7s, mem: 20.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit"  # test pass✅, time: 31.7s, mem: 29.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB"  # test pass✅, time: 21.4s, mem: 72.1GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 28.9s, mem: 30.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 23.9s, mem: 39.0GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 29.6s, mem: 29.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 34.5s, mem: 30.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 27.4s, mem: 38.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 33.1s, mem: 29.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 23.3s, mem: 27.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 29.9s, mem: 30.4GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40"  # test pass✅, time: 25.4s, mem: 54.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 19.0s, mem: 52.4GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 30.2s, mem: 2.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0"  # test pass✅, time: 8.0s, mem: 3.0GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 17.4s, mem: 5.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0"  # test pass✅, time: 8.7s, mem: 7.1GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.6s, mem: 9.1GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0"  # test pass✅, time: 8.0s, mem: 18.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 25.9s, mem: 27.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0"  # test pass✅, time: 20.3s, mem: 52.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 37.6s, mem: 31.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0"  # test pass✅, time: 47.3s, mem: 76.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-Llama3-76B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 40.6s, mem: 51.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit"  # test pass✅, time: 6.7s, mem: 5.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 7.5s, mem: 7.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40"  # test pass✅, time: 6.9s, mem: 9.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5"  # test pass✅, time: 6.0s, mem: 7.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit"  # test pass✅, time: 9.8s, mem: 11.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 7.0s, mem: 19.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 12.4s, mem: 7.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct -A flash_attention_2"  # test pass✅, time: 17.3s, mem: 16.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 21.2s, mem: 18.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct -A flash_attention_2"  # test pass✅, time: 20.6s, mem: 27.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-72B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 35.2s, mem: 44.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5"  # test pass✅, time: 8.9s, mem: 9.7GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"  # test pass✅, time: 3.8s, mem: 9.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5"  # test pass✅, time: 8.7s, mem: 9.4GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1"  # test pass✅, time: 9.2s, mem: 10.0GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 3.1s, mem: 15.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
-#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0"  # test fail❌, time: 2.9s, mem: 27.1GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.3s, mem: 11.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0"  # test pass✅, time: 8.8s, mem: 20.7GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 11.7s, mem: 8.0GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 8.7s, mem: 17.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 8.3s, mem: 8.7GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.8s, mem: 18.4GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 21.7s, mem: 16.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 16.7s, mem: 25.4GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit"  # test pass✅, time: 15.0s, mem: 16.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 11.6s, mem: 21.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 10.5s, mem: 7.1GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2"  # test pass✅, time: 9.2s, mem: 12.1GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha --load-in-4bit -A flash_attention_2"  # test pass✅, time: 107.9s, mem: 9.1GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha -A flash_attention_2"  # test pass✅, time: 60.2s, mem: 18.7GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 27.9s, mem: 28.7GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 23.2s, mem: 21.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 30.0s, mem: 18.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit -A flash_attention_2"  # test pass✅, time: 56.5s, mem: 9.0GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 9.1s, mem: 7.7GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 25.4s, mem: 20.0GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit -A flash_attention_2"  # test pass✅, time: 41.9s, mem: 10.4GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 14.2s, mem: 9.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 10.6s, mem: 26.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 12.4s, mem: 5.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 9.5s, mem: 14.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 66.1s, mem: 22.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2"  # test pass✅, time: 87.8s, mem: 67.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 18.8s, mem: 12.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2"  # test pass✅, time: 17.6s, mem: 29.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 20.9s, mem: 8.0GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2"  # test pass✅, time: 16.2s, mem: 16.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-0.5b-ov -A flash_attention_2"  # test pass✅, time: 8.6s, mem: 15.4GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-7b-ov -A flash_attention_2"  # test pass✅, time: 18.5s, mem: 28.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 3.6s, mem: 1.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 3.1s, mem: 1.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 3.6s, mem: 1.7GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 2.9s, mem: 2.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 12.6s, mem: 6.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2"  # test pass✅, time: 9.0s, mem: 11.7GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 9.3s, mem: 4.7GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2"  # test pass✅, time: 8.3s, mem: 9.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m mistralai/Pixtral-12B-2409"  # test pass✅, time: 16.6s, mem: 35.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m omlab/omchat-v2.0-13B-single-beta_hf -A flash_attention_2"  # test pass✅, time: 27.9s, mem: 41.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6-int4 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 15.9s, mem: 9.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 14.7s, mem: 9.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 12.8s, mem: 19.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 27.0s, mem: 9.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 25.2s, mem: 19.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 13.2s, mem: 8.4GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B -A flash_attention_2"  # test pass✅, time: 7.9s, mem: 17.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.6s, mem: 7.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.2s, mem: 8.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 7.6s, mem: 7.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.4s, mem: 8.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit"  # test pass✅, time: 12.0s, mem: 6.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0"  # test pass✅, time: 8.8s, mem: 16.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit"  # test pass✅, time: 10.8s, mem: 8.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1"  # test pass✅, time: 9.4s, mem: 17.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit"  # test pass✅, time: 13.3s, mem: 8.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1"  # test pass✅, time: 12.4s, mem: 17.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 6.0s, mem: 2.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2"  # test pass✅, time: 4.7s, mem: 4.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Gemma2-9B -A flash_attention_2"  # test pass✅, time: 9.6s, mem: 23.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Llama3-8B -A flash_attention_2"  # test pass✅, time: 5.9s, mem: 19.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit"  # test pass✅, time: 10.2s, mem: 8.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V"  # test pass✅, time: 7.4s, mem: 19.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit"  # test pass✅, time: 6.5s, mem: 9.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh"  # test pass✅, time: 5.0s, mem: 10.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit"  # test pass✅, time: 10.3s, mem: 8.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B"  # test pass✅, time: 8.9s, mem: 11.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh"  # test pass✅, time: 7.8s, mem: 12.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit"  # test pass✅, time: 8.5s, mem: 5.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B"  # test pass✅, time: 6.4s, mem: 12.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit"  # test pass✅, time: 9.8s, mem: 5.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B"  # test pass✅, time: 7.8s, mem: 13.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit"  # test pass✅, time: 11.3s, mem: 9.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V"  # test pass✅, time: 10.4s, mem: 19.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit"  # test pass✅, time: 32.1s, mem: 29.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB"  # test pass✅, time: 18.7s, mem: 71.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 21.9s, mem: 26.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 28.6s, mem: 30.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40"  # test pass✅, time: 24.9s, mem: 54.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 18.4s, mem: 52.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 27.5s, mem: 1.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0"  # test pass✅, time: 7.0s, mem: 2.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 15.5s, mem: 4.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0"  # test pass✅, time: 8.0s, mem: 6.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 8.9s, mem: 8.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0"  # test pass✅, time: 7.5s, mem: 18.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 24.5s, mem: 26.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0"  # test pass✅, time: 19.9s, mem: 52.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 34.8s, mem: 31.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0"  # test pass✅, time: 46.5s, mem: 76.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-Llama3-76B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 39.6s, mem: 51.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit"  # test pass✅, time: 5.9s, mem: 5.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 6.8s, mem: 6.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40"  # test pass✅, time: 6.2s, mem: 8.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5"  # test pass✅, time: 5.4s, mem: 7.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit"  # test pass✅, time: 9.0s, mem: 11.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 7.1s, mem: 19.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 11.7s, mem: 6.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct -A flash_attention_2"  # test pass✅, time: 14.5s, mem: 16.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 17.9s, mem: 18.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct -A flash_attention_2"  # test pass✅, time: 15.1s, mem: 27.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-72B-Instruct-AWQ -A flash_attention_2"  # test pass✅, time: 34.8s, mem: 44.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5"  # test pass✅, time: 7.9s, mem: 9.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"  # test pass✅, time: 3.3s, mem: 9.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5"  # test pass✅, time: 8.0s, mem: 9.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1"  # test pass✅, time: 8.4s, mem: 9.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 1.5s, mem: 15.0GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
+#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0"  # test fail❌, time: 1.6s, mem: 26.7GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.0s, mem: 11.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0"  # test pass✅, time: 8.3s, mem: 20.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 10.9s, mem: 7.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 8.0s, mem: 17.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 7.6s, mem: 8.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 5.9s, mem: 18.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 20.9s, mem: 15.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 15.8s, mem: 24.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit"  # test pass✅, time: 14.3s, mem: 15.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 11.1s, mem: 21.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 9.6s, mem: 6.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2"  # test pass✅, time: 8.3s, mem: 11.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha --load-in-4bit -A flash_attention_2"  # test pass✅, time: 100.3s, mem: 8.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha -A flash_attention_2"  # test pass✅, time: 58.7s, mem: 18.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 2.7s, mem: 8.9GB, 1/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 28.4s, mem: 28.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 20.6s, mem: 21.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 26.2s, mem: 18.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit -A flash_attention_2"  # test pass✅, time: 62.4s, mem: 8.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 7.3s, mem: 7.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 25.4s, mem: 19.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit -A flash_attention_2"  # test pass✅, time: 47.9s, mem: 10.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 14.1s, mem: 9.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 10.2s, mem: 26.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 12.0s, mem: 5.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 8.5s, mem: 14.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 64.1s, mem: 22.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2"  # test pass✅, time: 87.3s, mem: 67.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 18.1s, mem: 12.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2"  # test pass✅, time: 17.3s, mem: 29.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 20.2s, mem: 7.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2"  # test pass✅, time: 15.7s, mem: 16.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-0.5b-ov -A flash_attention_2"  # test pass✅, time: 8.0s, mem: 15.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-7b-ov -A flash_attention_2"  # test pass✅, time: 18.1s, mem: 28.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 2.4s, mem: 0.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 1.9s, mem: 1.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 2.8s, mem: 1.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 2.1s, mem: 2.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 12.1s, mem: 6.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2"  # test pass✅, time: 8.5s, mem: 11.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 8.4s, mem: 4.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2"  # test pass✅, time: 7.6s, mem: 9.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m mistralai/Pixtral-12B-2409"  # test pass✅, time: 17.2s, mem: 35.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 10.3s, mem: 12.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2"  # test pass✅, time: 9.4s, mem: 14.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m omlab/omchat-v2.0-13B-single-beta_hf -A flash_attention_2"  # test pass✅, time: 27.3s, mem: 41.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6-int4 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 16.6s, mem: 9.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 13.4s, mem: 9.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 11.7s, mem: 19.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 23.1s, mem: 9.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 22.0s, mem: 19.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 12.5s, mem: 8.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B -A flash_attention_2"  # test pass✅, time: 7.5s, mem: 17.2GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.6s, mem: 7.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.3s, mem: 8.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 7.9s, mem: 7.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.6s, mem: 8.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit"  # test pass✅, time: 11.3s, mem: 6.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0"  # test pass✅, time: 8.0s, mem: 16.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit"  # test pass✅, time: 10.3s, mem: 8.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1"  # test pass✅, time: 8.9s, mem: 17.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit"  # test pass✅, time: 13.2s, mem: 8.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1"  # test pass✅, time: 13.0s, mem: 17.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 6.1s, mem: 2.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2"  # test pass✅, time: 4.7s, mem: 4.5GB, 13/13 tests passed.