From 415f3552e9895a559c4785e661a117538a7dec40 Mon Sep 17 00:00:00 2001
From: matatonic <matatonic-git@zhero.org>
Date: Wed, 22 May 2024 19:38:38 -0400
Subject: [PATCH] 0.17.0

---
 README.md             |   2 +-
 vision-alt.sample.env |  93 ++++++++++++++++++-----------------
 vision.sample.env     | 111 +++++++++++++++++++++---------------------
 3 files changed, 104 insertions(+), 102 deletions(-)

diff --git a/README.md b/README.md
index 0965f6b..98b471b 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 - [X] [Moondream2](https://huggingface.co/vikhyatk/moondream2)
 - [X] [Moondream1](https://huggingface.co/vikhyatk/moondream1) (alternate docker only)
 - [X] [openbmb](https://huggingface.co/openbmb)
-- - [X] [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)
+- - [X] [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) (main docker only)
 - - [X] [MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2)
 - - [X] [MiniCPM-V aka. OmniLMM-3B](https://huggingface.co/openbmb/MiniCPM-V)
 - - [ ] [OmniLMM-12B](https://huggingface.co/openbmb/OmniLMM-12B)
diff --git a/vision-alt.sample.env b/vision-alt.sample.env
index ac75d83..101c990 100644
--- a/vision-alt.sample.env
+++ b/vision-alt.sample.env
@@ -2,52 +2,53 @@
 # Copy this file to vision.env and uncomment the model of your choice.
 HF_HOME=hf_home
 #CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 13.8s, mem: 52.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail❌, time: 16.4s, mem: 18.2GB, 2/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 26.0s, mem: 31.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 14.3s, mem: 52.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail❌, time: 17.1s, mem: 18.2GB, 2/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 27.0s, mem: 31.3GB, 8/8 tests passed.
 #CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5"  # test pass✅, time: 3.5s, mem: 7.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 4.2s, mem: 19.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 19.6s, mem: 12.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 15.0s, mem: 37.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 19.4s, mem: 12.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 13.8s, mem: 36.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B"  # test pass✅, time: 21.9s, mem: 40.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B"  # test pass✅, time: 64.1s, mem: 40.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --load-in-4bit --use-flash-attn"  # test pass✅, time: 37.7s, mem: 10.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn"  # test pass✅, time: 20.6s, mem: 27.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 24.1s, mem: 14.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn"  # test pass✅, time: 19.0s, mem: 31.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 4.5s, mem: 19.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 20.5s, mem: 12.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 14.7s, mem: 37.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 19.2s, mem: 12.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 13.2s, mem: 36.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B"  # test pass✅, time: 22.2s, mem: 40.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B"  # test pass✅, time: 64.7s, mem: 40.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --load-in-4bit --use-flash-attn"  # test pass✅, time: 31.6s, mem: 10.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn"  # test pass✅, time: 22.4s, mem: 27.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 26.4s, mem: 14.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn"  # test pass✅, time: 20.7s, mem: 31.9GB, 8/8 tests passed.
 #CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
 #CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --load-in-4bit --use-flash-attn"  # test pass✅, time: 16.7s, mem: 21.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn"  # test pass✅, time: 10.6s, mem: 67.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 202.8s, mem: 24.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn"  # test pass✅, time: 174.7s, mem: 70.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --load-in-4bit --use-flash-attn"  # test pass✅, time: 11.1s, mem: 6.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn"  # test pass✅, time: 5.0s, mem: 15.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 34.8s, mem: 10.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn"  # test pass✅, time: 14.7s, mem: 19.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --load-in-4bit --use-flash-attn"  # test pass✅, time: 22.7s, mem: 26.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn"  # test pass✅, time: 15.2s, mem: 91.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 25.0s, mem: 29.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn"  # test pass✅, time: 18.6s, mem: 96.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 14.8s, mem: 25.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 6.0s, mem: 22.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 7.5s, mem: 22.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 18.0s, mem: 25.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 18.3s, mem: 19.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn --device cuda:0"  # test pass✅, time: 10.7s, mem: 9.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.0s, mem: 7.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 14.9s, mem: 20.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn --device cuda:0"  # test pass✅, time: 10.1s, mem: 11.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail❌, time: 2.3s, mem: 6.3GB, 0/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail❌, time: 1.8s, mem: 16.0GB, 0/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.4s, mem: 9.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.4s, mem: 27.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 9.1s, mem: 5.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.0s, mem: 14.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.2s, mem: 8.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.0s, mem: 11.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit"  # test pass✅, time: 7.7s, mem: 8.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn"  # test pass✅, time: 5.1s, mem: 17.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream1"  # test pass✅, time: 4.0s, mem: 5.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass✅, time: 3.8s, mem: 4.9GB, 8/8 tests passed.
\ No newline at end of file
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn"  # test pass✅, time: 11.0s, mem: 67.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 180.1s, mem: 24.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn"  # test pass✅, time: 118.0s, mem: 70.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --load-in-4bit --use-flash-attn"  # test pass✅, time: 11.4s, mem: 6.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn"  # test pass✅, time: 5.5s, mem: 15.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 35.4s, mem: 9.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn"  # test pass✅, time: 15.4s, mem: 18.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --load-in-4bit --use-flash-attn"  # test pass✅, time: 22.7s, mem: 26.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn"  # test pass✅, time: 14.7s, mem: 91.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 24.4s, mem: 29.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn"  # test pass✅, time: 18.4s, mem: 95.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 14.1s, mem: 24.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 6.1s, mem: 21.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 7.4s, mem: 21.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 17.9s, mem: 25.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 16.6s, mem: 18.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn --device cuda:0"  # test pass✅, time: 11.2s, mem: 9.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 5.3s, mem: 7.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 16.8s, mem: 20.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn --device cuda:0"  # test pass✅, time: 10.7s, mem: 10.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail❌, time: 2.3s, mem: 5.8GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail❌, time: 1.8s, mem: 15.5GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.0s, mem: 8.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.7s, mem: 26.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 9.1s, mem: 5.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 5.5s, mem: 14.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 5.9s, mem: 7.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.6s, mem: 11.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit"  # test pass✅, time: 7.7s, mem: 7.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn"  # test pass✅, time: 5.2s, mem: 17.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream1"  # test pass✅, time: 4.0s, mem: 4.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass✅, time: 4.1s, mem: 4.5GB, 8/8 tests passed.
\ No newline at end of file
diff --git a/vision.sample.env b/vision.sample.env
index 642e63e..4838c96 100644
--- a/vision.sample.env
+++ b/vision.sample.env
@@ -2,59 +2,60 @@
 # Copy this file to vision.env and uncomment the model of your choice.
 HF_HOME=hf_home
 #CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V"  # test pass✅, time: 6.7s, mem: 19.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB"  # test pass✅, time: 21.9s, mem: 78.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.9s, mem: 22.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0"  # test pass✅, time: 8.2s, mem: 12.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0"  # test pass✅, time: 8.5s, mem: 22.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ --use-flash-attn --device-map cuda:0"  # test pass✅, time: 9.7s, mem: 12.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V"  # test pass✅, time: 6.8s, mem: 19.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB"  # test pass✅, time: 21.5s, mem: 78.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.8s, mem: 22.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0"  # test pass✅, time: 8.1s, mem: 12.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0"  # test pass✅, time: 8.5s, mem: 22.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ --use-flash-attn --device-map cuda:0"  # test pass✅, time: 9.7s, mem: 12.9GB, 8/8 tests passed.
 #CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 13.3s, mem: 52.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail❌, time: 17.1s, mem: 18.3GB, 2/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 26.6s, mem: 32.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5"  # test pass✅, time: 3.5s, mem: 7.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 4.7s, mem: 19.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 20.1s, mem: 12.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 14.7s, mem: 37.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 19.5s, mem: 12.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 13.9s, mem: 36.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B"  # test pass✅, time: 22.3s, mem: 40.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B"  # test pass✅, time: 64.4s, mem: 40.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0"  # test pass✅, time: 6.9s, mem: 20.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.3s, mem: 17.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 5.0s, mem: 18.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn --load-in-4bit"  # test pass✅, time: 7.2s, mem: 6.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test pass✅, time: 4.2s, mem: 8.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 13.9s, mem: 25.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 6.1s, mem: 22.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 7.7s, mem: 22.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 18.1s, mem: 26.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 16.9s, mem: 19.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn"  # test pass✅, time: 9.2s, mem: 9.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 5.3s, mem: 7.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 19.4s, mem: 20.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn"  # test pass✅, time: 12.0s, mem: 11.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail❌, time: 2.3s, mem: 6.1GB, 0/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail❌, time: 1.8s, mem: 15.9GB, 0/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.4s, mem: 9.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.4s, mem: 27.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 9.0s, mem: 5.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 5.8s, mem: 14.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 53.5s, mem: 26.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn"  # test pass✅, time: 46.2s, mem: 72.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 18.8s, mem: 9.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn"  # test pass✅, time: 12.7s, mem: 19.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 12.0s, mem: 15.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn"  # test pass✅, time: 9.5s, mem: 32.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 13.0s, mem: 9.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn"  # test pass✅, time: 8.1s, mem: 19.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn --load-in-4bit"  # test pass✅, time: 7.9s, mem: 7.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn"  # test pass✅, time: 6.3s, mem: 12.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.2s, mem: 7.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.1s, mem: 11.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-70B --use-flash-attn --load-in-4bit"  # test fail❌, time: 4.1s, mem: 39.4GB, Test failed with Exception: Internal Server Error
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit"  # test pass✅, time: 8.4s, mem: 8.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn"  # test pass✅, time: 5.5s, mem: 17.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.4s, mem: 8.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --load-in-4bit --device-map cuda:0"  # test pass✅, time: 9.7s, mem: 7.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0"  # test pass✅, time: 6.1s, mem: 19.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass✅, time: 3.9s, mem: 4.8GB, 8/8 tests passed.
\ No newline at end of file
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail❌, time: 16.9s, mem: 18.3GB, 2/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 27.2s, mem: 32.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5"  # test pass✅, time: 3.8s, mem: 7.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 4.7s, mem: 19.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 20.4s, mem: 12.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 14.9s, mem: 37.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 19.2s, mem: 12.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 14.1s, mem: 36.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B"  # test pass✅, time: 22.2s, mem: 40.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B"  # test pass✅, time: 64.5s, mem: 40.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0"  # test pass✅, time: 6.4s, mem: 20.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.3s, mem: 17.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 4.7s, mem: 18.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn --load-in-4bit"  # test pass✅, time: 7.3s, mem: 5.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test pass✅, time: 4.0s, mem: 8.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 13.9s, mem: 24.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 6.1s, mem: 21.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 7.8s, mem: 21.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 18.0s, mem: 25.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 15.8s, mem: 19.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn"  # test pass✅, time: 9.7s, mem: 9.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 5.0s, mem: 7.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 15.3s, mem: 20.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn"  # test pass✅, time: 12.6s, mem: 11.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail❌, time: 2.2s, mem: 6.0GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail❌, time: 1.8s, mem: 15.7GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.4s, mem: 9.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.7s, mem: 26.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 9.3s, mem: 5.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 5.9s, mem: 14.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 52.1s, mem: 26.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn"  # test pass✅, time: 46.1s, mem: 72.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 19.3s, mem: 9.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn"  # test pass✅, time: 12.6s, mem: 19.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 12.2s, mem: 14.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn"  # test pass✅, time: 9.3s, mem: 32.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 13.6s, mem: 9.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn"  # test pass✅, time: 8.9s, mem: 18.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn --load-in-4bit"  # test pass✅, time: 8.0s, mem: 7.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn"  # test pass✅, time: 6.4s, mem: 12.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 8.2s, mem: 22.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 5.9s, mem: 7.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.7s, mem: 11.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-70B --use-flash-attn --load-in-4bit"  # test fail❌, time: 4.7s, mem: 39.2GB, Test failed with Exception: Internal Server Error
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit"  # test pass✅, time: 8.6s, mem: 7.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn"  # test pass✅, time: 5.7s, mem: 17.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.0s, mem: 8.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --load-in-4bit --device-map cuda:0"  # test pass✅, time: 9.6s, mem: 7.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0"  # test pass✅, time: 6.4s, mem: 19.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass✅, time: 3.9s, mem: 4.4GB, 8/8 tests passed.
\ No newline at end of file