diff --git a/README.md b/README.md
index bda994e..3816e0e 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 - - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (main docker only, wont gpu split)
 - [X] [qihoo360](https://huggingface.co/qihoo360)
 - - [X] [360VL-8B](https://huggingface.co/qihoo360/360VL-8B)
-- - [X] [360VL-70B](https://huggingface.co/qihoo360/360VL-70B) (loading error, [see note](https://huggingface.co/qihoo360/360VL-70B/discussions/1))
+- - [X] [360VL-70B](https://huggingface.co/qihoo360/360VL-70B) (loading error, [see note](https://huggingface.co/qihoo360/360VL-70B/discussions/1), also too large for me to test)
 - [X] [LlavaNext](https://huggingface.co/llava-hf) (main docker only)
 - - [X] [llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) (main docker only)
 - - [X] [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (main docker only)
@@ -81,7 +81,7 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le
 Version: 0.14.0
 
 - docker-compose.yml: Assume the runtime supports the device (ie. nvidia)
-- new model support: qihoo360/360VL-8B, qihoo360/360VL-70B (70B loading error, [see note](https://huggingface.co/qihoo360/360VL-70B/discussions/1))
+- new model support: qihoo360/360VL-8B, qihoo360/360VL-70B (70B loading error, [see note](https://huggingface.co/qihoo360/360VL-70B/discussions/1), also too large for me to test)
 - new model support: BAAI/Emu2-Chat, Can be slow to load, may need --max-memory option control the loading on multiple gpus
 - new model support: TIGER-Labs/Mantis: Mantis-8B-siglip-llama3, Mantis-8B-clip-llama3, Mantis-8B-Fuyu
 
diff --git a/model_conf_tests.alt.json b/model_conf_tests.alt.json
index d718e9e..794457f 100644
--- a/model_conf_tests.alt.json
+++ b/model_conf_tests.alt.json
@@ -16,7 +16,7 @@
   ["YanweiLi/MGM-34B-HD", "--use-flash-attn"],
   ["YanweiLi/MGM-8x7B", "--use-flash-attn"],
   ["YanweiLi/MGM-8x7B-HD", "--use-flash-attn"],
-  ["BAAI/Bunny-Llama-3-8B-V"],
+  ["qihoo360/360VL-8B", "--use-flash-attn"],
   ["adept/fuyu-8b", "--device-map", "cuda:0"],
   ["internlm/internlm-xcomposer2-4khd-7b", "--use-flash-attn", "--device-map", "cuda:0"],
   ["internlm/internlm-xcomposer2-7b", "--use-flash-attn", "--device-map", "cuda:0"],
@@ -31,6 +31,7 @@
   ["OpenGVLab/InternVL-Chat-V1-5", "--load-in-4bit", "--device-map", "cuda:0"],
   ["THUDM/cogvlm-chat-hf", "--load-in-4bit"],
   ["THUDM/cogagent-chat-hf", "--load-in-4bit"],
+  ["qihoo360/360VL-8B", "--use-flash-attn", "--load-in-4bit"],
   ["internlm/internlm-xcomposer2-7b-4bit", "--use-flash-attn", "--device", "cuda:0"],
   ["internlm/internlm-xcomposer2-vl-7b-4bit", "--use-flash-attn", "--device", "cuda:0"],
   ["llava-hf/bakLlava-v1-hf", "--load-in-4bit", "--use-flash-attn"],
diff --git a/model_conf_tests.json b/model_conf_tests.json
index 1db5a69..8d895d7 100644
--- a/model_conf_tests.json
+++ b/model_conf_tests.json
@@ -1,20 +1,15 @@
 [
-  ["qihoo360/360VL-8B", "--use-flash-attn"],
-  ["qihoo360/360VL-70B", "--use-flash-attn"],
-  ["BAAI/Emu2-Chat", "--max-memory=0:78GiB,1:20GiB"],
-  ["BAAI/Emu2-Chat", "--load-in-4bit", "--device-map", "cuda:0"],
   ["vikhyatk/moondream2", "--use-flash-attn"],
-  ["vikhyatk/moondream1"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],
   ["HuggingFaceM4/idefics2-8b", "--use-flash-attn", "--device-map", "cuda:0"],
   ["qihoo360/360VL-8B", "--use-flash-attn"],
-  ["qihoo360/360VL-70B", "--use-flash-attn"],
   ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--device-map", "cuda:0"],
   ["echo840/Monkey"],
   ["echo840/Monkey-Chat"],
   ["THUDM/cogvlm-chat-hf"],
   ["THUDM/cogagent-chat-hf"],
   ["Qwen/Qwen-VL-Chat"],
+  ["BAAI/Emu2-Chat", "--max-memory=0:78GiB,1:20GiB"],
   ["BAAI/Bunny-Llama-3-8B-V"],
   ["qresearch/llama-3-vision-alpha-hf", "--device", "cuda:0"],
   ["TIGER-Lab/Mantis-8B-siglip-llama3", "--use-flash-attn", "--device-map", "cuda:0"],
@@ -33,8 +28,6 @@
   ["llava-hf/llava-v1.6-vicuna-7b-hf", "--use-flash-attn"],
   ["llava-hf/llava-v1.6-vicuna-13b-hf", "--use-flash-attn"],
   ["llava-hf/llava-v1.6-34b-hf", "--use-flash-attn"],
-  ["01-ai/Yi-VL-6B", "--use-flash-attn"],
-  ["01-ai/Yi-VL-34B", "--use-flash-attn"],
   ["YanweiLi/MGM-2B", "--use-flash-attn"],
 
   ["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"],
@@ -54,7 +47,5 @@
   ["llava-hf/llava-v1.6-vicuna-7b-hf", "--load-in-4bit", "--use-flash-attn"],
   ["llava-hf/llava-v1.6-vicuna-13b-hf", "--load-in-4bit", "--use-flash-attn"],
   ["llava-hf/llava-v1.6-34b-hf", "--load-in-4bit", "--use-flash-attn"],
-  ["01-ai/Yi-VL-6B", "--load-in-4bit", "--use-flash-attn"],
-  ["01-ai/Yi-VL-34B", "--load-in-4bit", "--use-flash-attn"],
   ["YanweiLi/MGM-2B", "--use-flash-attn", "--load-in-4bit"]
 ]
diff --git a/vision-alt.sample.env b/vision-alt.sample.env
index b26a21b..0be64c6 100644
--- a/vision-alt.sample.env
+++ b/vision-alt.sample.env
@@ -2,46 +2,48 @@
 # Copy this file to vision.env and uncomment the model of your choice.
 HF_HOME=hf_home
 #CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass✅, time: 4.3s, mem: 4.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream1"  # test pass✅, time: 4.2s, mem: 4.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 6.2s, mem: 21.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass✅, time: 4.8s, mem: 4.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream1"  # test pass✅, time: 4.6s, mem: 4.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 6.3s, mem: 21.8GB, 8/8 tests passed.
 #CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 8.0s, mem: 21.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 13.5s, mem: 52.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 14.0s, mem: 36.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 15.0s, mem: 37.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 5.0s, mem: 19.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 13.4s, mem: 52.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 14.2s, mem: 36.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 15.2s, mem: 37.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 4.8s, mem: 19.5GB, 8/8 tests passed.
 #CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn"  # test pass✅, time: 5.7s, mem: 15.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn"  # test pass✅, time: 15.1s, mem: 18.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn"  # test pass✅, time: 21.7s, mem: 27.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn"  # test pass✅, time: 16.4s, mem: 31.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn"  # test pass✅, time: 11.2s, mem: 67.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn"  # test pass✅, time: 100.7s, mem: 70.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn"  # test pass✅, time: 15.1s, mem: 91.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn"  # test pass✅, time: 18.8s, mem: 96.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 14.6s, mem: 25.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 18.4s, mem: 25.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 16.8s, mem: 19.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 14.4s, mem: 20.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.9s, mem: 11.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.1s, mem: 7.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail❌, time: 2.0s, mem: 15.7GB, 0/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 5.7s, mem: 14.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.8s, mem: 26.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 27.1s, mem: 31.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail❌, time: 17.2s, mem: 18.2GB, 2/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 20.1s, mem: 12.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 21.4s, mem: 12.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn --device cuda:0"  # test pass✅, time: 10.9s, mem: 9.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn --device cuda:0"  # test pass✅, time: 11.6s, mem: 10.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail❌, time: 2.5s, mem: 6.0GB, 0/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 9.4s, mem: 5.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.7s, mem: 9.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.6s, mem: 6.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 35.9s, mem: 9.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --load-in-4bit --use-flash-attn"  # test pass✅, time: 37.9s, mem: 10.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 23.5s, mem: 14.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --load-in-4bit --use-flash-attn"  # test pass✅, time: 17.2s, mem: 21.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 161.9s, mem: 24.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --load-in-4bit --use-flash-attn"  # test pass✅, time: 21.6s, mem: 26.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 25.5s, mem: 29.5GB, 8/8 tests passed.
\ No newline at end of file
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn"  # test pass✅, time: 6.1s, mem: 15.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn"  # test pass✅, time: 14.9s, mem: 18.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn"  # test pass✅, time: 18.8s, mem: 27.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn"  # test pass✅, time: 18.2s, mem: 31.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn"  # test pass✅, time: 11.3s, mem: 67.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn"  # test pass✅, time: 196.6s, mem: 70.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn"  # test pass✅, time: 16.0s, mem: 91.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn"  # test pass✅, time: 19.7s, mem: 96.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn"  # test pass✅, time: 5.6s, mem: 17.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 14.1s, mem: 25.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 18.8s, mem: 25.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 16.0s, mem: 19.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 17.3s, mem: 20.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.0s, mem: 11.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.5s, mem: 7.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail❌, time: 2.0s, mem: 15.9GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.0s, mem: 14.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.8s, mem: 27.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 27.5s, mem: 31.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail❌, time: 17.5s, mem: 18.3GB, 2/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 20.2s, mem: 12.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 20.9s, mem: 12.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit"  # test pass✅, time: 8.6s, mem: 7.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn --device cuda:0"  # test pass✅, time: 11.2s, mem: 9.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn --device cuda:0"  # test pass✅, time: 13.2s, mem: 10.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail❌, time: 2.4s, mem: 5.9GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 9.9s, mem: 5.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 11.0s, mem: 9.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.4s, mem: 6.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 35.8s, mem: 9.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --load-in-4bit --use-flash-attn"  # test pass✅, time: 37.2s, mem: 10.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 31.3s, mem: 14.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --load-in-4bit --use-flash-attn"  # test pass✅, time: 17.3s, mem: 21.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 213.9s, mem: 24.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --load-in-4bit --use-flash-attn"  # test pass✅, time: 23.8s, mem: 26.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 25.6s, mem: 29.6GB, 8/8 tests passed.
\ No newline at end of file
diff --git a/vision.py b/vision.py
index 9d25560..9167d15 100644
--- a/vision.py
+++ b/vision.py
@@ -78,19 +78,6 @@ def parse_args(argv=None):
 if __name__ == "__main__":
     args = parse_args(sys.argv[1:])
 
-    if args.model in ['01-ai/Yi-VL-34B', '01-ai/Yi-VL-6B']:
-        if False:
-            # 💩 fake wrapper for compatibility... but it doesn't work anyways?
-            # OSError: Incorrect path_or_model_id: '01-ai/Yi-VL-6B/vit/clip-vit-H-14-laion2B-s32B-b79K-yi-vl-6B-448'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
-            
-            os.chdir("Yi/VL")
-            os.environ['PYTHONPATH'] = '.'
-            os.system(f"huggingface-cli download --quiet {args.model} --local-dir {args.model}")
-            os.execvp("python", ["python", "openai_api.py", "--model-path", args.model, "--port", f"{args.port}", "--host", args.host])
-            sys.exit(0) # not reached
-        else:
-            os.system(f"huggingface-cli download --quiet {args.model} --local-dir {args.model}")
-    
     if not args.backend:
         args.backend = guess_backend(args.model)
 
diff --git a/vision.sample.env b/vision.sample.env
index e87cc25..4941c78 100644
--- a/vision.sample.env
+++ b/vision.sample.env
@@ -3,45 +3,49 @@
 HF_HOME=hf_home
 #CUDA_VISIBLE_DEVICES=1,0
 #CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass✅, time: 4.1s, mem: 4.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream1"  # test fail❌, time: 3.6s, mem: 4.8GB, Test failed with Exception: Internal Server Error
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 13.5s, mem: 51.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.1s, mem: 22.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.9s, mem: 8.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 6.5s, mem: 21.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 8.2s, mem: 21.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 14.1s, mem: 36.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 15.3s, mem: 37.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 5.0s, mem: 19.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 14.4s, mem: 24.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 16.6s, mem: 18.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 16.5s, mem: 20.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.9s, mem: 11.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.1s, mem: 7.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail❌, time: 2.0s, mem: 15.6GB, 0/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.1s, mem: 14.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.8s, mem: 26.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn"  # test pass✅, time: 12.8s, mem: 19.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn"  # test pass✅, time: 8.6s, mem: 18.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn"  # test pass✅, time: 9.6s, mem: 32.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 13.4s, mem: 52.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.9s, mem: 22.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn"  # test pass✅, time: 5.7s, mem: 17.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.6s, mem: 8.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 6.2s, mem: 21.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 8.1s, mem: 21.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 14.4s, mem: 36.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 15.3s, mem: 37.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 4.9s, mem: 19.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB"  # test pass✅, time: 20.5s, mem: 78.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V"  # test pass✅, time: 6.8s, mem: 19.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0"  # test pass✅, time: 6.8s, mem: 19.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 4.9s, mem: 18.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.4s, mem: 17.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0"  # test pass✅, time: 6.5s, mem: 20.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 13.2s, mem: 25.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 18.4s, mem: 26.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 17.3s, mem: 19.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 15.5s, mem: 20.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.1s, mem: 11.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.4s, mem: 7.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail❌, time: 2.0s, mem: 15.9GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.1s, mem: 14.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.6s, mem: 27.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn"  # test pass✅, time: 12.9s, mem: 19.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn"  # test pass✅, time: 8.9s, mem: 19.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn"  # test pass✅, time: 9.5s, mem: 32.6GB, 8/8 tests passed.
 #CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn"  # test pass✅, time: 46.2s, mem: 72.4GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --use-flash-attn"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --use-flash-attn"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test pass✅, time: 4.5s, mem: 8.3GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 27.6s, mem: 32.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail❌, time: 17.0s, mem: 18.2GB, 2/8 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0"  # test pass✅, time: 8.2s, mem: 12.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --load-in-4bit --device-map cuda:0"  # test pass✅, time: 11.2s, mem: 7.6GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 19.4s, mem: 12.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 20.6s, mem: 12.2GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn"  # test pass✅, time: 11.2s, mem: 9.5GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn"  # test pass✅, time: 11.3s, mem: 10.9GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail❌, time: 2.5s, mem: 5.9GB, 0/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 9.3s, mem: 5.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.2s, mem: 9.0GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 19.1s, mem: 9.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 13.6s, mem: 9.7GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 12.2s, mem: 14.8GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 54.1s, mem: 26.1GB, 8/8 tests passed.
-#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --load-in-4bit --use-flash-attn"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --load-in-4bit --use-flash-attn"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn --load-in-4bit"  # test pass✅, time: 7.4s, mem: 5.7GB, 8/8 tests passed.
\ No newline at end of file
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test pass✅, time: 4.4s, mem: 8.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 26.9s, mem: 32.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail❌, time: 17.2s, mem: 18.2GB, 2/8 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0"  # test pass✅, time: 8.0s, mem: 12.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit"  # test pass✅, time: 8.5s, mem: 7.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --load-in-4bit --device-map cuda:0"  # test pass✅, time: 10.5s, mem: 7.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 19.5s, mem: 12.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 20.9s, mem: 12.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn"  # test pass✅, time: 11.4s, mem: 9.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn"  # test pass✅, time: 12.3s, mem: 10.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail❌, time: 2.4s, mem: 5.9GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 9.7s, mem: 5.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.1s, mem: 9.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 19.0s, mem: 9.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 13.3s, mem: 9.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 12.5s, mem: 14.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 52.8s, mem: 26.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn --load-in-4bit"  # test pass✅, time: 7.6s, mem: 5.7GB, 8/8 tests passed.
\ No newline at end of file