From 96cc4e33b69a3dac5b3a3268055c254ef13a857f Mon Sep 17 00:00:00 2001 From: matatonic Date: Sun, 22 Sep 2024 17:43:29 -0400 Subject: [PATCH] 0.33.0 --- Dockerfile | 2 +- README.md | 33 ++---- backend/qwen2-vl.py | 4 + model_conf_tests.alt.json | 6 + model_conf_tests.json | 6 - requirements.txt | 3 +- vision-alt.sample.env | 54 +++++---- vision.sample.env | 231 +++++++++++++++++++------------------- 8 files changed, 168 insertions(+), 171 deletions(-) diff --git a/Dockerfile b/Dockerfile index 04e2ef7..4c06d41 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ RUN git clone https://github.com/togethercomputer/Dragonfly --single-branch /app COPY requirements.txt . ARG VERSION=latest -RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "git+https://github.com/huggingface/transformers\nautoawq>=0.2.5" >> requirements.txt ; fi +RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "git+https://github.com/huggingface/transformers" >> requirements.txt ; fi # TODO: nvidia apex wheel RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt diff --git a/README.md b/README.md index 3a70ef0..33fbc4c 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,10 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/ - [X] [fancyfeast/joy-caption-pre-alpha](https://huggingface.co/spaces/fancyfeast/joy-caption-pre-alpha) (caption only) - [X] [fuyu-8b](https://huggingface.co/adept/fuyu-8b) [pretrain] - [X] [HuggingFaceM4/idefics2](https://huggingface.co/HuggingFaceM4) -- - [X] [idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) (wont gpu split) -- - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (wont gpu split) -- - [X] [idefics2-8b-chatty](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty) (wont gpu split) -- - [X] [idefics2-8b-chatty-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty-AWQ) (wont gpu split) +- - [X] [idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) (wont gpu split, alternate docker only) +- - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (wont gpu split, alternate docker only) +- - [X] [idefics2-8b-chatty](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty) (wont gpu split, alternate docker only) +- - [X] [idefics2-8b-chatty-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty-AWQ) (wont gpu split, alternate docker only) - [X] [InternLM](https://huggingface.co/internlm/) - - [X] [XComposer2-2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b) (wont gpu split) - - [X] [XComposer2-4KHD-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b) (wont gpu split) @@ -148,7 +148,9 @@ If you can't find your favorite model, you can [open a new issue](https://github Version 0.33.0 - new model support: mx262/MiniMonkey, thanks [@white2018](https://github.com/white2018) -- Fix qwen2-vl when used with qwen-agent and multiple system prompts (tools), thanks [@cedonley](https://github.com/cedonley) +- Fix Qwen2-VL when used with Qwen-Agent and multiple system prompts (tools), thanks [@cedonley](https://github.com/cedonley) +- idefics2-8b support moved to alt image +- pin Qwen2-VL-7B-Instruct-AWQ revision, [see note for info](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ/discussions/4) Version 0.32.0 @@ -359,7 +361,7 @@ docker compose -f docker-compose.alt.yml pull python -m venv .venv source .venv/bin/activate # install the python dependencies -pip install -U -r requirements.txt "git+https://github.com/huggingface/transformers" "autoawq>=0.2.5" +pip install -U -r requirements.txt "git+https://github.com/huggingface/transformers" # OR install the python dependencies for the alt version pip install -U -r requirements.txt "transformers==4.41.2" # run the server with your chosen model @@ -484,19 +486,6 @@ CUDA_VISIBLE_DEVICES=1,0 python vision.py -m llava-hf/llava-v1.6-34b-hf --device You can also use the environment variable: `OPENEDAI_DEVICE_MAP="sequential"` to specify the `--device-map` argument. -4. "My Nvidia GPU isn't detected when using docker." -- On Linux, you may need to specify the default runtime for your container environment (and perhaps install the nvidia-container-runtime), like so: -In /etc/docker/daemon.json: -```json -{ - "runtimes": { - "nvidia": { - "path": "nvidia-container-runtime", - "runtimeArgs": [] - } - }, - "default-runtime": "nvidia" -} -``` -- In Windows, be sure you have WSL2 installed and docker is configured to use it. Also make sure your nvidia drivers are up to date. - +4. "My Nvidia GPU isn't detected when using docker.", using Nvidia CUDA with docker. +- Linux: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html +- Windows: Use WSL2 with docker and nvidia drivers: https://docs.nvidia.com/cuda/wsl-user-guide/index.html diff --git a/backend/qwen2-vl.py b/backend/qwen2-vl.py index 528463d..f0f3b06 100644 --- a/backend/qwen2-vl.py +++ b/backend/qwen2-vl.py @@ -25,6 +25,10 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p self.processor = AutoProcessor.from_pretrained(model_id) del self.params['trust_remote_code'] + + if model_id == 'Qwen/Qwen2-VL-7B-Instruct-AWQ': + self.params['revision'] = '9d72ae62396aaa1817b006e07ddbbd121024f50d' # re: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ/discussions/4 + self.model = Qwen2VLForConditionalGeneration.from_pretrained(**self.params).eval() self.loaded_banner() diff --git a/model_conf_tests.alt.json b/model_conf_tests.alt.json index 96aebe3..c2722ad 100644 --- a/model_conf_tests.alt.json +++ b/model_conf_tests.alt.json @@ -1,4 +1,10 @@ [ + ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], + ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0"], + ["HuggingFaceM4/idefics2-8b-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"], + ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], + ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0"], + ["HuggingFaceM4/idefics2-8b-chatty-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"], ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0", "--load-in-4bit"], ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0"], ["OpenGVLab/Mini-InternVL-Chat-4B-V1-5", "--max-tiles", "40", "--load-in-4bit"], diff --git a/model_conf_tests.json b/model_conf_tests.json index 95a864a..e6b285a 100644 --- a/model_conf_tests.json +++ b/model_conf_tests.json @@ -16,12 +16,6 @@ ["BAAI/Bunny-v1_1-Llama-3-8B-V"], ["BAAI/Emu2-Chat", "--load-in-4bit"], ["BAAI/Emu2-Chat", "--max-memory=0:78GiB,1:20GiB"], - ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], - ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0"], - ["HuggingFaceM4/idefics2-8b-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"], - ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"], - ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0"], - ["HuggingFaceM4/idefics2-8b-chatty-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"], ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--load-in-4bit"], ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40", "--load-in-4bit"], ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40"], diff --git a/requirements.txt b/requirements.txt index e03778e..8a2319b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ accelerate auto_gptq +autoawq bitsandbytes fastapi # See: https://github.com/bdashore3/flash-attention/releases for other windows flash_attn releases @@ -54,4 +55,4 @@ git+https://github.com/LLaVA-VL/LLaVA-NeXT.git # mistral mistral_inference>=1.4.0 -mistral_common>=1.4.0 +mistral_common[opencv]>=1.4.3 diff --git a/vision-alt.sample.env b/vision-alt.sample.env index 0233fc6..57f5f8d 100644 --- a/vision-alt.sample.env +++ b/vision-alt.sample.env @@ -4,27 +4,33 @@ HF_HOME=hf_home HF_HUB_ENABLE_HF_TRANSFER=1 #HF_TOKEN=hf-... #CUDA_VISIBLE_DEVICES=1,0 -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.3s, mem: 5.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0" # test pass✅, time: 8.5s, mem: 10.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 10.2s, mem: 9.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit" # test pass✅, time: 9.7s, mem: 6.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5" # test pass✅, time: 7.9s, mem: 11.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 27.3s, mem: 13.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 23.3s, mem: 37.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 28.3s, mem: 12.9GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 20.5s, mem: 36.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit" # test pass✅, time: 33.1s, mem: 22.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B" # test pass✅, time: 31.4s, mem: 40.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit" # test pass✅, time: 137.4s, mem: 22.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B" # test pass✅, time: 99.7s, mem: 40.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b --use-flash-attn --load-in-4bit --device-map cuda:0" # test pass✅, time: 52.7s, mem: 48.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b --use-flash-attn --load-in-4bit --device-map cuda:0" # test pass✅, time: 27.4s, mem: 8.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b --use-flash-attn --device-map cuda:0" # test fail❌, time: 8.4s, mem: 16.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn --load-in-4bit" # test pass✅, time: 23.6s, mem: 8.0GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn" # test pass✅, time: 18.3s, mem: 17.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.2s, mem: 4.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.9s, mem: 8.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0 --load-in-4bit" # test fail❌, time: 3.3s, mem: 3.4GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass✅, time: 9.8s, mem: 8.9GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn --load-in-4bit" # test pass✅, time: 16.6s, mem: 16.9GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn" # test pass✅, time: 16.5s, mem: 32.1GB, 13/13 tests passed. \ No newline at end of file +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 24.5s, mem: 29.8GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 21.4s, mem: 39.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 24.7s, mem: 29.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 30.1s, mem: 31.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 24.1s, mem: 40.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 26.3s, mem: 31.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.4s, mem: 7.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0" # test pass✅, time: 7.7s, mem: 11.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 9.9s, mem: 10.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit" # test pass✅, time: 8.9s, mem: 6.9GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5" # test pass✅, time: 7.2s, mem: 11.9GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 30.3s, mem: 13.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 22.3s, mem: 37.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 27.3s, mem: 12.7GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 20.7s, mem: 36.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit" # test pass✅, time: 36.1s, mem: 22.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B" # test pass✅, time: 30.5s, mem: 40.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit" # test pass✅, time: 118.3s, mem: 22.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B" # test pass✅, time: 123.8s, mem: 40.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b -A flash_attention_2 --load-in-4bit --device-map cuda:0" # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (timeout). +#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b -A flash_attention_2 --load-in-4bit --device-map cuda:0" # test pass✅, time: 32.6s, mem: 8.9GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b -A flash_attention_2 --device-map cuda:0" # test fail❌, time: 6.4s, mem: 16.6GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 20.4s, mem: 8.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf -A flash_attention_2" # test pass✅, time: 16.7s, mem: 17.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.2s, mem: 5.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 6.4s, mem: 8.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.7s, mem: 3.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 9.0s, mem: 9.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm -A flash_attention_2 --load-in-4bit" # test pass✅, time: 16.2s, mem: 17.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm -A flash_attention_2" # test pass✅, time: 15.6s, mem: 32.3GB, 13/13 tests passed. diff --git a/vision.sample.env b/vision.sample.env index 5e5a638..e110915 100644 --- a/vision.sample.env +++ b/vision.sample.env @@ -4,120 +4,117 @@ HF_HOME=hf_home HF_HUB_ENABLE_HF_TRANSFER=1 #HF_TOKEN=hf-... #CUDA_VISIBLE_DEVICES=1,0 -#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Gemma2-9B -A flash_attention_2" # test pass✅, time: 9.4s, mem: 23.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Llama3-8B -A flash_attention_2" # test pass✅, time: 5.8s, mem: 19.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 10.6s, mem: 9.0GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V" # test pass✅, time: 8.0s, mem: 20.0GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit" # test pass✅, time: 7.1s, mem: 9.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh" # test pass✅, time: 5.4s, mem: 11.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit" # test pass✅, time: 10.2s, mem: 8.7GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B" # test pass✅, time: 9.1s, mem: 12.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh" # test pass✅, time: 7.8s, mem: 12.7GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit" # test pass✅, time: 8.9s, mem: 5.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B" # test pass✅, time: 6.7s, mem: 12.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit" # test pass✅, time: 10.4s, mem: 6.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B" # test pass✅, time: 8.3s, mem: 13.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 11.8s, mem: 9.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V" # test pass✅, time: 10.7s, mem: 20.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit" # test pass✅, time: 31.7s, mem: 29.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB" # test pass✅, time: 21.4s, mem: 72.1GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 28.9s, mem: 30.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 23.9s, mem: 39.0GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 29.6s, mem: 29.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 34.5s, mem: 30.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 27.4s, mem: 38.9GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 33.1s, mem: 29.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 23.3s, mem: 27.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit" # test pass✅, time: 29.9s, mem: 30.4GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40" # test pass✅, time: 25.4s, mem: 54.9GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 19.0s, mem: 52.4GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 30.2s, mem: 2.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0" # test pass✅, time: 8.0s, mem: 3.0GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 17.4s, mem: 5.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0" # test pass✅, time: 8.7s, mem: 7.1GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.6s, mem: 9.1GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0" # test pass✅, time: 8.0s, mem: 18.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 25.9s, mem: 27.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0" # test pass✅, time: 20.3s, mem: 52.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 37.6s, mem: 31.9GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0" # test pass✅, time: 47.3s, mem: 76.9GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-Llama3-76B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 40.6s, mem: 51.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit" # test pass✅, time: 6.7s, mem: 5.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 7.5s, mem: 7.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40" # test pass✅, time: 6.9s, mem: 9.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 6.0s, mem: 7.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit" # test pass✅, time: 9.8s, mem: 11.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 7.0s, mem: 19.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 12.4s, mem: 7.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct -A flash_attention_2" # test pass✅, time: 17.3s, mem: 16.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 21.2s, mem: 18.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct -A flash_attention_2" # test pass✅, time: 20.6s, mem: 27.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-72B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 35.2s, mem: 44.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5" # test pass✅, time: 8.9s, mem: 9.7GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5" # test pass✅, time: 3.8s, mem: 9.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5" # test pass✅, time: 8.7s, mem: 9.4GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1" # test pass✅, time: 9.2s, mem: 10.0GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit" # test fail❌, time: 3.1s, mem: 15.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} -#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0" # test fail❌, time: 2.9s, mem: 27.1GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.3s, mem: 11.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0" # test pass✅, time: 8.8s, mem: 20.7GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 11.7s, mem: 8.0GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 8.7s, mem: 17.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.3s, mem: 8.7GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 6.8s, mem: 18.4GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 21.7s, mem: 16.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 16.7s, mem: 25.4GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit" # test pass✅, time: 15.0s, mem: 16.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 11.6s, mem: 21.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2 --load-in-4bit" # test pass✅, time: 10.5s, mem: 7.1GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2" # test pass✅, time: 9.2s, mem: 12.1GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha --load-in-4bit -A flash_attention_2" # test pass✅, time: 107.9s, mem: 9.1GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha -A flash_attention_2" # test pass✅, time: 60.2s, mem: 18.7GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 27.9s, mem: 28.7GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 23.2s, mem: 21.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 30.0s, mem: 18.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit -A flash_attention_2" # test pass✅, time: 56.5s, mem: 9.0GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 9.1s, mem: 7.7GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 25.4s, mem: 20.0GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit -A flash_attention_2" # test pass✅, time: 41.9s, mem: 10.4GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 14.2s, mem: 9.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 10.6s, mem: 26.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.4s, mem: 5.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 9.5s, mem: 14.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 66.1s, mem: 22.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2" # test pass✅, time: 87.8s, mem: 67.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 18.8s, mem: 12.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2" # test pass✅, time: 17.6s, mem: 29.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 20.9s, mem: 8.0GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2" # test pass✅, time: 16.2s, mem: 16.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-0.5b-ov -A flash_attention_2" # test pass✅, time: 8.6s, mem: 15.4GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-7b-ov -A flash_attention_2" # test pass✅, time: 18.5s, mem: 28.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.6s, mem: 1.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 3.1s, mem: 1.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 3.6s, mem: 1.7GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 2.9s, mem: 2.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 12.6s, mem: 6.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2" # test pass✅, time: 9.0s, mem: 11.7GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 9.3s, mem: 4.7GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2" # test pass✅, time: 8.3s, mem: 9.5GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m mistralai/Pixtral-12B-2409" # test pass✅, time: 16.6s, mem: 35.9GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m omlab/omchat-v2.0-13B-single-beta_hf -A flash_attention_2" # test pass✅, time: 27.9s, mem: 41.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6-int4 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 15.9s, mem: 9.6GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 14.7s, mem: 9.9GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 12.8s, mem: 19.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 27.0s, mem: 9.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 25.2s, mem: 19.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B -A flash_attention_2 --load-in-4bit" # test pass✅, time: 13.2s, mem: 8.4GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B -A flash_attention_2" # test pass✅, time: 7.9s, mem: 17.3GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.6s, mem: 7.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 6.2s, mem: 8.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.6s, mem: 7.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 6.4s, mem: 8.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit" # test pass✅, time: 12.0s, mem: 6.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0" # test pass✅, time: 8.8s, mem: 16.9GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit" # test pass✅, time: 10.8s, mem: 8.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1" # test pass✅, time: 9.4s, mem: 17.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit" # test pass✅, time: 13.3s, mem: 8.2GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1" # test pass✅, time: 12.4s, mem: 17.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 6.0s, mem: 2.8GB, 13/13 tests passed. -#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2" # test pass✅, time: 4.7s, mem: 4.7GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Gemma2-9B -A flash_attention_2" # test pass✅, time: 9.6s, mem: 23.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m AIDC-AI/Ovis1.5-Llama3-8B -A flash_attention_2" # test pass✅, time: 5.9s, mem: 19.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 10.2s, mem: 8.7GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V" # test pass✅, time: 7.4s, mem: 19.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit" # test pass✅, time: 6.5s, mem: 9.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh" # test pass✅, time: 5.0s, mem: 10.8GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit" # test pass✅, time: 10.3s, mem: 8.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B" # test pass✅, time: 8.9s, mem: 11.9GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh" # test pass✅, time: 7.8s, mem: 12.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit" # test pass✅, time: 8.5s, mem: 5.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B" # test pass✅, time: 6.4s, mem: 12.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit" # test pass✅, time: 9.8s, mem: 5.8GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B" # test pass✅, time: 7.8s, mem: 13.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit" # test pass✅, time: 11.3s, mem: 9.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V" # test pass✅, time: 10.4s, mem: 19.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit" # test pass✅, time: 32.1s, mem: 29.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB" # test pass✅, time: 18.7s, mem: 71.8GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 21.9s, mem: 26.9GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit" # test pass✅, time: 28.6s, mem: 30.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40" # test pass✅, time: 24.9s, mem: 54.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 18.4s, mem: 52.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 27.5s, mem: 1.9GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-1B --device-map cuda:0" # test pass✅, time: 7.0s, mem: 2.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 15.5s, mem: 4.8GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-2B --device-map cuda:0" # test pass✅, time: 8.0s, mem: 6.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 8.9s, mem: 8.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-8B --device-map cuda:0" # test pass✅, time: 7.5s, mem: 18.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 24.5s, mem: 26.9GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-26B --device-map cuda:0" # test pass✅, time: 19.9s, mem: 52.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 34.8s, mem: 31.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-40B --device-map cuda:0" # test pass✅, time: 46.5s, mem: 76.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-Llama3-76B --device-map cuda:0 --load-in-4bit" # test pass✅, time: 39.6s, mem: 51.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit" # test pass✅, time: 5.9s, mem: 5.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit" # test pass✅, time: 6.8s, mem: 6.9GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40" # test pass✅, time: 6.2s, mem: 8.7GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5" # test pass✅, time: 5.4s, mem: 7.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit" # test pass✅, time: 9.0s, mem: 11.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 7.1s, mem: 19.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 11.7s, mem: 6.8GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-2B-Instruct -A flash_attention_2" # test pass✅, time: 14.5s, mem: 16.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 17.9s, mem: 18.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-7B-Instruct -A flash_attention_2" # test pass✅, time: 15.1s, mem: 27.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m Qwen/Qwen2-VL-72B-Instruct-AWQ -A flash_attention_2" # test pass✅, time: 34.8s, mem: 44.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5" # test pass✅, time: 7.9s, mem: 9.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5" # test pass✅, time: 3.3s, mem: 9.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5" # test pass✅, time: 8.0s, mem: 9.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1" # test pass✅, time: 8.4s, mem: 9.8GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit" # test fail❌, time: 1.5s, mem: 15.0GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} +#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0" # test fail❌, time: 1.6s, mem: 26.7GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500} +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.0s, mem: 11.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0" # test pass✅, time: 8.3s, mem: 20.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 10.9s, mem: 7.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 8.0s, mem: 17.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.6s, mem: 8.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 5.9s, mem: 18.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit" # test pass✅, time: 20.9s, mem: 15.9GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 15.8s, mem: 24.9GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit" # test pass✅, time: 14.3s, mem: 15.8GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 11.1s, mem: 21.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2 --load-in-4bit" # test pass✅, time: 9.6s, mem: 6.7GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha -A flash_attention_2" # test pass✅, time: 8.3s, mem: 11.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha --load-in-4bit -A flash_attention_2" # test pass✅, time: 100.3s, mem: 8.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m fancyfeast/joy-caption-pre-alpha -A flash_attention_2" # test pass✅, time: 58.7s, mem: 18.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test fail❌, time: 2.7s, mem: 8.9GB, 1/13 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2d5-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 28.4s, mem: 28.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 20.6s, mem: 21.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 26.2s, mem: 18.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit -A flash_attention_2" # test pass✅, time: 62.4s, mem: 8.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 7.3s, mem: 7.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 25.4s, mem: 19.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit -A flash_attention_2" # test pass✅, time: 47.9s, mem: 10.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 14.1s, mem: 9.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 10.2s, mem: 26.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 12.0s, mem: 5.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 8.5s, mem: 14.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 64.1s, mem: 22.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf -A flash_attention_2" # test pass✅, time: 87.3s, mem: 67.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 18.1s, mem: 12.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf -A flash_attention_2" # test pass✅, time: 17.3s, mem: 29.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2 --load-in-4bit" # test pass✅, time: 20.2s, mem: 7.7GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf -A flash_attention_2" # test pass✅, time: 15.7s, mem: 16.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-0.5b-ov -A flash_attention_2" # test pass✅, time: 8.0s, mem: 15.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m lmms-lab/llava-onevision-qwen2-7b-ov -A flash_attention_2" # test pass✅, time: 18.1s, mem: 28.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 2.4s, mem: 0.9GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 1.9s, mem: 1.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 2.8s, mem: 1.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 2.1s, mem: 2.3GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 12.1s, mem: 6.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct -A flash_attention_2" # test pass✅, time: 8.5s, mem: 11.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2 --load-in-4bit" # test pass✅, time: 8.4s, mem: 4.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2" # test pass✅, time: 7.6s, mem: 9.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m mistralai/Pixtral-12B-2409" # test pass✅, time: 17.2s, mem: 35.8GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2 --load-in-4bit" # test pass✅, time: 10.3s, mem: 12.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m mx262/MiniMonkey -A flash_attention_2" # test pass✅, time: 9.4s, mem: 14.5GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m omlab/omchat-v2.0-13B-single-beta_hf -A flash_attention_2" # test pass✅, time: 27.3s, mem: 41.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6-int4 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 16.6s, mem: 9.4GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 13.4s, mem: 9.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 11.7s, mem: 19.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 23.1s, mem: 9.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 22.0s, mem: 19.1GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B -A flash_attention_2 --load-in-4bit" # test pass✅, time: 12.5s, mem: 8.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B -A flash_attention_2" # test pass✅, time: 7.5s, mem: 17.2GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 9.6s, mem: 7.7GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 6.3s, mem: 8.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit" # test pass✅, time: 7.9s, mem: 7.7GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA-1.5 -A flash_attention_2 --device-map cuda:0" # test pass✅, time: 6.6s, mem: 8.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit" # test pass✅, time: 11.3s, mem: 6.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0" # test pass✅, time: 8.0s, mem: 16.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit" # test pass✅, time: 10.3s, mem: 8.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1" # test pass✅, time: 8.9s, mem: 17.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit" # test pass✅, time: 13.2s, mem: 8.0GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1" # test pass✅, time: 13.0s, mem: 17.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2 --load-in-4bit" # test pass✅, time: 6.1s, mem: 2.6GB, 13/13 tests passed. +#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 -A flash_attention_2" # test pass✅, time: 4.7s, mem: 4.5GB, 13/13 tests passed.