0.33.0

matatonic · Sep 22, 2024 · 96cc4e3 · 96cc4e3
1 parent 1cc0b8c
commit 96cc4e3
Show file tree

Hide file tree

Showing 8 changed files with 168 additions and 171 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -10,7 +10,7 @@ RUN git clone https://github.com/togethercomputer/Dragonfly --single-branch /app
 
 COPY requirements.txt .
 ARG VERSION=latest
-RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "git+https://github.com/huggingface/transformers\nautoawq>=0.2.5" >> requirements.txt ; fi
+RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "git+https://github.com/huggingface/transformers" >> requirements.txt ; fi
 # TODO: nvidia apex wheel
 RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt
 

diff --git a/README.md b/README.md
@@ -38,10 +38,10 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 - [X] [fancyfeast/joy-caption-pre-alpha](https://huggingface.co/spaces/fancyfeast/joy-caption-pre-alpha) (caption only)
 - [X] [fuyu-8b](https://huggingface.co/adept/fuyu-8b) [pretrain]
 - [X] [HuggingFaceM4/idefics2](https://huggingface.co/HuggingFaceM4) 
-- - [X] [idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) (wont gpu split)
-- - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (wont gpu split)
-- - [X] [idefics2-8b-chatty](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty) (wont gpu split)
-- - [X] [idefics2-8b-chatty-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty-AWQ) (wont gpu split)
+- - [X] [idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) (wont gpu split, alternate docker only)
+- - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (wont gpu split, alternate docker only)
+- - [X] [idefics2-8b-chatty](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty) (wont gpu split, alternate docker only)
+- - [X] [idefics2-8b-chatty-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty-AWQ) (wont gpu split, alternate docker only)
 - [X] [InternLM](https://huggingface.co/internlm/)
 - - [X] [XComposer2-2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b) (wont gpu split)
 - - [X] [XComposer2-4KHD-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b) (wont gpu split)
@@ -148,7 +148,9 @@ If you can't find your favorite model, you can [open a new issue](https://github
 Version 0.33.0
 
 - new model support: mx262/MiniMonkey, thanks [@white2018](https://github.com/white2018)
-- Fix qwen2-vl when used with qwen-agent and multiple system prompts (tools), thanks [@cedonley](https://github.com/cedonley)
+- Fix Qwen2-VL when used with Qwen-Agent and multiple system prompts (tools), thanks [@cedonley](https://github.com/cedonley)
+- idefics2-8b support moved to alt image
+- pin Qwen2-VL-7B-Instruct-AWQ revision, [see note for info](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ/discussions/4)
 
 Version 0.32.0
 
@@ -359,7 +361,7 @@ docker compose -f docker-compose.alt.yml pull
 python -m venv .venv
 source .venv/bin/activate
 # install the python dependencies
-pip install -U -r requirements.txt "git+https://github.com/huggingface/transformers" "autoawq>=0.2.5"
+pip install -U -r requirements.txt "git+https://github.com/huggingface/transformers"
 # OR install the python dependencies for the alt version
 pip install -U -r requirements.txt "transformers==4.41.2"
 # run the server with your chosen model
@@ -484,19 +486,6 @@ CUDA_VISIBLE_DEVICES=1,0 python vision.py -m llava-hf/llava-v1.6-34b-hf --device
 
 You can also use the environment variable: `OPENEDAI_DEVICE_MAP="sequential"` to specify the `--device-map` argument.
 
-4. "My Nvidia GPU isn't detected when using docker."
-- On Linux, you may need to specify the default runtime for your container environment (and perhaps install the nvidia-container-runtime), like so:
-In /etc/docker/daemon.json:
-```json
-{
-    "runtimes": {
-        "nvidia": {
-            "path": "nvidia-container-runtime",
-            "runtimeArgs": []
-        }
-    },
-    "default-runtime": "nvidia"
-}
-```
-- In Windows, be sure you have WSL2 installed and docker is configured to use it. Also make sure your nvidia drivers are up to date.
-
+4. "My Nvidia GPU isn't detected when using docker.", using Nvidia CUDA with docker.
+- Linux: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
+- Windows: Use WSL2 with docker and nvidia drivers: https://docs.nvidia.com/cuda/wsl-user-guide/index.html
diff --git a/backend/qwen2-vl.py b/backend/qwen2-vl.py
@@ -25,6 +25,10 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
         self.processor = AutoProcessor.from_pretrained(model_id)
 
         del self.params['trust_remote_code']
+
+        if model_id == 'Qwen/Qwen2-VL-7B-Instruct-AWQ':
+            self.params['revision'] = '9d72ae62396aaa1817b006e07ddbbd121024f50d' # re: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ/discussions/4
+
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(**self.params).eval()
 
         self.loaded_banner()

diff --git a/model_conf_tests.alt.json b/model_conf_tests.alt.json
@@ -1,4 +1,10 @@
 [
+  ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
+  ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
+  ["HuggingFaceM4/idefics2-8b-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"],
+  ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
+  ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0"],
+  ["HuggingFaceM4/idefics2-8b-chatty-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0"],
   ["OpenGVLab/Mini-InternVL-Chat-4B-V1-5", "--max-tiles", "40", "--load-in-4bit"],

diff --git a/model_conf_tests.json b/model_conf_tests.json
@@ -16,12 +16,6 @@
   ["BAAI/Bunny-v1_1-Llama-3-8B-V"],
   ["BAAI/Emu2-Chat", "--load-in-4bit"],
   ["BAAI/Emu2-Chat", "--max-memory=0:78GiB,1:20GiB"],
-  ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
-  ["HuggingFaceM4/idefics2-8b", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["HuggingFaceM4/idefics2-8b-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
-  ["HuggingFaceM4/idefics2-8b-chatty", "-A", "flash_attention_2", "--device-map", "cuda:0"],
-  ["HuggingFaceM4/idefics2-8b-chatty-AWQ", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40", "--load-in-4bit"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--max-tiles", "40"],

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 accelerate
 auto_gptq
+autoawq
 bitsandbytes
 fastapi
 # See: https://github.com/bdashore3/flash-attention/releases for other windows flash_attn releases
@@ -54,4 +55,4 @@ git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
 
 # mistral
 mistral_inference>=1.4.0
-mistral_common>=1.4.0
+mistral_common[opencv]>=1.4.3
diff --git a/vision-alt.sample.env b/vision-alt.sample.env
@@ -4,27 +4,33 @@ HF_HOME=hf_home
 HF_HUB_ENABLE_HF_TRANSFER=1
 #HF_TOKEN=hf-...
 #CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.3s, mem: 5.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0"  # test pass✅, time: 8.5s, mem: 10.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 10.2s, mem: 9.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit"  # test pass✅, time: 9.7s, mem: 6.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5"  # test pass✅, time: 7.9s, mem: 11.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 27.3s, mem: 13.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 23.3s, mem: 37.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 28.3s, mem: 12.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 20.5s, mem: 36.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit"  # test pass✅, time: 33.1s, mem: 22.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B"  # test pass✅, time: 31.4s, mem: 40.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit"  # test pass✅, time: 137.4s, mem: 22.2GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B"  # test pass✅, time: 99.7s, mem: 40.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b --use-flash-attn --load-in-4bit --device-map cuda:0"  # test pass✅, time: 52.7s, mem: 48.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b --use-flash-attn --load-in-4bit --device-map cuda:0"  # test pass✅, time: 27.4s, mem: 8.8GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b --use-flash-attn --device-map cuda:0"  # test fail❌, time: 8.4s, mem: 16.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn --load-in-4bit"  # test pass✅, time: 23.6s, mem: 8.0GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn"  # test pass✅, time: 18.3s, mem: 17.3GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.2s, mem: 4.6GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.9s, mem: 8.5GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 3.3s, mem: 3.4GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 9.8s, mem: 8.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn --load-in-4bit"  # test pass✅, time: 16.6s, mem: 16.9GB, 13/13 tests passed.
-#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn"  # test pass✅, time: 16.5s, mem: 32.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 24.5s, mem: 29.8GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 21.4s, mem: 39.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 24.7s, mem: 29.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 30.1s, mem: 31.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 24.1s, mem: 40.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 26.3s, mem: 31.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 10.4s, mem: 7.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL2-4B --device-map cuda:0"  # test pass✅, time: 7.7s, mem: 11.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 9.9s, mem: 10.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit"  # test pass✅, time: 8.9s, mem: 6.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5"  # test pass✅, time: 7.2s, mem: 11.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 30.3s, mem: 13.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 22.3s, mem: 37.5GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 27.3s, mem: 12.7GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 20.7s, mem: 36.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit"  # test pass✅, time: 36.1s, mem: 22.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B"  # test pass✅, time: 30.5s, mem: 40.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit"  # test pass✅, time: 118.3s, mem: 22.3GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B"  # test pass✅, time: 123.8s, mem: 40.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-72b -A flash_attention_2 --load-in-4bit --device-map cuda:0"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (timeout).
+#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b -A flash_attention_2 --load-in-4bit --device-map cuda:0"  # test pass✅, time: 32.6s, mem: 8.9GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m cognitivecomputations/dolphin-vision-7b -A flash_attention_2 --device-map cuda:0"  # test fail❌, time: 6.4s, mem: 16.6GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 20.4s, mem: 8.1GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf -A flash_attention_2"  # test pass✅, time: 16.7s, mem: 17.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 7.2s, mem: 5.4GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 6.4s, mem: 8.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 1.7s, mem: 3.5GB, Test failed with Exception: Error code: 500 - {'message': 'InternalServerError', 'code': 500}
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 9.0s, mem: 9.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 16.2s, mem: 17.0GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm -A flash_attention_2"  # test pass✅, time: 15.6s, mem: 32.3GB, 13/13 tests passed.