From e1576c648504dd601b25abf7d108898825bd1e07 Mon Sep 17 00:00:00 2001
From: matatonic <matatonic-git@zhero.org>
Date: Fri, 13 Sep 2024 20:02:03 -0400
Subject: [PATCH] 0.31.1 minicpm v2.6-int4 fix, container rename

---
 README.md                | 5 +++++
 backend/minicpm-v-2_6.py | 6 ++++--
 docker-compose.alt.yml   | 7 ++-----
 docker-compose.yml       | 7 ++-----
 model_conf_tests.json    | 1 +
 vision.sample.env        | 1 +
 6 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 7314437..2955b32 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,7 @@ Can't decide which to use? See the [OpenVLM Leaderboard](https://huggingface.co/
 - - [X] [Pixtral-12B](https://huggingface.co/mistralai/Pixtral-12B-2409)
 - [X] [openbmb](https://huggingface.co/openbmb)
 - - [X] [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) (video not supported yet)
+- - [X] [MiniCPM-V-2_6-int4](https://huggingface.co/openbmb/MiniCPM-V-2_6-int4)
 - - [X] [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)
 - - [X] [MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2) (alternate docker only)
 - - [X] [MiniCPM-V aka. OmniLMM-3B](https://huggingface.co/openbmb/MiniCPM-V) (alternate docker only)
@@ -138,6 +139,10 @@ If you can't find your favorite model, you can [open a new issue](https://github
 
 ## Recent updates
 
+Version 0.31.1
+
+- Fix support for openbmb/MiniCPM-V-2_6-int4
+
 Version 0.31.0
 
 - new model support: Qwen/Qwen2-VL family of models (video untested, GPTQ not working yet, but AWQ and BF16 are fine)
diff --git a/backend/minicpm-v-2_6.py b/backend/minicpm-v-2_6.py
index e2e63c5..a1a7972 100644
--- a/backend/minicpm-v-2_6.py
+++ b/backend/minicpm-v-2_6.py
@@ -5,6 +5,7 @@
 from decord import VideoReader, cpu
 
 # openbmb/MiniCPM-V-2_6
+# openbmb/MiniCPM-V-2_6-int4
 
 MAX_NUM_FRAMES=64 # if cuda OOM set a smaller number
 
@@ -36,8 +37,9 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
         self.model = AutoModel.from_pretrained(**self.params).eval()
 
         # bitsandbytes already moves the model to the device, so we don't need to do it again.
-        if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)):
-            self.model = self.model.to(dtype=self.params['torch_dtype'], device=self.device)
+        if '-int4' not in model_id:
+            if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)):
+                self.model = self.model.to(dtype=self.params['torch_dtype'], device=self.device)
     
         self.loaded_banner()
     
diff --git a/docker-compose.alt.yml b/docker-compose.alt.yml
index 0b337e5..689cfd5 100644
--- a/docker-compose.alt.yml
+++ b/docker-compose.alt.yml
@@ -1,17 +1,14 @@
 services:
-  server:
+  openedai-vision-alt:
     build:
       args:
         - VERSION=alt
       dockerfile: Dockerfile
-    tty: true
+    container_name: openedai-vision-alt
     image: ghcr.io/matatonic/openedai-vision-alt
     env_file: vision-alt.env # your settings go here
     volumes:
       - ./hf_home:/app/hf_home  # for Hugginface model cache
-    # be sure to review and run prepare_minigemini.sh before starting a mini-gemini model
-      - ./model_zoo:/app/model_zoo # for MiniGemini
-      - ./YanweiLi:/app/YanweiLi # for MiniGemini
       - ./model_conf_tests.alt.json:/app/model_conf_tests.json
     ports:
       - 5006:5006
diff --git a/docker-compose.yml b/docker-compose.yml
index 073f62d..1e47538 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,17 +1,14 @@
 services:
-  server:
+  openedai-vision:
     build:
       args:
         - VERSION=latest
       dockerfile: Dockerfile
-    tty: true
+    container_name: openedai-vision
     image: ghcr.io/matatonic/openedai-vision
     env_file: vision.env # your settings go here
     volumes:
       - ./hf_home:/app/hf_home  # for Hugginface model cache
-    # be sure to review and run prepare_minigemini.sh before starting a mini-gemini model
-      - ./model_zoo:/app/model_zoo # for MiniGemini
-      - ./YanweiLi:/app/YanweiLi # for MiniGemini
       - ./model_conf_tests.json:/app/model_conf_tests.json
     ports:
       - 5006:5006
diff --git a/model_conf_tests.json b/model_conf_tests.json
index c6c1e41..ee5e563 100644
--- a/model_conf_tests.json
+++ b/model_conf_tests.json
@@ -94,6 +94,7 @@
   ["microsoft/Phi-3.5-vision-instruct", "-A", "flash_attention_2", "--load-in-4bit"],
   ["microsoft/Phi-3.5-vision-instruct", "-A", "flash_attention_2"],
   ["mistralai/Pixtral-12B-2409"],
+  ["openbmb/MiniCPM-V-2_6-int4", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["openbmb/MiniCPM-V-2_6", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
   ["openbmb/MiniCPM-V-2_6", "-A", "flash_attention_2", "--device-map", "cuda:0"],
   ["openbmb/MiniCPM-Llama3-V-2_5", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
diff --git a/vision.sample.env b/vision.sample.env
index 89ab412..c145e57 100644
--- a/vision.sample.env
+++ b/vision.sample.env
@@ -98,6 +98,7 @@ HF_HUB_ENABLE_HF_TRANSFER=1
 #CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2 --load-in-4bit"  # test pass✅, time: 8.5s, mem: 4.4GB, 13/13 tests passed.
 #CLI_COMMAND="python vision.py -m microsoft/Phi-3.5-vision-instruct -A flash_attention_2"  # test pass✅, time: 7.5s, mem: 9.3GB, 13/13 tests passed.
 #CLI_COMMAND="python vision.py -m mistralai/Pixtral-12B-2409"  # test pass✅, time: 16.3s, mem: 35.6GB, 13/13 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6-int4 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 15.5s, mem: 9.5GB, 13/13 tests passed.
 #CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 13.6s, mem: 9.4GB, 13/13 tests passed.
 #CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2_6 -A flash_attention_2 --device-map cuda:0"  # test pass✅, time: 12.1s, mem: 18.8GB, 13/13 tests passed.
 #CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 -A flash_attention_2 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 22.8s, mem: 9.0GB, 13/13 tests passed.