0.12.0

matatonic · Apr 28, 2024 · 1dc75c7 · 1dc75c7
1 parent adb8eee
commit 1dc75c7
Show file tree

Hide file tree

Showing 9 changed files with 206 additions and 112 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -10,11 +10,9 @@ RUN git clone https://github.com/dvlab-research/MGM.git --single-branch /app/MGM
 WORKDIR /app
 COPY requirements.txt .
 ARG VERSION=latest
-# transformers==4.36.2 supports most models except MGM-2B, llava-1.6, nanollava
-RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers>=4.39.0" >> requirements.txt ; fi
+RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers>=4.39.0\nautoawq" >> requirements.txt ; fi
 # TODO: nvidia apex wheel
-RUN pip install --no-cache-dir -U -r requirements.txt \
-    https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.7/flash_attn-2.5.7+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt
 
 WORKDIR /app/MGM
 RUN pip install --no-cache-dir --no-deps -e .

diff --git a/README.md b/README.md
@@ -20,6 +20,9 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 - - [X] [XComposer2-7b-4bit](https://huggingface.co/internlm/internlm-xcomposer2-7b-4bit) (not recommended)
 - - [X] [XComposer2-VL](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) [pretrain] (wont gpu split)
 - - [X] [XComposer2-VL-4bit](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b-4bit)
+- [X] [HuggingFaceM4/idefics2](https://huggingface.co/HuggingFaceM4) 
+- - [X] [idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) (main docker only, wont gpu split)
+- - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (main docker only, wont gpu split)
 - [X] [LlavaNext](https://huggingface.co/llava-hf) (main docker only)
 - - [X] [llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) (main docker only)
 - - [X] [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (main docker only)
@@ -62,6 +65,11 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le
 
 ## Recent updates
 
+Version: 0.12.0
+
+- new model support: HuggingFaceM4/idefics2-8b, HuggingFaceM4/idefics2-8b-AWQ
+- Fix: remove prompt from output of InternVL-Chat-V1-5
+
 Version: 0.11.0
 
 - new model support: OpenGVLab/InternVL-Chat-V1-5, up to 4k resolution, top opensource model
@@ -117,9 +125,9 @@ docker compose -f docker-compose.alt.yml pull
 
 ```shell
 # install the python dependencies
-pip install -r requirements.txt "transformers>=4.39.0"
+pip install -U -r requirements.txt "transformers>=4.39.0" autoawq
 # OR install the python dependencies for the alt version
-pip install -r requirements.txt "transformers==4.36.2"
+pip install -U -r requirements.txt "transformers==4.36.2"
 # run the server with your chosen model
 python vision.py --model vikhyatk/moondream2
 ```

diff --git a/backend/idefics2.py b/backend/idefics2.py
@@ -0,0 +1,58 @@
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers import AwqConfig
+
+from vision_qna import *
+
+# "HuggingFaceM4/idefics2-8b"
+# "HuggingFaceM4/idefics2-8b-AWQ"
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "idefics2"
+
+    def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
+        super().__init__(model_id, device, device_map, extra_params, format)
+
+        #do_image_splitting=False
+        #size= {"longest_edge": 448, "shortest_edge": 378} 
+        self.processor = AutoProcessor.from_pretrained(model_id)
+
+        if  '-awq' in model_id.lower():
+            """
+            # This is from https://huggingface.co/HuggingFaceM4/idefics2-8b
+            # It doesn't work
+            quantization_config = AwqConfig(
+                bits=4,
+                fuse_max_seq_len=4096,
+                modules_to_fuse={
+                    "attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
+                    "mlp": ["gate_proj", "up_proj", "down_proj"],
+                    "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
+                    "use_alibi": False,
+                    "num_attention_heads": 32,
+                    "num_key_value_heads": 8,
+                    "hidden_size": 4096,
+                }
+            )
+            self.params['quantization_config'] = quantization_config
+            """
+
+        if self.params['torch_dtype'] == torch.bfloat16:
+            self.params['torch_dtype'] = torch.float16
+
+        self.model = AutoModelForVision2Seq.from_pretrained(**self.params).to(self.device)
+
+        print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
+
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
+        images, hfmessages = await images_hfmessages_from_messages(request.messages)
+
+        prompt = self.processor.apply_chat_template(hfmessages, add_generation_prompt=True)
+        inputs = self.processor(text=prompt, images=images, return_tensors="pt")
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+
+        # Generate
+        params = self.get_generation_params(request)
+        generated_ids = self.model.generate(**inputs, **params)
+        generated_texts = self.processor.decode(generated_ids[0][inputs['input_ids'].size(1):].cpu(), skip_special_tokens=True)
+
+        return generated_texts
diff --git a/model_conf_tests.json b/model_conf_tests.json
@@ -2,7 +2,8 @@
   ["vikhyatk/moondream2", "--use-flash-attn"],
   ["vikhyatk/moondream1"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],
-  ["qnguyen3/nanoLLaVA", "--use-flash-attn"],
+  ["HuggingFaceM4/idefics2-8b", "--use-flash-attn", "--device-map", "cuda:0"],
+  ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--device-map", "cuda:0"],
   ["echo840/Monkey"],
   ["echo840/Monkey-Chat"],
   ["THUDM/cogvlm-chat-hf"],
@@ -26,7 +27,8 @@
 
   ["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--load-in-4bit", "--device-map", "cuda:0"],
-  ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--load-in-4bit"],
+  ["HuggingFaceM4/idefics2-8b-AWQ", "--use-flash-attn", "--device-map", "cuda:0"],
+  ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--load-in-4bit", "--device-map", "cuda:0"],
   ["THUDM/cogvlm-chat-hf", "--load-in-4bit"],
   ["THUDM/cogagent-chat-hf", "--load-in-4bit"],
   ["internlm/internlm-xcomposer2-7b-4bit", "--use-flash-attn"],

diff --git a/requirements.txt b/requirements.txt
@@ -2,20 +2,26 @@ accelerate
 auto_gptq
 bitsandbytes
 fastapi
-flash_attn
+# See: https://github.com/bdashore3/flash-attention/releases for other windows flash_attn releases
+# And: https://github.com/Dao-AILab/flash-attention/releases for linux.
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/bdashore3/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/bdashore3/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+flash_attn; python_version != "3.10" and python_version != "3.11"
 openai
 peft
 protobuf
 pydantic
 python-datauri
 requests
 sentencepiece
-torch>=2.2.0
+torch==2.2.*
 uvicorn
 xformers
 
 # moondream
-deepspeed==0.11.1
+deepspeed<0.14.0
 einops
 einops-exts
 httpx
@@ -36,8 +42,11 @@ transformers_stream_generator
 loguru
 sse_starlette
 
-#latest
+# alt
+#transformers==4.36.2
+
+# latest
 #transformers>=4.39.0
+# idefics2
+#autoawq
 
-#alt
-#transformers==4.36.2
diff --git a/test_models.py b/test_models.py
@@ -20,8 +20,8 @@
     'leaf': 'https://images.freeimages.com/images/large-previews/cd7/gingko-biloba-1058537.jpg',
 }
 
-green_pass = '\033[92mpass\033[0m'
-red_fail = '\033[91mfail\033[0m'
+green_pass = '\033[92mpass\033[0m✅'
+red_fail = '\033[91mfail\033[0m❌'
 
 
 def data_url_from_url(img_url: str) -> str:
@@ -48,7 +48,7 @@ def record_result(cmd_args, results, t, mem, note):
         'note': note
     }])
     result = all(results)
-    print(f"#CLI_COMMAND=\"python vision.py -m {' '.join(cmd_args)}\"  # test {'pass' if result else 'fail'}, time: {t:.1f}s, mem: {mem:.1f}GB, {note}")
+    print(f"#CLI_COMMAND=\"python vision.py -m {' '.join(cmd_args)}\"  # test {green_pass if result else red_fail}, time: {t:.1f}s, mem: {mem:.1f}GB, {note}")
 
 torch_memory_baseline = 0
 
@@ -115,8 +115,8 @@ def test(cmd_args: list[str]) -> int:
     mem = get_total_gpu_mem_used()
 
     result = all(results)
-    if result:
-        note = 'All tests passed.'
+    if not note:
+        note = f'{results.count(True)}/{len(results)} tests passed.'
 
     print(f"\n\n###\n\nTest complete.\nResult: {green_pass if result else red_fail}, time: {t:.1f}s")
 
@@ -233,4 +233,4 @@ def single_round():
     for r in all_results:
         cmdl = ' '.join(r['args'])
         result = all(r['results'])
-        print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\"  # test {'pass' if result else 'fail'}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}")
+        print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\"  # test {green_pass if result else red_fail}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}")