From 1dc75c7584f8224dfaa1d74a22225472e3638e19 Mon Sep 17 00:00:00 2001
From: matatonic <matatonic-git@zhero.org>
Date: Sun, 28 Apr 2024 18:55:57 -0400
Subject: [PATCH] 0.12.0

---
 Dockerfile            |  6 ++--
 README.md             | 12 +++++--
 backend/idefics2.py   | 58 ++++++++++++++++++++++++++++++
 model_conf_tests.json |  6 ++--
 requirements.txt      | 21 +++++++----
 test_models.py        | 12 +++----
 vision-alt.sample.env | 84 +++++++++++++++++++++----------------------
 vision.sample.env     | 84 ++++++++++++++++++++++---------------------
 vision_qna.py         | 35 +++++++++++++-----
 9 files changed, 206 insertions(+), 112 deletions(-)
 create mode 100644 backend/idefics2.py

diff --git a/Dockerfile b/Dockerfile
index d70d47f..1c6e382 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,11 +10,9 @@ RUN git clone https://github.com/dvlab-research/MGM.git --single-branch /app/MGM
 WORKDIR /app
 COPY requirements.txt .
 ARG VERSION=latest
-# transformers==4.36.2 supports most models except MGM-2B, llava-1.6, nanollava
-RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers>=4.39.0" >> requirements.txt ; fi
+RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers>=4.39.0\nautoawq" >> requirements.txt ; fi
 # TODO: nvidia apex wheel
-RUN pip install --no-cache-dir -U -r requirements.txt \
-    https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.7/flash_attn-2.5.7+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt
 
 WORKDIR /app/MGM
 RUN pip install --no-cache-dir --no-deps -e .
diff --git a/README.md b/README.md
index 04fa502..05a1ac0 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,9 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 - - [X] [XComposer2-7b-4bit](https://huggingface.co/internlm/internlm-xcomposer2-7b-4bit) (not recommended)
 - - [X] [XComposer2-VL](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) [pretrain] (wont gpu split)
 - - [X] [XComposer2-VL-4bit](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b-4bit)
+- [X] [HuggingFaceM4/idefics2](https://huggingface.co/HuggingFaceM4) 
+- - [X] [idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) (main docker only, wont gpu split)
+- - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (main docker only, wont gpu split)
 - [X] [LlavaNext](https://huggingface.co/llava-hf) (main docker only)
 - - [X] [llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) (main docker only)
 - - [X] [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (main docker only)
@@ -62,6 +65,11 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le
 
 ## Recent updates
 
+Version: 0.12.0
+
+- new model support: HuggingFaceM4/idefics2-8b, HuggingFaceM4/idefics2-8b-AWQ
+- Fix: remove prompt from output of InternVL-Chat-V1-5
+
 Version: 0.11.0
 
 - new model support: OpenGVLab/InternVL-Chat-V1-5, up to 4k resolution, top opensource model
@@ -117,9 +125,9 @@ docker compose -f docker-compose.alt.yml pull
 
 ```shell
 # install the python dependencies
-pip install -r requirements.txt "transformers>=4.39.0"
+pip install -U -r requirements.txt "transformers>=4.39.0" autoawq
 # OR install the python dependencies for the alt version
-pip install -r requirements.txt "transformers==4.36.2"
+pip install -U -r requirements.txt "transformers==4.36.2"
 # run the server with your chosen model
 python vision.py --model vikhyatk/moondream2
 ```
diff --git a/backend/idefics2.py b/backend/idefics2.py
new file mode 100644
index 0000000..c547750
--- /dev/null
+++ b/backend/idefics2.py
@@ -0,0 +1,58 @@
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers import AwqConfig
+
+from vision_qna import *
+
+# "HuggingFaceM4/idefics2-8b"
+# "HuggingFaceM4/idefics2-8b-AWQ"
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "idefics2"
+    
+    def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
+        super().__init__(model_id, device, device_map, extra_params, format)
+
+        #do_image_splitting=False
+        #size= {"longest_edge": 448, "shortest_edge": 378} 
+        self.processor = AutoProcessor.from_pretrained(model_id)
+
+        if  '-awq' in model_id.lower():
+            """
+            # This is from https://huggingface.co/HuggingFaceM4/idefics2-8b
+            # It doesn't work
+            quantization_config = AwqConfig(
+                bits=4,
+                fuse_max_seq_len=4096,
+                modules_to_fuse={
+                    "attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
+                    "mlp": ["gate_proj", "up_proj", "down_proj"],
+                    "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
+                    "use_alibi": False,
+                    "num_attention_heads": 32,
+                    "num_key_value_heads": 8,
+                    "hidden_size": 4096,
+                }
+            )
+            self.params['quantization_config'] = quantization_config
+            """
+
+        if self.params['torch_dtype'] == torch.bfloat16:
+            self.params['torch_dtype'] = torch.float16
+
+        self.model = AutoModelForVision2Seq.from_pretrained(**self.params).to(self.device)
+
+        print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
+    
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
+        images, hfmessages = await images_hfmessages_from_messages(request.messages)
+
+        prompt = self.processor.apply_chat_template(hfmessages, add_generation_prompt=True)
+        inputs = self.processor(text=prompt, images=images, return_tensors="pt")
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+
+        # Generate
+        params = self.get_generation_params(request)
+        generated_ids = self.model.generate(**inputs, **params)
+        generated_texts = self.processor.decode(generated_ids[0][inputs['input_ids'].size(1):].cpu(), skip_special_tokens=True)
+
+        return generated_texts
\ No newline at end of file
diff --git a/model_conf_tests.json b/model_conf_tests.json
index 37fc754..74e7ba0 100644
--- a/model_conf_tests.json
+++ b/model_conf_tests.json
@@ -2,7 +2,8 @@
   ["vikhyatk/moondream2", "--use-flash-attn"],
   ["vikhyatk/moondream1"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],
-  ["qnguyen3/nanoLLaVA", "--use-flash-attn"],
+  ["HuggingFaceM4/idefics2-8b", "--use-flash-attn", "--device-map", "cuda:0"],
+  ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--device-map", "cuda:0"],
   ["echo840/Monkey"],
   ["echo840/Monkey-Chat"],
   ["THUDM/cogvlm-chat-hf"],
@@ -26,7 +27,8 @@
 
   ["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--load-in-4bit", "--device-map", "cuda:0"],
-  ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--load-in-4bit"],
+  ["HuggingFaceM4/idefics2-8b-AWQ", "--use-flash-attn", "--device-map", "cuda:0"],
+  ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--load-in-4bit", "--device-map", "cuda:0"],
   ["THUDM/cogvlm-chat-hf", "--load-in-4bit"],
   ["THUDM/cogagent-chat-hf", "--load-in-4bit"],
   ["internlm/internlm-xcomposer2-7b-4bit", "--use-flash-attn"],
diff --git a/requirements.txt b/requirements.txt
index a0d8feb..df290e0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,13 @@ accelerate
 auto_gptq
 bitsandbytes
 fastapi
-flash_attn
+# See: https://github.com/bdashore3/flash-attention/releases for other windows flash_attn releases
+# And: https://github.com/Dao-AILab/flash-attention/releases for linux.
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/bdashore3/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/bdashore3/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+flash_attn; python_version != "3.10" and python_version != "3.11"
 openai
 peft
 protobuf
@@ -10,12 +16,12 @@ pydantic
 python-datauri
 requests
 sentencepiece
-torch>=2.2.0
+torch==2.2.*
 uvicorn
 xformers
 
 # moondream
-deepspeed==0.11.1
+deepspeed<0.14.0
 einops
 einops-exts
 httpx
@@ -36,8 +42,11 @@ transformers_stream_generator
 loguru
 sse_starlette
 
-#latest
+# alt
+#transformers==4.36.2
+
+# latest
 #transformers>=4.39.0
+# idefics2
+#autoawq
 
-#alt
-#transformers==4.36.2
diff --git a/test_models.py b/test_models.py
index 5825861..b1864a4 100755
--- a/test_models.py
+++ b/test_models.py
@@ -20,8 +20,8 @@
     'leaf': 'https://images.freeimages.com/images/large-previews/cd7/gingko-biloba-1058537.jpg',
 }
 
-green_pass = '\033[92mpass\033[0m'
-red_fail = '\033[91mfail\033[0m'
+green_pass = '\033[92mpass\033[0m✅'
+red_fail = '\033[91mfail\033[0m❌'
 
 
 def data_url_from_url(img_url: str) -> str:
@@ -48,7 +48,7 @@ def record_result(cmd_args, results, t, mem, note):
         'note': note
     }])
     result = all(results)
-    print(f"#CLI_COMMAND=\"python vision.py -m {' '.join(cmd_args)}\"  # test {'pass' if result else 'fail'}, time: {t:.1f}s, mem: {mem:.1f}GB, {note}")
+    print(f"#CLI_COMMAND=\"python vision.py -m {' '.join(cmd_args)}\"  # test {green_pass if result else red_fail}, time: {t:.1f}s, mem: {mem:.1f}GB, {note}")
 
 torch_memory_baseline = 0
 
@@ -115,8 +115,8 @@ def test(cmd_args: list[str]) -> int:
     mem = get_total_gpu_mem_used()
 
     result = all(results)
-    if result:
-        note = 'All tests passed.'
+    if not note:
+        note = f'{results.count(True)}/{len(results)} tests passed.'
 
     print(f"\n\n###\n\nTest complete.\nResult: {green_pass if result else red_fail}, time: {t:.1f}s")
     
@@ -233,4 +233,4 @@ def single_round():
     for r in all_results:
         cmdl = ' '.join(r['args'])
         result = all(r['results'])
-        print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\"  # test {'pass' if result else 'fail'}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}")
+        print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\"  # test {green_pass if result else red_fail}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}")
diff --git a/vision-alt.sample.env b/vision-alt.sample.env
index 0cd6628..fa4aa08 100644
--- a/vision-alt.sample.env
+++ b/vision-alt.sample.env
@@ -2,45 +2,45 @@
 # Copy this file to vision.env and uncomment the model of your choice.
 HF_HOME=hf_home
 #CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass, time: 4.8s, mem: 4.6GB, All tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream1"  # test pass, time: 4.1s, mem: 4.9GB, All tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass, time: 6.1s, mem: 21.9GB, All tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass, time: 7.7s, mem: 21.8GB, All tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass, time: 13.5s, mem: 52.2GB, All tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass, time: 13.8s, mem: 36.5GB, All tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass, time: 15.4s, mem: 37.4GB, All tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass, time: 4.7s, mem: 19.7GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn"  # test pass, time: 5.6s, mem: 15.8GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn"  # test pass, time: 14.8s, mem: 19.1GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn"  # test pass, time: 22.1s, mem: 27.9GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn"  # test pass, time: 17.1s, mem: 31.9GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn"  # test pass, time: 11.2s, mem: 67.5GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn"  # test pass, time: 77.9s, mem: 70.6GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn"  # test pass, time: 14.9s, mem: 91.7GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn"  # test pass, time: 20.0s, mem: 96.4GB, All tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass, time: 15.9s, mem: 25.1GB, All tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass, time: 15.9s, mem: 19.2GB, All tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass, time: 17.6s, mem: 20.3GB, All tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass, time: 7.1s, mem: 11.6GB, All tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass, time: 6.6s, mem: 7.7GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail, time: 2.1s, mem: 15.9GB, 
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass, time: 6.2s, mem: 14.6GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass, time: 7.1s, mem: 27.0GB, All tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass, time: 28.2s, mem: 31.3GB, All tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail, time: 18.3s, mem: 18.3GB, 
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass, time: 21.5s, mem: 12.2GB, All tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass, time: 20.8s, mem: 12.2GB, All tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn --device cuda:0"  # test pass, time: 10.4s, mem: 9.7GB, All tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn --device cuda:0"  # test pass, time: 12.9s, mem: 11.1GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail, time: 2.6s, mem: 6.1GB, 
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass, time: 10.0s, mem: 5.5GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass, time: 11.3s, mem: 8.9GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --load-in-4bit --use-flash-attn"  # test pass, time: 12.5s, mem: 6.6GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --load-in-4bit --use-flash-attn"  # test pass, time: 39.7s, mem: 9.8GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --load-in-4bit --use-flash-attn"  # test pass, time: 41.2s, mem: 9.9GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --load-in-4bit --use-flash-attn"  # test pass, time: 17.6s, mem: 13.9GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --load-in-4bit --use-flash-attn"  # test pass, time: 18.2s, mem: 21.4GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --load-in-4bit --use-flash-attn"  # test pass, time: 222.7s, mem: 24.4GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --load-in-4bit --use-flash-attn"  # test pass, time: 22.6s, mem: 26.5GB, All tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --load-in-4bit --use-flash-attn"  # test pass, time: 26.1s, mem: 29.7GB, All tests passed.
\ No newline at end of file
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass✅, time: 4.6s, mem: 4.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream1"  # test pass✅, time: 4.3s, mem: 4.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 6.2s, mem: 21.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 7.8s, mem: 21.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 13.5s, mem: 52.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 14.4s, mem: 36.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 15.1s, mem: 37.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 5.0s, mem: 19.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn"  # test pass✅, time: 5.8s, mem: 15.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn"  # test pass✅, time: 16.0s, mem: 18.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn"  # test pass✅, time: 20.0s, mem: 27.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn"  # test pass✅, time: 18.4s, mem: 31.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn"  # test pass✅, time: 11.3s, mem: 67.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn"  # test pass✅, time: 96.4s, mem: 70.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn"  # test pass✅, time: 16.0s, mem: 91.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn"  # test pass✅, time: 18.6s, mem: 96.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 14.9s, mem: 25.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 16.8s, mem: 19.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 15.8s, mem: 20.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.0s, mem: 11.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.2s, mem: 7.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail❌, time: 2.1s, mem: 15.7GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.0s, mem: 14.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.1s, mem: 26.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 27.9s, mem: 31.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail❌, time: 17.6s, mem: 18.0GB, 2/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 20.5s, mem: 12.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 21.2s, mem: 11.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn --device cuda:0"  # test pass✅, time: 9.6s, mem: 9.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn --device cuda:0"  # test pass✅, time: 12.4s, mem: 10.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail❌, time: 2.5s, mem: 5.8GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.0s, mem: 5.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.9s, mem: 8.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --load-in-4bit --use-flash-attn"  # test pass✅, time: 9.3s, mem: 6.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 37.7s, mem: 9.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --load-in-4bit --use-flash-attn"  # test pass✅, time: 37.2s, mem: 9.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 22.0s, mem: 13.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --load-in-4bit --use-flash-attn"  # test pass✅, time: 18.2s, mem: 21.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 224.7s, mem: 24.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --load-in-4bit --use-flash-attn"  # test pass✅, time: 23.8s, mem: 26.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --load-in-4bit --use-flash-attn"  # test pass✅, time: 25.1s, mem: 29.5GB, 8/8 tests passed.
\ No newline at end of file
diff --git a/vision.sample.env b/vision.sample.env
index cc19dc1..e87cc25 100644
--- a/vision.sample.env
+++ b/vision.sample.env
@@ -2,44 +2,46 @@
 # Copy this file to vision.env and uncomment the model of your choice.
 HF_HOME=hf_home
 #CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass, time: 7.3s, mem: 4.6GB, All tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream1"  # test fail, time: 4.2s, mem: 4.9GB, Test failed with Exception: Internal Server Error
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass, time: 18.7s, mem: 52.0GB, All tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn"  # test pass, time: 11.4s, mem: 8.4GB, All tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass, time: 10.0s, mem: 21.8GB, All tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass, time: 13.2s, mem: 21.7GB, All tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass, time: 17.0s, mem: 36.1GB, All tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass, time: 19.9s, mem: 36.9GB, All tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass, time: 8.2s, mem: 19.3GB, All tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass, time: 18.9s, mem: 24.8GB, All tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass, time: 23.7s, mem: 18.8GB, All tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass, time: 20.7s, mem: 20.0GB, All tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass, time: 9.7s, mem: 11.3GB, All tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass, time: 12.7s, mem: 7.4GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail, time: 6.8s, mem: 15.5GB, 
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass, time: 8.5s, mem: 14.3GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass, time: 12.6s, mem: 26.6GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn"  # test pass, time: 16.1s, mem: 19.0GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn"  # test pass, time: 13.3s, mem: 18.7GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn"  # test pass, time: 12.4s, mem: 32.3GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn"  # test pass, time: 52.0s, mem: 72.3GB, All tests passed.
-#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --use-flash-attn"  # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --use-flash-attn"  # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test pass, time: 7.9s, mem: 8.2GB, All tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass, time: 27.1s, mem: 31.8GB, All tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail, time: 20.1s, mem: 18.0GB, 
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --load-in-4bit"  # test pass, time: 14.5s, mem: 7.8GB, All tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass, time: 21.3s, mem: 12.0GB, All tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass, time: 24.7s, mem: 12.0GB, All tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn"  # test pass, time: 13.2s, mem: 9.2GB, All tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn"  # test pass, time: 19.2s, mem: 10.7GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail, time: 5.0s, mem: 5.7GB, 
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass, time: 11.0s, mem: 5.5GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass, time: 15.8s, mem: 8.9GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --load-in-4bit --use-flash-attn"  # test pass, time: 26.6s, mem: 8.9GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --load-in-4bit --use-flash-attn"  # test pass, time: 15.5s, mem: 9.6GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --load-in-4bit --use-flash-attn"  # test pass, time: 13.3s, mem: 14.5GB, All tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --load-in-4bit --use-flash-attn"  # test pass, time: 54.8s, mem: 26.0GB, All tests passed.
-#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --load-in-4bit --use-flash-attn"  # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --load-in-4bit --use-flash-attn"  # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn --load-in-4bit"  # test pass, time: 7.4s, mem: 5.6GB, All tests passed.
\ No newline at end of file
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass✅, time: 4.1s, mem: 4.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream1"  # test fail❌, time: 3.6s, mem: 4.8GB, Test failed with Exception: Internal Server Error
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 13.5s, mem: 51.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.1s, mem: 22.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.9s, mem: 8.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 6.5s, mem: 21.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 8.2s, mem: 21.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 14.1s, mem: 36.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 15.3s, mem: 37.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 5.0s, mem: 19.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 14.4s, mem: 24.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 16.6s, mem: 18.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 16.5s, mem: 20.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.9s, mem: 11.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.1s, mem: 7.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0"  # test fail❌, time: 2.0s, mem: 15.6GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.1s, mem: 14.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.8s, mem: 26.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn"  # test pass✅, time: 12.8s, mem: 19.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn"  # test pass✅, time: 8.6s, mem: 18.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn"  # test pass✅, time: 9.6s, mem: 32.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn"  # test pass✅, time: 46.2s, mem: 72.4GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --use-flash-attn"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
+#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --use-flash-attn"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test pass✅, time: 4.5s, mem: 8.3GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 27.6s, mem: 32.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail❌, time: 17.0s, mem: 18.2GB, 2/8 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0"  # test pass✅, time: 8.2s, mem: 12.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --load-in-4bit --device-map cuda:0"  # test pass✅, time: 11.2s, mem: 7.6GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 19.4s, mem: 12.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 20.6s, mem: 12.2GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn"  # test pass✅, time: 11.2s, mem: 9.5GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn"  # test pass✅, time: 11.3s, mem: 10.9GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn"  # test fail❌, time: 2.5s, mem: 5.9GB, 0/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 9.3s, mem: 5.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 10.2s, mem: 9.0GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 19.1s, mem: 9.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 13.6s, mem: 9.7GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 12.2s, mem: 14.8GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --load-in-4bit --use-flash-attn"  # test pass✅, time: 54.1s, mem: 26.1GB, 8/8 tests passed.
+#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --load-in-4bit --use-flash-attn"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
+#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --load-in-4bit --use-flash-attn"  # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn --load-in-4bit"  # test pass✅, time: 7.4s, mem: 5.7GB, 8/8 tests passed.
\ No newline at end of file
diff --git a/vision_qna.py b/vision_qna.py
index c74e534..f2cfee5 100644
--- a/vision_qna.py
+++ b/vision_qna.py
@@ -7,6 +7,7 @@
 from typing import Optional, List, Literal
 from pydantic import BaseModel
 from transformers import BitsAndBytesConfig
+from transformers.image_utils import load_image
 
 class ImageURL(BaseModel):
     url: str
@@ -116,14 +117,7 @@ def get_generation_params(self, request: ImageChatRequest, default_params = {})
         return params
 
 async def url_to_image(img_url: str) -> Image.Image:
-    if img_url.startswith('http'):
-        response = requests.get(img_url)
-        
-        img_data = response.content
-    elif img_url.startswith('data:'):
-        img_data = DataURI(img_url).data
-
-    return Image.open(io.BytesIO(img_data)).convert("RGB")
+    return load_image(img_url)
 
 async def url_to_file(img_url: str) -> str:
     if img_url.startswith('data:'):
@@ -139,6 +133,25 @@ async def url_to_file(img_url: str) -> str:
             f.write(response.content)
             return filename
 
+async def images_hfmessages_from_messages(messages: list[Message], url_handler = url_to_image):
+    hfmessages = []
+    images = []
+
+    for m in messages:
+        content = []
+        for c in m.content:
+            if c.type == 'image_url':
+                image = await url_handler(c.image_url.url)
+                images.extend([image])
+                content.extend([{"type": "image"}])
+            elif c.type == 'text':
+                content.extend([{'type': 'text', 'text': c.text}])
+
+        hfmessages.extend([{'role': m.role, 'content': content}])
+
+    return images, hfmessages
+
+
 async def phi15_prompt_from_messages(messages: list[Message], img_tok = "<image>", img_end = ''): # </image>
     prompt = ''
     images = []
@@ -464,4 +477,8 @@ def guess_backend(model_name: str) -> str:
         return 'fuyu'
     
     if 'internvl-chat-v1-5' in model_id:
-        return 'internvl-chat-v1-5'
\ No newline at end of file
+        return 'internvl-chat-v1-5'
+    
+    if 'idefics2' in model_id:
+        return 'idefics2'
+    
\ No newline at end of file