From d087b6c275e73ea6551b9b1fc77f7793b2ed649c Mon Sep 17 00:00:00 2001
From: matatonic <matatonic-git@zhero.org>
Date: Wed, 19 Jun 2024 13:28:35 -0400
Subject: [PATCH] 0.25.0 +Florence

---
 README.md             |   7 ++
 backend/emu.py        |   4 +-
 backend/florence.py   |  63 +++++++++++++
 model_conf_tests.json |   4 +
 openedai.py           | 103 +++++++++++++++++++-
 requirements.txt      |   1 +
 vision.py             |   5 +-
 vision.sample.env     | 213 +++++++++++++++++++++---------------------
 vision_qna.py         |  22 ++++-
 9 files changed, 310 insertions(+), 112 deletions(-)
 create mode 100644 backend/florence.py
diff --git a/README.md b/README.md
index 4c5c2e7..2db5da9 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,8 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 - - [X] [idefics2-8b-chatty-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-chatty-AWQ) (wont gpu split)
 - [X] [Microsoft](https://huggingface.co/microsoft/)
 - - [X] [Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)
+- - [X] [Florence-2-large-ft](https://huggingface.co/microsoft/Florence-2-large-ft)  (wont gpu split)
+- - [X] [Florence-2-base-ft](https://huggingface.co/microsoft/Florence-2-base-ft)  (wont gpu split)
 - [X] [failspy](https://huggingface.co/failspy)
 - - [X] [Phi-3-vision-128k-instruct-abliterated-alpha](https://huggingface.co/failspy/Phi-3-vision-128k-instruct-abliterated-alpha)
 - [X] [qihoo360](https://huggingface.co/qihoo360)
@@ -103,6 +105,11 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le
 
 ## Recent updates
 
+Version 0.25.0
+
+- New model support: microsoft/Florence family of models. Not a chat model, but simple questions are ok and all commands are functional. ex "<MORE_DETAILED_CAPTION>", "<OCR>", "<OD>", etc.
+- Improved error handling & logging
+
 Version 0.24.1
 
 - Compatibility: Support generation without images for most models. (llava based models still require an image)
diff --git a/backend/emu.py b/backend/emu.py
index 2e389e4..f914c8f 100644
--- a/backend/emu.py
+++ b/backend/emu.py
@@ -2,7 +2,7 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
 from huggingface_hub import snapshot_download
-
+from loguru import logger
 from vision_qna import *
 
 # BAAI/Emu2-Chat
@@ -36,7 +36,7 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
             self.model = load_checkpoint_and_dispatch(self.model, checkpoint=checkpoint, device_map=device_map).eval()
 
         # self.model.device/dtype are overloaded with some other object
-        print(f"Loaded {model_id} on device: {self.device} with dtype: {self.params['torch_dtype']}")
+        logger.info(f"Loaded {model_id} on device: {self.device} with dtype: {self.params['torch_dtype']}")
     
     async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
         images, prompt, system = await emu_images_prompt_system_from_messages(request.messages)
diff --git a/backend/florence.py b/backend/florence.py
new file mode 100644
index 0000000..9df169c
--- /dev/null
+++ b/backend/florence.py
@@ -0,0 +1,63 @@
+from transformers import AutoProcessor, AutoModelForCausalLM
+
+from vision_qna import *
+
+# microsoft/Florence-2-large-ft
+# microsoft/Florence-2-base-ft
+
+def select_task(prompt):
+    tasks = ["<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>", "<OCR>", # simple tasks
+        "<OD>", "<DENSE_REGION_CAPTION>", "<REGION_PROPOSAL>", "<CAPTION_TO_PHRASE_GROUNDING>",
+        "<REFERRING_EXPRESSION_SEGMENTATION>", "<REGION_TO_SEGMENTATION>", "<OPEN_VOCABULARY_DETECTION>",
+        "<REGION_TO_CATEGORY>", "<REGION_TO_DESCRIPTION>", "<OCR_WITH_REGION>"
+    ]
+    for task in tasks:
+        if task in prompt:
+            return task
+        
+    return None
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "florence"
+    format: str = "florence"
+    visual_layers: List[str] = ['vision_tower', 'image_proj_norm', 'image_pos_embed', 'visual_temporal_embed']
+    
+    def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
+        super().__init__(model_id, device, device_map, extra_params, format)
+
+        if not format:
+            self.format = guess_model_format(model_id)
+
+        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
+        self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()
+
+        # bitsandbytes already moves the model to the device, so we don't need to do it again.
+        if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)):
+           self.model = self.model.to(self.device)
+
+        self.loaded_banner()
+
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
+        images, prompt = await prompt_from_messages(request.messages, self.format)
+        
+        inputs = self.processor(text=prompt, images=images[0], return_tensors="pt").to(device=self.model.device, dtype=self.model.dtype)
+
+        default_params = {
+            'do_sample': False,
+            'num_beams': 3,
+        }
+
+        params = self.get_generation_params(request, default_params=default_params)
+
+        generation_kwargs = dict(
+            **inputs,
+            **params,
+        )
+
+        generated_ids = self.model.generate(**generation_kwargs)
+        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+        parsed_answer = self.processor.post_process_generation(generated_text, task=select_task(prompt), image_size=(images[0].width, images[0].height))
+
+        for k, v in parsed_answer.items():
+            return str(v)
+
diff --git a/model_conf_tests.json b/model_conf_tests.json
index a4b9db3..af5167f 100644
--- a/model_conf_tests.json
+++ b/model_conf_tests.json
@@ -83,6 +83,10 @@
   ["llava-hf/llava-v1.6-vicuna-13b-hf", "--use-flash-attn"],
   ["llava-hf/llava-v1.6-vicuna-7b-hf", "--use-flash-attn", "--load-in-4bit"],
   ["llava-hf/llava-v1.6-vicuna-7b-hf", "--use-flash-attn"],
+  ["microsoft/Florence-2-base-ft", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
+  ["microsoft/Florence-2-base-ft", "--use-flash-attn", "--device-map", "cuda:0"],
+  ["microsoft/Florence-2-large-ft", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
+  ["microsoft/Florence-2-large-ft", "--use-flash-attn", "--device-map", "cuda:0"],
   ["microsoft/Phi-3-vision-128k-instruct", "--use-flash-attn", "--load-in-4bit"],
   ["microsoft/Phi-3-vision-128k-instruct", "--use-flash-attn"],
   ["openbmb/MiniCPM-Llama3-V-2_5", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
diff --git a/openedai.py b/openedai.py
index 36beca6..0fe1af7 100644
--- a/openedai.py
+++ b/openedai.py
@@ -1,13 +1,72 @@
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import PlainTextResponse
+from fastapi.responses import PlainTextResponse, JSONResponse
 from loguru import logger
 
+class OpenAIError(Exception):
+    pass
+
+class APIError(OpenAIError):
+    message: str
+    code: str = None
+    param: str = None
+    type: str = None
+
+    def __init__(self, message: str, code: int = 500, param: str = None, internal_message: str = ''):
+        super().__init__(message)
+        self.message = message
+        self.code = code
+        self.param = param
+        self.type = self.__class__.__name__,
+        self.internal_message = internal_message
+
+    def __repr__(self):
+        return "%s(message=%r, code=%d, param=%s)" % (
+            self.__class__.__name__,
+            self.message,
+            self.code,
+            self.param,
+        )
+
+class InternalServerError(APIError):
+    pass
+
+class ServiceUnavailableError(APIError):
+    def __init__(self, message="Service unavailable, please try again later.", code=503, internal_message=''):
+        super().__init__(message, code, internal_message)
+
+class APIStatusError(APIError):
+    status_code: int = 400
+    
+    def __init__(self, message: str, param: str = None, internal_message: str = ''):
+        super().__init__(message, self.status_code, param, internal_message)
+
+class BadRequestError(APIStatusError):
+    status_code: int = 400
+
+class AuthenticationError(APIStatusError):
+    status_code: int = 401
+
+class PermissionDeniedError(APIStatusError):
+    status_code: int = 403
+
+class NotFoundError(APIStatusError):
+    status_code: int = 404
+
+class ConflictError(APIStatusError):
+    status_code: int = 409
+
+class UnprocessableEntityError(APIStatusError):
+    status_code: int = 422
+
+class RateLimitError(APIStatusError):
+    status_code: int = 429
+
 class OpenAIStub(FastAPI):
     def __init__(self, **kwargs) -> None:
         super().__init__(**kwargs)
         self.models = {}
-            
+
         self.add_middleware(
             CORSMiddleware,
             allow_origins=["*"],
@@ -16,6 +75,46 @@ def __init__(self, **kwargs) -> None:
             allow_headers=["*"]
         )
 
+        @self.exception_handler(Exception)
+        def openai_exception_handler(request: Request, exc: Exception) -> JSONResponse:
+            # Generic server errors
+            #logger.opt(exception=exc).error("Logging exception traceback")
+
+            return JSONResponse(status_code=500, content={
+                'message': 'InternalServerError',
+                'code': 500,
+            })
+
+        @self.exception_handler(APIError)
+        def openai_apierror_handler(request: Request, exc: APIError) -> JSONResponse:
+            # Server error
+            logger.opt(exception=exc).error("Logging exception traceback")
+
+            if exc.internal_message:
+                logger.info(exc.internal_message)
+
+            return JSONResponse(status_code = exc.code, content={
+                'message': exc.message,
+                'code': exc.code,
+                'type': exc.__class__.__name__,
+                'param': exc.param,
+            })
+
+        @self.exception_handler(APIStatusError)
+        def openai_statuserror_handler(request: Request, exc: APIStatusError) -> JSONResponse:
+            # Client side error
+            logger.info(repr(exc))
+
+            if exc.internal_message:
+                logger.info(exc.internal_message)
+
+            return JSONResponse(status_code = exc.code, content={
+                'message': exc.message,
+                'code': exc.code,
+                'type': exc.__class__.__name__,
+                'param': exc.param,
+            })
+
         @self.middleware("http")
         async def log_requests(request: Request, call_next):
             logger.debug(f"Request path: {request.url.path}")
diff --git a/requirements.txt b/requirements.txt
index 3541f2a..db351c9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flas
 flash_attn; python_version != "3.10" and python_version != "3.11"
 hf_transfer
 loguru
+numpy<2
 openai
 peft
 protobuf
diff --git a/vision.py b/vision.py
index d27905b..b058c11 100644
--- a/vision.py
+++ b/vision.py
@@ -108,8 +108,7 @@ async def streamer():
         }
     }
 
-    if os.environ.get('OPENEDAI_DEBUG', False):
-        print(f'Response: {vis_chat_resp}')
+    logger.debug(f'Response: {vis_chat_resp}')
 
     return vis_chat_resp
 
@@ -141,7 +140,7 @@ def parse_args(argv=None):
     if not args.backend:
         args.backend = guess_backend(args.model)
 
-    print(f"Loading VisionQnA[{args.backend}] with {args.model}")
+    logger.info(f"Loading VisionQnA[{args.backend}] with {args.model}")
     backend = importlib.import_module(f'backend.{args.backend}')
 
     extra_params = {}
diff --git a/vision.sample.env b/vision.sample.env
index d69c7a7..5cfded1 100644
--- a/vision.sample.env
+++ b/vision.sample.env
@@ -4,109 +4,114 @@ HF_HOME=hf_home
 HF_HUB_ENABLE_HF_TRANSFER=1
 #HF_TOKEN=hf-...
 #CUDA_VISIBLE_DEVICES=1,0
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit"  # test pass✅, time: 8.4s, mem: 8.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V"  # test pass✅, time: 7.8s, mem: 19.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit"  # test pass✅, time: 8.8s, mem: 9.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh"  # test pass✅, time: 5.4s, mem: 10.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit"  # test pass✅, time: 12.9s, mem: 8.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B"  # test pass✅, time: 9.1s, mem: 11.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh"  # test pass✅, time: 8.0s, mem: 12.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit"  # test pass✅, time: 9.3s, mem: 5.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B"  # test pass✅, time: 6.6s, mem: 12.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit"  # test pass✅, time: 11.2s, mem: 5.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B"  # test pass✅, time: 9.6s, mem: 12.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit"  # test pass✅, time: 11.8s, mem: 9.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V"  # test pass✅, time: 9.8s, mem: 20.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit"  # test pass✅, time: 28.6s, mem: 29.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB"  # test pass✅, time: 19.6s, mem: 71.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 11.5s, mem: 13.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 10.8s, mem: 22.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0"  # test pass✅, time: 12.9s, mem: 12.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 16.9s, mem: 13.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0"  # test pass✅, time: 13.6s, mem: 22.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ --use-flash-attn --device-map cuda:0"  # test pass✅, time: 15.1s, mem: 13.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 20.5s, mem: 27.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 27.1s, mem: 30.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40"  # test pass✅, time: 25.9s, mem: 54.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 19.7s, mem: 52.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 40.2s, mem: 31.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit"  # test pass✅, time: 5.9s, mem: 5.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 6.9s, mem: 6.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40"  # test pass✅, time: 6.5s, mem: 8.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5"  # test pass✅, time: 5.7s, mem: 7.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit"  # test pass✅, time: 10.6s, mem: 6.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 13.8s, mem: 12.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40"  # test pass✅, time: 13.3s, mem: 15.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5"  # test pass✅, time: 10.1s, mem: 11.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit"  # test pass✅, time: 10.0s, mem: 11.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 6.6s, mem: 19.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1"  # test pass✅, time: 10.7s, mem: 10.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 26.8s, mem: 13.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 21.8s, mem: 37.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 29.0s, mem: 12.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 20.1s, mem: 36.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit"  # test pass✅, time: 33.3s, mem: 22.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B"  # test pass✅, time: 33.1s, mem: 40.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit"  # test pass✅, time: 144.9s, mem: 22.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V --load-in-4bit"  # test pass✅, time: 8.5s, mem: 8.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-Llama-3-8B-V"  # test pass✅, time: 7.5s, mem: 19.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh --load-in-4bit"  # test pass✅, time: 8.3s, mem: 9.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-2B-zh"  # test pass✅, time: 5.3s, mem: 10.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B --load-in-4bit"  # test pass✅, time: 12.8s, mem: 8.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B"  # test pass✅, time: 9.0s, mem: 11.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-3B-zh"  # test pass✅, time: 8.0s, mem: 12.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B --load-in-4bit"  # test pass✅, time: 9.9s, mem: 5.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_0-4B"  # test pass✅, time: 7.0s, mem: 12.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B --load-in-4bit"  # test pass✅, time: 11.8s, mem: 5.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-4B"  # test pass✅, time: 9.4s, mem: 13.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V --load-in-4bit"  # test pass✅, time: 10.1s, mem: 9.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Bunny-v1_1-Llama-3-8B-V"  # test pass✅, time: 10.0s, mem: 19.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --load-in-4bit"  # test pass✅, time: 26.7s, mem: 29.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m BAAI/Emu2-Chat --max-memory=0:78GiB,1:20GiB"  # test pass✅, time: 22.1s, mem: 71.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 11.8s, mem: 12.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 11.5s, mem: 22.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0"  # test pass✅, time: 13.5s, mem: 12.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 16.6s, mem: 12.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty --use-flash-attn --device-map cuda:0"  # test pass✅, time: 13.7s, mem: 22.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-chatty-AWQ --use-flash-attn --device-map cuda:0"  # test pass✅, time: 15.7s, mem: 12.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 21.1s, mem: 26.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 27.5s, mem: 29.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0 --max-tiles 40"  # test pass✅, time: 26.5s, mem: 54.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass✅, time: 20.3s, mem: 51.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0"  # test pass✅, time: 41.0s, mem: 31.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --load-in-4bit"  # test pass✅, time: 6.2s, mem: 5.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 7.6s, mem: 6.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --max-tiles 40"  # test pass✅, time: 7.1s, mem: 8.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-2B-V1-5"  # test pass✅, time: 6.4s, mem: 6.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --load-in-4bit"  # test pass✅, time: 11.3s, mem: 6.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40 --load-in-4bit"  # test pass✅, time: 14.0s, mem: 11.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --max-tiles 40"  # test pass✅, time: 13.5s, mem: 15.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/Mini-InternVL-Chat-4B-V1-5"  # test pass✅, time: 10.3s, mem: 11.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat --load-in-4bit"  # test pass✅, time: 9.6s, mem: 11.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat"  # test pass✅, time: 6.5s, mem: 19.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m Salesforce/xgen-mm-phi3-mini-instruct-r-v1"  # test pass✅, time: 10.4s, mem: 10.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass✅, time: 28.2s, mem: 13.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf"  # test pass✅, time: 23.0s, mem: 37.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass✅, time: 28.6s, mem: 12.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf"  # test pass✅, time: 21.1s, mem: 36.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B --load-in-4bit"  # test pass✅, time: 33.4s, mem: 22.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chat-19B"  # test pass✅, time: 32.2s, mem: 40.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B --load-in-4bit"  # test pass✅, time: 144.1s, mem: 22.2GB, 12/12 tests passed.
 #CLI_COMMAND="python vision.py -m THUDM/cogvlm2-llama3-chinese-chat-19B"  # test pass✅, time: 105.9s, mem: 40.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 77.8s, mem: 16.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0"  # test pass✅, time: 53.5s, mem: 28.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 10.1s, mem: 11.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0"  # test pass✅, time: 9.2s, mem: 20.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 10.3s, mem: 7.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 9.7s, mem: 17.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.1s, mem: 8.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.7s, mem: 18.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test pass✅, time: 5.9s, mem: 8.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 15.5s, mem: 15.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 19.1s, mem: 24.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey --load-in-4bit"  # test pass✅, time: 8.9s, mem: 15.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 9.2s, mem: 21.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit"  # test pass✅, time: 13.2s, mem: 15.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 11.4s, mem: 21.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn --load-in-4bit"  # test pass✅, time: 11.6s, mem: 7.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn"  # test pass✅, time: 9.1s, mem: 12.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 2.1s, mem: 7.2GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 19.2s, mem: 20.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 1.5s, mem: 6.4GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 25.2s, mem: 19.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn"  # test pass✅, time: 14.5s, mem: 9.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 2.2s, mem: 3.0GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.6s, mem: 7.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 2.1s, mem: 6.5GB, 0/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 25.1s, mem: 20.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn"  # test pass✅, time: 17.0s, mem: 11.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 12.7s, mem: 9.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 9.4s, mem: 26.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.5s, mem: 5.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 9.0s, mem: 14.6GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn --load-in-4bit"  # test pass✅, time: 63.5s, mem: 23.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn"  # test pass✅, time: 69.9s, mem: 68.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn --load-in-4bit"  # test pass✅, time: 23.3s, mem: 8.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn"  # test pass✅, time: 19.1s, mem: 17.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn --load-in-4bit"  # test pass✅, time: 15.1s, mem: 17.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 77.0s, mem: 16.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m THUDM/glm-4v-9b --device-map cuda:0"  # test pass✅, time: 54.1s, mem: 27.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.5s, mem: 11.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-Fuyu --device-map cuda:0"  # test pass✅, time: 8.9s, mem: 20.1GB, 12/12 tes ts passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 10.4s, mem: 7.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-clip-llama3 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 9.4s, mem: 17.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 8.8s, mem: 8.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m TIGER-Lab/Mantis-8B-siglip-llama3 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.5s, mem: 17.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn"  # test pass✅, time: 5.9s, mem: 8.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 15.0s, mem: 15.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0"  # test pass✅, time: 18.9s, mem: 24.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey --load-in-4bit"  # test pass✅, time: 8.8s, mem: 15.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass✅, time: 8.6s, mem: 21.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat --load-in-4bit"  # test pass✅, time: 12.6s, mem: 15.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass✅, time: 11.8s, mem: 21.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn --load-in-4bit"  # test pass✅, time: 11.6s, mem: 7.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m failspy/Phi-3-vision-128k-instruct-abliterated-alpha --use-flash-attn"  # test pass✅, time: 9.9s, mem: 12.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 2.6s, mem: 7.1GB, 0/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-4khd-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 19.3s, mem: 20.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 1.7s, mem: 6.3GB, 0/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 27.3s, mem: 18.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn"  # test pass✅, time: 14.2s, mem: 9.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 1.6s, mem: 2.9GB, 0/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-1_8b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.8s, mem: 7.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test fail❌, time: 1.9s, mem: 6.4GB, 0/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0"  # test pass✅, time: 25.6s, mem: 20.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn"  # test pass✅, time: 15.4s, mem: 10.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 13.6s, mem: 9.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 9.8s, mem: 26.7GB, 12/12 test
+s passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 9.5s, mem: 5.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0"  # test pass✅, time: 8.2s, mem: 14.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn --load-in-4bit"  # test pass✅, time: 63.8s, mem: 23.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn"  # test pass✅, time: 70.5s, mem: 68.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn --load-in-4bit"  # test pass✅, time: 23.5s, mem: 8.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn"  # test pass✅, time: 19.3s, mem: 17.4GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn --load-in-4bit"  # test pass✅, time: 15.2s, mem: 17.2GB, 12/12 tests passed.
 #CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn"  # test pass✅, time: 13.7s, mem: 33.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn --load-in-4bit"  # test pass✅, time: 13.8s, mem: 9.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn"  # test pass✅, time: 12.9s, mem: 18.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn --load-in-4bit"  # test pass✅, time: 11.3s, mem: 7.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn"  # test pass✅, time: 9.1s, mem: 12.5GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 12.0s, mem: 9.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 10.3s, mem: 19.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 8.4s, mem: 4.9GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.0s, mem: 8.7GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 10.9s, mem: 5.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 9.3s, mem: 9.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit"  # test pass✅, time: 10.4s, mem: 8.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn"  # test pass✅, time: 8.2s, mem: 17.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 11.4s, mem: 7.8GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.8s, mem: 8.2GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit"  # test pass✅, time: 10.3s, mem: 8.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0"  # test pass✅, time: 9.7s, mem: 17.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn --load-in-4bit"  # test pass✅, time: 15.9s, mem: 17.1GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn"  # test pass✅, time: 16.5s, mem: 32.4GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit"  # test pass✅, time: 11.1s, mem: 8.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1"  # test pass✅, time: 9.9s, mem: 17.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit"  # test pass✅, time: 10.0s, mem: 8.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1"  # test pass✅, time: 13.4s, mem: 17.3GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn --load-in-4bit"  # test pass✅, time: 6.7s, mem: 3.0GB, 12/12 tests passed.
-#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass✅, time: 5.3s, mem: 4.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn --load-in-4bit"  # test pass✅, time: 14.1s, mem: 9.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn"  # test pass✅, time: 13.1s, mem: 19.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 3.6s, mem: 1.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-base-ft --use-flash-attn --device-map cuda:0"  # test pass✅, time: 2.8s, mem: 1.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 3.8s, mem: 1.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Florence-2-large-ft --use-flash-attn --device-map cuda:0"  # test pass✅, time: 2.9s, mem: 2.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn --load-in-4bit"  # test pass✅, time: 11.7s, mem: 7.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m microsoft/Phi-3-vision-128k-instruct --use-flash-attn"  # test pass✅, time: 9.7s, mem: 12.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 11.5s, mem: 9.5GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-Llama3-V-2_5 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 11.3s, mem: 19.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 8.0s, mem: 4.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0"  # test pass✅, time: 6.7s, mem: 8.6GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 11.2s, mem: 5.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0"  # test pass✅, time: 8.9s, mem: 8.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn --load-in-4bit"  # test pass✅, time: 10.7s, mem: 8.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qihoo360/360VL-8B --use-flash-attn"  # test pass✅, time: 8.4s, mem: 17.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0 --load-in-4bit"  # test pass✅, time: 10.8s, mem: 7.7GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0"  # test pass✅, time: 7.3s, mem: 8.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0 --load-in-4bit"  # test pass✅, time: 9.7s, mem: 8.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m qresearch/llama-3-vision-alpha-hf --device cuda:0"  # test pass✅, time: 10.1s, mem: 17.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn --load-in-4bit"  # test pass✅, time: 16.2s, mem: 17.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m tiiuae/falcon-11B-vlm --use-flash-attn"  # test pass✅, time: 16.0s, mem: 32.3GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1 --load-in-4bit"  # test pass✅, time: 11.4s, mem: 8.0GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-Med-v1"  # test pass✅, time: 9.9s, mem: 17.2GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1 --load-in-4bit"  # test pass✅, time: 9.9s, mem: 7.8GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m togethercomputer/Llama-3-8B-Dragonfly-v1"  # test pass✅, time: 13.6s, mem: 17.1GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn --load-in-4bit"  # test pass✅, time: 6.7s, mem: 2.9GB, 12/12 tests passed.
+#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass✅, time: 5.4s, mem: 4.5GB, 12/12 tests passed.
\ No newline at end of file
diff --git a/vision_qna.py b/vision_qna.py
index e9c3a38..cef6a19 100644
--- a/vision_qna.py
+++ b/vision_qna.py
@@ -89,7 +89,7 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
         torch.set_grad_enabled(False)
 
     def loaded_banner(self):
-        print(f"Loaded {self._model_id} on device: {self.model.device} with dtype: {self.model.dtype}")
+        logger.info(f"Loaded {self._model_id} on device: {self.model.device} with dtype: {self.model.dtype}")
 
     def select_device(self):
         return 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
@@ -664,11 +664,27 @@ async def glm4v_prompt_from_messages(messages: list[Message], img_tok = "<|begin
 
     return images, prompt
 
+async def florence_prompt_from_messages(messages: list[Message], url_handler = url_to_image):
+    prompt = '<CAPTION>' # "<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>", "<OCR>"
+    images = []
+
+    for m in messages:
+        for c in m.content:
+            if c.type == 'image_url':
+                images.extend([ await url_handler(c.image_url.url) ])
+
+        for c in m.content:
+            if c.type == 'text':
+                prompt = c.text # only one command at a time
+
+    return images, prompt
+
 
 async def prompt_from_messages(messages: list[Message], format: str) -> str:
     known_formats = {
         'chatml': chatml_prompt_from_messages,
         'falcon': falcon_prompt_from_messages,
+        'florence': florence_prompt_from_messages,
         'fuyu': fuyu_prompt_from_messages,
         'gemma': gemma_prompt_from_messages,
         'glm4v': glm4v_prompt_from_messages,
@@ -693,6 +709,7 @@ def guess_model_format(model_name: str) -> str:
     model_format_match_map = {
         'chatml': ['34b', 'yi-6b', 'nanollava', 'internvl-chat-v1-5', 'internvl-chat-2b'],
         'falcon': ['falcon'],
+        'florence': ['florence'],
         'fuyu': ['fuyu'],
         'gemma': ['gemma', '-2b'],
         'glm4v': ['glm-4v'],
@@ -769,6 +786,9 @@ def guess_backend(model_name: str) -> str:
     if 'fuyu' in model_id:
         return 'fuyu'
     
+    if 'florence' in model_id:
+        return 'florence'
+    
     if 'internvl-chat' in model_id and '-v1-5' in model_id:
         return 'internvl-chat-v1-5'