0.14.0 +Mantis

matatonic · May 4, 2024 · c8d0fe3 · c8d0fe3
1 parent 1338426
commit c8d0fe3
Show file tree

Hide file tree

Showing 5 changed files with 56 additions and 17 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -6,6 +6,7 @@ RUN pip install --no-cache-dir --upgrade pip
 RUN mkdir -p /app
 RUN git clone https://github.com/01-ai/Yi --single-branch /app/Yi
 RUN git clone https://github.com/dvlab-research/MGM.git --single-branch /app/MGM
+RUN git clone https://github.com/TIGER-AI-Lab/Mantis.git --single-branch /app/Mantis
 
 WORKDIR /app
 COPY requirements.txt .
@@ -17,6 +18,9 @@ RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.tx
 WORKDIR /app/MGM
 RUN pip install --no-cache-dir --no-deps -e .
 
+WORKDIR /app/Mantis
+RUN pip install --no-cache-dir --no-deps -e .
+
 WORKDIR /app
 
 COPY *.py .

diff --git a/README.md b/README.md
@@ -40,6 +40,10 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 - - [X] [llama-3-vision-alpha-hf](https://huggingface.co/qresearch/llama-3-vision-alpha-hf) (main docker only, wont gpu split)
 - [X] [BAAI](https://huggingface.co/BAAI/)
 - - [X] [Bunny-Llama-3-8B-V](https://huggingface.co/BAAI/Bunny-Llama-3-8B-V) (main docker only)
+- [X] [TIGER-Lab](https://huggingface.co/TIGER-Lab)
+- - [X] [Mantis-8B-siglip-llama3](https://huggingface.co/TIGER-Lab/Mantis-8B-siglip-llama3) (main docker only, wont gpu split)
+- - [X] [Mantis-8B-clip-llama3](https://huggingface.co/TIGER-Lab/Mantis-8B-clip-llama3) (main docker only, wont gpu split)
+- - [X] [Mantis-8B-Fuyu](https://huggingface.co/TIGER-Lab/Mantis-8B-Fuyu) (main docker only, wont gpu split)
 - [X] [fuyu-8b](https://huggingface.co/adept/fuyu-8b) [pretrain]
 - [X] [Monkey-Chat](https://huggingface.co/echo840/Monkey-Chat)
 - [X] [Monkey](https://huggingface.co/echo840/Monkey)
@@ -70,6 +74,11 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le
 
 ## Recent updates
 
+Version: 0.14.0
+
+- new model support: TIGER-Labs/Mantis: Mantis-8B-siglip-llama3, Mantis-8B-clip-llama3, Mantis-8B-Fuyu
+
+
 Version: 0.13.0
 
 - new model support: InternLM-XComposer2-4KHD
@@ -78,10 +87,6 @@ Version: 0.13.0
 
 Version: 0.12.1
 
-- Fix: data: urls, revert load_image change
-
-Version: 0.12.0
-
 - new model support: HuggingFaceM4/idefics2-8b, HuggingFaceM4/idefics2-8b-AWQ
 - Fix: remove prompt from output of InternVL-Chat-V1-5
 
@@ -90,18 +95,6 @@ Version: 0.11.0
 - new model support: OpenGVLab/InternVL-Chat-V1-5, up to 4k resolution, top opensource model
 - MiniGemini renamed MGM upstream
 
-Version: 0.10.0
-
-- new model support: adept/fuyu-8b
-- new model support: MiniCPM-V-2
-- new model support: MiniGemini-7B -> MiniGemini-8x7B-HD, alternate docker.
-- new openai_example.sh shell script for simple command line generation.
-- new backend: include 01-ai/Yi-VL in main docker containers, currently errors.
-- new backend: include THUDM/CogVLM in main docker containers.
-- Fix: moondream1 (use alt container)
-- Split images into main (transformers>=4.39.0) and alt (transformers==4.36.2)
-- Big performance gains (10x) for some models, especially llava-v1.6-34B (`use_cache` missing from many models, all llava* models, more.)
-
 
 ## API Documentation
 

diff --git a/backend/mantis.py b/backend/mantis.py
@@ -0,0 +1,36 @@
+from mantis.models.mllava import chat_mllava, MLlavaProcessor, LlavaForConditionalGeneration
+from mantis.models.mfuyu import MFuyuForCausalLM, MFuyuProcessor
+
+from vision_qna import *
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "mantis"
+
+    def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
+        super().__init__(model_id, device, device_map, extra_params, format)
+
+        del self.params['trust_remote_code']
+
+        if '-fuyu' in model_id.lower():
+            self.processor = MFuyuProcessor.from_pretrained(model_id)
+            self.model = MFuyuForCausalLM.from_pretrained(**self.params)
+        else:
+            self.processor = MLlavaProcessor.from_pretrained(model_id)
+            self.model = LlavaForConditionalGeneration.from_pretrained(**self.params)
+
+        print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
+
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
+        prompt, history, images, system = await prompt_history_images_system_from_messages(request.messages, img_tok = "<image>", url_handler = url_to_image)
+
+        default_params = {
+            'num_beams': 1,
+            'do_sample': False,
+        }
+
+        params = self.get_generation_params(request, default_params)
+
+        response, history = chat_mllava(prompt, images, self.model, self.processor, history=history, **params)
+
+        return response
+
diff --git a/model_conf_tests.json b/model_conf_tests.json
@@ -1,4 +1,7 @@
 [
+  ["TIGER-Lab/Mantis-8B-siglip-llama3", "--use-flash-attn", "--device-map", "cuda:0"],
+  ["TIGER-Lab/Mantis-8B-clip-llama3", "--use-flash-attn", "--device-map", "cuda:0"],
+  ["TIGER-Lab/Mantis-8B-Fuyu", "--device-map", "cuda:0"],
   ["vikhyatk/moondream2", "--use-flash-attn"],
   ["vikhyatk/moondream1"],
   ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],

diff --git a/vision_qna.py b/vision_qna.py
@@ -521,4 +521,7 @@ def guess_backend(model_name: str) -> str:
         return 'llama3vision'
 
     if 'bunny' in model_id:
-        return 'bunny'
+        return 'bunny'
+
+    if 'mantis' in model_id:
+        return 'mantis'