0.7.0

matatonic · Apr 8, 2024 · c203ab3 · c203ab3
1 parent 6a24f4f
commit c203ab3
Show file tree

Hide file tree

Showing 18 changed files with 322 additions and 117 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,7 @@
 #
 hf_home/
+model_zoo/
+YanweiLi/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/Dockerfile b/Dockerfile
@@ -1,15 +1,24 @@
 FROM python:3.11-slim
 
+RUN apt-get update && apt-get install -y git
+RUN pip install --no-cache-dir --upgrade pip
+
 RUN mkdir -p /app
 WORKDIR /app
-
-RUN pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+RUN pip install --no-cache-dir -U https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir -U -r requirements.txt
+
+RUN git clone https://github.com/dvlab-research/MiniGemini.git --single-branch /app/MiniGemini
 
+WORKDIR /app/MiniGemini
+RUN pip install --no-cache-dir --no-deps -e .
+
+WORKDIR /app
 COPY requirements.*.txt .
-RUN for r in requirements.*.txt ; do pip install --no-cache-dir -r $r; done
+RUN for r in requirements.*.txt ; do pip install -U --no-cache-dir -r $r; done
 
 COPY *.py .
 COPY backend /app/backend
-CMD python vision.py
+
+CMD python vision.py
diff --git a/Dockerfile.minigemini b/Dockerfile.minigemini
@@ -0,0 +1,25 @@
+FROM python:3.11-slim
+
+RUN mkdir -p /app
+
+RUN apt-get update && apt-get install -y git
+
+RUN git clone https://github.com/dvlab-research/MiniGemini.git --single-branch /app/MiniGemini
+WORKDIR /app/MiniGemini
+RUN pip install --no-cache-dir --upgrade pip
+RUN pip install --no-cache-dir --no-deps -e .
+
+WORKDIR /app
+
+COPY requirements.minigemini.txt .
+RUN pip install --no-cache-dir -r requirements.minigemini.txt
+
+COPY *.py .
+COPY backend /app/backend
+
+RUN pip show torch
+RUN pip show torchvision
+RUN pip show transformers
+RUN pip show accelerate
+CMD python vision.py
+
diff --git a/README.md b/README.md
@@ -8,29 +8,53 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 - Not affiliated with OpenAI in any way
 
 Model support:
-- [X] [InternLM-XComposer2](https://huggingface.co/internlm/internlm-xcomposer2-7b) [finetune] (multi-image chat model, lots of warnings on startup, but works fine)
-- [X] [InternLM-XComposer2-VL](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) [pretrain] *(only supports a single image, also lots of warnings)
-- [X] [LlavaNext](https://huggingface.co/llava-hf) - (llava-v1.6-mistral-7b-hf, llava-v1.6-34b-hf - llava-v1.6-34b-hf is not working well yet) *(only supports a single image)
-- [X] [Llava](https://huggingface.co/llava-hf) - (llava-v1.5-vicuna-7b-hf, llava-v1.5-vicuna-13b-hf, llava-v1.5-bakLlava-7b-hf) *(only supports a single image)
+- [X] [InternLM-XComposer2](https://huggingface.co/internlm/internlm-xcomposer2-7b) [finetune] (multi-image chat model, lots of warnings on startup, wont gpu split)
+- [X] [InternLM-XComposer2-VL](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) [pretrain] *(only supports a single image, also lots of warnings, wont gpu split)
+- [X] [LlavaNext](https://huggingface.co/llava-hf) *(only supports a single image)
+- - [X] [llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)
+- - [X] [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf)
+- - [X] [llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf)
+- - [X] [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
+- [X] [Llava](https://huggingface.co/llava-hf) *(only supports a single image)
+- - [X] [llava-v1.5-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.5-vicuna-7b-hf)
+- - [X] [llava-v1.5-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.5-vicuna-13b-hf)
+- - [ ] [llava-v1.5-bakLlava-7b-hf](https://huggingface.co/llava-hf/llava-v1.5-bakLlava-7b-hf) (currently errors)
+- [X] [Monkey-Chat](https://huggingface.co/echo840/Monkey-Chat)
+- [X] [Monkey](https://huggingface.co/echo840/Monkey)
 - [X] [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
-- [X] Moondream2 - [vikhyatk/moondream2](https://huggingface.co/vikhyatk/moondream2) *(only supports a single image)
-- [ ] Moondream1 - [vikhyatk/moondream1](https://huggingface.co/vikhyatk/moondream1)
-- [ ] Deepseek-VL - [deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
-- [X] [openbmb/MiniCPM-V](https://huggingface.co/openbmb/MiniCPM-V) (aka. OmniLMM-3B) *(only supports a single image)
-- [ ] [openbmb/OmniLMM-12B](https://huggingface.co/openbmb/OmniLMM-12B)
-- [ ] [echo840/Monkey](https://huggingface.co/echo840/Monkey)
-- [ ] [YanweiLi/MiniGemini](https://huggingface.co/collections/YanweiLi/)
+- [X] [Moondream2](https://huggingface.co/vikhyatk/moondream2) *(only supports a single image)
+- [X] [MiniCPM-V](https://huggingface.co/openbmb/MiniCPM-V) (aka. OmniLMM-3B) *(only supports a single image)
+- [X] [MiniGemini](https://huggingface.co/collections/YanweiLi/) (more complex setup, see: `prepare_minigemini.sh`)
+- - [X] [MiniGemini-2B](https://huggingface.co/YanweiLi/Mini-Gemini-2B)
+- - [ ] [MiniGemini-7B](https://huggingface.co/YanweiLi/Mini-Gemini-7B) (currently errors)
+- - [ ] [MiniGemini-13B](https://huggingface.co/YanweiLi/Mini-Gemini-13B) (currently errors)
+- - [ ] [MiniGemini-34B](https://huggingface.co/YanweiLi/Mini-Gemini-34B) (currently errors)
+- - [ ] [MiniGemini-8x7B](https://huggingface.co/YanweiLi/Mini-Gemini-8x7B) (currently errors)
+- - [ ] [MiniGemini-7B-HD](https://huggingface.co/YanweiLi/Mini-Gemini-7B-HD) (currently errors)
+- - [ ] [MiniGemini-13B-HD](https://huggingface.co/YanweiLi/Mini-Gemini-13B-HD) (currently errors)
+- - [ ] [MiniGemini-34B-HD](https://huggingface.co/YanweiLi/Mini-Gemini-34B-HD) (currently errors)
+- - [ ] [MiniGemini-8x7B-HD](https://huggingface.co/YanweiLi/Mini-Gemini-8x7B-HD) (currently errors)
+- [ ] [OmniLMM-12B](https://huggingface.co/openbmb/OmniLMM-12B)
+- [ ] [Moondream1](https://huggingface.co/vikhyatk/moondream1)
+- [ ] [Deepseek-VL-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
+- [ ] [Deepseek-VL-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)
 - [ ] [NousResearch/Obsidian-3B-V0.5](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
 - [ ] ...
 
 
-Some vision systems include their own OpenAI compatible API server. Also included are some pre-built images and docker-compose for them:
-- [X] [THUDM/CogVLM](https://github.com/THUDM/CogVLM) ([cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf), [cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)), `docker-compose.cogvlm.yml` **Recommended for 16GB-40GB GPU**s
-- [X] [01-ai](https://huggingface.co/01-ai)/Yi-VL ([Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B), [Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)), `docker-compose.yi-vl.yml`
+Some vision systems include their own OpenAI compatible API server. Included are some pre-built images and docker-compose for them (they must be run separately):
+- [X] [THUDM/CogVLM](https://github.com/THUDM/CogVLM) `docker-compose.cogvlm.yml`
+- - [X] [cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf)
+- - [X] [cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf) **Recommended for 16GB-40GB GPU**
+- [X] [01-ai](https://huggingface.co/01-ai)/Yi-VL `docker-compose.yi-vl.yml`
+- - [X] [Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B)
+- - [X] [Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)
 
-Version: 0.6.1
+Version: 0.7.0
 
 Recent updates:
+- new model support: MiniGemini-2B (it's still a bit complex to use, see `prepare_minigemini.sh`)
+- new model support: echo840/Monkey-Chat, echo840/Monkey
 - AutoGPTQ support for internlm/internlm-xcomposer2-7b-4bit, internlm/internlm-xcomposer2-vl-7b-4bit
 - Automatic selection of backend, based on the model name
 - Enable trust_remote_code by default
@@ -48,10 +72,23 @@ API Documentation
 
 * [OpenAI Vision guide](https://platform.openai.com/docs/guides/vision)
 
-Installation instructions
--------------------------
 
-(**Docker Recommended**)
+Docker support
+--------------
+
+1) Edit the docker-compose file to suit your needs.
+
+2) You can run the server via docker like so:
+```shell
+docker compose up
+# for CogVLM
+docker compose -f docker-compose.cogvlm.yml up
+# for VI-VL
+docker compose -f docker-compose.yi-vl.yml up
+```
+
+Manual Installation instructions
+--------------------------------
 
 ```shell
 # install the python dependencies
@@ -64,6 +101,8 @@ pip install .
 python vision.py --model vikhyatk/moondream2
 ```
 
+For MiniGemini support the docker image is recommended. See `Dockerfile` and `requirements.minigemini.txt` for manual installation instructions.
+
 Usage
 -----
 
@@ -92,20 +131,6 @@ options:
   --preload             Preload model and exit. (default: False)
 ```
 
-Docker support
---------------
-
-1) Edit the docker-compose file to suit your needs.
-
-2) You can run the server via docker like so:
-```shell
-docker compose up
-# for CogVLM
-docker compose -f docker-compose.cogvlm.yml up
-# for VI-VL
-docker compose -f docker-compose.yi-vl.yml up
-```
-
 Sample API Usage
 ----------------
 

diff --git a/backend/llava.py b/backend/llava.py
@@ -16,6 +16,8 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
         if not format:
             self.format = guess_model_format(model_id)
 
+        del self.params['trust_remote_code']
+
         self.processor = LlavaProcessor.from_pretrained(model_id)
         self.model = LlavaForConditionalGeneration.from_pretrained(**self.params).eval()
 
@@ -29,6 +31,6 @@ async def chat_with_images(self, request: ImageChatRequest) -> str:
         params = self.get_generation_params(request)
 
         output = self.model.generate(**inputs, **params)
-        response = self.processor.decode(output[0], skip_special_tokens=True)
+        response = self.processor.decode(output[0][inputs['input_ids'].size(1):].cpu(), skip_special_tokens=True)
 
-        return answer_from_response(response, self.format)
+        return response
diff --git a/backend/llavanext.py b/backend/llavanext.py
@@ -1,10 +1,10 @@
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 from vision_qna import *
 
-# model_id = "llava-hf/llava-v1.6-mistral-7b-hf" # llama2
 # model_id = "llava-hf/llava-v1.6-34b-hf" # chatml
 # model_id = "llava-hf/llava-v1.6-vicuna-13b-hf" # vicuna
 # model_id = "llava-hf/llava-v1.6-vicuna-7b-hf" #  vicuna
+# model_id = "llava-hf/llava-v1.6-mistral-7b-hf" # llama2
 
 class VisionQnA(VisionQnABase):
     model_name: str = "llavanext"
@@ -16,7 +16,10 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
         if not format:
             self.format = guess_model_format(model_id)
 
-        self.processor = LlavaNextProcessor.from_pretrained(model_id)
+        del self.params['trust_remote_code']
+
+        use_fast = 'mistral' in model_id
+        self.processor = LlavaNextProcessor.from_pretrained(model_id, use_fast=use_fast)
         self.model = LlavaNextForConditionalGeneration.from_pretrained(**self.params).eval()
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
@@ -29,6 +32,6 @@ async def chat_with_images(self, request: ImageChatRequest) -> str:
         params = self.get_generation_params(request)
 
         output = self.model.generate(**inputs, **params)
-        response = self.processor.decode(output[0], skip_special_tokens=True)
+        response = self.processor.decode(output[0][inputs['input_ids'].size(1):].cpu(), skip_special_tokens=True)
 
-        return answer_from_response(response, self.format)
+        return response
diff --git a/backend/minigemini.py b/backend/minigemini.py
@@ -1,11 +1,24 @@
 import re
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
+from minigemini.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from minigemini.conversation import conv_templates, SeparatorStyle
 from minigemini.model.builder import load_pretrained_model
-from minigemini.mm_utils import process_images
+from minigemini.utils import disable_torch_init
+from minigemini.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
 
 from vision_qna import *
 
+# YanweiLi/Mini-Gemini-2B
+# YanweiLi/Mini-Gemini-7B
+# YanweiLi/Mini-Gemini-7B-HD
+# YanweiLi/Mini-Gemini-13B
+# YanweiLi/Mini-Gemini-34B
+# YanweiLi/Mini-Gemini-34B-HD
+# YanweiLi/Mini-Gemini-13B-HDs
+# YanweiLi/Mini-Gemini-8x7B-HD
+# YanweiLi/Mini-Gemini-8x7B
+
 class VisionQnA(VisionQnABase):
     model_name: str = "minigemini"
     format: str = "llama2"
@@ -16,19 +29,72 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
         if not format:
             self.format = guess_model_format(model_id)
 
+        model_base, model_name = model_id.split('/', 1)
+        del self.params['low_cpu_mem_usage']
+        del self.params['pretrained_model_name_or_path']
+        del self.params['trust_remote_code']
+
         self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
-            args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
+            model_id, None, model_name, **self.params)
 
-
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
 
     async def chat_with_images(self, request: ImageChatRequest) -> str:
-        images, prompt = await prompt_from_messages(request.messages, self.format)
+        image_convert, prompt = await prompt_from_messages(request.messages, self.format)
+
+        if hasattr(self.model.config, 'image_size_aux'):
+            if not hasattr(self.image_processor, 'image_size_raw'):
+                self.image_processor.image_size_raw = self.image_processor.crop_size.copy()
+            self.image_processor.crop_size['height'] = self.model.config.image_size_aux
+            self.image_processor.crop_size['width'] = self.model.config.image_size_aux
+            self.image_processor.size['shortest_edge'] = self.model.config.image_size_aux
+
+        image_tensor = process_images(image_convert, self.image_processor, self.model.config)
+
+        image_grid = getattr(self.model.config, 'image_grid', 1)
+        if hasattr(self.model.config, 'image_size_aux'):
+            raw_shape = [self.image_processor.image_size_raw['height'] * image_grid,
+                        self.image_processor.image_size_raw['width'] * image_grid]
+            image_tensor_aux = image_tensor 
+            image_tensor = torch.nn.functional.interpolate(image_tensor,
+                                                        size=raw_shape,
+                                                        mode='bilinear',
+                                                        align_corners=False)
+        else:
+            image_tensor_aux = []
+
+        if image_grid >= 2:            
+            raw_image = image_tensor.reshape(3, 
+                                            image_grid,
+                                            self.image_processor.image_size_raw['height'],
+                                            image_grid,
+                                            self.image_processor.image_size_raw['width'])
+            raw_image = raw_image.permute(1, 3, 0, 2, 4)
+            raw_image = raw_image.reshape(-1, 3,
+                                        self.image_processor.image_size_raw['height'],
+                                        self.image_processor.image_size_raw['width'])
+
+            if getattr(self.model.config, 'image_global', False):
+                global_image = image_tensor
+                if len(global_image.shape) == 3:
+                    global_image = global_image[None]
+                global_image = torch.nn.functional.interpolate(global_image, 
+                                                            size=[self.image_processor.image_size_raw['height'],
+                                                                    self.image_processor.image_size_raw['width']], 
+                                                            mode='bilinear', 
+                                                            align_corners=False)
+                # [image_crops, image_global]
+                raw_image = torch.cat([raw_image, global_image], dim=0)
+            image_tensor = raw_image.contiguous()
+            image_tensor = image_tensor.unsqueeze(0)
+
+        if type(image_tensor) is list:
+            image_tensor = [image.to(self.model.device, dtype=torch.float16) for image in image_tensor]
+            image_tensor_aux = [image.to(self.model.device, dtype=torch.float16) for image in image_tensor_aux]
+        else:
+            image_tensor = image_tensor.to(self.model.device, dtype=torch.float16)
+            image_tensor_aux = image_tensor_aux.to(self.model.device, dtype=torch.float16)
 
-        #encoded_images = self.model.encode_image(images).to(self.device)
-        # square?
-        image_tensor = process_images(image_convert, image_processor, model.config)
-        image_processor(images, return_tensors='pt')['pixel_values']
 
         input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.model.device)
 
@@ -38,18 +104,14 @@ async def chat_with_images(self, request: ImageChatRequest) -> str:
             output_ids = self.model.generate(
                 input_ids,
                 images=image_tensor,
-                images_aux=None,
+                images_aux=image_tensor_aux if len(image_tensor_aux)>0 else None,
                 bos_token_id=self.tokenizer.bos_token_id,  # Begin of sequence token
                 eos_token_id=self.tokenizer.eos_token_id,  # End of sequence token
                 pad_token_id=self.tokenizer.pad_token_id,  # Pad token
                 use_cache=True,
                 **params,
             )
-
         answer = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-
-        self.
-
         return answer