Merge pull request #8 from matatonic/dev

0.27.1
matatonic · Jul 16, 2024 · b424f49 · b424f49
2 parents cc8a8eb + 366dd3f
commit b424f49
Show file tree

Hide file tree

Showing 13 changed files with 178 additions and 140 deletions.
diff --git a/README.md b/README.md
@@ -10,12 +10,13 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 ## Model support
 
 - [X] [OpenGVLab](https://huggingface.co/OpenGVLab)
+- - [X] [InternVL2-Llama3-76B](https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B)
 - - [X] [InternVL2-40B](https://huggingface.co/OpenGVLab/InternVL2-40B)
 - - [X] [InternVL2-26B](https://huggingface.co/OpenGVLab/InternVL2-26B)
 - - [X] [InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B)
 - - [X] [InternVL2-4B](https://huggingface.co/OpenGVLab/InternVL2-4B) (alternate docker only)
 - - [X] [InternVL2-2B](https://huggingface.co/OpenGVLab/InternVL2-2B)
-- - [X] [InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ)
+- - [X] [InternVL2-2B-AWQ](https://huggingface.co/OpenGVLab/InternVL2-2B-AWQ) (currently errors)
 - - [X] [InternVL2-1B](https://huggingface.co/OpenGVLab/InternVL2-1B)
 - - [X] [InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5) (wont gpu split yet)
 - - [X] [InternVL-Chat-V1-5-Int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-Int8) (wont gpu split yet)
@@ -100,7 +101,9 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 - - [X] [MGM-34B-HD](https://huggingface.co/YanweiLi/MGM-34B-HD) (alternate docker only)
 - - [X] [MGM-8x7B-HD](https://huggingface.co/YanweiLi/MGM-8x7B-HD) (alternate docker only)
 - [X] [cognitivecomputations/dolphin-vision-72b](https://huggingface.co/cognitivecomputations/dolphin-vision-72b)
-- [X] [qnguyen3/nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA) (wont gpu split)
+- [X] [qnguyen3]
+- - [X] [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA) (wont gpu split)
+- - [X] [nanoLLaVA-1.5](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) (wont gpu split)
 - [ ] [01-ai/Yi-VL](https://huggingface.co/01-ai)
 - - [ ] [Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B) (currently errors)
 - - [ ] [Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B) (currently errors)
@@ -113,9 +116,15 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le
 
 ## Recent updates
 
+Version 0.27.1
+
+- new model support: qnguyen3/nanoLLaVA-1.5
+- Complete support for chat *without* images (using placeholder images where required, 1x1 clear or 8x8 black as necessary)
+- Require transformers==4.41.2 (4.42 breaks many models)
+
 Version 0.27.0
 
-- new model support: OpenGVLab/InternVL2 series of models (1B, 2B, 4B, 8B*, 26B*, 40B*) - *(current top open source models)
+- new model support: OpenGVLab/InternVL2 series of models (1B, 2B, 4B, 8B*, 26B*, 40B*, 76B*) - *(current top open source models)
 
 Version 0.26.0
 
@@ -242,9 +251,9 @@ cp vision-alt.sample.env vision-alt.env
 
 2) You can run the server via docker compose like so:
 ```shell
-# for OpenedAI Vision Server (transformers>=4.39.0)
+# for OpenedAI Vision Server
 docker compose up
-# for OpenedAI Vision Server (alternate, for Mini-Gemini > 2B, used transformers==4.36.2)
+# for OpenedAI Vision Server (alternate, for Mini-Gemini > 2B, uses transformers==4.36.2)
 docker compose -f docker-compose.alt.yml up
 ```
 
@@ -262,7 +271,7 @@ docker compose -f docker-compose.alt.yml pull
 
 ```shell
 # install the python dependencies
-pip install -U -r requirements.txt "transformers>=4.41.2" "autoawq>=0.2.5"
+pip install -U -r requirements.txt "transformers==4.41.2" "autoawq>=0.2.5"
 # OR install the python dependencies for the alt version
 pip install -U -r requirements.txt "transformers==4.36.2"
 # run the server with your chosen model

diff --git a/backend/florence.py b/backend/florence.py
@@ -40,6 +40,9 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
     async def chat_with_images(self, request: ImageChatRequest) -> str:
         images, prompt = await prompt_from_messages(request.messages, self.format)
 
+        if len(images) < 1:
+            images = [ await url_to_image(black_pixel_url) ]
+
         inputs = self.processor(text=prompt, images=images[0], return_tensors="pt").to(device=self.model.device, dtype=self.model.dtype)
 
         default_params = {

diff --git a/backend/internvl-chat-v1-5.py b/backend/internvl-chat-v1-5.py
@@ -18,7 +18,7 @@
 # OpenGVLab/InternVL2-8B
 # OpenGVLab/InternVL2-26B
 # OpenGVLab/InternVL2-40B (yi-34- nous-hermes-2)
-
+# OpenGVLab/InternVL2-Llama3-76B
 
 MAX_TILES = 6
 
@@ -143,7 +143,7 @@ async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGener
             for img in images:
                 image_tokens = '<img>' + '<IMG_CONTEXT>' * self.model.num_image_token * img.size(0) + '</img>'
                 prompt = prompt.replace('<image>', image_tokens, 1)
-            
+
         model_inputs = self.tokenizer(prompt, return_tensors='pt')
         input_ids = model_inputs['input_ids'].cuda()
         attention_mask = model_inputs['attention_mask'].cuda()

diff --git a/backend/llama3vision.py b/backend/llama3vision.py
@@ -29,6 +29,10 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
     async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
         images, prompt = await prompt_from_messages(request.messages, self.format)
 
+        if len(images) < 1:
+            images = [ await url_to_image(black_pixel_url) ]
+            prompt = '<image>\n' + prompt
+
         input_ids = self.model.tokenizer_image_token(prompt, self.tokenizer, -200, return_tensors="pt").unsqueeze(0).to(self.device)
         image_inputs = self.model.processor(
             images=images,

diff --git a/backend/llava.py b/backend/llava.py
@@ -29,6 +29,10 @@ async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGener
 
         images, prompt = await prompt_from_messages(request.messages, self.format)
 
+        if len(images) < 1:
+            images = [ await url_to_image(black_pixel_url) ]
+            prompt = "<image>\n" + prompt
+
         inputs = self.processor(prompt, images, return_tensors="pt").to(self.device)
 
         params = self.get_generation_params(request)

diff --git a/backend/llavanext.py b/backend/llavanext.py
@@ -31,6 +31,10 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
     async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
         images, prompt = await prompt_from_messages(request.messages, self.format)
 
+        if len(images) < 1:
+            images = [ await url_to_image(black_pixel_url) ]
+            prompt = "<image>\n" + prompt
+
         inputs = self.processor(prompt, images, return_tensors="pt").to(self.model.device)
 
         default_params = dict(

diff --git a/backend/minicpm.py b/backend/minicpm.py
@@ -40,7 +40,7 @@ async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGener
                     else:
                         msgs.extend([{ 'role': m.role, 'content': c.text }])
 
-        if not image:
+        if image is None:
             image = await url_to_image(transparent_pixel_url)
 
         # default uses num_beams: 3, but if streaming/sampling is requested, switch the defaults.

diff --git a/backend/minigemini.py b/backend/minigemini.py
@@ -52,7 +52,11 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
         self.loaded_banner()
 
     async def chat_with_images(self, request: ImageChatRequest) -> str:
-        image_convert, prompt = await prompt_from_messages(request.messages, self.format)
+        images, prompt = await prompt_from_messages(request.messages, self.format)
+
+        if len(images) < 1:
+            images = [ await url_to_image(black_pixel_url) ]
+            prompt = '<image>\n' + prompt
 
         if hasattr(self.model.config, 'image_size_aux'):
             if not hasattr(self.image_processor, 'image_size_raw'):
@@ -61,7 +65,7 @@ async def chat_with_images(self, request: ImageChatRequest) -> str:
             self.image_processor.crop_size['width'] = self.model.config.image_size_aux
             self.image_processor.size['shortest_edge'] = self.model.config.image_size_aux
 
-        image_tensor = process_images(image_convert, self.image_processor, self.model.config)
+        image_tensor = process_images(images, self.image_processor, self.model.config)
 
         image_grid = getattr(self.model.config, 'image_grid', 1)
         if hasattr(self.model.config, 'image_size_aux'):

diff --git a/backend/nanollava.py b/backend/nanollava.py
@@ -8,7 +8,8 @@
 transformers.logging.set_verbosity_error()
 warnings.filterwarnings('ignore')
 
-# 'qnguyen3/nanoLLaVA'
+# qnguyen3/nanoLLaVA
+# qnguyen3/nanoLLaVA-1.5
 
 def join_int_lists(int_lists, separator):
     result = []

diff --git a/model_conf_tests.json b/model_conf_tests.json
@@ -29,6 +29,7 @@
   ["OpenGVLab/InternVL2-1B", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/InternVL2-2B", "--device-map", "cuda:0"],
+  ["OpenGVLab/InternVL2-2B-AWQ", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/InternVL2-4B", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL2-8B", "--device-map", "cuda:0", "--load-in-4bit"],
@@ -37,6 +38,7 @@
   ["OpenGVLab/InternVL2-26B", "--device-map", "cuda:0"],
   ["OpenGVLab/InternVL2-40B", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/InternVL2-40B", "--device-map", "cuda:0"],
+  ["OpenGVLab/InternVL2-Llama3-76B", "--device-map", "cuda:0", "--load-in-4bit"],
   ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--load-in-4bit"],
   ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40", "--load-in-4bit"],
   ["OpenGVLab/Mini-InternVL-Chat-2B-V1-5", "--max-tiles", "40"],
@@ -74,6 +76,8 @@
   ["echo840/Monkey-Chat"],
   ["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "--use-flash-attn", "--load-in-4bit"],
   ["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "--use-flash-attn"],
+  ["internlm/internlm-xcomposer2d5-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
+  ["internlm/internlm-xcomposer2d5-7b", "--use-flash-attn", "--device-map", "cuda:0"],
   ["internlm/internlm-xcomposer2-4khd-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
   ["internlm/internlm-xcomposer2-4khd-7b", "--use-flash-attn", "--device-map", "cuda:0"],
   ["internlm/internlm-xcomposer2-7b", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
@@ -112,6 +116,8 @@
   ["qihoo360/360VL-8B", "--use-flash-attn"],
   ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
   ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--device-map", "cuda:0"],
+  ["qnguyen3/nanoLLaVA-1.5", "--use-flash-attn", "--device-map", "cuda:0", "--load-in-4bit"],
+  ["qnguyen3/nanoLLaVA-1.5", "--use-flash-attn", "--device-map", "cuda:0"],
   ["qresearch/llama-3-vision-alpha-hf", "--device", "cuda:0", "--load-in-4bit"],
   ["qresearch/llama-3-vision-alpha-hf", "--device", "cuda:0"],
   ["tiiuae/falcon-11B-vlm", "--use-flash-attn", "--load-in-4bit"],

diff --git a/test_models.py b/test_models.py
@@ -205,12 +205,11 @@ def generate_stream_response(image_url, prompt):
 
         return answer
 
-
-
     def single_round():
         # XXX TODO: timeout
         results = []
         ### Single round
+
         # url tests
         for name, url in urls.items():
             answer = generate_response(url, "What is the subject of the image?")
@@ -244,8 +243,8 @@ def single_round():
             else:
                 print(f"{name}[data_stream]: pass{', got: ' + answer if args.verbose else ''}")
 
-        """
 
+        """
         ## OCR tests
         quality_urls = {
             '98.21': ('What is the total bill?', 'https://ocr.space/Content/Images/receipt-ocr-original.webp'),
@@ -262,6 +261,7 @@ def single_round():
                     break
             else:
                 print(f"{name}[quality]: pass{', got: ' + answer if args.verbose else ''}")
+        """
 
         # No image tests
         no_image = { 
@@ -287,7 +287,6 @@ def no_image_response(prompt):
             else:
                 print(f"{name}[no_img]: pass{', got: ' + answer if args.verbose else ''}")
 
-        """
         return results
 
     with open('model_conf_tests.json') as f: