0.11.0 InternVL-Chat-V1-5 support

matatonic · Apr 28, 2024 · bd60163 · bd60163
1 parent c8d7fc7
commit bd60163
Show file tree

Hide file tree

Showing 8 changed files with 160 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -9,6 +9,8 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 
 ## Model support
 
+- [X] [OpenGVLab](https://huggingface.co/OpenGVLab)
+- - [X] [InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)  (wont gpu split yet, 4bit not recommended)
 - [X] [THUDM/CogVLM](https://github.com/THUDM/CogVLM)
 - - [X] [cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf)
 - - [X] [cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)
@@ -57,10 +59,14 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 
 See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard)
 
-Version: 0.10.0
-
 ## Recent updates
 
+Version: 0.11.0
+
+- new model support: OpenGVLab/InternVL-Chat-V1-5, up to 4k resolution, top opensource model
+
+Version: 0.10.0
+
 - new model support: adept/fuyu-8b
 - new model support: MiniCPM-V-2
 - new model support: MiniGemini-7B -> MiniGemini-8x7B-HD, alternate docker.

diff --git a/backend/internvl-chat-v1-5.py b/backend/internvl-chat-v1-5.py
@@ -0,0 +1,138 @@
+import os
+from transformers import AutoTokenizer, AutoModel
+from vision_qna import *
+import torch
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+
+# OpenGVLab/InternVL-Chat-V1-5
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image, input_size=448, max_num=6):
+    #image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "internvl-chat-v1-5"
+    format: str = "chatml"
+
+    def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
+        super().__init__(model_id, device, device_map, extra_params, format)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
+        self.model = AutoModel.from_pretrained(**self.params).eval()
+
+        self.model.img_context_token_id = self.tokenizer.convert_tokens_to_ids('<IMG_CONTEXT>')
+
+        if self.tokenizer.convert_tokens_to_ids('<|im_end|>') != 0:
+            self.eos_token_id = self.tokenizer.convert_tokens_to_ids('<|im_end|>')  # 92542, InternLM2
+        else:
+            self.eos_token_id = self.tokenizer.eos_token_id
+
+        print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
+
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
+        images, prompt = await chatml_prompt_from_messages(request.messages, img_tok='')
+
+        images = [load_image(image).to(self.model.dtype).cuda() for image in images]
+        if len(images) > 1:
+            pixel_values = torch.cat(images, dim=0)
+        else:
+            pixel_values = images[0]
+
+        default_params = {
+            'num_beams': 1,
+            'max_new_tokens': 512,
+            'do_sample': False,
+            'eos_token_id': self.eos_token_id,
+        }
+
+        generation_config = self.get_generation_params(request, default_params)
+
+        del generation_config['use_cache']
+
+        image_tokens = '<img>' + '<IMG_CONTEXT>' * self.model.num_image_token * pixel_values.shape[0] + '</img>\n'
+        model_inputs = self.tokenizer(image_tokens + prompt, return_tensors='pt')
+        input_ids = model_inputs['input_ids'].cuda()
+        attention_mask = model_inputs['attention_mask'].cuda()
+
+        output = self.model.generate(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **generation_config,
+        )
+        response = self.tokenizer.decode(output[0], skip_special_tokens=True)
+
+        return response.split('<|im_end|>')[0].strip()
diff --git a/chat_with_image.py b/chat_with_image.py
@@ -26,7 +26,8 @@ def url_for_api(img_url: str = None, filename: str = None, always_data=False) ->
 
 if __name__ == '__main__':
     # Initialize argparse
-    parser = argparse.ArgumentParser(description='Test vision using OpenAI')
+    parser = argparse.ArgumentParser(description='Test vision using OpenAI',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('-s', '--system-prompt', type=str, default=None)
     parser.add_argument('-m', '--max-tokens', type=int, default=None)
     parser.add_argument('-t', '--temperature', type=float, default=None)

diff --git a/model_conf_tests.alt.json b/model_conf_tests.alt.json
@@ -3,6 +3,7 @@
   ["vikhyatk/moondream1"],
   ["echo840/Monkey"],
   ["echo840/Monkey-Chat"],
+  ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],
   ["THUDM/cogvlm-chat-hf"],
   ["THUDM/cogagent-chat-hf"],
   ["Qwen/Qwen-VL-Chat"],
@@ -24,6 +25,7 @@
   ["llava-hf/llava-1.5-7b-hf", "--use-flash-attn", "--device-map", "cuda:0"],
   ["llava-hf/llava-1.5-13b-hf", "--use-flash-attn", "--device-map", "cuda:0"],
 
+  ["OpenGVLab/InternVL-Chat-V1-5", , "--load-in-4bit", "--device-map", "cuda:0"],
   ["THUDM/cogvlm-chat-hf", "--load-in-4bit"],
   ["THUDM/cogagent-chat-hf", "--load-in-4bit"],
   ["internlm/internlm-xcomposer2-7b-4bit", "--use-flash-attn", "--device", "cuda:0"],

diff --git a/model_conf_tests.json b/model_conf_tests.json
@@ -1,6 +1,7 @@
 [
   ["vikhyatk/moondream2", "--use-flash-attn"],
   ["vikhyatk/moondream1"],
+  ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],
   ["qnguyen3/nanoLLaVA", "--use-flash-attn"],
   ["echo840/Monkey"],
   ["echo840/Monkey-Chat"],
@@ -23,6 +24,7 @@
   ["01-ai/Yi-VL-34B", "--use-flash-attn"],
   ["YanweiLi/Mini-Gemini-2B", "--use-flash-attn"],
 
+  ["OpenGVLab/InternVL-Chat-V1-5", "--load-in-4bit", "--device-map", "cuda:0"],
   ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--load-in-4bit"],
   ["THUDM/cogvlm-chat-hf", "--load-in-4bit"],
   ["THUDM/cogagent-chat-hf", "--load-in-4bit"],

diff --git a/test_models.py b/test_models.py
@@ -185,7 +185,7 @@ def single_round():
         for name, url in urls.items():
             answer = generate_response(url, "What is the subject of the image?")
             correct = name in answer.lower()
-            results.extend([answer])
+            results.extend([correct])
             if not correct:
                 print(f"{name}[url]: fail, got: {answer}")
                 if args.abort_on_fail:
@@ -196,7 +196,7 @@ def single_round():
             data_url = data_url_from_url(url)
             answer = generate_response(data_url, "What is the subject of the image?")
             correct = name in answer.lower()
-            results.extend([answer])
+            results.extend([correct])
             if not correct:
                 print(f"{name}[data]: fail, got: {answer}")
                 if args.abort_on_fail:

diff --git a/vision.sample.env b/vision.sample.env
@@ -4,6 +4,7 @@ HF_HOME=hf_home
 #CUDA_VISIBLE_DEVICES=1,0
 #CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn"  # test pass, time: 4.4s, mem: 4.6GB, All tests passed.
 #CLI_COMMAND="python vision.py -m vikhyatk/moondream1"  # test fail, time: 3.6s, mem: 4.9GB, Test failed with Exception: Internal Server Error
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0"  # test pass, time: 13.4s, mem: 52.0GB, All tests passed.
 #CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn"  # test pass, time: 7.4s, mem: 8.5GB, All tests passed.
 #CLI_COMMAND="python vision.py -m echo840/Monkey"  # test pass, time: 6.2s, mem: 21.8GB, All tests passed.
 #CLI_COMMAND="python vision.py -m echo840/Monkey-Chat"  # test pass, time: 7.8s, mem: 21.7GB, All tests passed.
@@ -25,6 +26,7 @@ HF_HOME=hf_home
 #CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --use-flash-attn"  # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
 #CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --use-flash-attn"  # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
 #CLI_COMMAND="python vision.py -m YanweiLi/Mini-Gemini-2B --use-flash-attn"  # test pass, time: 4.2s, mem: 8.3GB, All tests passed.
+#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0"  # test fail, time: 17.2s, mem: 18.2GB,
 #CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --load-in-4bit"  # test pass, time: 11.3s, mem: 8.0GB, All tests passed.
 #CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit"  # test pass, time: 19.5s, mem: 12.1GB, All tests passed.
 #CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit"  # test pass, time: 19.8s, mem: 12.2GB, All tests passed.

diff --git a/vision_qna.py b/vision_qna.py
@@ -404,7 +404,7 @@ def guess_model_format(model_name: str) -> str:
         'vicuna': ['vicuna', '13b'],
         'vicuna0': ['yi-vl'],
         'phi15': ['moondream1', 'moondream2', 'monkey'],
-        'chatml': ['34b', 'yi-6b', 'nanollava'],
+        'chatml': ['34b', 'yi-6b', 'nanollava', 'internvl-chat-v1-5'],
         'fuyu': ['fuyu'],
     }
     for format, options in model_format_match_map.items():
@@ -465,4 +465,6 @@ def guess_backend(model_name: str) -> str:
 
     if 'fuyu' in model_id:
         return 'fuyu'
-
+
+    if 'internvl-chat-v1-5' in model_id:
+        return 'internvl-chat-v1-5'