Skip to content

Commit

Permalink
0.11.0 InternVL-Chat-V1-5 support
Browse files Browse the repository at this point in the history
  • Loading branch information
matatonic committed Apr 28, 2024
1 parent c8d7fc7 commit bd60163
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 7 deletions.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`

## Model support

- [X] [OpenGVLab](https://huggingface.co/OpenGVLab)
- - [X] [InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5) (wont gpu split yet, 4bit not recommended)
- [X] [THUDM/CogVLM](https://github.com/THUDM/CogVLM)
- - [X] [cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf)
- - [X] [cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)
Expand Down Expand Up @@ -57,10 +59,14 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`

See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard)

Version: 0.10.0

## Recent updates

Version: 0.11.0

- new model support: OpenGVLab/InternVL-Chat-V1-5, up to 4k resolution, top opensource model

Version: 0.10.0

- new model support: adept/fuyu-8b
- new model support: MiniCPM-V-2
- new model support: MiniGemini-7B -> MiniGemini-8x7B-HD, alternate docker.
Expand Down
138 changes: 138 additions & 0 deletions backend/internvl-chat-v1-5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import os
from transformers import AutoTokenizer, AutoModel
from vision_qna import *
import torch
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode

# OpenGVLab/InternVL-Chat-V1-5

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height

# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)

# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images


def load_image(image, input_size=448, max_num=6):
#image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values


class VisionQnA(VisionQnABase):
model_name: str = "internvl-chat-v1-5"
format: str = "chatml"

def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
super().__init__(model_id, device, device_map, extra_params, format)

self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
self.model = AutoModel.from_pretrained(**self.params).eval()

self.model.img_context_token_id = self.tokenizer.convert_tokens_to_ids('<IMG_CONTEXT>')

if self.tokenizer.convert_tokens_to_ids('<|im_end|>') != 0:
self.eos_token_id = self.tokenizer.convert_tokens_to_ids('<|im_end|>') # 92542, InternLM2
else:
self.eos_token_id = self.tokenizer.eos_token_id

print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")

async def chat_with_images(self, request: ImageChatRequest) -> str:
images, prompt = await chatml_prompt_from_messages(request.messages, img_tok='')

images = [load_image(image).to(self.model.dtype).cuda() for image in images]
if len(images) > 1:
pixel_values = torch.cat(images, dim=0)
else:
pixel_values = images[0]

default_params = {
'num_beams': 1,
'max_new_tokens': 512,
'do_sample': False,
'eos_token_id': self.eos_token_id,
}

generation_config = self.get_generation_params(request, default_params)

del generation_config['use_cache']

image_tokens = '<img>' + '<IMG_CONTEXT>' * self.model.num_image_token * pixel_values.shape[0] + '</img>\n'
model_inputs = self.tokenizer(image_tokens + prompt, return_tensors='pt')
input_ids = model_inputs['input_ids'].cuda()
attention_mask = model_inputs['attention_mask'].cuda()

output = self.model.generate(
pixel_values=pixel_values,
input_ids=input_ids,
attention_mask=attention_mask,
**generation_config,
)
response = self.tokenizer.decode(output[0], skip_special_tokens=True)

return response.split('<|im_end|>')[0].strip()
3 changes: 2 additions & 1 deletion chat_with_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def url_for_api(img_url: str = None, filename: str = None, always_data=False) ->

if __name__ == '__main__':
# Initialize argparse
parser = argparse.ArgumentParser(description='Test vision using OpenAI')
parser = argparse.ArgumentParser(description='Test vision using OpenAI',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-s', '--system-prompt', type=str, default=None)
parser.add_argument('-m', '--max-tokens', type=int, default=None)
parser.add_argument('-t', '--temperature', type=float, default=None)
Expand Down
2 changes: 2 additions & 0 deletions model_conf_tests.alt.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
["vikhyatk/moondream1"],
["echo840/Monkey"],
["echo840/Monkey-Chat"],
["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],
["THUDM/cogvlm-chat-hf"],
["THUDM/cogagent-chat-hf"],
["Qwen/Qwen-VL-Chat"],
Expand All @@ -24,6 +25,7 @@
["llava-hf/llava-1.5-7b-hf", "--use-flash-attn", "--device-map", "cuda:0"],
["llava-hf/llava-1.5-13b-hf", "--use-flash-attn", "--device-map", "cuda:0"],

["OpenGVLab/InternVL-Chat-V1-5", , "--load-in-4bit", "--device-map", "cuda:0"],
["THUDM/cogvlm-chat-hf", "--load-in-4bit"],
["THUDM/cogagent-chat-hf", "--load-in-4bit"],
["internlm/internlm-xcomposer2-7b-4bit", "--use-flash-attn", "--device", "cuda:0"],
Expand Down
2 changes: 2 additions & 0 deletions model_conf_tests.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[
["vikhyatk/moondream2", "--use-flash-attn"],
["vikhyatk/moondream1"],
["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],
["qnguyen3/nanoLLaVA", "--use-flash-attn"],
["echo840/Monkey"],
["echo840/Monkey-Chat"],
Expand All @@ -23,6 +24,7 @@
["01-ai/Yi-VL-34B", "--use-flash-attn"],
["YanweiLi/Mini-Gemini-2B", "--use-flash-attn"],

["OpenGVLab/InternVL-Chat-V1-5", "--load-in-4bit", "--device-map", "cuda:0"],
["qnguyen3/nanoLLaVA", "--use-flash-attn", "--load-in-4bit"],
["THUDM/cogvlm-chat-hf", "--load-in-4bit"],
["THUDM/cogagent-chat-hf", "--load-in-4bit"],
Expand Down
4 changes: 2 additions & 2 deletions test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def single_round():
for name, url in urls.items():
answer = generate_response(url, "What is the subject of the image?")
correct = name in answer.lower()
results.extend([answer])
results.extend([correct])
if not correct:
print(f"{name}[url]: fail, got: {answer}")
if args.abort_on_fail:
Expand All @@ -196,7 +196,7 @@ def single_round():
data_url = data_url_from_url(url)
answer = generate_response(data_url, "What is the subject of the image?")
correct = name in answer.lower()
results.extend([answer])
results.extend([correct])
if not correct:
print(f"{name}[data]: fail, got: {answer}")
if args.abort_on_fail:
Expand Down
2 changes: 2 additions & 0 deletions vision.sample.env
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ HF_HOME=hf_home
#CUDA_VISIBLE_DEVICES=1,0
#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass, time: 4.4s, mem: 4.6GB, All tests passed.
#CLI_COMMAND="python vision.py -m vikhyatk/moondream1" # test fail, time: 3.6s, mem: 4.9GB, Test failed with Exception: Internal Server Error
#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass, time: 13.4s, mem: 52.0GB, All tests passed.
#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn" # test pass, time: 7.4s, mem: 8.5GB, All tests passed.
#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass, time: 6.2s, mem: 21.8GB, All tests passed.
#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass, time: 7.8s, mem: 21.7GB, All tests passed.
Expand All @@ -25,6 +26,7 @@ HF_HOME=hf_home
#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --use-flash-attn" # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --use-flash-attn" # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit).
#CLI_COMMAND="python vision.py -m YanweiLi/Mini-Gemini-2B --use-flash-attn" # test pass, time: 4.2s, mem: 8.3GB, All tests passed.
#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0" # test fail, time: 17.2s, mem: 18.2GB,
#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --load-in-4bit" # test pass, time: 11.3s, mem: 8.0GB, All tests passed.
#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass, time: 19.5s, mem: 12.1GB, All tests passed.
#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass, time: 19.8s, mem: 12.2GB, All tests passed.
Expand Down
6 changes: 4 additions & 2 deletions vision_qna.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def guess_model_format(model_name: str) -> str:
'vicuna': ['vicuna', '13b'],
'vicuna0': ['yi-vl'],
'phi15': ['moondream1', 'moondream2', 'monkey'],
'chatml': ['34b', 'yi-6b', 'nanollava'],
'chatml': ['34b', 'yi-6b', 'nanollava', 'internvl-chat-v1-5'],
'fuyu': ['fuyu'],
}
for format, options in model_format_match_map.items():
Expand Down Expand Up @@ -465,4 +465,6 @@ def guess_backend(model_name: str) -> str:

if 'fuyu' in model_id:
return 'fuyu'


if 'internvl-chat-v1-5' in model_id:
return 'internvl-chat-v1-5'

0 comments on commit bd60163

Please sign in to comment.