You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I'm on an A100 instance from lambdalabs. Here is my predict.py code:
from cog import BasePredictor, Input, Path
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
from llava.conversation import conv_templates, SeparatorStyle
from llava.utils import disable_torch_init
from transformers import CLIPVisionModel, CLIPImageProcessor, StoppingCriteria
from llava.model import *
from llava.model.utils import KeywordsStoppingCriteria
from PIL import Image,ImageOps
import os
import requests
from PIL import Image
from io import BytesIO
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
def load_image(image_file):
if image_file.startswith('http') or image_file.startswith('https'):
response = requests.get(image_file)
image = Image.open(BytesIO(response.content)).convert('RGB')
else:
image = Image.open(image_file).convert('RGB')
return image
def resize_image(image, target_size):
width, height = image.size
aspect_ratio = width / height
if aspect_ratio > 1:
new_width = target_size[0]
new_height = int(new_width / aspect_ratio)
else:
new_height = target_size[1]
new_width = int(new_height * aspect_ratio)
image = image.resize((new_width, new_height))
width_diff = target_size[0] - image.size[0]
height_diff = target_size[1] - image.size[1]
left_padding = 0
top_padding = 0
right_padding = width_diff - left_padding
bottom_padding = height_diff - top_padding
padded_image = ImageOps.expand(image, border=(left_padding, top_padding, right_padding, bottom_padding), fill=0)
return padded_image
class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
self.model_name = "llava_v1" # Predefined model name
self.conv_mode = None # Predefined conversation mode
disable_torch_init()
self.model_name = os.path.expanduser(self.model_name)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
if "mpt" in self.model_name.lower():
self.model = LlavaMPTForCausalLM.from_pretrained(self.model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16, use_cache=True).cuda()
else:
self.model = LlavaLlamaForCausalLM.from_pretrained(self.model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16, use_cache=True).cuda()
self.image_processor = CLIPImageProcessor.from_pretrained(self.model.config.mm_vision_tower, torch_dtype=torch.float16)
mm_use_im_start_end = getattr(self.model.config, "mm_use_im_start_end", False)
self.tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
self.tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
self.vision_tower = self.model.get_model().vision_tower[0]
if self.vision_tower.device.type == 'meta':
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower.config._name_or_path, torch_dtype=torch.float16, low_cpu_mem_usage=True).cuda()
self.model.get_model().vision_tower[0] = self.vision_tower
else:
self.vision_tower.to(device='cuda', dtype=torch.float16)
self.vision_config = self.vision_tower.config
self.vision_config.im_patch_token = self.tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
self.vision_config.use_im_start_end = mm_use_im_start_end
if mm_use_im_start_end:
self.vision_config.im_start_token, self.vision_config.im_end_token = self.tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
def predict(self, image: Path = Input(description="Input image"), query: str = Input(description="Query")) -> Path:
"""Run a single prediction on the model"""
image_token_len = (self.vision_config.image_size // self.vision_config.patch_size) ** 2
qs = query
if self.vision_config.use_im_start_end:
qs = qs + '\n' + DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len + DEFAULT_IM_END_TOKEN
else:
qs = qs + '\n' + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
if "v1" in self.model_name.lower():
conv_mode = "llava_v1"
elif "mpt" in self.model_name.lower():
conv_mode = "mpt_multimodal"
else:
conv_mode = "multimodal"
if self.conv_mode is not None and conv_mode != self.conv_mode:
print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, self.conv_mode, self.conv_mode))
else:
self.conv_mode = conv_mode
conv = conv_templates[self.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
inputs = self.tokenizer([prompt])
image = load_image(image)
image = resize_image(image, (336, 336))
image_tensor = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
input_ids = torch.as_tensor(inputs.input_ids).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images=image_tensor.unsqueeze(0).half().cuda(),
do_sample=True,
temperature=0.2,
max_new_tokens=1024,
stopping_criteria=[stopping_criteria])
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
outputs = outputs.strip()
return outputs
Here is my cog.yaml file:
# Configuration for Cog ⚙️
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
build:
# set to true if your model requires a GPU
gpu: true
# a list of ubuntu apt packages to install
system_packages:
- "libgl1-mesa-glx"
- "libglib2.0-0"
# python version in the form '3.11' or '3.11.4'
python_version: "3.10"
# a list of packages in the format <package-name>==<version>
python_requirements: requirements.txt
# commands run after the environment is setup
run:
- "echo env is ready!"
- "echo another command if needed"
# predict.py defines how predictions are run on your model
predict: "predict.py:Predictor"
I'm really hoping this can be fixed easily! Thanks in advance!
The text was updated successfully, but these errors were encountered:
This is a known bug in v0.8.x
On quick fix is to use v0.7.2 for now. Or set gpu: false to build it and then run it with docker --gpus all ... yourself :(
Hey guys, I hope you are all doing well. I have my predict.py and cog.yaml ready in my project repository, but when I try to run "cog predict -i image=@https://cdn.shopify.com/s/files/1/0057/3728/3618/products/a-man-called-otto_ezrjr0pm_480x.progressive.jpg" I get an error, causing the docker image to fail to build. Here is the error:
I'm on an A100 instance from lambdalabs. Here is my predict.py code:
Here is my cog.yaml file:
I'm really hoping this can be fixed easily! Thanks in advance!
The text was updated successfully, but these errors were encountered: