Skip to content

Commit

Permalink
[Fix, Feat] Solve llava wild issue when task num can't divide, add fu…
Browse files Browse the repository at this point in the history
…yu ppl (EvolvingLMMs-Lab#37)

* Add fuyu ppl, fix llava-bench gather issue

* Add gpt4V

* Black lint
  • Loading branch information
kcz358 authored Feb 3, 2024
1 parent 596649b commit 8119e58
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 11 deletions.
1 change: 1 addition & 0 deletions lmms_eval/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .otterhd import OtterHD
from .qwen_vl import Qwen_VL
from .fuyu import Fuyu
from .gpt4v import GPT4V
39 changes: 38 additions & 1 deletion lmms_eval/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
from lmms_eval.api.instance import Instance
from tqdm import tqdm

import warnings

warnings.filterwarnings("ignore")


@register_model("fuyu")
class Fuyu(lmms):
Expand Down Expand Up @@ -118,7 +122,40 @@ def _collate(x):

def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
# TODO
assert False, "We have not implemented this function for llava yet"
res = []
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")

for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
# encode, pad, and truncate contexts for this batch
continuation = doc_to_target(self.task_dict[task][split][doc_id])
visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
visuals = self.flatten(visuals)
formatted_contexts = [f"{contexts}\n"]
formatted_continuation = [f"{contexts}\n{continuation}"]
model_inputs = self.processor(text=formatted_continuation, images=visuals, device=self.device)
for k, v in model_inputs.items():
model_inputs[k] = v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else [vv.to(self.device, non_blocking=True) for vv in v]

for index in range(len(model_inputs["image_patches"])):
model_inputs["image_patches"][index] = model_inputs["image_patches"][index].to(dtype=next(self.model.parameters()).dtype)

labels = model_inputs["input_ids"].clone()
contxt_id = self.processor(text=formatted_contexts, return_tensors="pt")["input_ids"]
labels[: len(contxt_id)] = -100
with torch.inference_mode():
outputs = self.model(**model_inputs, labels=labels)
loss = outputs["loss"]
# loss = torch.exp(loss)
logits = outputs["logits"]
greedy_tokens = logits.argmax(dim=-1)
cont_toks = model_inputs["input_ids"][:, contxt_id.shape[1] :] # [1, seq]
greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : model_inputs["input_ids"].shape[1]] # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
res.append((float(loss.item()), bool(max_equal)))
pbar.update(1)

pbar.close()
return res

def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
# TODO
Expand Down
108 changes: 108 additions & 0 deletions lmms_eval/models/gpt4v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from io import BytesIO
import os
import base64
from typing import List, Tuple
from tqdm import tqdm
import requests as url_requests
import time
import logging

from lmms_eval.api.instance import Instance
from lmms_eval.api.model import lmms
from lmms_eval.api.registry import register_model
from lmms_eval import utils

from PIL import Image

API_TYPE = os.getenv("API_TYPE", "openai")
NUM_SECONDS_TO_SLEEP = 5
eval_logger = logging.getLogger("lmms-eval")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
headers = {
"api-key": API_KEY,
"Content-Type": "application/json",
}


@register_model("gpt4V")
class GPT4V(lmms):
def __init__(self, **kwargs) -> None:
super().__init__()

# Function to encode the image
def encode_image(self, image: Image):
output_buffer = BytesIO()
image.save(output_buffer, format="JPEG")
byte_data = output_buffer.getvalue()
base64_str = base64.b64encode(byte_data).decode("utf-8")
return base64_str

def flatten(self, input):
new_list = []
for i in input:
for j in i:
new_list.append(j)
return new_list

def generate_until(self, requests) -> List[str]:
res = []
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")

for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
# encode, pad, and truncate contexts for this batch
visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
visuals = self.flatten(visuals)

payload = {"model": "gpt-4-vision-preview", "messages": [{"role": "user", "content": []}]}
payload["messages"][0]["content"].append({"type": "text", "text": contexts})

for visual in visuals:
img = self.encode_image(visual)
payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
if "max_new_tokens" not in gen_kwargs:
gen_kwargs["max_new_tokens"] = 1024
if "temperature" not in gen_kwargs:
gen_kwargs["temperature"] = 0
if "top_p" not in gen_kwargs:
gen_kwargs["top_p"] = None
if "num_beams" not in gen_kwargs:
gen_kwargs["num_beams"] = 1

# payload["max_tokens"] = gen_kwargs["max_new_tokens"]
# payload["temperature"] = gen_kwargs["temperature"]

for attempt in range(5):
try:
response = url_requests.post(API_URL, headers=headers, json=payload)
response_data = response.json()

content = response_data["choices"][0]["message"]["content"].strip()
break # If successful, break out of the loop

except Exception as e:
eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
if attempt < 5 - 1: # If we have retries left, sleep and then continue to next attempt
time.sleep(NUM_SECONDS_TO_SLEEP)
else: # If this was the last attempt, log and return empty
eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}")
content = ""
res.append(content)
return res

def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
# TODO
assert False, "GPT4V not support"

def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
# TODO
assert False, "GPT4V not support"
40 changes: 35 additions & 5 deletions lmms_eval/tasks/llava-bench-coco/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@
import time
import yaml
from pathlib import Path
from copy import deepcopy

eval_logger = logging.getLogger("lmms-eval")
NUM_SECONDS_TO_SLEEP = 0.5

LLAVA_W_METRICS = ["gpt_eval_llava_conv", "gpt_eval_llava_detail", "gpt_eval_llava_conv"]

rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r"))

with open(Path(__file__).parent / "llava-bench-coco.yaml", "r") as f:
Expand All @@ -24,16 +27,30 @@

config = yaml.safe_load("".join(safe_data))

API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]

GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]

API_TYPE = os.getenv("API_TYPE", "openai")

def get_eval(content: str, max_tokens: int, retries: int = 3):
if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
headers = {
"api-key": API_KEY,
"Content-Type": "application/json",
}


def get_eval(content: str, max_tokens: int, retries: int = 3):
global headers

messages = [
{
Expand Down Expand Up @@ -125,7 +142,7 @@ def llava_process_results(doc, result):
scores = [-1, -1]

metric = f"gpt_eval_llava_{doc.get('category', 'unknown')}"
review_dict = {
category_review_dict = {
"question": question,
"ans1": ans1,
"ans2": ans2,
Expand All @@ -136,8 +153,19 @@ def llava_process_results(doc, result):
"eval_model": model_name,
}

non_category_review_dict = deepcopy(category_review_dict)
non_category_review_dict["scores"] = [-999, -999]

data_dict = {}
for m in LLAVA_W_METRICS:
if m == metric:
data_dict[m] = category_review_dict
else:
data_dict[m] = non_category_review_dict
data_dict["gpt_eval_llava_all"] = category_review_dict

# return {"gpt_eval_llava_all": review_dict}
return {metric: review_dict, "gpt_eval_llava_all": review_dict}
return data_dict


def llava_conv_aggregation(results):
Expand All @@ -160,6 +188,8 @@ def llava_aggregation(results, category):
try:
scores = []
for result in results:
if -999 in result["scores"]:
continue
scores.append(result["scores"])

stats = np.asarray(scores).mean(0).tolist()
Expand Down
38 changes: 33 additions & 5 deletions lmms_eval/tasks/llava-in-the-wild/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@
import time
import yaml
from pathlib import Path
from copy import deepcopy

eval_logger = logging.getLogger("lmms-eval")
NUM_SECONDS_TO_SLEEP = 0.5

LLAVA_W_METRICS = ["gpt_eval_llava_conv", "gpt_eval_llava_detail", "gpt_eval_llava_conv"]

rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r"))

with open(Path(__file__).parent / "llava-in-the-wild.yaml", "r") as f:
Expand All @@ -24,16 +27,28 @@

config = yaml.safe_load("".join(safe_data))

API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]

API_TYPE = os.getenv("API_TYPE", "openai")

def get_eval(content: str, max_tokens: int, retries: int = 3):
if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
headers = {
"api-key": API_KEY,
"Content-Type": "application/json",
}


def get_eval(content: str, max_tokens: int, retries: int = 3):
global headers

messages = [
{
Expand Down Expand Up @@ -125,7 +140,7 @@ def llava_process_results(doc, result):
scores = [-1, -1]

metric = f"gpt_eval_llava_{doc.get('category', 'all')}"
review_dict = {
category_review_dict = {
"question": question,
"ans1": ans1,
"ans2": ans2,
Expand All @@ -136,8 +151,19 @@ def llava_process_results(doc, result):
"eval_model": model_name,
}

non_category_review_dict = deepcopy(category_review_dict)
non_category_review_dict["scores"] = [-999, -999]

data_dict = {}
for m in LLAVA_W_METRICS:
if m == metric:
data_dict[m] = category_review_dict
else:
data_dict[m] = non_category_review_dict
data_dict["gpt_eval_llava_all"] = category_review_dict

# return {"gpt_eval_llava_all": review_dict}
return {metric: review_dict, "gpt_eval_llava_all": review_dict}
return data_dict


def llava_conv_aggregation(results):
Expand All @@ -160,6 +186,8 @@ def llava_aggregation(results, category):
try:
scores = []
for result in results:
if -999 in result["scores"]:
continue
scores.append(result["scores"])

stats = np.asarray(scores).mean(0).tolist()
Expand Down

0 comments on commit 8119e58

Please sign in to comment.