From 9ceb6ecbb29f33d01481286050514bf1aaa2266b Mon Sep 17 00:00:00 2001 From: Soumik Rakshit <19soumik.rakshit96@gmail.com> Date: Wed, 30 Oct 2024 16:16:37 +0000 Subject: [PATCH] update: data structure in OpenAIJudge.frame_question --- hemm/metrics/__init__.py | 7 +- hemm/metrics/image_quality/lpips.py | 3 +- .../vqa/judges/mmllm_judges/openai_judge.py | 71 ++++++++++--------- .../test_2d_spatial_relationship_eval.py | 4 +- hemm/tests/test_prompt_alignment_eval.py | 3 +- 5 files changed, 46 insertions(+), 42 deletions(-) diff --git a/hemm/metrics/__init__.py b/hemm/metrics/__init__.py index 380ab11..3b3e6a4 100644 --- a/hemm/metrics/__init__.py +++ b/hemm/metrics/__init__.py @@ -1,4 +1,7 @@ -from .prompt_alignment import (BLIPScoreMertric, CLIPImageQualityScoreMetric, - CLIPScoreMetric) +from .prompt_alignment import ( + BLIPScoreMertric, + CLIPImageQualityScoreMetric, + CLIPScoreMetric, +) __all__ = ["BLIPScoreMertric", "CLIPImageQualityScoreMetric", "CLIPScoreMetric"] diff --git a/hemm/metrics/image_quality/lpips.py b/hemm/metrics/image_quality/lpips.py index 45e5b2b..fe67f95 100644 --- a/hemm/metrics/image_quality/lpips.py +++ b/hemm/metrics/image_quality/lpips.py @@ -5,8 +5,7 @@ import torch import weave from PIL import Image -from torchmetrics.functional.image import \ - learned_perceptual_image_patch_similarity +from torchmetrics.functional.image import learned_perceptual_image_patch_similarity from ...utils import base64_encode_image from .base import BaseImageQualityMetric, ComputeMetricOutput diff --git a/hemm/metrics/vqa/judges/mmllm_judges/openai_judge.py b/hemm/metrics/vqa/judges/mmllm_judges/openai_judge.py index 2e8ec35..e8d7512 100644 --- a/hemm/metrics/vqa/judges/mmllm_judges/openai_judge.py +++ b/hemm/metrics/vqa/judges/mmllm_judges/openai_judge.py @@ -1,6 +1,6 @@ import os import subprocess -from typing import List +from typing import Dict, List import spacy import weave @@ -9,8 +9,7 @@ from pydantic import BaseModel from .....utils import base64_encode_image -from .commons import (JudgeMent, JudgeQuestion, PromptCategory, - TaggedPromptParts) +from .commons import JudgeMent, JudgeQuestion, PromptCategory, TaggedPromptParts class OpenAIJudgeMent(BaseModel): @@ -92,7 +91,7 @@ def extract_prompt_parts(self, prompt: str) -> List[TaggedPromptParts]: return tagged_prompt_parts @weave.op() - def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]: + def frame_question(self, prompt: str, image: Image.Image) -> List[Dict[str, str]]: """Frame the question corresponding to the given prompt and image for the chain-of-thought system of judgement. @@ -101,21 +100,21 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion] image (Image.Image): The image to frame the question for. Returns: - List[JudgeQuestion]: List of questions to ask for the given prompt. + List[Dict[str, str]]: List of questions to ask for the given prompt. """ prompt = str(prompt) if self.prompt_property in [PromptCategory.spatial, PromptCategory.spatial_3d]: self._total_score = 5 - question = JudgeQuestion( - image_desciption_system_prompt=""" + question = { + "image_desciption_system_prompt": """ You are a helpful assistant meant to describe images is detail. You should pay special attention to the objects and their spatial layout in the image. """, - judgement_question_system_prompt=""" + "judgement_question_system_prompt": """ You are a helpful assistant meant to identify objects and their spatial layout in the image. You have to extract the question, the score, and the explanation from the user's response. """, - judgement_question=f""" + "judgement_question": f""" Looking at the image and given a detailed description of the image, evaluate if the text \"{prompt}\" is correctly portrayed in the image. Give a score from 1 to 5, according to the following criteria: @@ -135,20 +134,20 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion] 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the spatial layout of the objects in the image is not consistent with the text prompt. """, - ) + } return [(question, image)] elif self.prompt_property == PromptCategory.action: self._total_score = 5 - question = JudgeQuestion( - image_desciption_system_prompt=""" + question = { + "image_desciption_system_prompt": """ You are a helpful assistant meant to describe images is detail. You should pay special attention to the the actions, events, objects and their relationships in the image. """, - judgement_question_system_prompt=""" + "judgement_question_system_prompt": """ You are a helpful assistant meant to identify the actions, events, objects and their relationships in the image. You have to extract the question, the score, and the explanation from the user's response. """, - judgement_question=f""" + "judgement_question": f""" Looking at the image and given a detailed description of the image, evaluate if the text \"{prompt}\" is correctly portrayed in the image. Give a score from 1 to 5, according to the following criteria: @@ -168,20 +167,20 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion] 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the spatial layout of the objects in the image is not consistent with the text prompt. """, - ) + } return [(question, image)] elif self.prompt_property == PromptCategory.numeracy: self._total_score = 5 - question = JudgeQuestion( - image_desciption_system_prompt=""" + question = { + "image_desciption_system_prompt": """ You are a helpful assistant meant to describe images is detail. You should pay special attention to the objects and their quantities in the image. """, - judgement_question_system_prompt=""" + "judgement_question_system_prompt": """ You are a helpful assistant meant to identify objects and their quantities in the image. You have to extract the question, the score, and the explanation from the user's response. """, - judgement_question=f""" + "judgement_question": f""" Looking at the image and given a detailed description of the image, evaluate how well the image aligns with the text prompt: \"{prompt}\" Give a score from 1 to 5, according to the following criteria: @@ -201,23 +200,23 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion] 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the spatial layout of the objects in the image is not consistent with the text prompt. """, - ) + } return [(question, image)] elif self.prompt_property == PromptCategory.complex: self._total_score = 5 - question = JudgeQuestion( - image_desciption_system_prompt=""" + question = { + "image_desciption_system_prompt": """ You are a helpful assistant meant to describe images is detail. You should pay special attention to the objects in the image and their attributes (such as color, shape, texture), spatial layout and action relationships. """, - judgement_question_system_prompt=""" + "judgement_question_system_prompt": """ You are a helpful assistant meant to evaluate the correspondence of the image to a given text prompt. Focus on the objects in the image and their attributes (such as color, shape, texture), spatial layout and action relationships. You have to extract the question, the score, and the explanation from the user's response. """, - judgement_question=f""" + "judgement_question": f""" Looking at the image and given a detailed description of the image, evaluate how well the image aligns with the text prompt: \"{prompt}\" Give a score from 1 to 5, according to the following criteria: @@ -237,21 +236,21 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion] 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the spatial layout of the objects in the image is not consistent with the text prompt. """, - ) + } return [(question, image)] tagged_prompt_parts = self.extract_prompt_parts(prompt) questions: List[str] = [] for tagged_prompt_part in tagged_prompt_parts: - question = JudgeQuestion( - image_desciption_system_prompt=f""" + question = { + "image_desciption_system_prompt": f""" You are a helpful assistant meant to describe images is detail. You should pay special attention to any objects and their {self.prompt_property.name} in the given image. """, - judgement_question_system_prompt=f""" + "judgement_question_system_prompt": f""" You are a helpful assistant meant to identify any objects and their {self.prompt_property.name} in the given image. You have to extract the question, the score, and the explanation from the user's response. """, - judgement_question=f""" + "judgement_question": f""" Looking at the image and given a detailed description of the image, evaluate if there is a {tagged_prompt_part.entity} in the image. Give a score from 1 to 4, according to the following criteria: @@ -270,13 +269,13 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion] 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the spatial layout of the objects in the image is not consistent with the text prompt. """, - ) + } questions.append((question, image)) return questions @weave.op def execute_chain_of_thought( - self, question: JudgeQuestion, image: Image.Image + self, question: Dict[str, str], image: Image.Image ) -> OpenAIJudgeMent: image_description_explanation = ( self._openai_client.chat.completions.create( @@ -285,7 +284,7 @@ def execute_chain_of_thought( messages=[ { "role": "system", - "content": question.image_desciption_system_prompt, + "content": question["image_desciption_system_prompt"], }, { "role": "user", @@ -301,7 +300,9 @@ def execute_chain_of_thought( .choices[0] .message.content ) - question.judgement_question += f""" + question[ + "judgement_question" + ] += f""" Here is a detailed explanation of the image: --- @@ -318,12 +319,12 @@ def execute_chain_of_thought( messages=[ { "role": "system", - "content": question.judgement_question_system_prompt, + "content": question["judgement_question_system_prompt"], }, { "role": "user", "content": [ - {"type": "text", "text": question.judgement_question}, + {"type": "text", "text": question["judgement_question"]}, { "type": "image_url", "image_url": {"url": base64_encode_image(image)}, diff --git a/hemm/tests/test_2d_spatial_relationship_eval.py b/hemm/tests/test_2d_spatial_relationship_eval.py index 86a2009..4cd3f2d 100644 --- a/hemm/tests/test_2d_spatial_relationship_eval.py +++ b/hemm/tests/test_2d_spatial_relationship_eval.py @@ -6,7 +6,9 @@ from hemm.eval_pipelines import BaseDiffusionModel, EvaluationPipeline from hemm.metrics.spatial_relationship import SpatialRelationshipMetric2D from hemm.metrics.spatial_relationship.judges import ( - DETRSpatialRelationShipJudge, RTDETRSpatialRelationShipJudge) + DETRSpatialRelationShipJudge, + RTDETRSpatialRelationShipJudge, +) class Test2DSpatialRelationshipEval(unittest.TestCase): diff --git a/hemm/tests/test_prompt_alignment_eval.py b/hemm/tests/test_prompt_alignment_eval.py index 148884c..a876455 100644 --- a/hemm/tests/test_prompt_alignment_eval.py +++ b/hemm/tests/test_prompt_alignment_eval.py @@ -4,8 +4,7 @@ import wandb from hemm.eval_pipelines import BaseDiffusionModel, EvaluationPipeline -from hemm.metrics.prompt_alignment import (CLIPImageQualityScoreMetric, - CLIPScoreMetric) +from hemm.metrics.prompt_alignment import CLIPImageQualityScoreMetric, CLIPScoreMetric class TestPromptAlignmentEvaluation(unittest.TestCase):