update: data structure in OpenAIJudge.frame_question

wandb · Oct 30, 2024 · 9ceb6ec · 9ceb6ec
1 parent dc9579e
commit 9ceb6ec
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 42 deletions.
diff --git a/hemm/metrics/__init__.py b/hemm/metrics/__init__.py
@@ -1,4 +1,7 @@
-from .prompt_alignment import (BLIPScoreMertric, CLIPImageQualityScoreMetric,
-                               CLIPScoreMetric)
+from .prompt_alignment import (
+    BLIPScoreMertric,
+    CLIPImageQualityScoreMetric,
+    CLIPScoreMetric,
+)
 
 __all__ = ["BLIPScoreMertric", "CLIPImageQualityScoreMetric", "CLIPScoreMetric"]
diff --git a/hemm/metrics/image_quality/lpips.py b/hemm/metrics/image_quality/lpips.py
@@ -5,8 +5,7 @@
 import torch
 import weave
 from PIL import Image
-from torchmetrics.functional.image import \
-    learned_perceptual_image_patch_similarity
+from torchmetrics.functional.image import learned_perceptual_image_patch_similarity
 
 from ...utils import base64_encode_image
 from .base import BaseImageQualityMetric, ComputeMetricOutput

diff --git a/hemm/metrics/vqa/judges/mmllm_judges/openai_judge.py b/hemm/metrics/vqa/judges/mmllm_judges/openai_judge.py
@@ -1,6 +1,6 @@
 import os
 import subprocess
-from typing import List
+from typing import Dict, List
 
 import spacy
 import weave
@@ -9,8 +9,7 @@
 from pydantic import BaseModel
 
 from .....utils import base64_encode_image
-from .commons import (JudgeMent, JudgeQuestion, PromptCategory,
-                      TaggedPromptParts)
+from .commons import JudgeMent, JudgeQuestion, PromptCategory, TaggedPromptParts
 
 
 class OpenAIJudgeMent(BaseModel):
@@ -92,7 +91,7 @@ def extract_prompt_parts(self, prompt: str) -> List[TaggedPromptParts]:
         return tagged_prompt_parts
 
     @weave.op()
-    def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]:
+    def frame_question(self, prompt: str, image: Image.Image) -> List[Dict[str, str]]:
         """Frame the question corresponding to the given prompt and image for
         the chain-of-thought system of judgement.
 
@@ -101,21 +100,21 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
             image (Image.Image): The image to frame the question for.
 
         Returns:
-            List[JudgeQuestion]: List of questions to ask for the given prompt.
+            List[Dict[str, str]]: List of questions to ask for the given prompt.
         """
         prompt = str(prompt)
         if self.prompt_property in [PromptCategory.spatial, PromptCategory.spatial_3d]:
             self._total_score = 5
-            question = JudgeQuestion(
-                image_desciption_system_prompt="""
+            question = {
+                "image_desciption_system_prompt": """
 You are a helpful assistant meant to describe images is detail.
 You should pay special attention to the objects and their spatial layout in the image.
                 """,
-                judgement_question_system_prompt="""
+                "judgement_question_system_prompt": """
 You are a helpful assistant meant to identify objects and their spatial layout in the image.
 You have to extract the question, the score, and the explanation from the user's response.
                 """,
-                judgement_question=f"""
+                "judgement_question": f"""
 Looking at the image and given a detailed description of the image, evaluate if the text \"{prompt}\" is correctly portrayed in the image.
 Give a score from 1 to 5, according to the following criteria:
 
@@ -135,20 +134,20 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the
     spatial layout of the objects in the image is not consistent with the text prompt.
                 """,
-            )
+            }
             return [(question, image)]
         elif self.prompt_property == PromptCategory.action:
             self._total_score = 5
-            question = JudgeQuestion(
-                image_desciption_system_prompt="""
+            question = {
+                "image_desciption_system_prompt": """
 You are a helpful assistant meant to describe images is detail.
 You should pay special attention to the the actions, events, objects and their relationships in the image.
                 """,
-                judgement_question_system_prompt="""
+                "judgement_question_system_prompt": """
 You are a helpful assistant meant to identify the actions, events, objects and their relationships in the image.
 You have to extract the question, the score, and the explanation from the user's response.
                 """,
-                judgement_question=f"""
+                "judgement_question": f"""
 Looking at the image and given a detailed description of the image, evaluate if the text \"{prompt}\" is correctly portrayed in the image.
 Give a score from 1 to 5, according to the following criteria:
 
@@ -168,20 +167,20 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the
     spatial layout of the objects in the image is not consistent with the text prompt.
                 """,
-            )
+            }
             return [(question, image)]
         elif self.prompt_property == PromptCategory.numeracy:
             self._total_score = 5
-            question = JudgeQuestion(
-                image_desciption_system_prompt="""
+            question = {
+                "image_desciption_system_prompt": """
 You are a helpful assistant meant to describe images is detail.
 You should pay special attention to the objects and their quantities in the image.
                 """,
-                judgement_question_system_prompt="""
+                "judgement_question_system_prompt": """
 You are a helpful assistant meant to identify objects and their quantities in the image.
 You have to extract the question, the score, and the explanation from the user's response.
                 """,
-                judgement_question=f"""
+                "judgement_question": f"""
 Looking at the image and given a detailed description of the image, evaluate how well the image aligns with the text prompt: \"{prompt}\"
 Give a score from 1 to 5, according to the following criteria:
 
@@ -201,23 +200,23 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the
     spatial layout of the objects in the image is not consistent with the text prompt.
                 """,
-            )
+            }
             return [(question, image)]
         elif self.prompt_property == PromptCategory.complex:
             self._total_score = 5
-            question = JudgeQuestion(
-                image_desciption_system_prompt="""
+            question = {
+                "image_desciption_system_prompt": """
 You are a helpful assistant meant to describe images is detail.
 You should pay special attention to the objects in the image and their attributes
 (such as color, shape, texture), spatial layout and action relationships.
                 """,
-                judgement_question_system_prompt="""
+                "judgement_question_system_prompt": """
 You are a helpful assistant meant to evaluate the correspondence of the image to a given text prompt.
 Focus on the objects in the image and their attributes (such as color, shape, texture),
 spatial layout and action relationships. You have to extract the question, the score, and the
 explanation from the user's response.
                 """,
-                judgement_question=f"""
+                "judgement_question": f"""
 Looking at the image and given a detailed description of the image, evaluate how well the image aligns with the text prompt: \"{prompt}\"
 Give a score from 1 to 5, according to the following criteria:
 
@@ -237,21 +236,21 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the
     spatial layout of the objects in the image is not consistent with the text prompt.
                 """,
-            )
+            }
             return [(question, image)]
         tagged_prompt_parts = self.extract_prompt_parts(prompt)
         questions: List[str] = []
         for tagged_prompt_part in tagged_prompt_parts:
-            question = JudgeQuestion(
-                image_desciption_system_prompt=f"""
+            question = {
+                "image_desciption_system_prompt": f"""
 You are a helpful assistant meant to describe images is detail.
 You should pay special attention to any objects and their {self.prompt_property.name} in the given image.
                 """,
-                judgement_question_system_prompt=f"""
+                "judgement_question_system_prompt": f"""
 You are a helpful assistant meant to identify any objects and their {self.prompt_property.name}
 in the given image. You have to extract the question, the score, and the explanation from the user's response.
                 """,
-                judgement_question=f"""
+                "judgement_question": f"""
 Looking at the image and given a detailed description of the image, evaluate if there is a {tagged_prompt_part.entity} in the image.
 Give a score from 1 to 4, according to the following criteria:
 
@@ -270,13 +269,13 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the
     spatial layout of the objects in the image is not consistent with the text prompt.
                 """,
-            )
+            }
             questions.append((question, image))
         return questions
 
     @weave.op
     def execute_chain_of_thought(
-        self, question: JudgeQuestion, image: Image.Image
+        self, question: Dict[str, str], image: Image.Image
     ) -> OpenAIJudgeMent:
         image_description_explanation = (
             self._openai_client.chat.completions.create(
@@ -285,7 +284,7 @@ def execute_chain_of_thought(
                 messages=[
                     {
                         "role": "system",
-                        "content": question.image_desciption_system_prompt,
+                        "content": question["image_desciption_system_prompt"],
                     },
                     {
                         "role": "user",
@@ -301,7 +300,9 @@ def execute_chain_of_thought(
             .choices[0]
             .message.content
         )
-        question.judgement_question += f"""
+        question[
+            "judgement_question"
+        ] += f"""
 
 Here is a detailed explanation of the image:
 ---
@@ -318,12 +319,12 @@ def execute_chain_of_thought(
                 messages=[
                     {
                         "role": "system",
-                        "content": question.judgement_question_system_prompt,
+                        "content": question["judgement_question_system_prompt"],
                     },
                     {
                         "role": "user",
                         "content": [
-                            {"type": "text", "text": question.judgement_question},
+                            {"type": "text", "text": question["judgement_question"]},
                             {
                                 "type": "image_url",
                                 "image_url": {"url": base64_encode_image(image)},

diff --git a/hemm/tests/test_2d_spatial_relationship_eval.py b/hemm/tests/test_2d_spatial_relationship_eval.py
@@ -6,7 +6,9 @@
 from hemm.eval_pipelines import BaseDiffusionModel, EvaluationPipeline
 from hemm.metrics.spatial_relationship import SpatialRelationshipMetric2D
 from hemm.metrics.spatial_relationship.judges import (
-    DETRSpatialRelationShipJudge, RTDETRSpatialRelationShipJudge)
+    DETRSpatialRelationShipJudge,
+    RTDETRSpatialRelationShipJudge,
+)
 
 
 class Test2DSpatialRelationshipEval(unittest.TestCase):

diff --git a/hemm/tests/test_prompt_alignment_eval.py b/hemm/tests/test_prompt_alignment_eval.py
@@ -4,8 +4,7 @@
 
 import wandb
 from hemm.eval_pipelines import BaseDiffusionModel, EvaluationPipeline
-from hemm.metrics.prompt_alignment import (CLIPImageQualityScoreMetric,
-                                           CLIPScoreMetric)
+from hemm.metrics.prompt_alignment import CLIPImageQualityScoreMetric, CLIPScoreMetric
 
 
 class TestPromptAlignmentEvaluation(unittest.TestCase):