From 9ceb6ecbb29f33d01481286050514bf1aaa2266b Mon Sep 17 00:00:00 2001
From: Soumik Rakshit <19soumik.rakshit96@gmail.com>
Date: Wed, 30 Oct 2024 16:16:37 +0000
Subject: [PATCH] update: data structure in OpenAIJudge.frame_question

---
 hemm/metrics/__init__.py                      |  7 +-
 hemm/metrics/image_quality/lpips.py           |  3 +-
 .../vqa/judges/mmllm_judges/openai_judge.py   | 71 ++++++++++---------
 .../test_2d_spatial_relationship_eval.py      |  4 +-
 hemm/tests/test_prompt_alignment_eval.py      |  3 +-
 5 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/hemm/metrics/__init__.py b/hemm/metrics/__init__.py
index 380ab11..3b3e6a4 100644
--- a/hemm/metrics/__init__.py
+++ b/hemm/metrics/__init__.py
@@ -1,4 +1,7 @@
-from .prompt_alignment import (BLIPScoreMertric, CLIPImageQualityScoreMetric,
-                               CLIPScoreMetric)
+from .prompt_alignment import (
+    BLIPScoreMertric,
+    CLIPImageQualityScoreMetric,
+    CLIPScoreMetric,
+)
 
 __all__ = ["BLIPScoreMertric", "CLIPImageQualityScoreMetric", "CLIPScoreMetric"]
diff --git a/hemm/metrics/image_quality/lpips.py b/hemm/metrics/image_quality/lpips.py
index 45e5b2b..fe67f95 100644
--- a/hemm/metrics/image_quality/lpips.py
+++ b/hemm/metrics/image_quality/lpips.py
@@ -5,8 +5,7 @@
 import torch
 import weave
 from PIL import Image
-from torchmetrics.functional.image import \
-    learned_perceptual_image_patch_similarity
+from torchmetrics.functional.image import learned_perceptual_image_patch_similarity
 
 from ...utils import base64_encode_image
 from .base import BaseImageQualityMetric, ComputeMetricOutput
diff --git a/hemm/metrics/vqa/judges/mmllm_judges/openai_judge.py b/hemm/metrics/vqa/judges/mmllm_judges/openai_judge.py
index 2e8ec35..e8d7512 100644
--- a/hemm/metrics/vqa/judges/mmllm_judges/openai_judge.py
+++ b/hemm/metrics/vqa/judges/mmllm_judges/openai_judge.py
@@ -1,6 +1,6 @@
 import os
 import subprocess
-from typing import List
+from typing import Dict, List
 
 import spacy
 import weave
@@ -9,8 +9,7 @@
 from pydantic import BaseModel
 
 from .....utils import base64_encode_image
-from .commons import (JudgeMent, JudgeQuestion, PromptCategory,
-                      TaggedPromptParts)
+from .commons import JudgeMent, JudgeQuestion, PromptCategory, TaggedPromptParts
 
 
 class OpenAIJudgeMent(BaseModel):
@@ -92,7 +91,7 @@ def extract_prompt_parts(self, prompt: str) -> List[TaggedPromptParts]:
         return tagged_prompt_parts
 
     @weave.op()
-    def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]:
+    def frame_question(self, prompt: str, image: Image.Image) -> List[Dict[str, str]]:
         """Frame the question corresponding to the given prompt and image for
         the chain-of-thought system of judgement.
 
@@ -101,21 +100,21 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
             image (Image.Image): The image to frame the question for.
 
         Returns:
-            List[JudgeQuestion]: List of questions to ask for the given prompt.
+            List[Dict[str, str]]: List of questions to ask for the given prompt.
         """
         prompt = str(prompt)
         if self.prompt_property in [PromptCategory.spatial, PromptCategory.spatial_3d]:
             self._total_score = 5
-            question = JudgeQuestion(
-                image_desciption_system_prompt="""
+            question = {
+                "image_desciption_system_prompt": """
 You are a helpful assistant meant to describe images is detail.
 You should pay special attention to the objects and their spatial layout in the image.
                 """,
-                judgement_question_system_prompt="""
+                "judgement_question_system_prompt": """
 You are a helpful assistant meant to identify objects and their spatial layout in the image.
 You have to extract the question, the score, and the explanation from the user's response.
                 """,
-                judgement_question=f"""
+                "judgement_question": f"""
 Looking at the image and given a detailed description of the image, evaluate if the text \"{prompt}\" is correctly portrayed in the image.
 Give a score from 1 to 5, according to the following criteria:
 
@@ -135,20 +134,20 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the
     spatial layout of the objects in the image is not consistent with the text prompt.
                 """,
-            )
+            }
             return [(question, image)]
         elif self.prompt_property == PromptCategory.action:
             self._total_score = 5
-            question = JudgeQuestion(
-                image_desciption_system_prompt="""
+            question = {
+                "image_desciption_system_prompt": """
 You are a helpful assistant meant to describe images is detail.
 You should pay special attention to the the actions, events, objects and their relationships in the image.
                 """,
-                judgement_question_system_prompt="""
+                "judgement_question_system_prompt": """
 You are a helpful assistant meant to identify the actions, events, objects and their relationships in the image.
 You have to extract the question, the score, and the explanation from the user's response.
                 """,
-                judgement_question=f"""
+                "judgement_question": f"""
 Looking at the image and given a detailed description of the image, evaluate if the text \"{prompt}\" is correctly portrayed in the image.
 Give a score from 1 to 5, according to the following criteria:
 
@@ -168,20 +167,20 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the
     spatial layout of the objects in the image is not consistent with the text prompt.
                 """,
-            )
+            }
             return [(question, image)]
         elif self.prompt_property == PromptCategory.numeracy:
             self._total_score = 5
-            question = JudgeQuestion(
-                image_desciption_system_prompt="""
+            question = {
+                "image_desciption_system_prompt": """
 You are a helpful assistant meant to describe images is detail.
 You should pay special attention to the objects and their quantities in the image.
                 """,
-                judgement_question_system_prompt="""
+                "judgement_question_system_prompt": """
 You are a helpful assistant meant to identify objects and their quantities in the image.
 You have to extract the question, the score, and the explanation from the user's response.
                 """,
-                judgement_question=f"""
+                "judgement_question": f"""
 Looking at the image and given a detailed description of the image, evaluate how well the image aligns with the text prompt: \"{prompt}\"
 Give a score from 1 to 5, according to the following criteria:
 
@@ -201,23 +200,23 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the
     spatial layout of the objects in the image is not consistent with the text prompt.
                 """,
-            )
+            }
             return [(question, image)]
         elif self.prompt_property == PromptCategory.complex:
             self._total_score = 5
-            question = JudgeQuestion(
-                image_desciption_system_prompt="""
+            question = {
+                "image_desciption_system_prompt": """
 You are a helpful assistant meant to describe images is detail.
 You should pay special attention to the objects in the image and their attributes
 (such as color, shape, texture), spatial layout and action relationships.
                 """,
-                judgement_question_system_prompt="""
+                "judgement_question_system_prompt": """
 You are a helpful assistant meant to evaluate the correspondence of the image to a given text prompt.
 Focus on the objects in the image and their attributes (such as color, shape, texture),
 spatial layout and action relationships. You have to extract the question, the score, and the
 explanation from the user's response.
                 """,
-                judgement_question=f"""
+                "judgement_question": f"""
 Looking at the image and given a detailed description of the image, evaluate how well the image aligns with the text prompt: \"{prompt}\"
 Give a score from 1 to 5, according to the following criteria:
 
@@ -237,21 +236,21 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the
     spatial layout of the objects in the image is not consistent with the text prompt.
                 """,
-            )
+            }
             return [(question, image)]
         tagged_prompt_parts = self.extract_prompt_parts(prompt)
         questions: List[str] = []
         for tagged_prompt_part in tagged_prompt_parts:
-            question = JudgeQuestion(
-                image_desciption_system_prompt=f"""
+            question = {
+                "image_desciption_system_prompt": f"""
 You are a helpful assistant meant to describe images is detail.
 You should pay special attention to any objects and their {self.prompt_property.name} in the given image.
                 """,
-                judgement_question_system_prompt=f"""
+                "judgement_question_system_prompt": f"""
 You are a helpful assistant meant to identify any objects and their {self.prompt_property.name}
 in the given image. You have to extract the question, the score, and the explanation from the user's response.
                 """,
-                judgement_question=f"""
+                "judgement_question": f"""
 Looking at the image and given a detailed description of the image, evaluate if there is a {tagged_prompt_part.entity} in the image.
 Give a score from 1 to 4, according to the following criteria:
 
@@ -270,13 +269,13 @@ def frame_question(self, prompt: str, image: Image.Image) -> List[JudgeQuestion]
 3. The spatial layout of the objects in the image should be consistent with the text prompt. You should deduct 1 point from the score if the
     spatial layout of the objects in the image is not consistent with the text prompt.
                 """,
-            )
+            }
             questions.append((question, image))
         return questions
 
     @weave.op
     def execute_chain_of_thought(
-        self, question: JudgeQuestion, image: Image.Image
+        self, question: Dict[str, str], image: Image.Image
     ) -> OpenAIJudgeMent:
         image_description_explanation = (
             self._openai_client.chat.completions.create(
@@ -285,7 +284,7 @@ def execute_chain_of_thought(
                 messages=[
                     {
                         "role": "system",
-                        "content": question.image_desciption_system_prompt,
+                        "content": question["image_desciption_system_prompt"],
                     },
                     {
                         "role": "user",
@@ -301,7 +300,9 @@ def execute_chain_of_thought(
             .choices[0]
             .message.content
         )
-        question.judgement_question += f"""
+        question[
+            "judgement_question"
+        ] += f"""
 
 Here is a detailed explanation of the image:
 ---
@@ -318,12 +319,12 @@ def execute_chain_of_thought(
                 messages=[
                     {
                         "role": "system",
-                        "content": question.judgement_question_system_prompt,
+                        "content": question["judgement_question_system_prompt"],
                     },
                     {
                         "role": "user",
                         "content": [
-                            {"type": "text", "text": question.judgement_question},
+                            {"type": "text", "text": question["judgement_question"]},
                             {
                                 "type": "image_url",
                                 "image_url": {"url": base64_encode_image(image)},
diff --git a/hemm/tests/test_2d_spatial_relationship_eval.py b/hemm/tests/test_2d_spatial_relationship_eval.py
index 86a2009..4cd3f2d 100644
--- a/hemm/tests/test_2d_spatial_relationship_eval.py
+++ b/hemm/tests/test_2d_spatial_relationship_eval.py
@@ -6,7 +6,9 @@
 from hemm.eval_pipelines import BaseDiffusionModel, EvaluationPipeline
 from hemm.metrics.spatial_relationship import SpatialRelationshipMetric2D
 from hemm.metrics.spatial_relationship.judges import (
-    DETRSpatialRelationShipJudge, RTDETRSpatialRelationShipJudge)
+    DETRSpatialRelationShipJudge,
+    RTDETRSpatialRelationShipJudge,
+)
 
 
 class Test2DSpatialRelationshipEval(unittest.TestCase):
diff --git a/hemm/tests/test_prompt_alignment_eval.py b/hemm/tests/test_prompt_alignment_eval.py
index 148884c..a876455 100644
--- a/hemm/tests/test_prompt_alignment_eval.py
+++ b/hemm/tests/test_prompt_alignment_eval.py
@@ -4,8 +4,7 @@
 
 import wandb
 from hemm.eval_pipelines import BaseDiffusionModel, EvaluationPipeline
-from hemm.metrics.prompt_alignment import (CLIPImageQualityScoreMetric,
-                                           CLIPScoreMetric)
+from hemm.metrics.prompt_alignment import CLIPImageQualityScoreMetric, CLIPScoreMetric
 
 
 class TestPromptAlignmentEvaluation(unittest.TestCase):