fix llm as judge warnings (#173)

NathanHB · Nathan Habib · clefourrier · web-flow · commit 3a808336e19e · 2024-07-04T12:48:24.000+02:00
* commit

* fixes

* fix style

* fixes

* make style

* Fix import error detection for open ai package (llm as a judge metric)

---------

Co-authored-by: Nathan Habib &lt;nathan.habib@huggingface.com&gt;
Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;
diff --git a/src/lighteval/metrics/judge_prompts.jsonl b/src/lighteval/metrics/judge_prompts.jsonl
diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py
@@ -27,9 +27,8 @@
 import time
 from typing import Optional
 
-from openai import OpenAI
-
 from lighteval.logging.hierarchical_logger import hlog_warn
+from lighteval.utils import NO_OPENAI_ERROR_MSG, is_openai_available
 
 
 class JudgeOpenAI:
@@ -70,7 +69,8 @@ def __init__(
         openai_api_key: str,
         multi_turn: bool = False,
     ):
-        self.client = OpenAI(api_key=openai_api_key)
+        self.client = None  # loaded lazily
+        self.openai_api_key = openai_api_key
         self.model = model
         self.seed = seed
         self.temperature = temperature
@@ -112,6 +112,14 @@ def evaluate_answer(
         Raises:
             Exception: If an error occurs during the API call.
         """
+        if self.client is None:
+            if not is_openai_available():
+                raise ImportError(NO_OPENAI_ERROR_MSG)
+
+            from openai import OpenAI
+
+            self.client = OpenAI(api_key=self.openai_api_key)
+
         prompts = [
             self.__get_prompts_single_turn(
                 questions[0], answers[0], references[0] if references is not None and len(references) > 0 else None
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -20,6 +20,8 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import os
+
 import numpy as np
 from aenum import Enum
 
@@ -225,29 +227,29 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
-    llm_judge_multi_turn = SampleLevelMetricGrouping(
+    llm_judge_multi_turn_openai = SampleLevelMetricGrouping(
         metric=["single_turn", "multi_turn"],
         higher_is_better=True,
         category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLM(
             judge_model_name="gpt-3.5-turbo",
-            template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
+            template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
             multi_turn=True,
         ).compute,
         corpus_level_fn={
             "single_turn": np.mean,
             "multi_turn": np.mean,
         },
     )
-    llm_judge = SampleLevelMetricGrouping(
+    llm_judge_openai = SampleLevelMetricGrouping(
         metric=["judge_score"],
         higher_is_better=True,
         category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLM(
             judge_model_name="gpt-3.5-turbo",
-            template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
+            template_path=os.path.join(os.path.dirname(__file__), "", "judge_prompts.jsonl"),
             multi_turn=False,
         ).compute,
         corpus_level_fn={
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -631,18 +631,14 @@ def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool =
         OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
         self.multi_turn = multi_turn
 
-        try:
-            self.judge = JudgeOpenAI(
-                model=judge_model_name,
-                seed=42,
-                temperature=0.0,
-                templates_path=template_path,
-                openai_api_key=OPENAI_API_KEY,
-                multi_turn=multi_turn,
-            )
-        except Exception as e:
-            print(f"Could not initialize the JudgeOpenAI model:\n{e}")
-            self.judge = None
+        self.judge = JudgeOpenAI(
+            model=judge_model_name,
+            seed=42,
+            temperature=0.0,
+            templates_path=template_path,
+            openai_api_key=OPENAI_API_KEY,
+            multi_turn=multi_turn,
+        )
 
     def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
         """
diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py
@@ -45,7 +45,7 @@
     evaluation_splits=["train"],
     few_shots_split="",
     few_shots_select="random",
-    metric=["llm_judge_multi_turn"],
+    metric=["llm_judge_multi_turn_openai"],
     generation_size=1024,
     stop_sequence=[],
 )
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 import collections
+import os
 import random
 from dataclasses import dataclass
 from multiprocessing import Pool
@@ -53,7 +54,7 @@
     RequestType,
     TaskExampleId,
 )
-from lighteval.utils import as_list
+from lighteval.utils import NO_OPENAI_ERROR_MSG, as_list, is_openai_available
 
 from . import tasks_prompt_formatting
 
@@ -200,8 +201,21 @@ def __init__(  # noqa: C901
         self.metrics = as_list(cfg.metric)
         self.suite = as_list(cfg.suite)
         ignored = [metric for metric in self.metrics if Metrics[metric].value.category == MetricCategory.IGNORED]
+
         if len(ignored) > 0:
             hlog_warn(f"[WARNING] Not implemented yet: ignoring the metric {' ,'.join(ignored)} for task {self.name}.")
+
+        if any(
+            Metrics[metric].value.category in [MetricCategory.LLM_AS_JUDGE, MetricCategory.LLM_AS_JUDGE_MULTI_TURN]
+            for metric in self.metrics
+        ):
+            if not is_openai_available():
+                raise ImportError(NO_OPENAI_ERROR_MSG)
+            if os.getenv("OPENAI_API_KEY") is None:
+                raise ValueError(
+                    "Using llm as judge metric but no OPEN_API_KEY were found, please set it with: export OPEN_API_KEY={yourkey}"
+                )
+
         current_categories = [Metrics[metric].value.category for metric in self.metrics]
         self.has_metric_category = {category: (category in current_categories) for category in MetricCategory}
         # Sub-optimal system - we might want to store metric parametrisation in a yaml conf for example
diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py
@@ -191,6 +191,13 @@ def is_peft_available() -> bool:
 NO_PEFT_ERROR_MSG = "You are trying to use adapter weights models, for which you need `peft`, which is not available in your environment. Please install it using pip."
 
 
+def is_openai_available() -> bool:
+    return importlib.util.find_spec("openai") is not None
+
+
+NO_OPENAI_ERROR_MSG = "You are trying to use an Open AI LLM as a judge, for which you need `openai`, which is not available in your environment. Please install it using pip."
+
+
 def can_load_extended_tasks() -> bool:
     imports = []
     for package in ["langdetect"]:

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@`
`45`	`45`	`evaluation_splits=["train"],`
`46`	`46`	`few_shots_split="",`
`47`	`47`	`few_shots_select="random",`
`48`		`- metric=["llm_judge_multi_turn"],`
	`48`	`+ metric=["llm_judge_multi_turn_openai"],`
`49`	`49`	`generation_size=1024,`
`50`	`50`	`stop_sequence=[],`
`51`	`51`	`)`