Skip to content

Commit 3a80833

Browse files
NathanHBNathan Habibclefourrier
authored
fix llm as judge warnings (#173)
* commit * fixes * fix style * fixes * make style * Fix import error detection for open ai package (llm as a judge metric) --------- Co-authored-by: Nathan Habib <[email protected]> Co-authored-by: Clémentine Fourrier <[email protected]>
1 parent 7fcaab3 commit 3a80833

File tree

7 files changed

+48
-21
lines changed

7 files changed

+48
-21
lines changed

src/lighteval/metrics/llm_as_judge.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,8 @@
2727
import time
2828
from typing import Optional
2929

30-
from openai import OpenAI
31-
3230
from lighteval.logging.hierarchical_logger import hlog_warn
31+
from lighteval.utils import NO_OPENAI_ERROR_MSG, is_openai_available
3332

3433

3534
class JudgeOpenAI:
@@ -70,7 +69,8 @@ def __init__(
7069
openai_api_key: str,
7170
multi_turn: bool = False,
7271
):
73-
self.client = OpenAI(api_key=openai_api_key)
72+
self.client = None # loaded lazily
73+
self.openai_api_key = openai_api_key
7474
self.model = model
7575
self.seed = seed
7676
self.temperature = temperature
@@ -112,6 +112,14 @@ def evaluate_answer(
112112
Raises:
113113
Exception: If an error occurs during the API call.
114114
"""
115+
if self.client is None:
116+
if not is_openai_available():
117+
raise ImportError(NO_OPENAI_ERROR_MSG)
118+
119+
from openai import OpenAI
120+
121+
self.client = OpenAI(api_key=self.openai_api_key)
122+
115123
prompts = [
116124
self.__get_prompts_single_turn(
117125
questions[0], answers[0], references[0] if references is not None and len(references) > 0 else None

src/lighteval/metrics/metrics.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
# SOFTWARE.
2222

23+
import os
24+
2325
import numpy as np
2426
from aenum import Enum
2527

@@ -225,29 +227,29 @@ class Metrics(Enum):
225227
corpus_level_fn=np.mean,
226228
higher_is_better=True,
227229
)
228-
llm_judge_multi_turn = SampleLevelMetricGrouping(
230+
llm_judge_multi_turn_openai = SampleLevelMetricGrouping(
229231
metric=["single_turn", "multi_turn"],
230232
higher_is_better=True,
231233
category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
232234
use_case=MetricUseCase.SUMMARIZATION,
233235
sample_level_fn=JudgeLLM(
234236
judge_model_name="gpt-3.5-turbo",
235-
template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
237+
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
236238
multi_turn=True,
237239
).compute,
238240
corpus_level_fn={
239241
"single_turn": np.mean,
240242
"multi_turn": np.mean,
241243
},
242244
)
243-
llm_judge = SampleLevelMetricGrouping(
245+
llm_judge_openai = SampleLevelMetricGrouping(
244246
metric=["judge_score"],
245247
higher_is_better=True,
246248
category=MetricCategory.LLM_AS_JUDGE,
247249
use_case=MetricUseCase.SUMMARIZATION,
248250
sample_level_fn=JudgeLLM(
249251
judge_model_name="gpt-3.5-turbo",
250-
template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
252+
template_path=os.path.join(os.path.dirname(__file__), "", "judge_prompts.jsonl"),
251253
multi_turn=False,
252254
).compute,
253255
corpus_level_fn={

src/lighteval/metrics/metrics_sample.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -631,18 +631,14 @@ def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool =
631631
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
632632
self.multi_turn = multi_turn
633633

634-
try:
635-
self.judge = JudgeOpenAI(
636-
model=judge_model_name,
637-
seed=42,
638-
temperature=0.0,
639-
templates_path=template_path,
640-
openai_api_key=OPENAI_API_KEY,
641-
multi_turn=multi_turn,
642-
)
643-
except Exception as e:
644-
print(f"Could not initialize the JudgeOpenAI model:\n{e}")
645-
self.judge = None
634+
self.judge = JudgeOpenAI(
635+
model=judge_model_name,
636+
seed=42,
637+
temperature=0.0,
638+
templates_path=template_path,
639+
openai_api_key=OPENAI_API_KEY,
640+
multi_turn=multi_turn,
641+
)
646642

647643
def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
648644
"""

src/lighteval/tasks/extended/mt_bench/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
evaluation_splits=["train"],
4646
few_shots_split="",
4747
few_shots_select="random",
48-
metric=["llm_judge_multi_turn"],
48+
metric=["llm_judge_multi_turn_openai"],
4949
generation_size=1024,
5050
stop_sequence=[],
5151
)

src/lighteval/tasks/lighteval_task.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# SOFTWARE.
2222

2323
import collections
24+
import os
2425
import random
2526
from dataclasses import dataclass
2627
from multiprocessing import Pool
@@ -53,7 +54,7 @@
5354
RequestType,
5455
TaskExampleId,
5556
)
56-
from lighteval.utils import as_list
57+
from lighteval.utils import NO_OPENAI_ERROR_MSG, as_list, is_openai_available
5758

5859
from . import tasks_prompt_formatting
5960

@@ -200,8 +201,21 @@ def __init__( # noqa: C901
200201
self.metrics = as_list(cfg.metric)
201202
self.suite = as_list(cfg.suite)
202203
ignored = [metric for metric in self.metrics if Metrics[metric].value.category == MetricCategory.IGNORED]
204+
203205
if len(ignored) > 0:
204206
hlog_warn(f"[WARNING] Not implemented yet: ignoring the metric {' ,'.join(ignored)} for task {self.name}.")
207+
208+
if any(
209+
Metrics[metric].value.category in [MetricCategory.LLM_AS_JUDGE, MetricCategory.LLM_AS_JUDGE_MULTI_TURN]
210+
for metric in self.metrics
211+
):
212+
if not is_openai_available():
213+
raise ImportError(NO_OPENAI_ERROR_MSG)
214+
if os.getenv("OPENAI_API_KEY") is None:
215+
raise ValueError(
216+
"Using llm as judge metric but no OPEN_API_KEY were found, please set it with: export OPEN_API_KEY={yourkey}"
217+
)
218+
205219
current_categories = [Metrics[metric].value.category for metric in self.metrics]
206220
self.has_metric_category = {category: (category in current_categories) for category in MetricCategory}
207221
# Sub-optimal system - we might want to store metric parametrisation in a yaml conf for example

src/lighteval/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,13 @@ def is_peft_available() -> bool:
191191
NO_PEFT_ERROR_MSG = "You are trying to use adapter weights models, for which you need `peft`, which is not available in your environment. Please install it using pip."
192192

193193

194+
def is_openai_available() -> bool:
195+
return importlib.util.find_spec("openai") is not None
196+
197+
198+
NO_OPENAI_ERROR_MSG = "You are trying to use an Open AI LLM as a judge, for which you need `openai`, which is not available in your environment. Please install it using pip."
199+
200+
194201
def can_load_extended_tasks() -> bool:
195202
imports = []
196203
for package in ["langdetect"]:

0 commit comments

Comments
 (0)