updated inference endpoint system

huggingface · Jul 25, 2024 · 4067ee4 · 4067ee4
1 parent fd9dc34
commit 4067ee4
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 12 deletions.
diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py
@@ -37,6 +37,7 @@ class JudgeEndpoint:
 
     Args:
         model (str): The name of the model to use.
+        url (str): Endpoint to go to (open ai or inference endpoint)
         seed (int): The seed value for generating random responses.
         temperature (float): The temperature value for controlling the randomness of the responses.
         templates_path (str): The path to the JSON file containing the templates for prompts.

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -228,13 +228,13 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
-    llm_judge_multi_turn_openai = SampleLevelMetricGrouping(
+    llm_judge_multi_turn_gpt3p5 = SampleLevelMetricGrouping(
         metric_name=["single_turn", "multi_turn"],
         higher_is_better=True,
         category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLM(
-            judge_model_name_or_url="gpt-3.5-turbo",
+            judge_model_name="gpt-3.5-turbo",
             template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
             multi_turn=True,
         ).compute,
@@ -243,13 +243,13 @@ class Metrics(Enum):
             "multi_turn": np.mean,
         },
     )
-    llm_judge_multi_turn_local_endpoint = SampleLevelMetricGrouping(
+    llm_judge_multi_turn_llama3_405 = SampleLevelMetricGrouping(
         metric_name=["single_turn", "multi_turn"],
         higher_is_better=True,
         category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLM(
-            judge_model_name_or_url="http://localhost:3000/v1",  # replace with your endpoint url if needed
+            judge_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8",
             template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
             multi_turn=True,
         ).compute,
@@ -264,7 +264,7 @@ class Metrics(Enum):
         category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLM(
-            judge_model_name_or_url="gpt-3.5-turbo",
+            judge_model_name="gpt-3.5-turbo",
             template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
             multi_turn=False,
         ).compute,

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -624,19 +624,17 @@ def edit_similarity(self, s1, s2):
 class JudgeLLM:
     available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]
 
-    def __init__(self, judge_model_name_or_url: str, template_path: str, multi_turn: bool = False):
-        if judge_model_name_or_url in self.available_models_openai:
+    def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False):
+        if judge_model_name in self.available_models_openai:
             API_KEY = os.getenv("OPENAI_API_KEY")
             url = None
-            model = judge_model_name_or_url
         else:
             API_KEY = os.getenv("HF_TOKEN")
-            url = judge_model_name_or_url
-            model = "tgi"
+            url = "https://api-inference.huggingface.co/v1/"
 
         self.multi_turn = multi_turn
         self.judge = JudgeEndpoint(
-            model=model,
+            model=judge_model_name,
             url=url,
             seed=42,
             temperature=0.0,

diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py
@@ -55,7 +55,7 @@ def mt_bench_prompt(line, task_name: str = None):
     evaluation_splits=["train"],
     few_shots_split="",
     few_shots_select="random",
-    metric=["llm_judge_multi_turn_openai"],
+    metric=["llm_judge_multi_turn_gpt3p5", "llm_judge_multi_turn_llama3_405"],
     generation_size=1024,
     stop_sequence=[],
 )