Improve F1 scorer to accept an answer extraction function

UKGovernmentBEIS · Sep 4, 2024 · 09c4351 · 09c4351
1 parent 51676ab
commit 09c4351
Showing 1 changed file with 11 additions and 4 deletions.
diff --git a/src/inspect_ai/scorer/_classification.py b/src/inspect_ai/scorer/_classification.py
@@ -1,6 +1,6 @@
 import re
 import string
-from typing import List
+from typing import Callable, List
 
 from inspect_ai.solver._task_state import TaskState
 
@@ -11,19 +11,26 @@
 
 
 @scorer(metrics=[mean(), stderr()])
-def f1() -> Scorer:
+def f1(
+    answer_fn: Callable[[str], str] | None = None,
+) -> Scorer:
     """Scorer which produces an F1 score
 
     Computes the `F1` score for the answer (which balances recall precision by taking the harmonic mean between recall and precision).
     """
 
     async def score(state: TaskState, target: Target) -> Score:
         # Get generated answer and extract relevant answer text
-        answer = state.output.completion
+        answer = (
+            answer_fn(state.output.completion) if answer_fn else state.output.completion
+        )
         targets = target.target
 
         f1_score = max_f1_score(answer, targets)
-        return Score(value=f1_score, answer=answer)
+        return Score(
+            value=f1_score,
+            answer=answer,
+        )
 
     return score