From 6c6a03a3750b1a895ede85492a27856c06f932d0 Mon Sep 17 00:00:00 2001
From: jjallaire-aisi <joseph.allaire@dsit.gov.uk>
Date: Thu, 10 Oct 2024 05:25:14 -0400
Subject: [PATCH] Add `fail_on_error` option for `eval_retry()` and `inspect
 eval-retry` (#682)

Co-authored-by: aisi-inspect <166920645+aisi-inspect@users.noreply.github.com>
---
 CHANGELOG.md                 |  3 ++-
 src/inspect_ai/_cli/eval.py  | 31 +++++++++++++++++++++++++++++--
 src/inspect_ai/_eval/eval.py | 17 ++++++++++++++++-
 tests/test_fail_on_error.py  | 19 ++++++++++++++++++-
 4 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cdcd30216..45cc37cdd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,8 @@
 - Allow resolution of any sandbox name when there is only a single environment.
 - Introduce `--log-level-transcript` option for separate control of log entries recorded in the eval log file
 - Improve mime type detection for image content encoding (fixes issues w/ webp images). 
-- Fix memory leak in Inspect View worker-based JSON parsing
+- Fix memory leak in Inspect View worker-based JSON parsing.
+- Add `fail_on_error` option for `eval_retry()` and `inspect eval-retry`.
 
 ## v0.3.40 (6 October 2024)
 
diff --git a/src/inspect_ai/_cli/eval.py b/src/inspect_ai/_cli/eval.py
index ef88ccfe0..dffccad1b 100644
--- a/src/inspect_ai/_cli/eval.py
+++ b/src/inspect_ai/_cli/eval.py
@@ -29,7 +29,9 @@
     "Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
 )
 NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes"
+FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
 NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
+NO_FAIL_ON_ERROR_HELP = "Do not fail the eval if errors occur within samples (instead, continue running other samples)"
 LOG_IMAGES_HELP = (
     "Include base64 encoded versions of filename or URL based images in the log file."
 )
@@ -175,7 +177,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         type=float,
         is_flag=False,
         flag_value=0.0,
-        help="Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count.",
+        help=FAIL_ON_ERROR_HELP,
         envvar="INSPECT_EVAL_FAIL_ON_ERROR",
     )
     @click.option(
@@ -183,7 +185,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         type=bool,
         is_flag=True,
         default=False,
-        help="Do not fail the eval if errors occur within samples (instead, continue running other samples)",
+        help=NO_FAIL_ON_ERROR_HELP,
         envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR",
     )
     @click.option(
@@ -755,6 +757,22 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
     help="Trace message interactions with evaluated model to terminal.",
     envvar="INSPECT_EVAL_TRACE",
 )
+@click.option(
+    "--fail-on-error",
+    type=float,
+    is_flag=False,
+    flag_value=0.0,
+    help=FAIL_ON_ERROR_HELP,
+    envvar="INSPECT_EVAL_FAIL_ON_ERROR",
+)
+@click.option(
+    "--no-fail-on-error",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help=NO_FAIL_ON_ERROR_HELP,
+    envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR",
+)
 @click.option(
     "--no-log-samples",
     type=bool,
@@ -798,6 +816,8 @@ def eval_retry_command(
     max_subprocesses: int | None,
     no_sandbox_cleanup: bool | None,
     trace: bool | None,
+    fail_on_error: bool | float | None,
+    no_fail_on_error: bool | None,
     no_log_samples: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
@@ -817,6 +837,12 @@ def eval_retry_command(
     log_images = True if log_images else None
     score = False if no_score else True
 
+    # resolve fail_on_error
+    if no_fail_on_error is True:
+        fail_on_error = False
+    elif fail_on_error == 0.0:
+        fail_on_error = True
+
     # resolve log file
     retry_log_files = [
         log_file_info(filesystem(log_file).info(log_file)) for log_file in log_files
@@ -833,6 +859,7 @@ def eval_retry_command(
         max_subprocesses=max_subprocesses,
         sandbox_cleanup=sandbox_cleanup,
         trace=trace,
+        fail_on_error=fail_on_error,
         debug_errors=kwargs["debug_errors"],
         log_samples=log_samples,
         log_images=log_images,
diff --git a/src/inspect_ai/_eval/eval.py b/src/inspect_ai/_eval/eval.py
index 0591d4b17..242bc4a12 100644
--- a/src/inspect_ai/_eval/eval.py
+++ b/src/inspect_ai/_eval/eval.py
@@ -402,6 +402,7 @@ def eval_retry(
     max_subprocesses: int | None = None,
     sandbox_cleanup: bool | None = None,
     trace: bool | None = None,
+    fail_on_error: bool | float | None = None,
     debug_errors: bool | None = None,
     log_samples: bool | None = None,
     log_images: bool | None = None,
@@ -430,6 +431,10 @@ def eval_retry(
         sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
            (defaults to True)
         trace (bool | None): Trace message interactions with evaluated model to terminal.
+        fail_on_error (bool | float | None): `True` to fail on first sample error
+           (default); `False` to never fail on sample errors; Value between 0 and 1
+           to fail if a proportion of total samples fails. Value greater than 1 to fail
+           eval if a count of samples fails.
         debug_errors (bool | None): Raise task errors (rather than logging them)
            so they can be debugged (defaults to False).
         log_samples: (bool | None): Log detailed samples and scores (defaults to True)
@@ -461,6 +466,7 @@ def eval_retry(
             max_subprocesses=max_subprocesses,
             sandbox_cleanup=sandbox_cleanup,
             trace=trace,
+            fail_on_error=fail_on_error,
             debug_errors=debug_errors,
             log_samples=log_samples,
             log_images=log_images,
@@ -483,6 +489,7 @@ async def eval_retry_async(
     max_subprocesses: int | None = None,
     sandbox_cleanup: bool | None = None,
     trace: bool | None = None,
+    fail_on_error: bool | float | None = None,
     debug_errors: bool | None = None,
     log_samples: bool | None = None,
     log_images: bool | None = None,
@@ -511,6 +518,10 @@ async def eval_retry_async(
         sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
            (defaults to True)
         trace (bool | None): Trace message interactions with evaluated model to terminal.
+        fail_on_error (bool | float | None): `True` to fail on first sample error
+           (default); `False` to never fail on sample errors; Value between 0 and 1
+           to fail if a proportion of total samples fails. Value greater than 1 to fail
+           eval if a count of samples fails.
         debug_errors (bool | None): Raise task errors (rather than logging them)
            so they can be debugged (defaults to False).
         log_samples: (bool | None): Log detailed samples and scores (defaults to True)
@@ -588,7 +599,6 @@ async def eval_retry_async(
         )
         trace = eval_log.eval.config.trace or trace
         approval = eval_log.eval.config.approval
-        fail_on_error = eval_log.eval.config.fail_on_error
         message_limit = eval_log.eval.config.message_limit
         token_limit = eval_log.eval.config.token_limit
         max_samples = max_samples or eval_log.eval.config.max_samples
@@ -599,6 +609,11 @@ async def eval_retry_async(
             if sandbox_cleanup is not None
             else eval_log.eval.config.sandbox_cleanup
         )
+        fail_on_error = (
+            fail_on_error
+            if fail_on_error is not None
+            else eval_log.eval.config.fail_on_error
+        )
         log_samples = (
             log_samples if log_samples is not None else eval_log.eval.config.log_samples
         )
diff --git a/tests/test_fail_on_error.py b/tests/test_fail_on_error.py
index b5d8add4a..b2245a8d3 100644
--- a/tests/test_fail_on_error.py
+++ b/tests/test_fail_on_error.py
@@ -8,7 +8,7 @@
 
 
 @solver
-def failing_solver(fail: Callable[[TaskState], bool]):
+def failing_solver(fail: Callable[[TaskState], bool] = lambda state: True):
     async def solve(state: TaskState, generate: Generate):
         if fail(state):
             raise ValueError("Eval failed!")
@@ -112,3 +112,20 @@ def test_fail_on_error_retry():
     ):
         log = eval_retry(log)[0]
         assert log.eval.task_id == task_id
+
+
+@task
+def always_fails():
+    return Task(
+        solver=[failing_solver(), generate()],
+    )
+
+
+def test_fail_on_error_retry_override():
+    # fail the first time
+    log = eval(always_fails(), model="mockllm/model")[0]
+    assert log.status == "error"
+
+    # try again with fail_on_error = False
+    log = eval_retry(log, fail_on_error=False)[0]
+    assert log.status == "success"