From 6c6a03a3750b1a895ede85492a27856c06f932d0 Mon Sep 17 00:00:00 2001 From: jjallaire-aisi Date: Thu, 10 Oct 2024 05:25:14 -0400 Subject: [PATCH] Add `fail_on_error` option for `eval_retry()` and `inspect eval-retry` (#682) Co-authored-by: aisi-inspect <166920645+aisi-inspect@users.noreply.github.com> --- CHANGELOG.md | 3 ++- src/inspect_ai/_cli/eval.py | 31 +++++++++++++++++++++++++++++-- src/inspect_ai/_eval/eval.py | 17 ++++++++++++++++- tests/test_fail_on_error.py | 19 ++++++++++++++++++- 4 files changed, 65 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cdcd30216..45cc37cdd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,8 @@ - Allow resolution of any sandbox name when there is only a single environment. - Introduce `--log-level-transcript` option for separate control of log entries recorded in the eval log file - Improve mime type detection for image content encoding (fixes issues w/ webp images). -- Fix memory leak in Inspect View worker-based JSON parsing +- Fix memory leak in Inspect View worker-based JSON parsing. +- Add `fail_on_error` option for `eval_retry()` and `inspect eval-retry`. ## v0.3.40 (6 October 2024) diff --git a/src/inspect_ai/_cli/eval.py b/src/inspect_ai/_cli/eval.py index ef88ccfe0..dffccad1b 100644 --- a/src/inspect_ai/_cli/eval.py +++ b/src/inspect_ai/_cli/eval.py @@ -29,7 +29,9 @@ "Maximum number of subprocesses to run in parallel (default is os.cpu_count())" ) NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes" +FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count." NO_LOG_SAMPLES_HELP = "Do not include samples in the log file." +NO_FAIL_ON_ERROR_HELP = "Do not fail the eval if errors occur within samples (instead, continue running other samples)" LOG_IMAGES_HELP = ( "Include base64 encoded versions of filename or URL based images in the log file." ) @@ -175,7 +177,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]: type=float, is_flag=False, flag_value=0.0, - help="Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count.", + help=FAIL_ON_ERROR_HELP, envvar="INSPECT_EVAL_FAIL_ON_ERROR", ) @click.option( @@ -183,7 +185,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]: type=bool, is_flag=True, default=False, - help="Do not fail the eval if errors occur within samples (instead, continue running other samples)", + help=NO_FAIL_ON_ERROR_HELP, envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR", ) @click.option( @@ -755,6 +757,22 @@ def parse_comma_separated(value: str | None) -> list[str] | None: help="Trace message interactions with evaluated model to terminal.", envvar="INSPECT_EVAL_TRACE", ) +@click.option( + "--fail-on-error", + type=float, + is_flag=False, + flag_value=0.0, + help=FAIL_ON_ERROR_HELP, + envvar="INSPECT_EVAL_FAIL_ON_ERROR", +) +@click.option( + "--no-fail-on-error", + type=bool, + is_flag=True, + default=False, + help=NO_FAIL_ON_ERROR_HELP, + envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR", +) @click.option( "--no-log-samples", type=bool, @@ -798,6 +816,8 @@ def eval_retry_command( max_subprocesses: int | None, no_sandbox_cleanup: bool | None, trace: bool | None, + fail_on_error: bool | float | None, + no_fail_on_error: bool | None, no_log_samples: bool | None, log_images: bool | None, log_buffer: int | None, @@ -817,6 +837,12 @@ def eval_retry_command( log_images = True if log_images else None score = False if no_score else True + # resolve fail_on_error + if no_fail_on_error is True: + fail_on_error = False + elif fail_on_error == 0.0: + fail_on_error = True + # resolve log file retry_log_files = [ log_file_info(filesystem(log_file).info(log_file)) for log_file in log_files @@ -833,6 +859,7 @@ def eval_retry_command( max_subprocesses=max_subprocesses, sandbox_cleanup=sandbox_cleanup, trace=trace, + fail_on_error=fail_on_error, debug_errors=kwargs["debug_errors"], log_samples=log_samples, log_images=log_images, diff --git a/src/inspect_ai/_eval/eval.py b/src/inspect_ai/_eval/eval.py index 0591d4b17..242bc4a12 100644 --- a/src/inspect_ai/_eval/eval.py +++ b/src/inspect_ai/_eval/eval.py @@ -402,6 +402,7 @@ def eval_retry( max_subprocesses: int | None = None, sandbox_cleanup: bool | None = None, trace: bool | None = None, + fail_on_error: bool | float | None = None, debug_errors: bool | None = None, log_samples: bool | None = None, log_images: bool | None = None, @@ -430,6 +431,10 @@ def eval_retry( sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes (defaults to True) trace (bool | None): Trace message interactions with evaluated model to terminal. + fail_on_error (bool | float | None): `True` to fail on first sample error + (default); `False` to never fail on sample errors; Value between 0 and 1 + to fail if a proportion of total samples fails. Value greater than 1 to fail + eval if a count of samples fails. debug_errors (bool | None): Raise task errors (rather than logging them) so they can be debugged (defaults to False). log_samples: (bool | None): Log detailed samples and scores (defaults to True) @@ -461,6 +466,7 @@ def eval_retry( max_subprocesses=max_subprocesses, sandbox_cleanup=sandbox_cleanup, trace=trace, + fail_on_error=fail_on_error, debug_errors=debug_errors, log_samples=log_samples, log_images=log_images, @@ -483,6 +489,7 @@ async def eval_retry_async( max_subprocesses: int | None = None, sandbox_cleanup: bool | None = None, trace: bool | None = None, + fail_on_error: bool | float | None = None, debug_errors: bool | None = None, log_samples: bool | None = None, log_images: bool | None = None, @@ -511,6 +518,10 @@ async def eval_retry_async( sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes (defaults to True) trace (bool | None): Trace message interactions with evaluated model to terminal. + fail_on_error (bool | float | None): `True` to fail on first sample error + (default); `False` to never fail on sample errors; Value between 0 and 1 + to fail if a proportion of total samples fails. Value greater than 1 to fail + eval if a count of samples fails. debug_errors (bool | None): Raise task errors (rather than logging them) so they can be debugged (defaults to False). log_samples: (bool | None): Log detailed samples and scores (defaults to True) @@ -588,7 +599,6 @@ async def eval_retry_async( ) trace = eval_log.eval.config.trace or trace approval = eval_log.eval.config.approval - fail_on_error = eval_log.eval.config.fail_on_error message_limit = eval_log.eval.config.message_limit token_limit = eval_log.eval.config.token_limit max_samples = max_samples or eval_log.eval.config.max_samples @@ -599,6 +609,11 @@ async def eval_retry_async( if sandbox_cleanup is not None else eval_log.eval.config.sandbox_cleanup ) + fail_on_error = ( + fail_on_error + if fail_on_error is not None + else eval_log.eval.config.fail_on_error + ) log_samples = ( log_samples if log_samples is not None else eval_log.eval.config.log_samples ) diff --git a/tests/test_fail_on_error.py b/tests/test_fail_on_error.py index b5d8add4a..b2245a8d3 100644 --- a/tests/test_fail_on_error.py +++ b/tests/test_fail_on_error.py @@ -8,7 +8,7 @@ @solver -def failing_solver(fail: Callable[[TaskState], bool]): +def failing_solver(fail: Callable[[TaskState], bool] = lambda state: True): async def solve(state: TaskState, generate: Generate): if fail(state): raise ValueError("Eval failed!") @@ -112,3 +112,20 @@ def test_fail_on_error_retry(): ): log = eval_retry(log)[0] assert log.eval.task_id == task_id + + +@task +def always_fails(): + return Task( + solver=[failing_solver(), generate()], + ) + + +def test_fail_on_error_retry_override(): + # fail the first time + log = eval(always_fails(), model="mockllm/model")[0] + assert log.status == "error" + + # try again with fail_on_error = False + log = eval_retry(log, fail_on_error=False)[0] + assert log.status == "success"