Skip to content

Commit

Permalink
Add fail_on_error option for eval_retry() and inspect eval-retry (
Browse files Browse the repository at this point in the history
#682)

Co-authored-by: aisi-inspect <[email protected]>
  • Loading branch information
jjallaire-aisi and aisi-inspect authored Oct 10, 2024
1 parent cfc4c22 commit 6c6a03a
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 5 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
- Allow resolution of any sandbox name when there is only a single environment.
- Introduce `--log-level-transcript` option for separate control of log entries recorded in the eval log file
- Improve mime type detection for image content encoding (fixes issues w/ webp images).
- Fix memory leak in Inspect View worker-based JSON parsing
- Fix memory leak in Inspect View worker-based JSON parsing.
- Add `fail_on_error` option for `eval_retry()` and `inspect eval-retry`.

## v0.3.40 (6 October 2024)

Expand Down
31 changes: 29 additions & 2 deletions src/inspect_ai/_cli/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
"Maximum number of subprocesses to run in parallel (default is os.cpu_count())"
)
NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task completes"
FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
NO_FAIL_ON_ERROR_HELP = "Do not fail the eval if errors occur within samples (instead, continue running other samples)"
LOG_IMAGES_HELP = (
"Include base64 encoded versions of filename or URL based images in the log file."
)
Expand Down Expand Up @@ -175,15 +177,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
type=float,
is_flag=False,
flag_value=0.0,
help="Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count.",
help=FAIL_ON_ERROR_HELP,
envvar="INSPECT_EVAL_FAIL_ON_ERROR",
)
@click.option(
"--no-fail-on-error",
type=bool,
is_flag=True,
default=False,
help="Do not fail the eval if errors occur within samples (instead, continue running other samples)",
help=NO_FAIL_ON_ERROR_HELP,
envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR",
)
@click.option(
Expand Down Expand Up @@ -755,6 +757,22 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
help="Trace message interactions with evaluated model to terminal.",
envvar="INSPECT_EVAL_TRACE",
)
@click.option(
"--fail-on-error",
type=float,
is_flag=False,
flag_value=0.0,
help=FAIL_ON_ERROR_HELP,
envvar="INSPECT_EVAL_FAIL_ON_ERROR",
)
@click.option(
"--no-fail-on-error",
type=bool,
is_flag=True,
default=False,
help=NO_FAIL_ON_ERROR_HELP,
envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR",
)
@click.option(
"--no-log-samples",
type=bool,
Expand Down Expand Up @@ -798,6 +816,8 @@ def eval_retry_command(
max_subprocesses: int | None,
no_sandbox_cleanup: bool | None,
trace: bool | None,
fail_on_error: bool | float | None,
no_fail_on_error: bool | None,
no_log_samples: bool | None,
log_images: bool | None,
log_buffer: int | None,
Expand All @@ -817,6 +837,12 @@ def eval_retry_command(
log_images = True if log_images else None
score = False if no_score else True

# resolve fail_on_error
if no_fail_on_error is True:
fail_on_error = False
elif fail_on_error == 0.0:
fail_on_error = True

# resolve log file
retry_log_files = [
log_file_info(filesystem(log_file).info(log_file)) for log_file in log_files
Expand All @@ -833,6 +859,7 @@ def eval_retry_command(
max_subprocesses=max_subprocesses,
sandbox_cleanup=sandbox_cleanup,
trace=trace,
fail_on_error=fail_on_error,
debug_errors=kwargs["debug_errors"],
log_samples=log_samples,
log_images=log_images,
Expand Down
17 changes: 16 additions & 1 deletion src/inspect_ai/_eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,7 @@ def eval_retry(
max_subprocesses: int | None = None,
sandbox_cleanup: bool | None = None,
trace: bool | None = None,
fail_on_error: bool | float | None = None,
debug_errors: bool | None = None,
log_samples: bool | None = None,
log_images: bool | None = None,
Expand Down Expand Up @@ -430,6 +431,10 @@ def eval_retry(
sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
(defaults to True)
trace (bool | None): Trace message interactions with evaluated model to terminal.
fail_on_error (bool | float | None): `True` to fail on first sample error
(default); `False` to never fail on sample errors; Value between 0 and 1
to fail if a proportion of total samples fails. Value greater than 1 to fail
eval if a count of samples fails.
debug_errors (bool | None): Raise task errors (rather than logging them)
so they can be debugged (defaults to False).
log_samples: (bool | None): Log detailed samples and scores (defaults to True)
Expand Down Expand Up @@ -461,6 +466,7 @@ def eval_retry(
max_subprocesses=max_subprocesses,
sandbox_cleanup=sandbox_cleanup,
trace=trace,
fail_on_error=fail_on_error,
debug_errors=debug_errors,
log_samples=log_samples,
log_images=log_images,
Expand All @@ -483,6 +489,7 @@ async def eval_retry_async(
max_subprocesses: int | None = None,
sandbox_cleanup: bool | None = None,
trace: bool | None = None,
fail_on_error: bool | float | None = None,
debug_errors: bool | None = None,
log_samples: bool | None = None,
log_images: bool | None = None,
Expand Down Expand Up @@ -511,6 +518,10 @@ async def eval_retry_async(
sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
(defaults to True)
trace (bool | None): Trace message interactions with evaluated model to terminal.
fail_on_error (bool | float | None): `True` to fail on first sample error
(default); `False` to never fail on sample errors; Value between 0 and 1
to fail if a proportion of total samples fails. Value greater than 1 to fail
eval if a count of samples fails.
debug_errors (bool | None): Raise task errors (rather than logging them)
so they can be debugged (defaults to False).
log_samples: (bool | None): Log detailed samples and scores (defaults to True)
Expand Down Expand Up @@ -588,7 +599,6 @@ async def eval_retry_async(
)
trace = eval_log.eval.config.trace or trace
approval = eval_log.eval.config.approval
fail_on_error = eval_log.eval.config.fail_on_error
message_limit = eval_log.eval.config.message_limit
token_limit = eval_log.eval.config.token_limit
max_samples = max_samples or eval_log.eval.config.max_samples
Expand All @@ -599,6 +609,11 @@ async def eval_retry_async(
if sandbox_cleanup is not None
else eval_log.eval.config.sandbox_cleanup
)
fail_on_error = (
fail_on_error
if fail_on_error is not None
else eval_log.eval.config.fail_on_error
)
log_samples = (
log_samples if log_samples is not None else eval_log.eval.config.log_samples
)
Expand Down
19 changes: 18 additions & 1 deletion tests/test_fail_on_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


@solver
def failing_solver(fail: Callable[[TaskState], bool]):
def failing_solver(fail: Callable[[TaskState], bool] = lambda state: True):
async def solve(state: TaskState, generate: Generate):
if fail(state):
raise ValueError("Eval failed!")
Expand Down Expand Up @@ -112,3 +112,20 @@ def test_fail_on_error_retry():
):
log = eval_retry(log)[0]
assert log.eval.task_id == task_id


@task
def always_fails():
return Task(
solver=[failing_solver(), generate()],
)


def test_fail_on_error_retry_override():
# fail the first time
log = eval(always_fails(), model="mockllm/model")[0]
assert log.status == "error"

# try again with fail_on_error = False
log = eval_retry(log, fail_on_error=False)[0]
assert log.status == "success"

0 comments on commit 6c6a03a

Please sign in to comment.