Skip to content

Commit

Permalink
Add —no-score-display option (#1027)
Browse files Browse the repository at this point in the history
* Add `—no-display-metrics` option

This optio will stop the computation and display of metrics during execution of evaluation tasks.

* tweaks to no-score-display

* Update CHANGELOG.md

* remove unused struct member

* update changelog

---------

Co-authored-by: J.J. Allaire <[email protected]>
  • Loading branch information
dragonstyle and jjallaire authored Dec 22, 2024
1 parent d036962 commit ccd6a9d
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## Unreleased

- Task display: Added `--no-score-display` option to disable realtime scoring metrics.
- Bugfix: Fix failure to fully clone samples that have message lists as input.

## v0.3.53 (20 December 2024)
Expand Down
25 changes: 25 additions & 0 deletions src/inspect_ai/_cli/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
NO_SCORE_HELP = (
"Do not score model output (use the inspect score command to score output later)"
)
NO_SCORE_DISPLAY = "Do not display scoring metrics in realtime."
MAX_CONNECTIONS_HELP = f"Maximum number of concurrent connections to Model API (defaults to {DEFAULT_MAX_CONNECTIONS})"
MAX_RETRIES_HELP = (
f"Maximum number of times to retry request (defaults to {DEFAULT_MAX_RETRIES})"
Expand Down Expand Up @@ -257,6 +258,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
help=NO_SCORE_HELP,
envvar="INSPECT_EVAL_NO_SCORE",
)
@click.option(
"--no-score-display",
type=bool,
is_flag=True,
help=NO_SCORE_HELP,
envvar="INSPECT_EVAL_SCORE_DISPLAY",
)
@click.option(
"--max-tokens",
type=int,
Expand Down Expand Up @@ -446,6 +454,7 @@ def eval_command(
log_images: bool | None,
log_buffer: int | None,
no_score: bool | None,
no_score_display: bool | None,
log_format: Literal["eval", "json"] | None,
**common: Unpack[CommonOptions],
) -> None:
Expand Down Expand Up @@ -495,6 +504,7 @@ def eval_command(
log_images=log_images,
log_buffer=log_buffer,
no_score=no_score,
no_score_display=no_score_display,
is_eval_set=False,
**config,
)
Expand Down Expand Up @@ -603,6 +613,7 @@ def eval_set_command(
log_images: bool | None,
log_buffer: int | None,
no_score: bool | None,
no_score_display: bool | None,
bundle_dir: str | None,
bundle_overwrite: bool | None,
log_format: Literal["eval", "json"] | None,
Expand Down Expand Up @@ -654,6 +665,7 @@ def eval_set_command(
log_images=log_images,
log_buffer=log_buffer,
no_score=no_score,
no_score_display=no_score_display,
is_eval_set=True,
retry_attempts=retry_attempts,
retry_wait=retry_wait,
Expand Down Expand Up @@ -706,6 +718,7 @@ def eval_exec(
log_images: bool | None,
log_buffer: int | None,
no_score: bool | None,
no_score_display: bool | None,
is_eval_set: bool = False,
retry_attempts: int | None = None,
retry_wait: int | None = None,
Expand Down Expand Up @@ -746,6 +759,7 @@ def eval_exec(
log_images = False if log_images is False else None
trace = True if trace else None
score = False if no_score else True
score_display = False if no_score_display else None

# build params
params: dict[str, Any] = (
Expand Down Expand Up @@ -781,6 +795,7 @@ def eval_exec(
log_images=log_images,
log_buffer=log_buffer,
score=score,
score_display=score_display,
)
| kwargs
)
Expand Down Expand Up @@ -915,6 +930,13 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
help=NO_SCORE_HELP,
envvar="INSPECT_EVAL_SCORE",
)
@click.option(
"--no-score-display",
type=bool,
is_flag=True,
help=NO_SCORE_HELP,
envvar="INSPECT_EVAL_SCORE_DISPLAY",
)
@click.option(
"--max-connections",
type=int,
Expand All @@ -940,6 +962,7 @@ def eval_retry_command(
log_images: bool | None,
log_buffer: int | None,
no_score: bool | None,
no_score_display: bool | None,
max_connections: int | None,
max_retries: int | None,
timeout: int | None,
Expand All @@ -954,6 +977,7 @@ def eval_retry_command(
log_samples = False if no_log_samples else None
log_images = False if log_images is False else None
score = False if no_score else True
score_display = False if no_score_display else None

# resolve fail_on_error
if no_fail_on_error is True:
Expand Down Expand Up @@ -984,6 +1008,7 @@ def eval_retry_command(
log_images=log_images,
log_buffer=log_buffer,
score=score,
score_display=score_display,
max_retries=max_retries,
timeout=timeout,
max_connections=max_connections,
Expand Down
6 changes: 5 additions & 1 deletion src/inspect_ai/_display/textual/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,11 @@ def task_display(self, profile: TaskProfile) -> Iterator[TaskDisplay]:

# add task
try:
yield self.query_one(TasksView).add_task(task)
task_view = self.query_one(TasksView)
task_view.set_display_metrics(
profile.eval_config.score_display is not False
)
yield task_view.add_task(task)
finally:
pass

Expand Down
16 changes: 13 additions & 3 deletions src/inspect_ai/_display/textual/widgets/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def __init__(self) -> None:
self.description_width = MAX_DESCRIPTION_WIDTH
self.model_name_width = MAX_MODEL_NAME_WIDTH
self.sample_count_width = 0
self.display_metrics = True

def init_tasks(self, tasks: list[TaskSpec]) -> None:
# clear existing tasks
Expand All @@ -89,14 +90,21 @@ def init_tasks(self, tasks: list[TaskSpec]) -> None:
def add_task(self, task: TaskWithResult) -> TaskDisplay:
self.update_count_width(task.profile.samples)
task_display = TaskProgressView(
task, self.description_width, self.model_name_width, self.sample_count_width
task,
self.description_width,
self.model_name_width,
self.sample_count_width,
self.display_metrics,
)
self.tasks.mount(task_display)
self.tasks.scroll_to_widget(task_display)
self.update_progress_widths()

return task_display

def set_display_metrics(self, display_metrics: bool) -> None:
self.display_metrics = display_metrics

def update_count_width(self, samples: int) -> None:
sample_count_str = progress_count(samples, samples, self.sample_count_width)
self.sample_count_width = min(
Expand Down Expand Up @@ -174,6 +182,7 @@ def __init__(
description_width: int,
model_name_width: int,
sample_count_width: int,
display_metrics: bool,
) -> None:
super().__init__()
self.t = task
Expand All @@ -190,6 +199,7 @@ def __init__(
self.task_detail = TaskDetail(id="task-detail", classes="hidden")

self.sample_count_width: int = sample_count_width
self.display_metrics = display_metrics

metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
metrics_width: reactive[int | None] = reactive(None)
Expand All @@ -198,7 +208,7 @@ def __init__(
samples_total: reactive[int] = reactive(0)

def compose(self) -> ComposeResult:
yield self.toggle
yield (self.toggle if self.display_metrics else Static())
yield TaskStatusIcon()
yield Static(
progress_description(self.t.profile, self.description_width, pad=True)
Expand Down Expand Up @@ -274,7 +284,7 @@ def refresh_count(self) -> None:

def update_metrics_label(self) -> None:
# compute the label (with a min size)
if self.metrics is not None:
if self.metrics is not None and self.metrics_display is not None:
metric_label = task_metric(self.metrics, self.metrics_width)
self.metrics_width = len(metric_label)
self.metrics_display.update(metric_label)
Expand Down
17 changes: 17 additions & 0 deletions src/inspect_ai/_eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def eval(
log_images: bool | None = None,
log_buffer: int | None = None,
score: bool = True,
score_display: bool | None = None,
**kwargs: Unpack[GenerateConfigArgs],
) -> list[EvalLog]:
r"""Evaluate tasks using a Model.
Expand Down Expand Up @@ -139,6 +140,7 @@ def eval(
If not specified, an appropriate default for the format and filesystem is
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
score (bool): Score output (defaults to True)
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
**kwargs (GenerateConfigArgs): Model generation options.
Returns:
Expand Down Expand Up @@ -183,6 +185,7 @@ def eval(
log_images=log_images,
log_buffer=log_buffer,
score=score,
score_display=score_display,
**kwargs,
)
)
Expand Down Expand Up @@ -220,6 +223,7 @@ async def eval_async(
log_images: bool | None = None,
log_buffer: int | None = None,
score: bool = True,
score_display: bool | None = None,
**kwargs: Unpack[GenerateConfigArgs],
) -> list[EvalLog]:
r"""Evaluate tasks using a Model (async).
Expand Down Expand Up @@ -282,6 +286,7 @@ async def eval_async(
If not specified, an appropriate default for the format and filesystem is
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
score (bool): Score output (defaults to True)
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
**kwargs (GenerateConfigArgs): Model generation options.
Returns:
Expand Down Expand Up @@ -380,6 +385,7 @@ async def eval_async(
log_samples=log_samples,
log_images=log_images,
log_buffer=log_buffer,
score_display=score_display,
)

# run tasks - 2 codepaths, one for the traditional task at a time
Expand Down Expand Up @@ -467,6 +473,7 @@ def eval_retry(
log_images: bool | None = None,
log_buffer: int | None = None,
score: bool = True,
score_display: bool | None = None,
max_retries: int | None = None,
timeout: int | None = None,
max_connections: int | None = None,
Expand Down Expand Up @@ -507,6 +514,7 @@ def eval_retry(
If not specified, an appropriate default for the format and filesystem is
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
score (bool): Score output (defaults to True)
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
max_retries (int | None):
Maximum number of times to retry request.
timeout: (int | None):
Expand Down Expand Up @@ -541,6 +549,7 @@ def eval_retry(
log_images=log_images,
log_buffer=log_buffer,
score=score,
score_display=score_display,
max_retries=max_retries,
timeout=timeout,
max_connections=max_connections,
Expand All @@ -565,6 +574,7 @@ async def eval_retry_async(
log_images: bool | None = None,
log_buffer: int | None = None,
score: bool = True,
score_display: bool | None = None,
max_retries: int | None = None,
timeout: int | None = None,
max_connections: int | None = None,
Expand Down Expand Up @@ -603,6 +613,7 @@ async def eval_retry_async(
If not specified, an appropriate default for the format and filesystem is
chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
score (bool): Score output (defaults to True)
score_display (bool | None): Show scoring metrics in realtime (defaults to True)
max_retries (int | None):
Maximum number of times to retry request.
timeout: (int | None):
Expand Down Expand Up @@ -699,6 +710,11 @@ async def eval_retry_async(
log_buffer = (
log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
)
score_display = (
score_display
if score_display is not None
else eval_log.eval.config.score_display
)

config = eval_log.plan.config
config.max_retries = max_retries or config.max_retries
Expand Down Expand Up @@ -740,6 +756,7 @@ async def eval_retry_async(
log_images=log_images,
log_buffer=log_buffer,
score=score,
score_display=score_display,
**dict(config),
)
)[0]
Expand Down
18 changes: 15 additions & 3 deletions src/inspect_ai/_eval/task/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
log_location=log_location,
)

with display().task(profile) as td:
with display().task(
profile,
) as td:
try:
# start the log
await log_start(logger, plan, generate_config)
Expand Down Expand Up @@ -252,7 +254,10 @@ async def generate(

# track when samples complete and update progress as we go
progress_results: list[dict[str, SampleScore]] = []
update_metrics_display = update_metrics_display_fn(td)
update_metrics_display = update_metrics_display_fn(
td,
display_metrics=profile.eval_config.score_display is not False,
)

def sample_complete(sample_score: dict[str, SampleScore]) -> None:
# Capture the result
Expand Down Expand Up @@ -400,7 +405,10 @@ def sample_complete(sample_score: dict[str, SampleScore]) -> None:


def update_metrics_display_fn(
td: TaskDisplay, initial_interval: float = 0, min_interval: float = 0.9
td: TaskDisplay,
initial_interval: float = 0,
min_interval: float = 0.9,
display_metrics: bool = True,
) -> Callable[
[
int,
Expand All @@ -420,6 +428,10 @@ def compute(
reducers: ScoreReducer | list[ScoreReducer] | None,
metrics: list[Metric] | dict[str, list[Metric]] | None,
) -> None:
# Don't compute metrics if they are not being displayed
if not display_metrics:
return None

nonlocal next_compute_time
time_start = time.perf_counter()
if time_start >= next_compute_time:
Expand Down
1 change: 1 addition & 0 deletions src/inspect_ai/_util/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

def init_display_type(display: str | None = None) -> DisplayType:
global _display_type
global _display_metrics
display = (
display or os.environ.get("INSPECT_DISPLAY", DEFAULT_DISPLAY).lower().strip()
)
Expand Down
3 changes: 3 additions & 0 deletions src/inspect_ai/log/_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ class EvalConfig(BaseModel):
log_buffer: int | None = Field(default=None)
"""Number of samples to buffer before writing log file."""

score_display: bool | None = Field(default=None)
"""Display scoring metrics realtime."""

@property
def max_messages(self) -> int | None:
"""Deprecated max_messages property."""
Expand Down

0 comments on commit ccd6a9d

Please sign in to comment.