From ccd6a9d483ed58884071a64b466cd6c13868dc4d Mon Sep 17 00:00:00 2001 From: Charles Teague Date: Sun, 22 Dec 2024 09:09:38 -0500 Subject: [PATCH] =?UTF-8?q?Add=20`=E2=80=94no-score-display`=20option=20(#?= =?UTF-8?q?1027)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add `—no-display-metrics` option This optio will stop the computation and display of metrics during execution of evaluation tasks. * tweaks to no-score-display * Update CHANGELOG.md * remove unused struct member * update changelog --------- Co-authored-by: J.J. Allaire --- CHANGELOG.md | 1 + src/inspect_ai/_cli/eval.py | 25 +++++++++++++++++++ src/inspect_ai/_display/textual/app.py | 6 ++++- .../_display/textual/widgets/tasks.py | 16 +++++++++--- src/inspect_ai/_eval/eval.py | 17 +++++++++++++ src/inspect_ai/_eval/task/run.py | 18 ++++++++++--- src/inspect_ai/_util/display.py | 1 + src/inspect_ai/log/_log.py | 3 +++ 8 files changed, 80 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f2dfcfab7..f18555ecd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## Unreleased +- Task display: Added `--no-score-display` option to disable realtime scoring metrics. - Bugfix: Fix failure to fully clone samples that have message lists as input. ## v0.3.53 (20 December 2024) diff --git a/src/inspect_ai/_cli/eval.py b/src/inspect_ai/_cli/eval.py index 7490d17b6..81953c731 100644 --- a/src/inspect_ai/_cli/eval.py +++ b/src/inspect_ai/_cli/eval.py @@ -42,6 +42,7 @@ NO_SCORE_HELP = ( "Do not score model output (use the inspect score command to score output later)" ) +NO_SCORE_DISPLAY = "Do not display scoring metrics in realtime." MAX_CONNECTIONS_HELP = f"Maximum number of concurrent connections to Model API (defaults to {DEFAULT_MAX_CONNECTIONS})" MAX_RETRIES_HELP = ( f"Maximum number of times to retry request (defaults to {DEFAULT_MAX_RETRIES})" @@ -257,6 +258,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]: help=NO_SCORE_HELP, envvar="INSPECT_EVAL_NO_SCORE", ) + @click.option( + "--no-score-display", + type=bool, + is_flag=True, + help=NO_SCORE_HELP, + envvar="INSPECT_EVAL_SCORE_DISPLAY", + ) @click.option( "--max-tokens", type=int, @@ -446,6 +454,7 @@ def eval_command( log_images: bool | None, log_buffer: int | None, no_score: bool | None, + no_score_display: bool | None, log_format: Literal["eval", "json"] | None, **common: Unpack[CommonOptions], ) -> None: @@ -495,6 +504,7 @@ def eval_command( log_images=log_images, log_buffer=log_buffer, no_score=no_score, + no_score_display=no_score_display, is_eval_set=False, **config, ) @@ -603,6 +613,7 @@ def eval_set_command( log_images: bool | None, log_buffer: int | None, no_score: bool | None, + no_score_display: bool | None, bundle_dir: str | None, bundle_overwrite: bool | None, log_format: Literal["eval", "json"] | None, @@ -654,6 +665,7 @@ def eval_set_command( log_images=log_images, log_buffer=log_buffer, no_score=no_score, + no_score_display=no_score_display, is_eval_set=True, retry_attempts=retry_attempts, retry_wait=retry_wait, @@ -706,6 +718,7 @@ def eval_exec( log_images: bool | None, log_buffer: int | None, no_score: bool | None, + no_score_display: bool | None, is_eval_set: bool = False, retry_attempts: int | None = None, retry_wait: int | None = None, @@ -746,6 +759,7 @@ def eval_exec( log_images = False if log_images is False else None trace = True if trace else None score = False if no_score else True + score_display = False if no_score_display else None # build params params: dict[str, Any] = ( @@ -781,6 +795,7 @@ def eval_exec( log_images=log_images, log_buffer=log_buffer, score=score, + score_display=score_display, ) | kwargs ) @@ -915,6 +930,13 @@ def parse_comma_separated(value: str | None) -> list[str] | None: help=NO_SCORE_HELP, envvar="INSPECT_EVAL_SCORE", ) +@click.option( + "--no-score-display", + type=bool, + is_flag=True, + help=NO_SCORE_HELP, + envvar="INSPECT_EVAL_SCORE_DISPLAY", +) @click.option( "--max-connections", type=int, @@ -940,6 +962,7 @@ def eval_retry_command( log_images: bool | None, log_buffer: int | None, no_score: bool | None, + no_score_display: bool | None, max_connections: int | None, max_retries: int | None, timeout: int | None, @@ -954,6 +977,7 @@ def eval_retry_command( log_samples = False if no_log_samples else None log_images = False if log_images is False else None score = False if no_score else True + score_display = False if no_score_display else None # resolve fail_on_error if no_fail_on_error is True: @@ -984,6 +1008,7 @@ def eval_retry_command( log_images=log_images, log_buffer=log_buffer, score=score, + score_display=score_display, max_retries=max_retries, timeout=timeout, max_connections=max_connections, diff --git a/src/inspect_ai/_display/textual/app.py b/src/inspect_ai/_display/textual/app.py index 88d8eeb39..5ac516946 100644 --- a/src/inspect_ai/_display/textual/app.py +++ b/src/inspect_ai/_display/textual/app.py @@ -197,7 +197,11 @@ def task_display(self, profile: TaskProfile) -> Iterator[TaskDisplay]: # add task try: - yield self.query_one(TasksView).add_task(task) + task_view = self.query_one(TasksView) + task_view.set_display_metrics( + profile.eval_config.score_display is not False + ) + yield task_view.add_task(task) finally: pass diff --git a/src/inspect_ai/_display/textual/widgets/tasks.py b/src/inspect_ai/_display/textual/widgets/tasks.py index d65dff484..a25a3c29f 100644 --- a/src/inspect_ai/_display/textual/widgets/tasks.py +++ b/src/inspect_ai/_display/textual/widgets/tasks.py @@ -72,6 +72,7 @@ def __init__(self) -> None: self.description_width = MAX_DESCRIPTION_WIDTH self.model_name_width = MAX_MODEL_NAME_WIDTH self.sample_count_width = 0 + self.display_metrics = True def init_tasks(self, tasks: list[TaskSpec]) -> None: # clear existing tasks @@ -89,7 +90,11 @@ def init_tasks(self, tasks: list[TaskSpec]) -> None: def add_task(self, task: TaskWithResult) -> TaskDisplay: self.update_count_width(task.profile.samples) task_display = TaskProgressView( - task, self.description_width, self.model_name_width, self.sample_count_width + task, + self.description_width, + self.model_name_width, + self.sample_count_width, + self.display_metrics, ) self.tasks.mount(task_display) self.tasks.scroll_to_widget(task_display) @@ -97,6 +102,9 @@ def add_task(self, task: TaskWithResult) -> TaskDisplay: return task_display + def set_display_metrics(self, display_metrics: bool) -> None: + self.display_metrics = display_metrics + def update_count_width(self, samples: int) -> None: sample_count_str = progress_count(samples, samples, self.sample_count_width) self.sample_count_width = min( @@ -174,6 +182,7 @@ def __init__( description_width: int, model_name_width: int, sample_count_width: int, + display_metrics: bool, ) -> None: super().__init__() self.t = task @@ -190,6 +199,7 @@ def __init__( self.task_detail = TaskDetail(id="task-detail", classes="hidden") self.sample_count_width: int = sample_count_width + self.display_metrics = display_metrics metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None) metrics_width: reactive[int | None] = reactive(None) @@ -198,7 +208,7 @@ def __init__( samples_total: reactive[int] = reactive(0) def compose(self) -> ComposeResult: - yield self.toggle + yield (self.toggle if self.display_metrics else Static()) yield TaskStatusIcon() yield Static( progress_description(self.t.profile, self.description_width, pad=True) @@ -274,7 +284,7 @@ def refresh_count(self) -> None: def update_metrics_label(self) -> None: # compute the label (with a min size) - if self.metrics is not None: + if self.metrics is not None and self.metrics_display is not None: metric_label = task_metric(self.metrics, self.metrics_width) self.metrics_width = len(metric_label) self.metrics_display.update(metric_label) diff --git a/src/inspect_ai/_eval/eval.py b/src/inspect_ai/_eval/eval.py index b09484ff3..00bfacb6d 100644 --- a/src/inspect_ai/_eval/eval.py +++ b/src/inspect_ai/_eval/eval.py @@ -76,6 +76,7 @@ def eval( log_images: bool | None = None, log_buffer: int | None = None, score: bool = True, + score_display: bool | None = None, **kwargs: Unpack[GenerateConfigArgs], ) -> list[EvalLog]: r"""Evaluate tasks using a Model. @@ -139,6 +140,7 @@ def eval( If not specified, an appropriate default for the format and filesystem is chosen (10 for most all cases, 100 for JSON logs on remote filesystems). score (bool): Score output (defaults to True) + score_display (bool | None): Show scoring metrics in realtime (defaults to True) **kwargs (GenerateConfigArgs): Model generation options. Returns: @@ -183,6 +185,7 @@ def eval( log_images=log_images, log_buffer=log_buffer, score=score, + score_display=score_display, **kwargs, ) ) @@ -220,6 +223,7 @@ async def eval_async( log_images: bool | None = None, log_buffer: int | None = None, score: bool = True, + score_display: bool | None = None, **kwargs: Unpack[GenerateConfigArgs], ) -> list[EvalLog]: r"""Evaluate tasks using a Model (async). @@ -282,6 +286,7 @@ async def eval_async( If not specified, an appropriate default for the format and filesystem is chosen (10 for most all cases, 100 for JSON logs on remote filesystems). score (bool): Score output (defaults to True) + score_display (bool | None): Show scoring metrics in realtime (defaults to True) **kwargs (GenerateConfigArgs): Model generation options. Returns: @@ -380,6 +385,7 @@ async def eval_async( log_samples=log_samples, log_images=log_images, log_buffer=log_buffer, + score_display=score_display, ) # run tasks - 2 codepaths, one for the traditional task at a time @@ -467,6 +473,7 @@ def eval_retry( log_images: bool | None = None, log_buffer: int | None = None, score: bool = True, + score_display: bool | None = None, max_retries: int | None = None, timeout: int | None = None, max_connections: int | None = None, @@ -507,6 +514,7 @@ def eval_retry( If not specified, an appropriate default for the format and filesystem is chosen (10 for most all cases, 100 for JSON logs on remote filesystems). score (bool): Score output (defaults to True) + score_display (bool | None): Show scoring metrics in realtime (defaults to True) max_retries (int | None): Maximum number of times to retry request. timeout: (int | None): @@ -541,6 +549,7 @@ def eval_retry( log_images=log_images, log_buffer=log_buffer, score=score, + score_display=score_display, max_retries=max_retries, timeout=timeout, max_connections=max_connections, @@ -565,6 +574,7 @@ async def eval_retry_async( log_images: bool | None = None, log_buffer: int | None = None, score: bool = True, + score_display: bool | None = None, max_retries: int | None = None, timeout: int | None = None, max_connections: int | None = None, @@ -603,6 +613,7 @@ async def eval_retry_async( If not specified, an appropriate default for the format and filesystem is chosen (10 for most all cases, 100 for JSON logs on remote filesystems). score (bool): Score output (defaults to True) + score_display (bool | None): Show scoring metrics in realtime (defaults to True) max_retries (int | None): Maximum number of times to retry request. timeout: (int | None): @@ -699,6 +710,11 @@ async def eval_retry_async( log_buffer = ( log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer ) + score_display = ( + score_display + if score_display is not None + else eval_log.eval.config.score_display + ) config = eval_log.plan.config config.max_retries = max_retries or config.max_retries @@ -740,6 +756,7 @@ async def eval_retry_async( log_images=log_images, log_buffer=log_buffer, score=score, + score_display=score_display, **dict(config), ) )[0] diff --git a/src/inspect_ai/_eval/task/run.py b/src/inspect_ai/_eval/task/run.py index f5d0d8207..336bb4dab 100644 --- a/src/inspect_ai/_eval/task/run.py +++ b/src/inspect_ai/_eval/task/run.py @@ -217,7 +217,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog: log_location=log_location, ) - with display().task(profile) as td: + with display().task( + profile, + ) as td: try: # start the log await log_start(logger, plan, generate_config) @@ -252,7 +254,10 @@ async def generate( # track when samples complete and update progress as we go progress_results: list[dict[str, SampleScore]] = [] - update_metrics_display = update_metrics_display_fn(td) + update_metrics_display = update_metrics_display_fn( + td, + display_metrics=profile.eval_config.score_display is not False, + ) def sample_complete(sample_score: dict[str, SampleScore]) -> None: # Capture the result @@ -400,7 +405,10 @@ def sample_complete(sample_score: dict[str, SampleScore]) -> None: def update_metrics_display_fn( - td: TaskDisplay, initial_interval: float = 0, min_interval: float = 0.9 + td: TaskDisplay, + initial_interval: float = 0, + min_interval: float = 0.9, + display_metrics: bool = True, ) -> Callable[ [ int, @@ -420,6 +428,10 @@ def compute( reducers: ScoreReducer | list[ScoreReducer] | None, metrics: list[Metric] | dict[str, list[Metric]] | None, ) -> None: + # Don't compute metrics if they are not being displayed + if not display_metrics: + return None + nonlocal next_compute_time time_start = time.perf_counter() if time_start >= next_compute_time: diff --git a/src/inspect_ai/_util/display.py b/src/inspect_ai/_util/display.py index 100732b13..538d744e6 100644 --- a/src/inspect_ai/_util/display.py +++ b/src/inspect_ai/_util/display.py @@ -14,6 +14,7 @@ def init_display_type(display: str | None = None) -> DisplayType: global _display_type + global _display_metrics display = ( display or os.environ.get("INSPECT_DISPLAY", DEFAULT_DISPLAY).lower().strip() ) diff --git a/src/inspect_ai/log/_log.py b/src/inspect_ai/log/_log.py index 4a2d075e7..a18c34c63 100644 --- a/src/inspect_ai/log/_log.py +++ b/src/inspect_ai/log/_log.py @@ -94,6 +94,9 @@ class EvalConfig(BaseModel): log_buffer: int | None = Field(default=None) """Number of samples to buffer before writing log file.""" + score_display: bool | None = Field(default=None) + """Display scoring metrics realtime.""" + @property def max_messages(self) -> int | None: """Deprecated max_messages property."""