Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add additional goosebench evals #1571

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/goose-bench/src/assets/squirrel-data.csv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions crates/goose-bench/src/eval_suites/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod core;
mod evaluation;
mod factory;
mod small_models;

pub use evaluation::*;
pub use factory::{register_evaluation, EvaluationSuiteFactory};
57 changes: 57 additions & 0 deletions crates/goose-bench/src/eval_suites/small_models/blog_summary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use async_trait::async_trait;

pub struct BlogSummary {}

impl BlogSummary {
pub fn new() -> Self {
BlogSummary {}
}

fn check_markdown_numbered_list(&self, text: &str) -> bool {
// Check if all numbers 1-5 exist in markdown numbered list format
(1..=5).all(|n| text.contains(&format!("{}.", n)))
}
}

#[async_trait]
impl Evaluation for BlogSummary {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("BlogSummary - run");
let mut metrics = Vec::new();
let response = agent.prompt("What are the top 5 most counterintuitive insights from this blog post? Format your response in Markdown with 5 numbered points (1. 2. 3. 4. 5.) https://huyenchip.com/2025/01/07/agents.html".to_string()).await?;

// Get text content from the last message
let has_markdown_list = if let Some(last_msg) = response.last() {
self.check_markdown_numbered_list(&last_msg.as_concat_text())
} else {
false
};

metrics.push((
"valid_markdown_format".to_string(),
EvaluationMetric::Boolean(has_markdown_list),
));

Ok(metrics)
}

fn name(&self) -> &str {
"blog_summary"
}

fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: vec!["uvx mcp-server-fetch".to_string()],
}
}
}

register_evaluation!("small_models_fetch", BlogSummary);
105 changes: 105 additions & 0 deletions crates/goose-bench/src/eval_suites/small_models/flappy_bird.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use async_trait::async_trait;
use goose::message::MessageContent;
use mcp_core::role::Role;
use serde_json::{self, Value};
use std::fs;

pub struct FlappyBird {}

impl FlappyBird {
pub fn new() -> Self {
FlappyBird {}
}

fn check_python_implementation(&self, content: &str) -> bool {
content.contains("import pygame") &&
content.contains("pygame.init()") &&
content.contains("while") && // Game loop
content.contains("pygame.event.get()") && // Event handling
content.contains("def main") && // Main function
content.contains("if __name__ == '__main__'") // Main guard
}
}

#[async_trait]
impl Evaluation for FlappyBird {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
work_dir: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("FlappyBird - run");
let mut metrics = Vec::new();

let messages = agent.prompt("Create a Flappy Bird game in Python. Structure the code with a main function and use the if __name__ == '__main__': idiom. You must use pygame. The background color should be a light blue color. Pressing SPACE multiple times will accelerate the bird. The bird's shape should be a red circle. Place on the bottom some land colored as dark yellow chosen. Make a score shown on the top right side. Increment if you pass pipes and don't hit them. Make randomly spaced dark green pipes with enough space. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again. The final game should be written to a file named flappy_bird.py.".to_string()).await?;

// Check if the agent used the text editor tool correctly
let valid_tool_call = messages.iter().any(|msg| {
msg.role == Role::Assistant
&& msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
// Check tool name and basic parameters
if tool_call.name != "developer__text_editor" {
return false;
}

// Parse the arguments as JSON
if let Ok(args) =
serde_json::from_value::<Value>(tool_call.arguments.clone())
{
// Only check command is write and correct filename
args.get("command").and_then(Value::as_str) == Some("write")
&& args
.get("path")
.and_then(Value::as_str)
.is_some_and(|s| s.contains("flappy_bird.py"))
} else {
false
}
} else {
false
}
} else {
false
}
})
});

metrics.push((
"used_write_tool".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
));

// If tool was used correctly, check the actual file content
if valid_tool_call {
if let Ok(file_path) = work_dir.fs_get("flappy_bird.py".to_string()) {
if let Ok(content) = fs::read_to_string(file_path) {
let valid_implementation = self.check_python_implementation(&content);
metrics.push((
"valid_implementation".to_string(),
EvaluationMetric::Boolean(valid_implementation),
));
}
}
}

Ok(metrics)
}

fn name(&self) -> &str {
"flappy_bird"
}

fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}

register_evaluation!("small_models", FlappyBird);
83 changes: 83 additions & 0 deletions crates/goose-bench/src/eval_suites/small_models/goose_wiki.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use async_trait::async_trait;
use goose::message::MessageContent;
use mcp_core::role::Role;
use serde_json::{self, Value};

pub struct GooseWiki {}

impl GooseWiki {
pub fn new() -> Self {
GooseWiki {}
}
}

#[async_trait]
impl Evaluation for GooseWiki {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("GooseWiki - run");
let mut metrics = Vec::new();

let messages = agent.prompt("Create a Wikipedia-style web page about Goose (Block's AI agent) in a new index.html file. The page should be a complete, well-structured HTML document with proper head and body sections. Use heading tags (h1, h2, h3) to organize the content into clear sections. Include comprehensive information about Goose organized in a way similar to how Wikipedia presents technical topics.".to_string()).await?;

// Check if the agent used the text editor tool to create index.html
let valid_tool_call = messages.iter().any(|msg| {
msg.role == Role::Assistant &&
msg.content.iter().any(|content| {
if let MessageContent::ToolRequest(tool_req) = content {
if let Ok(tool_call) = tool_req.tool_call.as_ref() {
// Check tool name is correct
if tool_call.name != "developer__text_editor" {
return false;
}

// Parse the arguments as JSON
if let Ok(args) = serde_json::from_value::<Value>(tool_call.arguments.clone()) {
// Check command is write and path contains index.html
args.get("command").and_then(Value::as_str) == Some("write") &&
args.get("path").and_then(Value::as_str).is_some_and(|s| s.contains("index.html")) &&
// Verify file_text contains basic HTML structure
args.get("file_text").and_then(Value::as_str).is_some_and(|s| {
s.contains("<html") && s.contains("</html>") &&
s.contains("<head") && s.contains("</head>") &&
s.contains("<body") && s.contains("</body>")
})
} else {
false
}
} else {
false
}
} else {
false
}
})
});

metrics.push((
"created_valid_html".to_string(),
EvaluationMetric::Boolean(valid_tool_call),
));

Ok(metrics)
}

fn name(&self) -> &str {
"goose_wiki"
}

fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: Vec::new(),
}
}
}

register_evaluation!("small_models", GooseWiki);
5 changes: 5 additions & 0 deletions crates/goose-bench/src/eval_suites/small_models/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mod blog_summary;
mod flappy_bird;
mod goose_wiki;
mod restaurant_research;
mod squirrel_census;
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
use crate::bench_work_dir::BenchmarkWorkDir;
use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric, ExtensionRequirements};
use crate::register_evaluation;
use async_trait::async_trait;

pub struct RestaurantResearch {}

impl RestaurantResearch {
pub fn new() -> Self {
RestaurantResearch {}
}

fn check_markdown_bullets(&self, text: &str) -> bool {
// Check if there's at least one bullet point and proper markdown formatting
text.contains("- ") || text.contains("* ")
}

fn count_bullet_points(&self, text: &str) -> i64 {
// Count total bullet points (either - or * style)
let dash_bullets = text.matches("- ").count();
let star_bullets = text.matches("* ").count();
(dash_bullets + star_bullets) as i64
}
}

#[async_trait]
impl Evaluation for RestaurantResearch {
async fn run(
&self,
mut agent: Box<dyn BenchAgent>,
_: &mut BenchmarkWorkDir,
) -> anyhow::Result<Vec<(String, EvaluationMetric)>> {
println!("RestaurantResearch - run");
let mut metrics = Vec::new();
let response = agent.prompt("Search for and provide a current, detailed list of the best Sichuanese restaurants specifically in the East Village neighborhood of NYC. Format your response in Markdown using bullet points (either - or *) for each restaurant. For each restaurant include:
- Restaurant name and what they're known for
- Signature dishes
- Atmosphere/setting
- Any relevant details about reservations or dining experience
- What distinguishes them from others

Present the information in order of significance or quality. Focus specifically on Sichuanese establishments, not general Chinese restaurants.".to_string()).await?;

// Get text content from the last message
if let Some(last_msg) = response.last() {
let text_content = last_msg.as_concat_text();
let has_markdown_bullets = self.check_markdown_bullets(&text_content);
let bullet_count = self.count_bullet_points(&text_content);

metrics.push((
"valid_markdown_format".to_string(),
EvaluationMetric::Boolean(has_markdown_bullets),
));
metrics.push((
"bullet_point_count".to_string(),
EvaluationMetric::Integer(bullet_count),
));
}

Ok(metrics)
}

fn name(&self) -> &str {
"restaurant_research"
}

fn required_extensions(&self) -> ExtensionRequirements {
ExtensionRequirements {
builtin: vec!["developer".to_string()],
external: vec!["uvx mcp-server-fetch".to_string()],
}
}
}

register_evaluation!("small_models_fetch", RestaurantResearch);
Loading
Loading