diff --git a/README.md b/README.md index f136449..bc1ace1 100644 --- a/README.md +++ b/README.md @@ -259,6 +259,15 @@ Add this snippet to your Trae `mcp_settings.json`: - `pg_backups` - Manage database backups and schedules. - `pg_upgrade` - Upgrade PostgreSQL to a newer version. + + +## Running evals + +The evals package loads an mcp client that then runs the index.ts file, so there is no need to rebuild between tests. You can load environment variables by prefixing the npx command. Full documentation can be found [here](https://www.mcpevals.io/docs). + +```bash +OPENAI_API_KEY=your-key npx mcp-eval src/evals/evals.ts src/tools/data.ts +``` ## Debugging You can use the [MCP inspector](https://modelcontextprotocol.io/docs/tools/inspector) or the diff --git a/package.json b/package.json index f56ebe1..38c83c2 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,8 @@ "jsonschema": "^1.5.0", "tar-stream": "^3.1.7", "zod": "^3.24.2", - "zod-to-json-schema": "^3.24.5" + "zod-to-json-schema": "^3.24.5", + "mcp-evals": "^1.0.18" }, "devDependencies": { "@eslint/eslintrc": "^3.3.1", @@ -83,4 +84,4 @@ "exec-perms": "shx chmod +x dist/*.js", "test:watch": "nyc mocha --watch --forbid-only" } -} +} \ No newline at end of file diff --git a/src/evals/evals.ts b/src/evals/evals.ts new file mode 100644 index 0000000..708e0ef --- /dev/null +++ b/src/evals/evals.ts @@ -0,0 +1,59 @@ +//evals.ts + +import { EvalConfig } from 'mcp-evals'; +import { openai } from "@ai-sdk/openai"; +import { grade, EvalFunction } from "mcp-evals"; + +const pg_psqlEval: EvalFunction = { + name: "pg_psql Tool Evaluation", + description: "Evaluates the functionality of the pg_psql tool", + run: async () => { + const result = await grade(openai("gpt-4"), "How can I create a new table named 'users' with two columns id and name, then retrieve all rows using pg_psql?"); + return JSON.parse(result); + } +}; + +const pg_infoEval: EvalFunction = { + name: 'PG Info Tool Evaluation', + description: 'Evaluates the database status retrieval from the PG Info tool', + run: async () => { + const result = await grade(openai("gpt-4"), "Show me the status of the 'mydb' database in the 'myapp' app using the pg_info tool."); + return JSON.parse(result); + } +}; + +const pg_psEval: EvalFunction = { + name: 'pg_ps tool evaluation', + description: 'Evaluates the monitoring of active queries: progress, resources, performance', + run: async () => { + const result = await grade(openai("gpt-4"), "Show me the currently running queries on the 'mydb' database with verbose output so I can monitor resource usage on Heroku"); + return JSON.parse(result); + } +}; + +const pgLocksEval: EvalFunction = { + name: 'pg_locks Tool Evaluation', + description: 'Evaluates the functionality of the pg_locks tool', + run: async () => { + const result = await grade(openai("gpt-4"), "Please analyze the locks for the database 'mydb' to check for blocked queries or deadlocks."); + return JSON.parse(result); + } +}; + +const pg_outliersEval: EvalFunction = { + name: 'pg_outliers Tool Evaluation', + description: 'Evaluates the tool for finding resource-heavy queries in Postgres', + run: async () => { + const result = await grade(openai("gpt-4"), "Which queries in my Postgres database are the most resource-intensive and how can I optimize them?"); + return JSON.parse(result); + } +}; + +const config: EvalConfig = { + model: openai("gpt-4"), + evals: [pg_psqlEval, pg_infoEval, pg_psEval, pgLocksEval, pg_outliersEval] +}; + +export default config; + +export const evals = [pg_psqlEval, pg_infoEval, pg_psEval, pgLocksEval, pg_outliersEval]; \ No newline at end of file