From 05296d7e74de8d140ba8e41764a8694099ec1eaa Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 27 Oct 2024 16:01:48 -0700 Subject: [PATCH] Fixed retina resolution scaling issues --- example/google.ts | 10 ++++++---- package-lock.json | 14 ++------------ package.json | 14 ++++++++++++-- src/browser.ts | 6 +++--- src/planners/anthropic.ts | 40 +++++++++++++++++++++++++-------------- 5 files changed, 49 insertions(+), 35 deletions(-) diff --git a/example/google.ts b/example/google.ts index 6cd9e57..3db802e 100644 --- a/example/google.ts +++ b/example/google.ts @@ -1,13 +1,13 @@ import { Builder, Browser } from 'selenium-webdriver'; -import { ServiceBuilder } from 'selenium-webdriver/firefox'; +// import { ServiceBuilder } from 'selenium-webdriver/firefox'; import { AnthropicPlanner, BrowserAgent, pauseForInput } from 'cerebellum-ai'; (async function example() { let driver = await new Builder() - .forBrowser(Browser.FIREFOX) - .setFirefoxService(new ServiceBuilder('/snap/bin/geckodriver')) // Necessary for snap based firefox installs + .forBrowser(Browser.CHROME) + // .setFirefoxService(new ServiceBuilder('/snap/bin/geckodriver')) // Necessary for snap based firefox installs .build(); try { @@ -18,7 +18,9 @@ import { AnthropicPlanner, BrowserAgent, pauseForInput } from 'cerebellum-ai'; // Create the Cerebellum browser agent const planner = new AnthropicPlanner(process.env.ANTHROPIC_API_KEY as string); - const agent = new BrowserAgent(driver, planner, goal); + const agent = new BrowserAgent(driver, planner, goal, { + pauseAfterEachAction: true, + }); // Have Cerebellum takeover website navigation await agent.start(); diff --git a/package-lock.json b/package-lock.json index 3a846ab..51035dc 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,16 +1,15 @@ { "name": "cerebellum-ai", - "version": "0.0.1", + "version": "0.0.4", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "cerebellum-ai", - "version": "0.0.1", + "version": "0.0.4", "license": "MIT", "dependencies": { "@anthropic-ai/sdk": "^0.30.1", - "npmrc": "1.1.1", "selenium-webdriver": "^4.25.0", "sharp": "^0.33.5" }, @@ -1223,15 +1222,6 @@ } } }, - "node_modules/npmrc": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/npmrc/-/npmrc-1.1.1.tgz", - "integrity": "sha512-uBbR8YNnIvKhMCDyz27Ffvnm2bLntMesFcwiz6Yp0DYSQB+JqgM1MocewFwMlItKCex+1ytii/+vmLhigO1qhg==", - "license": "BSD", - "bin": { - "npmrc": "npmrc.js" - } - }, "node_modules/pako": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", diff --git a/package.json b/package.json index 1e80df3..e4921e0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "cerebellum-ai", - "version": "0.0.4", + "version": "0.0.5", "description": "LLM based browser automation", "type": "module", "main": "dist/index.js", @@ -12,6 +12,7 @@ }, "scripts": { "build": "tsc", + "watch": "tsc -w", "prepublish": "npm run build", "test": "echo \"Error: no test specified\" && exit 1" }, @@ -27,5 +28,14 @@ "@anthropic-ai/sdk": "^0.30.1", "selenium-webdriver": "^4.25.0", "sharp": "^0.33.5" - } + }, + "keywords": [ + "ai", + "automation", + "browser", + "selenium", + "webdriver", + "llm", + "claude" + ] } diff --git a/src/browser.ts b/src/browser.ts index 65fe9cc..c380c48 100644 --- a/src/browser.ts +++ b/src/browser.ts @@ -79,7 +79,7 @@ export class BrowserAgent { } public async getState(): Promise { - const size = await this.driver.manage().window().getSize(); + const size = await this.driver.executeScript('return { x: window.innerWidth, y: window.innerHeight }') as Coordinate; const screenshot = await this.driver.takeScreenshot(); const url = await this.driver.getCurrentUrl(); @@ -88,8 +88,8 @@ export class BrowserAgent { return { screenshot: screenshot, - height: size.height, - width: size.width, + height: size.y, + width: size.x, url: url, mouse: mousePosition, }; diff --git a/src/planners/anthropic.ts b/src/planners/anthropic.ts index 7dfacde..1a6b930 100644 --- a/src/planners/anthropic.ts +++ b/src/planners/anthropic.ts @@ -3,6 +3,7 @@ import { ActionPlanner, BrowserAction, BrowserState, BrowserStep, Coordinate } f import sharp from 'sharp'; import { BetaMessageParam } from '@anthropic-ai/sdk/resources/beta/messages/messages'; +import fs from 'fs'; interface ScalingRatio { ratio: Coordinate; oldSize: Coordinate; @@ -121,7 +122,17 @@ ${additionalContext} public async resizeScreenshot(screenshot: string): Promise { const screenshotBuffer = Buffer.from(screenshot, 'base64'); const sharpImage = sharp(screenshotBuffer); - const resizedImg = await sharpImage.resize(1280, 800, { fit: 'inside' }) + const resizedImg = await sharpImage.resize(1280, 800, { fit: 'inside' }); + const imgBuffer = await resizedImg.toBuffer(); + + const imgStr = imgBuffer.toString('base64'); + return imgStr; + } + + public async resizeImageToDimensions(image: string, newDim: Coordinate): Promise { + const screenshotBuffer = Buffer.from(image, 'base64'); + const sharpImage = sharp(screenshotBuffer); + const resizedImg = await sharpImage.resize(newDim.x, newDim.y, { fit: 'fill' }); const imgBuffer = await resizedImg.toBuffer(); const imgStr = imgBuffer.toString('base64'); @@ -171,21 +182,23 @@ ${additionalContext} const contentSubMsg: (Anthropic.Beta.Messages.BetaTextBlockParam | Anthropic.Beta.Messages.BetaImageBlockParam)[] = []; if (options.mousePosition) { - const imgDim = await this.getDimensions(currentState.screenshot); - const scaling = this.getScalingRatio(imgDim) - const scaledCoord = this.browserToLLMCoordinates(currentState.mouse, scaling) - resultText += `After action mouse cursor is at X: ${scaledCoord.x}, Y: ${scaledCoord.y}\n\n` + const imgDim = {x: currentState.width, y: currentState.height}; + const scaling = this.getScalingRatio(imgDim); + const scaledCoord = this.browserToLLMCoordinates(currentState.mouse, scaling); + resultText += `After action mouse cursor is at X: ${scaledCoord.x}, Y: ${scaledCoord.y}\n\n`; } if (options.url) { - resultText += `After action, the tab's URL is ${currentState.url}\n\n` + resultText += `After action, the tab's URL is ${currentState.url}\n\n`; } if (options.screenshot) { resultText += 'Here is a screenshot of the browswer after the action was performed.\n\n'; - - const markedImage = await this.markScreenshotWithCursor(currentState.screenshot, currentState.mouse); + const viewportImage = await this.resizeImageToDimensions(currentState.screenshot, {x: currentState.width, y: currentState.height}); + const markedImage = await this.markScreenshotWithCursor(viewportImage, currentState.mouse); const resized = await this.resizeScreenshot(markedImage); + + fs.writeFileSync('tmp.png', resized, 'base64'); contentSubMsg.push({ type: 'image', @@ -194,11 +207,11 @@ ${additionalContext} media_type: 'image/png', data: resized } - }) + }); } if (resultText === '') { // Put a generic text explaination for no URL or result - resultText += 'Action was performed.' + resultText += 'Action was performed.'; } contentSubMsg.unshift({ @@ -386,8 +399,7 @@ ${additionalContext} currentState: BrowserState, sessionHistory: BrowserStep[]): Promise { const systemPrompt = this.formatSystemPrompt(goal, additionalContext, additionalInstructions); const messages = await this.formatIntoMessages(goal, currentState, sessionHistory); - const imgDim = await this.getDimensions(currentState.screenshot); - const scaling = this.getScalingRatio(imgDim) + const scaling = this.getScalingRatio({ x: currentState.width, y: currentState.height }); const response = await this.client.beta.messages.create({ model: "claude-3-5-sonnet-20241022", @@ -397,8 +409,8 @@ ${additionalContext} { type: "computer_20241022", name: "computer", - display_width_px: scaling.newSize.x, - display_height_px: scaling.newSize.y, + display_width_px: currentState.width, + display_height_px: currentState.height, display_number: 1 }, {