Skip to content

Commit

Permalink
Fixed retina resolution scaling issues
Browse files Browse the repository at this point in the history
  • Loading branch information
paladinCTO committed Oct 27, 2024
1 parent 8cc7608 commit 05296d7
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 35 deletions.
10 changes: 6 additions & 4 deletions example/google.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import { Builder, Browser } from 'selenium-webdriver';
import { ServiceBuilder } from 'selenium-webdriver/firefox';
// import { ServiceBuilder } from 'selenium-webdriver/firefox';

import { AnthropicPlanner, BrowserAgent, pauseForInput } from 'cerebellum-ai';


(async function example() {
let driver = await new Builder()
.forBrowser(Browser.FIREFOX)
.setFirefoxService(new ServiceBuilder('/snap/bin/geckodriver')) // Necessary for snap based firefox installs
.forBrowser(Browser.CHROME)
// .setFirefoxService(new ServiceBuilder('/snap/bin/geckodriver')) // Necessary for snap based firefox installs
.build();

try {
Expand All @@ -18,7 +18,9 @@ import { AnthropicPlanner, BrowserAgent, pauseForInput } from 'cerebellum-ai';

// Create the Cerebellum browser agent
const planner = new AnthropicPlanner(process.env.ANTHROPIC_API_KEY as string);
const agent = new BrowserAgent(driver, planner, goal);
const agent = new BrowserAgent(driver, planner, goal, {
pauseAfterEachAction: true,
});

// Have Cerebellum takeover website navigation
await agent.start();
Expand Down
14 changes: 2 additions & 12 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 12 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "cerebellum-ai",
"version": "0.0.4",
"version": "0.0.5",
"description": "LLM based browser automation",
"type": "module",
"main": "dist/index.js",
Expand All @@ -12,6 +12,7 @@
},
"scripts": {
"build": "tsc",
"watch": "tsc -w",
"prepublish": "npm run build",
"test": "echo \"Error: no test specified\" && exit 1"
},
Expand All @@ -27,5 +28,14 @@
"@anthropic-ai/sdk": "^0.30.1",
"selenium-webdriver": "^4.25.0",
"sharp": "^0.33.5"
}
},
"keywords": [
"ai",
"automation",
"browser",
"selenium",
"webdriver",
"llm",
"claude"
]
}
6 changes: 3 additions & 3 deletions src/browser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ export class BrowserAgent {
}

public async getState(): Promise<BrowserState> {
const size = await this.driver.manage().window().getSize();
const size = await this.driver.executeScript('return { x: window.innerWidth, y: window.innerHeight }') as Coordinate;
const screenshot = await this.driver.takeScreenshot();
const url = await this.driver.getCurrentUrl();

Expand All @@ -88,8 +88,8 @@ export class BrowserAgent {

return {
screenshot: screenshot,
height: size.height,
width: size.width,
height: size.y,
width: size.x,
url: url,
mouse: mousePosition,
};
Expand Down
40 changes: 26 additions & 14 deletions src/planners/anthropic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { ActionPlanner, BrowserAction, BrowserState, BrowserStep, Coordinate } f
import sharp from 'sharp';
import { BetaMessageParam } from '@anthropic-ai/sdk/resources/beta/messages/messages';

import fs from 'fs';
interface ScalingRatio {
ratio: Coordinate;
oldSize: Coordinate;
Expand Down Expand Up @@ -121,7 +122,17 @@ ${additionalContext}
public async resizeScreenshot(screenshot: string): Promise<string> {
const screenshotBuffer = Buffer.from(screenshot, 'base64');
const sharpImage = sharp(screenshotBuffer);
const resizedImg = await sharpImage.resize(1280, 800, { fit: 'inside' })
const resizedImg = await sharpImage.resize(1280, 800, { fit: 'inside' });
const imgBuffer = await resizedImg.toBuffer();

const imgStr = imgBuffer.toString('base64');
return imgStr;
}

public async resizeImageToDimensions(image: string, newDim: Coordinate): Promise<string> {
const screenshotBuffer = Buffer.from(image, 'base64');
const sharpImage = sharp(screenshotBuffer);
const resizedImg = await sharpImage.resize(newDim.x, newDim.y, { fit: 'fill' });
const imgBuffer = await resizedImg.toBuffer();

const imgStr = imgBuffer.toString('base64');
Expand Down Expand Up @@ -171,21 +182,23 @@ ${additionalContext}
const contentSubMsg: (Anthropic.Beta.Messages.BetaTextBlockParam | Anthropic.Beta.Messages.BetaImageBlockParam)[] = [];

if (options.mousePosition) {
const imgDim = await this.getDimensions(currentState.screenshot);
const scaling = this.getScalingRatio(imgDim)
const scaledCoord = this.browserToLLMCoordinates(currentState.mouse, scaling)
resultText += `After action mouse cursor is at X: ${scaledCoord.x}, Y: ${scaledCoord.y}\n\n`
const imgDim = {x: currentState.width, y: currentState.height};
const scaling = this.getScalingRatio(imgDim);
const scaledCoord = this.browserToLLMCoordinates(currentState.mouse, scaling);
resultText += `After action mouse cursor is at X: ${scaledCoord.x}, Y: ${scaledCoord.y}\n\n`;
}

if (options.url) {
resultText += `After action, the tab's URL is ${currentState.url}\n\n`
resultText += `After action, the tab's URL is ${currentState.url}\n\n`;
}

if (options.screenshot) {
resultText += 'Here is a screenshot of the browswer after the action was performed.\n\n';

const markedImage = await this.markScreenshotWithCursor(currentState.screenshot, currentState.mouse);
const viewportImage = await this.resizeImageToDimensions(currentState.screenshot, {x: currentState.width, y: currentState.height});
const markedImage = await this.markScreenshotWithCursor(viewportImage, currentState.mouse);
const resized = await this.resizeScreenshot(markedImage);

fs.writeFileSync('tmp.png', resized, 'base64');

contentSubMsg.push({
type: 'image',
Expand All @@ -194,11 +207,11 @@ ${additionalContext}
media_type: 'image/png',
data: resized
}
})
});
}

if (resultText === '') { // Put a generic text explaination for no URL or result
resultText += 'Action was performed.'
resultText += 'Action was performed.';
}

contentSubMsg.unshift({
Expand Down Expand Up @@ -386,8 +399,7 @@ ${additionalContext}
currentState: BrowserState, sessionHistory: BrowserStep[]): Promise<BrowserAction> {
const systemPrompt = this.formatSystemPrompt(goal, additionalContext, additionalInstructions);
const messages = await this.formatIntoMessages(goal, currentState, sessionHistory);
const imgDim = await this.getDimensions(currentState.screenshot);
const scaling = this.getScalingRatio(imgDim)
const scaling = this.getScalingRatio({ x: currentState.width, y: currentState.height });

const response = await this.client.beta.messages.create({
model: "claude-3-5-sonnet-20241022",
Expand All @@ -397,8 +409,8 @@ ${additionalContext}
{
type: "computer_20241022",
name: "computer",
display_width_px: scaling.newSize.x,
display_height_px: scaling.newSize.y,
display_width_px: currentState.width,
display_height_px: currentState.height,
display_number: 1
},
{
Expand Down

0 comments on commit 05296d7

Please sign in to comment.