Skip to content

Commit

Permalink
fix(action-parser): box coordinates normalization (#161)
Browse files Browse the repository at this point in the history
  • Loading branch information
ulivz authored Feb 28, 2025
1 parent 9c5f8f4 commit 2845023
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 4 deletions.
8 changes: 4 additions & 4 deletions packages/action-parser/src/actionParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,10 @@ export function parseActionVlm(
const numbers = oriBox.replace(/[()[\]]/g, '').split(',');

// Convert to float and scale
const floatNumbers = numbers.map(
(num: string, idx) =>
Number.parseFloat(num) / (factors[idx] || factors[0]),
);
const floatNumbers = numbers.map((num, idx) => {
const factorIndex = idx % 2;
return Number.parseFloat(num) / factors[factorIndex];
});

if (floatNumbers.length === 2) {
floatNumbers.push(floatNumbers[0], floatNumbers[1]);
Expand Down
43 changes: 43 additions & 0 deletions packages/action-parser/test/actionParser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -172,4 +172,47 @@ Action: click(start_box='(100,200)')
]);
});
});

describe('Box coordinates normalization', () => {
it('should correctly normalize box with four coordinates using custom factors', () => {
const input = `Thought: I need to click on this element
Action: click(start_box='[348, 333, 928, 365]')`;

const result = parseActionVlm(input, [1366, 768]);

expect(result).toEqual([
{
reflection: null,
thought: 'I need to click on this element',
action_type: 'click',
action_inputs: {
// Verify that x1, y1, x2, y2 are all normalized correctly
// x1 = 348/1366, y1 = 333/768, x2 = 928/1366, y2 = 365/768
start_box:
'[0.2547584187408492,0.43359375,0.6793557833089312,0.4752604166666667]',
},
},
]);
});

it('should handle real-world screen dimensions with four coordinates', () => {
const input = `Thought: I need to click on this element in the browser
Action: click(start_box='[287, 111, 313, 124]')`;

const result = parseActionVlm(input, [1280, 800]);

expect(result).toEqual([
{
reflection: null,
thought: 'I need to click on this element in the browser',
action_type: 'click',
action_inputs: {
// Verify the normalized results at the actual screen size
// x1 = 287/1280, y1 = 111/800, x2 = 313/1280, y2 = 124/800
start_box: '[0.22421875,0.13875,0.24453125,0.155]',
},
},
]);
});
});
});

0 comments on commit 2845023

Please sign in to comment.