Skip to content

Commit

Permalink
add strategies/visual_browser.py
Browse files Browse the repository at this point in the history
  • Loading branch information
abrichr committed Oct 8, 2024
1 parent 1d0e7c9 commit 8808b15
Show file tree
Hide file tree
Showing 6 changed files with 1,014 additions and 227 deletions.
64 changes: 64 additions & 0 deletions chrome_extension/content.js
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,7 @@ function handleUserEvent(event) {
timestamp: timestamp,
visibleHTMLString,
visibleHTMLDuration,
devicePixelRatio,
};

if (event instanceof KeyboardEvent) {
Expand Down Expand Up @@ -573,3 +574,66 @@ function setupScrollAndResizeListeners() {
window.addEventListener('scroll', handleScrollEvent, { passive: true });
window.addEventListener('resize', handleResizeEvent, { passive: true });
}

/* Debugging */

const DEBUG_DRAW = false; // Flag for drawing bounding boxes

// Start continuous drawing if DEBUG_DRAW is enabled
if (DEBUG_DRAW) {
startDrawingBoundingBoxes();
}

/**
* Start continuously drawing bounding boxes for visible elements.
*/
function startDrawingBoundingBoxes() {
function drawBoundingBoxesLoop() {
// Clean up existing bounding boxes before drawing new ones
cleanUpBoundingBoxes();

// Query all visible elements and draw their bounding boxes
document.querySelectorAll('*').forEach(element => {
if (isVisible(element)) {
drawBoundingBoxForElement(element);
}
});

// Use requestAnimationFrame for continuous updates without performance impact
requestAnimationFrame(drawBoundingBoxesLoop);
}

// Kick off the loop
drawBoundingBoxesLoop();
}

/**
* Draw a bounding box for the given element.
* Uses client coordinates.
* @param {HTMLElement} element - The DOM element to draw the bounding box for.
*/
function drawBoundingBoxForElement(element) {
const { top, left, bottom, right } = element.getBoundingClientRect();

// Create and style the overlay to represent the bounding box
let bboxOverlay = document.createElement('div');
bboxOverlay.style.position = 'absolute';
bboxOverlay.style.border = '2px solid red';
bboxOverlay.style.top = `${top + window.scrollY}px`; // Adjust for scrolling
bboxOverlay.style.left = `${left + window.scrollX}px`; // Adjust for scrolling
bboxOverlay.style.width = `${right - left}px`;
bboxOverlay.style.height = `${bottom - top}px`;
bboxOverlay.style.pointerEvents = 'none'; // Prevent interference with normal element interactions
bboxOverlay.style.zIndex = '9999'; // Ensure it's drawn on top
bboxOverlay.setAttribute('data-debug-bbox', element.getAttribute('data-id') || '');

// Append the overlay to the body
document.body.appendChild(bboxOverlay);
}

/**
* Clean up all existing bounding boxes to prevent overlapping or lingering overlays.
*/
function cleanUpBoundingBoxes() {
document.querySelectorAll('[data-debug-bbox]').forEach(overlay => overlay.remove());
}
86 changes: 43 additions & 43 deletions openadapt/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def set_browser_mode(
websocket.send(message)


def add_screen_tlbr(browser_events: list[models.BrowserEvent]) -> None:
def add_screen_tlbr(browser_events: list[models.BrowserEvent], target_element_only: bool = False) -> None:
"""Computes and adds the 'data-tlbr-screen' attribute for each element.
Uses coordMappings provided by JavaScript events. If 'data-tlbr-screen' already
Expand All @@ -102,6 +102,8 @@ def add_screen_tlbr(browser_events: list[models.BrowserEvent]) -> None:
Args:
browser_events (list[models.BrowserEvent]): list of browser events to process.
target_element_only (bool): if True, only process the target element. If False,
process all elements with the 'data-tlbr-client' property.
"""
# Initialize variables to store the most recent valid mappings
latest_valid_x_mappings = None
Expand All @@ -115,9 +117,6 @@ def add_screen_tlbr(browser_events: list[models.BrowserEvent]) -> None:
logger.warning(exc)
continue

if not target_element:
continue

# Extract coordMappings from the message
message = event.message
coord_mappings = message.get("coordMappings", {})
Expand All @@ -137,7 +136,7 @@ def add_screen_tlbr(browser_events: list[models.BrowserEvent]) -> None:
# Reuse the most recent valid mappings
if latest_valid_x_mappings is None or latest_valid_y_mappings is None:
logger.warning(
f"No valid coordinate mappings available for element: {target_id}"
f"No valid coordinate mappings available for element: {target_element.get('id')}"
)
continue # No valid mappings available, skip this event

Expand All @@ -152,50 +151,51 @@ def add_screen_tlbr(browser_events: list[models.BrowserEvent]) -> None:
y_mappings["client"], y_mappings["screen"]
)

# Only process the screen coordinates
tlbr_attr = "data-tlbr-screen"
try:
# Get existing screen coordinates if present
existing_screen_coords = (
target_element[tlbr_attr] if tlbr_attr in target_element.attrs else None
)
except KeyError:
existing_screen_coords = None
# Define function to process element
def process_element(element):
# Compute the 'data-tlbr-screen' attribute
tlbr_attr = "data-tlbr-screen"
existing_screen_coords = element.get(tlbr_attr, None)

# Compute screen coordinates
client_coords = target_element.get("data-tlbr-client")
if not client_coords:
logger.warning(
f"Missing client coordinates for element with id: {target_id}"
client_coords = element.get("data-tlbr-client")
if not client_coords:
logger.warning(f"Missing client coordinates for element with id: {element.get('id')}")
return

# Extract client coordinates
client_top, client_left, client_bottom, client_right = map(
float, client_coords.split(",")
)
continue

# Extract client coordinates
client_top, client_left, client_bottom, client_right = map(
float, client_coords.split(",")
)
# Calculate screen coordinates using the computed scale and offset
screen_top = sy_scale * client_top + sy_offset
screen_left = sx_scale * client_left + sx_offset
screen_bottom = sy_scale * client_bottom + sy_offset
screen_right = sx_scale * client_right + sx_offset

# New computed screen coordinates
new_screen_coords = f"{screen_top},{screen_left},{screen_bottom},{screen_right}"
logger.info(f"{client_coords=} {existing_screen_coords=} {new_screen_coords=}")

# Check for existing data-tlbr-screen attribute
if existing_screen_coords:
assert existing_screen_coords == new_screen_coords, (
"Mismatch in computed and existing screen coordinates:"
f" {existing_screen_coords} != {new_screen_coords}"
)

# Calculate screen coordinates using the computed scale and offset
screen_top = sy_scale * client_top + sy_offset
screen_left = sx_scale * client_left + sx_offset
screen_bottom = sy_scale * client_bottom + sy_offset
screen_right = sx_scale * client_right + sx_offset

# New computed screen coordinates
new_screen_coords = f"{screen_top},{screen_left},{screen_bottom},{screen_right}"
logger.info(f"{client_coords=} {existing_screen_coords=} {new_screen_coords=}")

# Check for existing data-tlbr-screen attribute
if existing_screen_coords:
assert existing_screen_coords == new_screen_coords, (
"Mismatch in computed and existing screen coordinates:"
f" {existing_screen_coords} != {new_screen_coords}"
)
# Update the attribute with the new value
element["data-tlbr-screen"] = new_screen_coords

# Update the attribute with the new value
target_element["data-tlbr-screen"] = new_screen_coords
# Process elements based on target_element_only flag
if target_element_only:
process_element(target_element)
else:
elements_to_process = soup.find_all(attrs={"data-tlbr-client": True})
for element in elements_to_process:
process_element(element)

# Write the updated element back to the message
# Write the updated elements back to the message
message["visibleHTMLString"] = str(soup)

logger.info("Finished processing all browser events for screen coordinates.")
Expand Down
33 changes: 25 additions & 8 deletions openadapt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,9 @@ def available_segment_descriptions(self, value: list[str]) -> None:
)

@property
def active_browser_element(self) -> BeautifulSoup:
def active_browser_element(self) -> BeautifulSoup | None:
if not self._active_browser_element:
return None
return utils.parse_html(self._active_browser_element)

@active_browser_element.setter
Expand All @@ -232,10 +234,12 @@ def active_browser_element(self, value: BeautifulSoup) -> None:
self._active_browser_element = str(value)

@property
def available_browser_elements(self) -> BeautifulSoup:
def available_browser_elements(self) -> BeautifulSoup | None:
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#navigating-the-tree
# The value True matches every tag it can. This code finds all the tags in the
# document, but none of the text strings
if not self._available_browser_elements:
return None
return utils.parse_html(self._available_browser_elements)

@available_browser_elements.setter
Expand Down Expand Up @@ -535,6 +539,8 @@ def to_prompt_dict(self) -> dict[str, Any]:
Returns:
dictionary containing relevant properties from the ActionEvent.
"""
if self.active_browser_element:
import ipdb; ipdb.set_trace()
action_dict = deepcopy(
{
key: val
Expand All @@ -550,10 +556,22 @@ def to_prompt_dict(self) -> dict[str, Any]:
for key in ("mouse_x", "mouse_y", "mouse_dx", "mouse_dy"):
if key in action_dict:
del action_dict[key]
# TODO XXX: add target_segment_description?

# Manually add properties to the dictionary
if self.available_segment_descriptions:
action_dict["available_segment_descriptions"] = (
self.available_segment_descriptions
)
if self.active_browser_element:
action_dict["active_browser_element"] = str(self.active_browser_element)
if self.available_browser_elements:
# TODO XXX: available browser_elements contains raw HTML. We need to
# prompt to convert into descriptions.
action_dict["available_browser_elements"] = str(self.available_browser_elements)

if self.active_browser_element:
import ipdb; ipdb.set_trace()
return action_dict


Expand Down Expand Up @@ -725,8 +743,7 @@ def parse(self) -> tuple[BeautifulSoup, BeautifulSoup | None]:
"""Parses the visible HTML and optionally extracts the target element.
This method processes the browser event to parse the visible HTML and,
if the event type is "click", extracts the target HTML element that was
clicked.
if the event has a targetId, extracts the target HTML element.
Returns:
A tuple containing:
Expand All @@ -743,14 +760,14 @@ def parse(self) -> tuple[BeautifulSoup, BeautifulSoup | None]:
assert visible_html_string, "Cannot parse without visibleHTMLstring"

# Parse the visible HTML using BeautifulSoup
soup = BeautifulSoup(visible_html_string, "html.parser")
soup = utils.parse_html(visible_html_string)

event_type = message.get("eventType")
target_element = None

if event_type == "click":
# Fetch the target element using its data-id
target_id = message.get("targetId")
# Fetch the target element using its data-id
target_id = message.get("targetId")
if target_id:
target_element = soup.find(attrs={"data-id": target_id})
assert target_element, f"No target element found for targetId: {target_id}"

Expand Down
1 change: 1 addition & 0 deletions openadapt/strategies/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from openadapt.strategies.base import BaseReplayStrategy
from openadapt.strategies.browser import BrowserReplayStrategy
from openadapt.strategies.visual_browser import VisualBrowserReplayStrategy

# disabled because importing is expensive
# from openadapt.strategies.demo import DemoReplayStrategy
Expand Down
Loading

0 comments on commit 8808b15

Please sign in to comment.