add strategies/visual_browser.py

OpenAdaptAI · Oct 8, 2024 · 8808b15 · 8808b15
1 parent 1d0e7c9
commit 8808b15
Show file tree

Hide file tree

Showing 6 changed files with 1,014 additions and 227 deletions.
diff --git a/chrome_extension/content.js b/chrome_extension/content.js
@@ -410,6 +410,7 @@ function handleUserEvent(event) {
     timestamp: timestamp,
     visibleHTMLString,
     visibleHTMLDuration,
+    devicePixelRatio,
   };
 
   if (event instanceof KeyboardEvent) {
@@ -573,3 +574,66 @@ function setupScrollAndResizeListeners() {
   window.addEventListener('scroll', handleScrollEvent, { passive: true });
   window.addEventListener('resize', handleResizeEvent, { passive: true });
 }
+
+/* Debugging */
+
+const DEBUG_DRAW = false;  // Flag for drawing bounding boxes
+
+// Start continuous drawing if DEBUG_DRAW is enabled
+if (DEBUG_DRAW) {
+  startDrawingBoundingBoxes();
+}
+
+/**
+ * Start continuously drawing bounding boxes for visible elements.
+ */
+function startDrawingBoundingBoxes() {
+  function drawBoundingBoxesLoop() {
+    // Clean up existing bounding boxes before drawing new ones
+    cleanUpBoundingBoxes();
+
+    // Query all visible elements and draw their bounding boxes
+    document.querySelectorAll('*').forEach(element => {
+      if (isVisible(element)) {
+        drawBoundingBoxForElement(element);
+      }
+    });
+
+    // Use requestAnimationFrame for continuous updates without performance impact
+    requestAnimationFrame(drawBoundingBoxesLoop);
+  }
+
+  // Kick off the loop
+  drawBoundingBoxesLoop();
+}
+
+/**
+ * Draw a bounding box for the given element.
+ * Uses client coordinates.
+ * @param {HTMLElement} element - The DOM element to draw the bounding box for.
+ */
+function drawBoundingBoxForElement(element) {
+  const { top, left, bottom, right } = element.getBoundingClientRect();
+
+  // Create and style the overlay to represent the bounding box
+  let bboxOverlay = document.createElement('div');
+  bboxOverlay.style.position = 'absolute';
+  bboxOverlay.style.border = '2px solid red';
+  bboxOverlay.style.top = `${top + window.scrollY}px`;  // Adjust for scrolling
+  bboxOverlay.style.left = `${left + window.scrollX}px`;  // Adjust for scrolling
+  bboxOverlay.style.width = `${right - left}px`;
+  bboxOverlay.style.height = `${bottom - top}px`;
+  bboxOverlay.style.pointerEvents = 'none';  // Prevent interference with normal element interactions
+  bboxOverlay.style.zIndex = '9999';  // Ensure it's drawn on top
+  bboxOverlay.setAttribute('data-debug-bbox', element.getAttribute('data-id') || '');
+
+  // Append the overlay to the body
+  document.body.appendChild(bboxOverlay);
+}
+
+/**
+ * Clean up all existing bounding boxes to prevent overlapping or lingering overlays.
+ */
+function cleanUpBoundingBoxes() {
+  document.querySelectorAll('[data-debug-bbox]').forEach(overlay => overlay.remove());
+}
diff --git a/openadapt/browser.py b/openadapt/browser.py
@@ -92,7 +92,7 @@ def set_browser_mode(
     websocket.send(message)
 
 
-def add_screen_tlbr(browser_events: list[models.BrowserEvent]) -> None:
+def add_screen_tlbr(browser_events: list[models.BrowserEvent], target_element_only: bool = False) -> None:
     """Computes and adds the 'data-tlbr-screen' attribute for each element.
 
     Uses coordMappings provided by JavaScript events. If 'data-tlbr-screen' already
@@ -102,6 +102,8 @@ def add_screen_tlbr(browser_events: list[models.BrowserEvent]) -> None:
 
     Args:
         browser_events (list[models.BrowserEvent]): list of browser events to process.
+        target_element_only (bool): if True, only process the target element. If False,
+            process all elements with the 'data-tlbr-client' property.
     """
     # Initialize variables to store the most recent valid mappings
     latest_valid_x_mappings = None
@@ -115,9 +117,6 @@ def add_screen_tlbr(browser_events: list[models.BrowserEvent]) -> None:
             logger.warning(exc)
             continue
 
-        if not target_element:
-            continue
-
         # Extract coordMappings from the message
         message = event.message
         coord_mappings = message.get("coordMappings", {})
@@ -137,7 +136,7 @@ def add_screen_tlbr(browser_events: list[models.BrowserEvent]) -> None:
             # Reuse the most recent valid mappings
             if latest_valid_x_mappings is None or latest_valid_y_mappings is None:
                 logger.warning(
-                    f"No valid coordinate mappings available for element: {target_id}"
+                    f"No valid coordinate mappings available for element: {target_element.get('id')}"
                 )
                 continue  # No valid mappings available, skip this event
 
@@ -152,50 +151,51 @@ def add_screen_tlbr(browser_events: list[models.BrowserEvent]) -> None:
             y_mappings["client"], y_mappings["screen"]
         )
 
-        # Only process the screen coordinates
-        tlbr_attr = "data-tlbr-screen"
-        try:
-            # Get existing screen coordinates if present
-            existing_screen_coords = (
-                target_element[tlbr_attr] if tlbr_attr in target_element.attrs else None
-            )
-        except KeyError:
-            existing_screen_coords = None
+        # Define function to process element
+        def process_element(element):
+            # Compute the 'data-tlbr-screen' attribute
+            tlbr_attr = "data-tlbr-screen"
+            existing_screen_coords = element.get(tlbr_attr, None)
 
-        # Compute screen coordinates
-        client_coords = target_element.get("data-tlbr-client")
-        if not client_coords:
-            logger.warning(
-                f"Missing client coordinates for element with id: {target_id}"
+            client_coords = element.get("data-tlbr-client")
+            if not client_coords:
+                logger.warning(f"Missing client coordinates for element with id: {element.get('id')}")
+                return
+
+            # Extract client coordinates
+            client_top, client_left, client_bottom, client_right = map(
+                float, client_coords.split(",")
             )
-            continue
 
-        # Extract client coordinates
-        client_top, client_left, client_bottom, client_right = map(
-            float, client_coords.split(",")
-        )
+            # Calculate screen coordinates using the computed scale and offset
+            screen_top = sy_scale * client_top + sy_offset
+            screen_left = sx_scale * client_left + sx_offset
+            screen_bottom = sy_scale * client_bottom + sy_offset
+            screen_right = sx_scale * client_right + sx_offset
+
+            # New computed screen coordinates
+            new_screen_coords = f"{screen_top},{screen_left},{screen_bottom},{screen_right}"
+            logger.info(f"{client_coords=} {existing_screen_coords=} {new_screen_coords=}")
+
+            # Check for existing data-tlbr-screen attribute
+            if existing_screen_coords:
+                assert existing_screen_coords == new_screen_coords, (
+                    "Mismatch in computed and existing screen coordinates:"
+                    f" {existing_screen_coords} != {new_screen_coords}"
+                )
 
-        # Calculate screen coordinates using the computed scale and offset
-        screen_top = sy_scale * client_top + sy_offset
-        screen_left = sx_scale * client_left + sx_offset
-        screen_bottom = sy_scale * client_bottom + sy_offset
-        screen_right = sx_scale * client_right + sx_offset
-
-        # New computed screen coordinates
-        new_screen_coords = f"{screen_top},{screen_left},{screen_bottom},{screen_right}"
-        logger.info(f"{client_coords=} {existing_screen_coords=} {new_screen_coords=}")
-
-        # Check for existing data-tlbr-screen attribute
-        if existing_screen_coords:
-            assert existing_screen_coords == new_screen_coords, (
-                "Mismatch in computed and existing screen coordinates:"
-                f" {existing_screen_coords} != {new_screen_coords}"
-            )
+            # Update the attribute with the new value
+            element["data-tlbr-screen"] = new_screen_coords
 
-        # Update the attribute with the new value
-        target_element["data-tlbr-screen"] = new_screen_coords
+        # Process elements based on target_element_only flag
+        if target_element_only:
+            process_element(target_element)
+        else:
+            elements_to_process = soup.find_all(attrs={"data-tlbr-client": True})
+            for element in elements_to_process:
+                process_element(element)
 
-        # Write the updated element back to the message
+        # Write the updated elements back to the message
         message["visibleHTMLString"] = str(soup)
 
     logger.info("Finished processing all browser events for screen coordinates.")

diff --git a/openadapt/models.py b/openadapt/models.py
@@ -221,7 +221,9 @@ def available_segment_descriptions(self, value: list[str]) -> None:
         )
 
     @property
-    def active_browser_element(self) -> BeautifulSoup:
+    def active_browser_element(self) -> BeautifulSoup | None:
+        if not self._active_browser_element:
+            return None
         return utils.parse_html(self._active_browser_element)
 
     @active_browser_element.setter
@@ -232,10 +234,12 @@ def active_browser_element(self, value: BeautifulSoup) -> None:
         self._active_browser_element = str(value)
 
     @property
-    def available_browser_elements(self) -> BeautifulSoup:
+    def available_browser_elements(self) -> BeautifulSoup | None:
         # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#navigating-the-tree
         # The value True matches every tag it can. This code finds all the tags in the
         # document, but none of the text strings
+        if not self._available_browser_elements:
+            return None
         return utils.parse_html(self._available_browser_elements)
 
     @available_browser_elements.setter
@@ -535,6 +539,8 @@ def to_prompt_dict(self) -> dict[str, Any]:
         Returns:
             dictionary containing relevant properties from the ActionEvent.
         """
+        if self.active_browser_element:
+            import ipdb; ipdb.set_trace()
         action_dict = deepcopy(
             {
                 key: val
@@ -550,10 +556,22 @@ def to_prompt_dict(self) -> dict[str, Any]:
             for key in ("mouse_x", "mouse_y", "mouse_dx", "mouse_dy"):
                 if key in action_dict:
                     del action_dict[key]
+        # TODO XXX: add target_segment_description?
+
+        # Manually add properties to the dictionary
         if self.available_segment_descriptions:
             action_dict["available_segment_descriptions"] = (
                 self.available_segment_descriptions
             )
+        if self.active_browser_element:
+            action_dict["active_browser_element"] = str(self.active_browser_element)
+        if self.available_browser_elements:
+            # TODO XXX: available browser_elements contains raw HTML. We need to
+            # prompt to convert into descriptions.
+            action_dict["available_browser_elements"] = str(self.available_browser_elements)
+
+        if self.active_browser_element:
+            import ipdb; ipdb.set_trace()
         return action_dict
 
 
@@ -725,8 +743,7 @@ def parse(self) -> tuple[BeautifulSoup, BeautifulSoup | None]:
         """Parses the visible HTML and optionally extracts the target element.
 
         This method processes the browser event to parse the visible HTML and,
-        if the event type is "click", extracts the target HTML element that was
-        clicked.
+        if the event has a targetId, extracts the target HTML element.
 
         Returns:
             A tuple containing:
@@ -743,14 +760,14 @@ def parse(self) -> tuple[BeautifulSoup, BeautifulSoup | None]:
         assert visible_html_string, "Cannot parse without visibleHTMLstring"
 
         # Parse the visible HTML using BeautifulSoup
-        soup = BeautifulSoup(visible_html_string, "html.parser")
+        soup = utils.parse_html(visible_html_string)
 
         event_type = message.get("eventType")
         target_element = None
 
-        if event_type == "click":
-            # Fetch the target element using its data-id
-            target_id = message.get("targetId")
+        # Fetch the target element using its data-id
+        target_id = message.get("targetId")
+        if target_id:
             target_element = soup.find(attrs={"data-id": target_id})
             assert target_element, f"No target element found for targetId: {target_id}"
 

diff --git a/openadapt/strategies/__init__.py b/openadapt/strategies/__init__.py
@@ -6,6 +6,7 @@
 
 from openadapt.strategies.base import BaseReplayStrategy
 from openadapt.strategies.browser import BrowserReplayStrategy
+from openadapt.strategies.visual_browser import VisualBrowserReplayStrategy
 
 # disabled because importing is expensive
 # from openadapt.strategies.demo import DemoReplayStrategy