Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion chrome_extension/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ function handleServerMessage(data) {
* Handle SET_MODE message from server
*/
function handleSetMode(message) {
const newMode = message.payload?.mode || 'idle';
const newMode = message.mode || message.payload?.mode || 'idle';
currentMode = newMode;
console.log('[OpenAdapt] Mode set to:', currentMode);

Expand Down Expand Up @@ -231,6 +231,7 @@ chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
const tabId = sender.tab?.id;

switch (message.type) {
case 'USER_EVENT':
case 'DOM_EVENT':
// Add tab ID and relay to server
message.tabId = tabId;
Expand Down
41 changes: 31 additions & 10 deletions openadapt_capture/browser_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,27 @@


class BrowserEventType(str, Enum):
"""Browser event type identifiers."""
"""Browser event type identifiers.

Values match the raw DOM event names sent by the Chrome extension
content script (e.g. "click", "keydown"), consistent with legacy OpenAdapt.
"""

# User interaction events
CLICK = "browser.click"
KEYDOWN = "browser.keydown"
KEYUP = "browser.keyup"
SCROLL = "browser.scroll"
INPUT = "browser.input"
FOCUS = "browser.focus"
BLUR = "browser.blur"
CLICK = "click"
KEYDOWN = "keydown"
KEYUP = "keyup"
SCROLL = "scroll"
INPUT = "input"
MOUSEMOVE = "mousemove"
FOCUS = "focus"
BLUR = "blur"

# Navigation events
NAVIGATE = "browser.navigate"
NAVIGATE = "navigate"

# Unknown/generic
UNKNOWN = "browser.unknown"
UNKNOWN = "unknown"


class NavigationType(str, Enum):
Expand Down Expand Up @@ -242,6 +247,21 @@ class BrowserNavigationEvent(BaseBrowserEvent):
# =============================================================================


class BrowserMouseMoveEvent(BaseBrowserEvent):
"""Mouse move event in browser."""

type: Literal[BrowserEventType.MOUSEMOVE] = BrowserEventType.MOUSEMOVE

# Coordinates
client_x: float = Field(description="Viewport X")
client_y: float = Field(description="Viewport Y")
screen_x: float = Field(default=0, description="Screen X")
screen_y: float = Field(default=0, description="Screen Y")

# Target element
element: SemanticElementRef | None = Field(default=None)


class BrowserFocusEvent(BaseBrowserEvent):
"""Element focus/blur event in browser."""

Expand Down Expand Up @@ -292,5 +312,6 @@ class DOMSnapshot(BaseModel):
| BrowserScrollEvent
| BrowserInputEvent
| BrowserNavigationEvent
| BrowserMouseMoveEvent
| BrowserFocusEvent
)
213 changes: 213 additions & 0 deletions openadapt_capture/capture.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,20 @@
from pathlib import Path
from typing import TYPE_CHECKING, Iterator

from openadapt_capture.browser_events import (
BoundingBox,
BrowserClickEvent,
BrowserEventType,
BrowserFocusEvent,
BrowserInputEvent,
BrowserKeyEvent,
BrowserMouseMoveEvent,
BrowserNavigationEvent,
BrowserScrollEvent,
ElementState,
NavigationType,
SemanticElementRef,
)
from openadapt_capture.events import (
ActionEvent as PydanticActionEvent,
)
Expand All @@ -27,6 +41,8 @@
if TYPE_CHECKING:
from PIL import Image

from openadapt_capture.browser_events import BrowserEvent


def _convert_action_event(db_event) -> PydanticActionEvent | None:
"""Convert a SQLAlchemy ActionEvent to a Pydantic event.
Expand Down Expand Up @@ -99,6 +115,182 @@ def _convert_action_event(db_event) -> PydanticActionEvent | None:
return None


def _parse_element_ref(raw: dict | None) -> SemanticElementRef | None:
"""Parse a raw element dict into a SemanticElementRef.

Handles field name variations between the content-script format
(e.g. ``dataId``, ``tagName``, ``classList``) and snake_case alternatives.
"""
if not raw or not isinstance(raw, dict):
return None

bbox_raw = raw.get("bbox", {})
bbox = BoundingBox(
x=bbox_raw.get("x", 0),
y=bbox_raw.get("y", 0),
width=bbox_raw.get("width", 0),
height=bbox_raw.get("height", 0),
)

state_raw = raw.get("state", {})
state = ElementState(
enabled=state_raw.get("enabled", True),
focused=state_raw.get("focused", False),
visible=state_raw.get("visible", True),
checked=state_raw.get("checked"),
selected=state_raw.get("selected"),
expanded=state_raw.get("expanded"),
value=state_raw.get("value"),
) if isinstance(state_raw, dict) else ElementState()

return SemanticElementRef(
role=raw.get("role") or "",
name=raw.get("name") or "",
bbox=bbox,
xpath=raw.get("xpath") or raw.get("dataId") or "",
css_selector=raw.get("cssSelector") or raw.get("css_selector") or "",
state=state,
tag_name=raw.get("tagName") or raw.get("tag_name") or "",
id=raw.get("id"),
class_list=raw.get("classList") or raw.get("class_list") or [],
)


def _convert_browser_event(db_event) -> "BrowserEvent | None":
"""Convert a SQLAlchemy BrowserEvent to a typed Pydantic browser event.

The DB stores browser events as JSON in the `message` field. The recorder
wraps each raw WebSocket message as ``{"message": <raw_event>}``.

Handles both flat (content-script) and payload-wrapped message formats.

Args:
db_event: SQLAlchemy BrowserEvent instance.

Returns:
Typed browser event or None if parsing fails.
"""
msg = db_event.message
if not isinstance(msg, dict):
return None

# Unwrap the recorder's {"message": <raw>} wrapper
inner = msg.get("message", msg)
if not isinstance(inner, dict):
return None

# Support both flat (content-script) and payload-wrapped (browser_bridge) formats
payload = inner.get("payload", inner)

raw_type = payload.get("eventType", inner.get("eventType", ""))
try:
event_type = BrowserEventType(raw_type)
except ValueError:
return None

timestamp = db_event.timestamp or 0
url = payload.get("url", inner.get("url", ""))
tab_id = inner.get("tabId", payload.get("tab_id", 0))

try:
if event_type == BrowserEventType.CLICK:
elem = _parse_element_ref(payload.get("element"))
if elem is None:
return None
return BrowserClickEvent(
timestamp=timestamp,
url=url,
tab_id=tab_id,
client_x=payload.get("clientX", 0),
client_y=payload.get("clientY", 0),
page_x=payload.get("pageX", payload.get("clientX", 0)),
page_y=payload.get("pageY", payload.get("clientY", 0)),
button=payload.get("button", 0),
click_count=payload.get("clickCount", 1),
element=elem,
)
elif event_type in (BrowserEventType.KEYDOWN, BrowserEventType.KEYUP):
element = _parse_element_ref(payload.get("element"))
return BrowserKeyEvent(
timestamp=timestamp,
type=event_type,
url=url,
tab_id=tab_id,
key=payload.get("key", ""),
code=payload.get("code", ""),
key_code=payload.get("keyCode", 0),
shift_key=payload.get("shiftKey", False),
ctrl_key=payload.get("ctrlKey", False),
alt_key=payload.get("altKey", False),
meta_key=payload.get("metaKey", False),
element=element,
)
elif event_type == BrowserEventType.SCROLL:
return BrowserScrollEvent(
timestamp=timestamp,
url=url,
tab_id=tab_id,
scroll_x=payload.get("scrollX", 0),
scroll_y=payload.get("scrollY", 0),
delta_x=payload.get("deltaX", payload.get("scrollDeltaX", 0)),
delta_y=payload.get("deltaY", payload.get("scrollDeltaY", 0)),
)
elif event_type == BrowserEventType.INPUT:
elem = _parse_element_ref(payload.get("element"))
if elem is None:
return None
return BrowserInputEvent(
timestamp=timestamp,
url=url,
tab_id=tab_id,
input_type=payload.get("inputType", ""),
data=payload.get("data"),
value=payload.get("value", ""),
element=elem,
)
elif event_type == BrowserEventType.NAVIGATE:
nav_type = payload.get("navigationType", "link")
valid = [e.value for e in NavigationType]
return BrowserNavigationEvent(
timestamp=timestamp,
url=url,
tab_id=tab_id,
previous_url=payload.get("previousUrl", ""),
navigation_type=(
NavigationType(nav_type)
if nav_type in valid
else NavigationType.LINK
),
)
elif event_type == BrowserEventType.MOUSEMOVE:
element = _parse_element_ref(payload.get("element"))
return BrowserMouseMoveEvent(
timestamp=timestamp,
url=url,
tab_id=tab_id,
client_x=payload.get("clientX", 0),
client_y=payload.get("clientY", 0),
screen_x=payload.get("screenX", 0),
screen_y=payload.get("screenY", 0),
element=element,
)
elif event_type in (BrowserEventType.FOCUS, BrowserEventType.BLUR):
elem = _parse_element_ref(payload.get("element"))
if elem is None:
return None
return BrowserFocusEvent(
timestamp=timestamp,
type=event_type,
url=url,
tab_id=tab_id,
element=elem,
)
except Exception as e:
import logging
logging.getLogger(__name__).debug("Failed to parse browser event: %s", e)
return None


@dataclass
class Action:
"""A processed action event with associated screenshot.
Expand Down Expand Up @@ -385,6 +577,27 @@ def actions(self, include_moves: bool = False) -> Iterator[Action]:
continue
yield Action(event=event, _capture=self)

def browser_events(self) -> list["BrowserEvent"]:
"""Get all browser events as typed Pydantic models.

Parses the JSON message field from each stored BrowserEvent into
the appropriate typed event (BrowserClickEvent, BrowserKeyEvent, etc.).

Returns:
List of typed browser events, ordered by timestamp.
"""
events: list[BrowserEvent] = []
for db_event in self._recording.browser_events:
parsed = _convert_browser_event(db_event)
if parsed is not None:
events.append(parsed)
return events

@property
def browser_event_count(self) -> int:
"""Number of browser events in this capture."""
return len(self._recording.browser_events)

def get_frame_at(self, timestamp: float, tolerance: float = 0.5) -> "Image" | None:
"""Get the screen frame closest to a timestamp.

Expand Down
17 changes: 17 additions & 0 deletions openadapt_capture/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def record(
video: bool = True,
audio: bool = False,
images: bool = False,
browser_events: bool = False,
send_profile: bool = False,
) -> None:
"""Record GUI interactions.
Expand All @@ -27,6 +28,9 @@ def record(
video: Capture video (default: True).
audio: Capture audio (default: False).
images: Save screenshots as PNGs (default: False).
browser_events: Capture browser DOM events via Chrome extension (default: False).
Requires the openadapt-capture Chrome extension to be installed and
connects via WebSocket on localhost:8765.
send_profile: Send profiling data via wormhole after recording (default: False).
"""
import time
Expand All @@ -36,6 +40,9 @@ def record(
output_dir = str(Path(output_dir).resolve())

print(f"Recording to: {output_dir}")
if browser_events:
print("Browser event capture enabled (WebSocket on localhost:8765)")
print("Make sure the openadapt-capture Chrome extension is installed.")
print("Press Ctrl+C or type stop sequence to stop recording...")
print()

Expand All @@ -45,6 +52,7 @@ def record(
capture_video=video,
capture_audio=audio,
capture_images=images,
capture_browser_events=browser_events,
send_profile=send_profile,
) as recorder:
recorder.wait_for_ready()
Expand Down Expand Up @@ -129,6 +137,7 @@ def info(capture_dir: str) -> None:
# Count events
actions = list(capture.actions())
print(f"Actions: {len(actions)}")
print(f"Browser events: {capture.browser_event_count}")

# Event type breakdown
from collections import Counter
Expand All @@ -138,6 +147,14 @@ def info(capture_dir: str) -> None:
for event_type, count in types.most_common():
print(f" {event_type}: {count}")

# Browser event breakdown
if capture.browser_event_count > 0:
browser_events = capture.browser_events()
btypes = Counter(type(e).__name__ for e in browser_events)
print("Browser event types:")
for btype, count in btypes.most_common():
print(f" {btype}: {count}")


def transcribe(
capture_dir: str,
Expand Down
Loading