Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 119 additions & 1 deletion openadapt_evals/adapters/waa/live.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ class WAALiveConfig:
waa_image_version: str | None = None
strict_setup_readiness: bool = False
setup_readiness_retries: int = 3
focus_check_method: str = "win32" # "win32", "a11y", or "both"


class WAALiveAdapter(BenchmarkAdapter):
Expand Down Expand Up @@ -1997,7 +1998,7 @@ def _try_activate_patterns(
)
if resp.status_code == 200:
time.sleep(0.5)
if self._check_foreground_matches(patterns, requests_module):
if self._check_foreground_dispatch(patterns, requests_module):
logger.info(
"Post-setup focus: activated '%s' on attempt %d",
pattern,
Expand All @@ -2022,6 +2023,123 @@ def _try_activate_patterns(
time.sleep(delay)
return False

# Known-bad foreground window titles that indicate the app is not ready.
_BAD_FOREGROUND_TITLES = [
"document recovery",
"libreoffice start center",
]

def _check_foreground_win32(self, patterns: list[str]) -> bool:
"""Check foreground window title using Win32 API (fast, reliable).

Runs a minimal PowerShell script that calls ``GetForegroundWindow()``
and ``GetWindowText()`` via P/Invoke to retrieve the current foreground
window title, then checks whether it contains any of the expected
patterns (case-insensitive).

Args:
patterns: Window title substrings to match (case-insensitive).

Returns:
True if the foreground window title contains any of the patterns.
"""
script = r"""
Add-Type -TypeDefinition @"
using System;
using System.Runtime.InteropServices;
using System.Text;
public static class FgWin {
[DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow();
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
public static extern int GetWindowText(IntPtr hWnd, StringBuilder text, int count);
public static string GetTitle() {
var sb = new StringBuilder(512);
GetWindowText(GetForegroundWindow(), sb, sb.Capacity);
return sb.ToString();
}
}
"@
[FgWin]::GetTitle()
"""
try:
output = self.run_powershell(script).strip()
# Take the last non-empty line (PowerShell may emit warnings before).
title = ""
for line in reversed(output.splitlines()):
line = line.strip()
if line:
title = line
break

self._last_foreground_title = title

# Detect known-bad foreground states.
title_lower = title.lower()
if not title:
logger.warning(
"Win32 foreground check: window title is empty/blank"
)
return False
for bad in self._BAD_FOREGROUND_TITLES:
if bad in title_lower:
logger.warning(
"Win32 foreground check: detected known-bad title '%s'",
title[:120],
)
return False

for pattern in patterns:
if pattern.lower() in title_lower:
logger.debug(
"Win32 foreground check: matched '%s' in '%s'",
pattern,
title[:100],
)
return True
logger.debug(
"Win32 foreground check: no pattern matched in '%s'",
title[:100],
)
except Exception as e:
logger.debug("Win32 foreground check failed: %s", e)
return False

def _check_foreground_dispatch(
self,
patterns: list[str],
requests_module: Any,
) -> bool:
"""Dispatch foreground check based on configured method.

Args:
patterns: Window title substrings to match (case-insensitive).
requests_module: The ``requests`` module (needed for a11y fallback).

Returns:
True if the foreground window matches any pattern.
"""
method = self.config.focus_check_method

if method == "win32":
return self._check_foreground_win32(patterns)
elif method == "a11y":
return self._check_foreground_matches(patterns, requests_module)
elif method == "both":
# Try fast Win32 first; fall back to a11y if it fails.
result = self._check_foreground_win32(patterns)
if result:
return True
logger.debug(
"Win32 foreground check negative; falling back to a11y"
)
return self._check_foreground_matches(patterns, requests_module)
else:
logger.warning(
"Unknown focus_check_method '%s'; defaulting to win32",
method,
)
return self._check_foreground_win32(patterns)

def _check_foreground_matches(
self,
patterns: list[str],
Expand Down
9 changes: 9 additions & 0 deletions openadapt_evals/benchmarks/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ def cmd_run(args: argparse.Namespace) -> int:
clean_desktop=getattr(args, "clean_desktop", False),
force_tray_icons=getattr(args, "force_tray_icons", False),
waa_image_version=getattr(args, "waa_image_version", None),
focus_check_method=getattr(args, "focus_check_method", "win32"),
)
adapter = WAALiveAdapter(config)

Expand Down Expand Up @@ -551,6 +552,7 @@ def cmd_live(args: argparse.Namespace) -> int:
clean_desktop=getattr(args, "clean_desktop", False),
force_tray_icons=getattr(args, "force_tray_icons", False),
waa_image_version=getattr(args, "waa_image_version", None),
focus_check_method=getattr(args, "focus_check_method", "win32"),
)
adapter = WAALiveAdapter(config)

Expand Down Expand Up @@ -961,6 +963,7 @@ def patch_evaluate_endpoint() -> bool:
clean_desktop=getattr(args, "clean_desktop", False),
force_tray_icons=getattr(args, "force_tray_icons", False),
waa_image_version=getattr(args, "waa_image_version", None),
focus_check_method=getattr(args, "focus_check_method", "win32"),
)
)

Expand Down Expand Up @@ -2426,6 +2429,9 @@ def main() -> int:
help="Max times to override premature 'done' (default: 3)")
run_parser.add_argument("--done-gate-threshold", type=float, default=1.0,
help="Minimum score to accept 'done' (default: 1.0)")
run_parser.add_argument("--focus-check-method", type=str, default="win32",
choices=["win32", "a11y", "both"],
help="Method for foreground window check: win32 (fast, default), a11y, or both")

# Live evaluation (full control)
live_parser = subparsers.add_parser("live", help="Run live evaluation against WAA server (full control)")
Expand Down Expand Up @@ -2460,6 +2466,9 @@ def main() -> int:
help="Max times to override premature 'done' (default: 3)")
live_parser.add_argument("--done-gate-threshold", type=float, default=1.0,
help="Minimum score to accept 'done' (default: 1.0)")
live_parser.add_argument("--focus-check-method", type=str, default="win32",
choices=["win32", "a11y", "both"],
help="Method for foreground window check: win32 (fast, default), a11y, or both")

# Probe server
probe_parser = subparsers.add_parser("probe", help="Check if WAA server is reachable")
Expand Down
51 changes: 21 additions & 30 deletions tests/test_setup_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,9 +597,9 @@ def test_calls_activate_window_with_patterns(self):
adapter = self._make_adapter()

# Simulate activate_window success (/setup) and foreground check success
# via accessibility endpoint (/accessibility).
# via win32 API (default focus_check_method).
setup_calls = []
a11y_calls = []
ps_calls = []

def _fake_post(url, **kwargs):
setup_calls.append(url)
Expand All @@ -609,30 +609,27 @@ def _fake_post(url, **kwargs):
resp.text = '{"results": [{"type": "activate_window", "status": "ok"}]}'
return resp

def _fake_get(url, **kwargs):
a11y_calls.append(url)
resp = MagicMock()
resp.status_code = 200
resp.json.return_value = {"AT": {"name": "LibreOffice Calc - data.xlsx"}}
return resp
def _fake_powershell(script, **kwargs):
ps_calls.append(script)
return "LibreOffice Calc - data.xlsx"

with patch("requests.post", side_effect=_fake_post), \
patch("requests.get", side_effect=_fake_get), \
patch.object(adapter, "run_powershell", side_effect=_fake_powershell), \
patch("time.sleep"):
adapter._ensure_app_focused({
"related_apps": ["libreoffice_calc"],
})

# Should have called activate_window at least once and check at least once
# Should have called activate_window at least once and win32 check at least once
assert len(setup_calls) >= 1
assert len(a11y_calls) >= 1
assert len(ps_calls) >= 1

def test_retries_on_foreground_mismatch(self):
"""Retries when foreground check does not match expected pattern."""
adapter = self._make_adapter()

setup_calls = []
a11y_calls = []
ps_calls = []

def _fake_post(url, **kwargs):
setup_calls.append(url)
Expand All @@ -642,24 +639,21 @@ def _fake_post(url, **kwargs):
resp.text = '{"results": [{"type": "activate_window", "status": "ok"}]}'
return resp

def _fake_get(url, **kwargs):
a11y_calls.append(url)
resp = MagicMock()
resp.status_code = 200
def _fake_powershell(script, **kwargs):
ps_calls.append(script)
# Always report desktop as foreground (mismatch).
resp.json.return_value = {"AT": {"name": "Desktop"}}
return resp
return "Desktop"

with patch("requests.post", side_effect=_fake_post), \
patch("requests.get", side_effect=_fake_get), \
patch.object(adapter, "run_powershell", side_effect=_fake_powershell), \
patch("time.sleep"):
adapter._ensure_app_focused({
"related_apps": ["notepad"],
})

# Should have tried multiple setup activations and foreground checks.
# Should have tried multiple setup activations and win32 foreground checks.
assert len(setup_calls) >= 3 # At least 3 retry attempts
assert len(a11y_calls) >= 3 # At least 3 foreground checks
assert len(ps_calls) >= 3 # At least 3 foreground checks

def test_succeeds_on_second_attempt(self):
"""If first attempt fails but second succeeds, returns after second."""
Expand All @@ -674,20 +668,17 @@ def _fake_post(url, **kwargs):
resp.text = '{"results": []}'
return resp

def _fake_get(url, **kwargs):
def _fake_powershell(script, **kwargs):
attempt_count[0] += 1
resp = MagicMock()
resp.status_code = 200
if attempt_count[0] <= 2:
# First attempt: wrong window
resp.json.return_value = {"AT": {"name": "Desktop"}}
# First attempts: wrong window
return "Desktop"
else:
# Second attempt: correct window
resp.json.return_value = {"AT": {"name": "LibreOffice Calc"}}
return resp
# Later attempt: correct window
return "LibreOffice Calc"

with patch("requests.post", side_effect=_fake_post), \
patch("requests.get", side_effect=_fake_get), \
patch.object(adapter, "run_powershell", side_effect=_fake_powershell), \
patch("time.sleep"):
adapter._ensure_app_focused({
"related_apps": ["libreoffice_calc"],
Expand Down