Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .beads/issues.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@
{"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
{"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","notes":"wright repo (OpenAdaptAI/wright) scaffolding underway. Herald + consilium repos transferred to OpenAdaptAI org. Wright will be the orchestration layer for eval pipeline.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.422633-05:00"}
{"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-06T15:42:14.015601-05:00"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-07T01:44:43.380289-05:00"}
{"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
14 changes: 7 additions & 7 deletions openadapt_evals/benchmarks/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2252,7 +2252,7 @@ def cmd_eval_suite(args: argparse.Namespace) -> int:
tunnel_mgr = SSHTunnelManager()
tunnel_mgr.start_tunnels_for_vm(vm_ip=worker_ip)
server_url = "http://localhost:5001"
evaluate_url = "http://localhost:5050"
evaluate_url = None # use server_url for /evaluate
# Give tunnels a moment to establish
import time
time.sleep(3)
Expand Down Expand Up @@ -2389,8 +2389,8 @@ def main() -> int:
)
run_parser.add_argument("--server", type=str, default="http://localhost:5001",
help="WAA server URL (default: localhost:5001 for SSH tunnel)")
run_parser.add_argument("--evaluate-url", type=str, default="http://localhost:5050",
help="Evaluate server URL (default: localhost:5050)")
run_parser.add_argument("--evaluate-url", type=str, default=None,
help="Evaluate server URL (default: same as --server)")
run_parser.add_argument("--agent", type=str, default="api-openai",
help="Agent type: noop, mock, api-claude, api-openai, api-claude-cu, qwen3vl, smol")
run_parser.add_argument("--task", type=str,
Expand Down Expand Up @@ -2437,8 +2437,8 @@ def main() -> int:
live_parser = subparsers.add_parser("live", help="Run live evaluation against WAA server (full control)")
live_parser.add_argument("--server", type=str, default="http://localhost:5001",
help="WAA server URL (default: localhost:5001 for SSH tunnel)")
live_parser.add_argument("--evaluate-url", type=str, default="http://localhost:5050",
help="Evaluate server URL (default: localhost:5050)")
live_parser.add_argument("--evaluate-url", type=str, default=None,
help="Evaluate server URL (default: same as --server)")
live_parser.add_argument("--agent", type=str, default="mock",
help="Agent type: mock, noop, api-claude, api-openai, api-claude-cu, qwen3vl, smol, retrieval-claude, retrieval-openai")
live_parser.add_argument("--demo", type=str, help="Demo trajectory file for ApiAgent")
Expand Down Expand Up @@ -2791,8 +2791,8 @@ def main() -> int:
help="WAA server URL (used with --no-pool-create)",
)
suite_parser.add_argument(
"--evaluate-url", type=str, default="http://localhost:5050",
help="Evaluate server URL (used with --no-pool-create)",
"--evaluate-url", type=str, default=None,
help="Evaluate server URL (default: same as --server)",
)

args = parser.parse_args()
Expand Down
11 changes: 11 additions & 0 deletions openadapt_evals/benchmarks/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,17 @@ def _run_single_task(
done = True
break

# If evaluate endpoint is unreachable, accept "done"
# rather than forcing the agent to continue pointlessly
if gate_result.error_type == "infrastructure":
logger.warning(
f"Step {steps}: Done-gate skipped — evaluate "
f"returned infrastructure error: {gate_result.reason}. "
"Accepting 'done'."
)
done = True
break

if gate_score >= config.done_gate_threshold:
logger.info(
f"Step {steps}: Done-gate PASSED "
Expand Down
7 changes: 4 additions & 3 deletions scripts/core4_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,13 @@ def cmd_run(args: argparse.Namespace) -> int:
str(out),
"--server",
args.server,
"--evaluate-url",
args.evaluate_url,
"--agent",
args.agent,
"--vm-user",
args.vm_user,
]
if args.evaluate_url:
cmd.extend(["--evaluate-url", args.evaluate_url])
if args.vm_ip:
cmd.extend(["--vm-ip", args.vm_ip])
if args.start_from > 0:
Expand Down Expand Up @@ -181,7 +181,8 @@ def build_parser() -> argparse.ArgumentParser:
run.add_argument("--max-steps", type=int, default=15)
run.add_argument("--output-root", default="benchmark_results")
run.add_argument("--server", default="http://localhost:5001")
run.add_argument("--evaluate-url", default="http://localhost:5050")
run.add_argument("--evaluate-url", default=None,
help="Evaluate server URL (default: same as --server)")
run.add_argument("--agent", default="api-claude-cu")
run.add_argument("--vm-ip", default=None)
run.add_argument("--vm-user", default="azureuser")
Expand Down
6 changes: 3 additions & 3 deletions scripts/core4_lane.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ def _build_eval_cmd(args: argparse.Namespace, trial: TrialConfig) -> list[str]:
trial.output_arg(),
"--server",
args.server,
"--evaluate-url",
args.evaluate_url,
"--vm-user",
args.vm_user,
"--transport-error-threshold",
str(args.transport_error_threshold),
]
if args.evaluate_url:
cmd.extend(["--evaluate-url", args.evaluate_url])
if args.vm_ip:
cmd.extend(["--vm-ip", args.vm_ip])
if args.controller:
Expand Down Expand Up @@ -181,7 +181,7 @@ def _common_args(parser: argparse.ArgumentParser) -> None:
parser.add_argument("--agent", default="api-openai", help="Agent passed to run_dc_eval")
parser.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
parser.add_argument("--server", default="http://localhost:5001", help="WAA server URL")
parser.add_argument("--evaluate-url", default="http://localhost:5050", help="Evaluate server URL")
parser.add_argument("--evaluate-url", default=None, help="Evaluate server URL (default: same as --server)")
parser.add_argument("--vm-ip", default=None, help="VM IP (optional)")
parser.add_argument("--vm-user", default="azureuser", help="VM SSH user")
parser.add_argument(
Expand Down
53 changes: 7 additions & 46 deletions scripts/run_dc_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def _start_tunnel(vm_user: str, vm_ip: str) -> bool:
"-o", "TCPKeepAlive=yes",
"-o", "ExitOnForwardFailure=yes",
"-L", "5001:localhost:5000",
"-L", "5050:localhost:5051",
"-L", "8006:localhost:8006",
f"{vm_user}@{vm_ip}",
]
Expand All @@ -69,43 +68,6 @@ def _probe(server: str, timeout: int = 10) -> bool:
return False


def _setup_eval_proxy(vm_user: str, vm_ip: str) -> bool:
"""(Re-)establish socat proxy for the evaluate server on the VM.

Docker port forwarding for port 5050 is broken due to QEMU's custom
bridge networking (--cap-add NET_ADMIN). Work around it by restarting
the socat-waa-evaluate systemd service on the VM host. The service is
installed during pool creation (see DOCKER_SETUP_SCRIPT in pool.py).
The SSH tunnel maps local 5050 -> VM 5051.

Falls back to the legacy nohup socat approach if the systemd service
is not installed (e.g. on older VMs provisioned before this change).
"""
# Try systemd service first (preferred: auto-restarts on failure)
script = (
"if systemctl list-unit-files socat-waa-evaluate.service "
"| grep -q socat-waa-evaluate; then "
" sudo systemctl restart socat-waa-evaluate.service; "
"else "
" killall socat 2>/dev/null || true; sleep 1; "
" which socat >/dev/null 2>&1 "
" || sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq socat; "
" nohup socat TCP-LISTEN:5051,fork,reuseaddr "
" 'EXEC:docker exec -i winarena socat - TCP\\:127.0.0.1\\:5050' "
" </dev/null >/dev/null 2>&1 & "
"fi"
)
result = subprocess.run(
["ssh", "-o", "StrictHostKeyChecking=no", f"{vm_user}@{vm_ip}", script],
capture_output=True, timeout=30,
)
if result.returncode != 0:
print(f" socat proxy setup failed: {result.stderr.decode()}")
return False
print(" socat proxy for evaluate server established (VM:5051 -> container:5050)")
return True


def _restart_container(vm_user: str, vm_ip: str) -> bool:
"""Restart Windows via QEMU monitor reset, falling back to docker restart.

Expand All @@ -122,8 +84,7 @@ def _restart_container(vm_user: str, vm_ip: str) -> bool:
if mgr.is_qemu_monitor_reachable():
print(" Resetting Windows via QEMU monitor (system_reset)...")
if mgr.reset_windows():
print(" QEMU reset sent, re-establishing evaluate proxy...")
_setup_eval_proxy(vm_user, vm_ip)
print(" QEMU reset sent.")
return True
print(" QEMU reset command failed, falling back to docker restart...")
else:
Expand All @@ -139,8 +100,7 @@ def _restart_container(vm_user: str, vm_ip: str) -> bool:
if result.returncode != 0:
print(f" Container restart failed: {result.stderr.decode()}")
return False
print(" Container restarted, re-establishing evaluate proxy...")
_setup_eval_proxy(vm_user, vm_ip)
print(" Container restarted.")
return True


Expand All @@ -164,11 +124,10 @@ def ensure_waa_ready(
if _probe(server) and (evaluate_url is None or _probe(evaluate_url)):
return True

# Step 2: Reconnect tunnel + ensure socat proxy
# Step 2: Reconnect tunnel
print(" WAA unreachable, reconnecting tunnel...")
_kill_tunnels()
time.sleep(1)
_setup_eval_proxy(vm_user, vm_ip)
if _start_tunnel(vm_user, vm_ip):
time.sleep(3)
if _probe(server) and (evaluate_url is None or _probe(evaluate_url)):
Expand Down Expand Up @@ -210,7 +169,8 @@ def main() -> int:
parser.add_argument("--agent", default="api-claude-cu", help="Agent type")
parser.add_argument("--demo-dir", default="annotated_demos", help="Demo directory")
parser.add_argument("--server", default="http://localhost:5001")
parser.add_argument("--evaluate-url", default="http://localhost:5050")
parser.add_argument("--evaluate-url", default=None,
help="Evaluate server URL (default: same as --server)")
parser.add_argument("--max-steps", type=int, default=15)
parser.add_argument("--output", default="benchmark_results")
parser.add_argument(
Expand Down Expand Up @@ -357,11 +317,12 @@ def main() -> int:
"--agent", args.agent,
"--tasks", tid,
"--server", args.server,
"--evaluate-url", args.evaluate_url,
"--max-steps", str(args.max_steps),
"--output", str(output_dir),
"--run-name", run_name,
]
if args.evaluate_url:
cmd.extend(["--evaluate-url", args.evaluate_url])
if demo_path:
cmd.extend(["--demo", str(demo_path.resolve())])
if args.controller and demo_path:
Expand Down
22 changes: 13 additions & 9 deletions scripts/run_eval_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ def _setup_connectivity(

def _wait_waa_ready(
server: str = "http://localhost:5001",
evaluate_url: str = "http://localhost:5050",
evaluate_url: str | None = None,
timeout: int = 1200,
) -> bool:
"""Wait for WAA server and evaluate server to respond."""
Expand All @@ -381,13 +381,15 @@ def _wait_waa_ready(
except Exception:
waa_ok = False

try:
eval_ok = requests.get(f"{evaluate_url}/probe", timeout=10).ok
except Exception:
eval_ok = False
eval_ok = True # default to True when no separate evaluate URL
if evaluate_url:
try:
eval_ok = requests.get(f"{evaluate_url}/probe", timeout=10).ok
except Exception:
eval_ok = False

if waa_ok and eval_ok:
print(f"[waa] WAA + evaluate server ready after {elapsed}s")
print(f"[waa] WAA server ready after {elapsed}s")
return True
if waa_ok and not eval_ok:
# WAA is up but evaluate isn't — acceptable for ZS-only runs
Expand Down Expand Up @@ -434,7 +436,7 @@ def _run_eval(
conditions: list[tuple[str, str, Path | None]],
agent: str,
server: str,
evaluate_url: str,
evaluate_url: str | None,
max_steps: int,
output_dir: Path,
vm_ip: str,
Expand Down Expand Up @@ -481,11 +483,12 @@ def _run_eval(
"--agent", agent,
"--tasks", tid,
"--server", server,
"--evaluate-url", evaluate_url,
"--max-steps", str(max_steps),
"--output", str(output_dir),
"--run-name", run_name,
]
if evaluate_url:
cmd.extend(["--evaluate-url", evaluate_url])
if clean_desktop:
cmd.append("--clean-desktop")
if force_tray_icons:
Expand Down Expand Up @@ -606,7 +609,8 @@ def build_parser() -> argparse.ArgumentParser:
help="Pinned WAA image version label to record in run metadata",
)
parser.add_argument("--server", default="http://localhost:5001")
parser.add_argument("--evaluate-url", default="http://localhost:5050")
parser.add_argument("--evaluate-url", default=None,
help="Evaluate server URL (default: same as --server)")
parser.add_argument(
"--vm-name", default=DEFAULT_VM_NAME, help="VM name",
)
Expand Down