diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index 79f6ccb..9655e64 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -13,5 +13,5 @@ {"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"} {"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","notes":"wright repo (OpenAdaptAI/wright) scaffolding underway. Herald + consilium repos transferred to OpenAdaptAI org. Wright will be the orchestration layer for eval pipeline.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.422633-05:00"} {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"} -{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-06T15:42:14.015601-05:00"} +{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-07T01:44:43.380289-05:00"} {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"} diff --git a/openadapt_evals/benchmarks/cli.py b/openadapt_evals/benchmarks/cli.py index d0dc994..3f1d604 100644 --- a/openadapt_evals/benchmarks/cli.py +++ b/openadapt_evals/benchmarks/cli.py @@ -2252,7 +2252,7 @@ def cmd_eval_suite(args: argparse.Namespace) -> int: tunnel_mgr = SSHTunnelManager() tunnel_mgr.start_tunnels_for_vm(vm_ip=worker_ip) server_url = "http://localhost:5001" - evaluate_url = "http://localhost:5050" + evaluate_url = None # use server_url for /evaluate # Give tunnels a moment to establish import time time.sleep(3) @@ -2389,8 +2389,8 @@ def main() -> int: ) run_parser.add_argument("--server", type=str, default="http://localhost:5001", help="WAA server URL (default: localhost:5001 for SSH tunnel)") - run_parser.add_argument("--evaluate-url", type=str, default="http://localhost:5050", - help="Evaluate server URL (default: localhost:5050)") + run_parser.add_argument("--evaluate-url", type=str, default=None, + help="Evaluate server URL (default: same as --server)") run_parser.add_argument("--agent", type=str, default="api-openai", help="Agent type: noop, mock, api-claude, api-openai, api-claude-cu, qwen3vl, smol") run_parser.add_argument("--task", type=str, @@ -2437,8 +2437,8 @@ def main() -> int: live_parser = subparsers.add_parser("live", help="Run live evaluation against WAA server (full control)") live_parser.add_argument("--server", type=str, default="http://localhost:5001", help="WAA server URL (default: localhost:5001 for SSH tunnel)") - live_parser.add_argument("--evaluate-url", type=str, default="http://localhost:5050", - help="Evaluate server URL (default: localhost:5050)") + live_parser.add_argument("--evaluate-url", type=str, default=None, + help="Evaluate server URL (default: same as --server)") live_parser.add_argument("--agent", type=str, default="mock", help="Agent type: mock, noop, api-claude, api-openai, api-claude-cu, qwen3vl, smol, retrieval-claude, retrieval-openai") live_parser.add_argument("--demo", type=str, help="Demo trajectory file for ApiAgent") @@ -2791,8 +2791,8 @@ def main() -> int: help="WAA server URL (used with --no-pool-create)", ) suite_parser.add_argument( - "--evaluate-url", type=str, default="http://localhost:5050", - help="Evaluate server URL (used with --no-pool-create)", + "--evaluate-url", type=str, default=None, + help="Evaluate server URL (default: same as --server)", ) args = parser.parse_args() diff --git a/openadapt_evals/benchmarks/runner.py b/openadapt_evals/benchmarks/runner.py index 399ee87..52403b8 100644 --- a/openadapt_evals/benchmarks/runner.py +++ b/openadapt_evals/benchmarks/runner.py @@ -400,6 +400,17 @@ def _run_single_task( done = True break + # If evaluate endpoint is unreachable, accept "done" + # rather than forcing the agent to continue pointlessly + if gate_result.error_type == "infrastructure": + logger.warning( + f"Step {steps}: Done-gate skipped — evaluate " + f"returned infrastructure error: {gate_result.reason}. " + "Accepting 'done'." + ) + done = True + break + if gate_score >= config.done_gate_threshold: logger.info( f"Step {steps}: Done-gate PASSED " diff --git a/scripts/core4_eval.py b/scripts/core4_eval.py index ad330b7..dee5d46 100644 --- a/scripts/core4_eval.py +++ b/scripts/core4_eval.py @@ -79,13 +79,13 @@ def cmd_run(args: argparse.Namespace) -> int: str(out), "--server", args.server, - "--evaluate-url", - args.evaluate_url, "--agent", args.agent, "--vm-user", args.vm_user, ] + if args.evaluate_url: + cmd.extend(["--evaluate-url", args.evaluate_url]) if args.vm_ip: cmd.extend(["--vm-ip", args.vm_ip]) if args.start_from > 0: @@ -181,7 +181,8 @@ def build_parser() -> argparse.ArgumentParser: run.add_argument("--max-steps", type=int, default=15) run.add_argument("--output-root", default="benchmark_results") run.add_argument("--server", default="http://localhost:5001") - run.add_argument("--evaluate-url", default="http://localhost:5050") + run.add_argument("--evaluate-url", default=None, + help="Evaluate server URL (default: same as --server)") run.add_argument("--agent", default="api-claude-cu") run.add_argument("--vm-ip", default=None) run.add_argument("--vm-user", default="azureuser") diff --git a/scripts/core4_lane.py b/scripts/core4_lane.py index ff683d9..9642f72 100644 --- a/scripts/core4_lane.py +++ b/scripts/core4_lane.py @@ -55,13 +55,13 @@ def _build_eval_cmd(args: argparse.Namespace, trial: TrialConfig) -> list[str]: trial.output_arg(), "--server", args.server, - "--evaluate-url", - args.evaluate_url, "--vm-user", args.vm_user, "--transport-error-threshold", str(args.transport_error_threshold), ] + if args.evaluate_url: + cmd.extend(["--evaluate-url", args.evaluate_url]) if args.vm_ip: cmd.extend(["--vm-ip", args.vm_ip]) if args.controller: @@ -181,7 +181,7 @@ def _common_args(parser: argparse.ArgumentParser) -> None: parser.add_argument("--agent", default="api-openai", help="Agent passed to run_dc_eval") parser.add_argument("--max-steps", type=int, default=15, help="Max steps per task") parser.add_argument("--server", default="http://localhost:5001", help="WAA server URL") - parser.add_argument("--evaluate-url", default="http://localhost:5050", help="Evaluate server URL") + parser.add_argument("--evaluate-url", default=None, help="Evaluate server URL (default: same as --server)") parser.add_argument("--vm-ip", default=None, help="VM IP (optional)") parser.add_argument("--vm-user", default="azureuser", help="VM SSH user") parser.add_argument( diff --git a/scripts/run_dc_eval.py b/scripts/run_dc_eval.py index 4978d76..ea7bf17 100644 --- a/scripts/run_dc_eval.py +++ b/scripts/run_dc_eval.py @@ -53,7 +53,6 @@ def _start_tunnel(vm_user: str, vm_ip: str) -> bool: "-o", "TCPKeepAlive=yes", "-o", "ExitOnForwardFailure=yes", "-L", "5001:localhost:5000", - "-L", "5050:localhost:5051", "-L", "8006:localhost:8006", f"{vm_user}@{vm_ip}", ] @@ -69,43 +68,6 @@ def _probe(server: str, timeout: int = 10) -> bool: return False -def _setup_eval_proxy(vm_user: str, vm_ip: str) -> bool: - """(Re-)establish socat proxy for the evaluate server on the VM. - - Docker port forwarding for port 5050 is broken due to QEMU's custom - bridge networking (--cap-add NET_ADMIN). Work around it by restarting - the socat-waa-evaluate systemd service on the VM host. The service is - installed during pool creation (see DOCKER_SETUP_SCRIPT in pool.py). - The SSH tunnel maps local 5050 -> VM 5051. - - Falls back to the legacy nohup socat approach if the systemd service - is not installed (e.g. on older VMs provisioned before this change). - """ - # Try systemd service first (preferred: auto-restarts on failure) - script = ( - "if systemctl list-unit-files socat-waa-evaluate.service " - "| grep -q socat-waa-evaluate; then " - " sudo systemctl restart socat-waa-evaluate.service; " - "else " - " killall socat 2>/dev/null || true; sleep 1; " - " which socat >/dev/null 2>&1 " - " || sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq socat; " - " nohup socat TCP-LISTEN:5051,fork,reuseaddr " - " 'EXEC:docker exec -i winarena socat - TCP\\:127.0.0.1\\:5050' " - " /dev/null 2>&1 & " - "fi" - ) - result = subprocess.run( - ["ssh", "-o", "StrictHostKeyChecking=no", f"{vm_user}@{vm_ip}", script], - capture_output=True, timeout=30, - ) - if result.returncode != 0: - print(f" socat proxy setup failed: {result.stderr.decode()}") - return False - print(" socat proxy for evaluate server established (VM:5051 -> container:5050)") - return True - - def _restart_container(vm_user: str, vm_ip: str) -> bool: """Restart Windows via QEMU monitor reset, falling back to docker restart. @@ -122,8 +84,7 @@ def _restart_container(vm_user: str, vm_ip: str) -> bool: if mgr.is_qemu_monitor_reachable(): print(" Resetting Windows via QEMU monitor (system_reset)...") if mgr.reset_windows(): - print(" QEMU reset sent, re-establishing evaluate proxy...") - _setup_eval_proxy(vm_user, vm_ip) + print(" QEMU reset sent.") return True print(" QEMU reset command failed, falling back to docker restart...") else: @@ -139,8 +100,7 @@ def _restart_container(vm_user: str, vm_ip: str) -> bool: if result.returncode != 0: print(f" Container restart failed: {result.stderr.decode()}") return False - print(" Container restarted, re-establishing evaluate proxy...") - _setup_eval_proxy(vm_user, vm_ip) + print(" Container restarted.") return True @@ -164,11 +124,10 @@ def ensure_waa_ready( if _probe(server) and (evaluate_url is None or _probe(evaluate_url)): return True - # Step 2: Reconnect tunnel + ensure socat proxy + # Step 2: Reconnect tunnel print(" WAA unreachable, reconnecting tunnel...") _kill_tunnels() time.sleep(1) - _setup_eval_proxy(vm_user, vm_ip) if _start_tunnel(vm_user, vm_ip): time.sleep(3) if _probe(server) and (evaluate_url is None or _probe(evaluate_url)): @@ -210,7 +169,8 @@ def main() -> int: parser.add_argument("--agent", default="api-claude-cu", help="Agent type") parser.add_argument("--demo-dir", default="annotated_demos", help="Demo directory") parser.add_argument("--server", default="http://localhost:5001") - parser.add_argument("--evaluate-url", default="http://localhost:5050") + parser.add_argument("--evaluate-url", default=None, + help="Evaluate server URL (default: same as --server)") parser.add_argument("--max-steps", type=int, default=15) parser.add_argument("--output", default="benchmark_results") parser.add_argument( @@ -357,11 +317,12 @@ def main() -> int: "--agent", args.agent, "--tasks", tid, "--server", args.server, - "--evaluate-url", args.evaluate_url, "--max-steps", str(args.max_steps), "--output", str(output_dir), "--run-name", run_name, ] + if args.evaluate_url: + cmd.extend(["--evaluate-url", args.evaluate_url]) if demo_path: cmd.extend(["--demo", str(demo_path.resolve())]) if args.controller and demo_path: diff --git a/scripts/run_eval_pipeline.py b/scripts/run_eval_pipeline.py index aa56021..b0fd6fc 100644 --- a/scripts/run_eval_pipeline.py +++ b/scripts/run_eval_pipeline.py @@ -359,7 +359,7 @@ def _setup_connectivity( def _wait_waa_ready( server: str = "http://localhost:5001", - evaluate_url: str = "http://localhost:5050", + evaluate_url: str | None = None, timeout: int = 1200, ) -> bool: """Wait for WAA server and evaluate server to respond.""" @@ -381,13 +381,15 @@ def _wait_waa_ready( except Exception: waa_ok = False - try: - eval_ok = requests.get(f"{evaluate_url}/probe", timeout=10).ok - except Exception: - eval_ok = False + eval_ok = True # default to True when no separate evaluate URL + if evaluate_url: + try: + eval_ok = requests.get(f"{evaluate_url}/probe", timeout=10).ok + except Exception: + eval_ok = False if waa_ok and eval_ok: - print(f"[waa] WAA + evaluate server ready after {elapsed}s") + print(f"[waa] WAA server ready after {elapsed}s") return True if waa_ok and not eval_ok: # WAA is up but evaluate isn't — acceptable for ZS-only runs @@ -434,7 +436,7 @@ def _run_eval( conditions: list[tuple[str, str, Path | None]], agent: str, server: str, - evaluate_url: str, + evaluate_url: str | None, max_steps: int, output_dir: Path, vm_ip: str, @@ -481,11 +483,12 @@ def _run_eval( "--agent", agent, "--tasks", tid, "--server", server, - "--evaluate-url", evaluate_url, "--max-steps", str(max_steps), "--output", str(output_dir), "--run-name", run_name, ] + if evaluate_url: + cmd.extend(["--evaluate-url", evaluate_url]) if clean_desktop: cmd.append("--clean-desktop") if force_tray_icons: @@ -606,7 +609,8 @@ def build_parser() -> argparse.ArgumentParser: help="Pinned WAA image version label to record in run metadata", ) parser.add_argument("--server", default="http://localhost:5001") - parser.add_argument("--evaluate-url", default="http://localhost:5050") + parser.add_argument("--evaluate-url", default=None, + help="Evaluate server URL (default: same as --server)") parser.add_argument( "--vm-name", default=DEFAULT_VM_NAME, help="VM name", )