Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .beads/issues.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@
{"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
{"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","notes":"wright repo (OpenAdaptAI/wright) scaffolding underway. Herald + consilium repos transferred to OpenAdaptAI org. Wright will be the orchestration layer for eval pipeline.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.422633-05:00"}
{"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-01: GPU grant applications reviewed and rewritten (11 files). Writing done, blocked on eval results (DC signal on harder tasks). Detailed status tracked in openadapt-internal (private repo).","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T17:11:02.757913-05:00"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-06T15:42:14.015601-05:00"}
{"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
25 changes: 13 additions & 12 deletions scripts/core4_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,6 @@ def cmd_run(args: argparse.Namespace) -> int:
args.agent,
"--vm-user",
args.vm_user,
"--transport-error-threshold",
str(args.transport_error_threshold),
"--health-samples",
str(args.health_samples),
"--health-min-success",
str(args.health_min_success),
"--health-sample-delay",
str(args.health_sample_delay),
]
if args.vm_ip:
cmd.extend(["--vm-ip", args.vm_ip])
Expand All @@ -112,6 +104,16 @@ def cmd_run(args: argparse.Namespace) -> int:
str(args.max_replans),
]
)
if args.done_gate:
cmd.extend(
[
"--done-gate",
"--done-gate-max-overrides",
str(args.done_gate_max_overrides),
"--done-gate-threshold",
str(args.done_gate_threshold),
]
)

print(f"\n=== Trial {t} -> {out} ===")
rc = _run_cmd(cmd, cwd=repo_root, dry_run=args.dry_run)
Expand Down Expand Up @@ -184,15 +186,14 @@ def build_parser() -> argparse.ArgumentParser:
run.add_argument("--vm-ip", default=None)
run.add_argument("--vm-user", default="azureuser")
run.add_argument("--start-from", type=int, default=0)
run.add_argument("--transport-error-threshold", type=int, default=8)
run.add_argument("--health-samples", type=int, default=3)
run.add_argument("--health-min-success", type=int, default=2)
run.add_argument("--health-sample-delay", type=float, default=1.5)
run.add_argument("--zs-only", action="store_true")
run.add_argument("--dc-only", action="store_true")
run.add_argument("--controller", action="store_true")
run.add_argument("--max-retries", type=int, default=2)
run.add_argument("--max-replans", type=int, default=2)
run.add_argument("--done-gate", action="store_true")
run.add_argument("--done-gate-max-overrides", type=int, default=3)
run.add_argument("--done-gate-threshold", type=float, default=1.0)
run.add_argument("--continue-on-fail", action="store_true")
run.add_argument("--dry-run", action="store_true")
run.set_defaults(func=cmd_run)
Expand Down