diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index a91c229..79f6ccb 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -13,5 +13,5 @@ {"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"} {"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","notes":"wright repo (OpenAdaptAI/wright) scaffolding underway. Herald + consilium repos transferred to OpenAdaptAI org. Wright will be the orchestration layer for eval pipeline.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.422633-05:00"} {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"} -{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-01: GPU grant applications reviewed and rewritten (11 files). Writing done, blocked on eval results (DC signal on harder tasks). Detailed status tracked in openadapt-internal (private repo).","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T17:11:02.757913-05:00"} +{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-06T15:42:14.015601-05:00"} {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"} diff --git a/openadapt_evals/adapters/waa/live.py b/openadapt_evals/adapters/waa/live.py index 8c11840..cdd0b3e 100644 --- a/openadapt_evals/adapters/waa/live.py +++ b/openadapt_evals/adapters/waa/live.py @@ -257,62 +257,84 @@ def _build_type_commands(text: str) -> str: _LIBREOFFICE_RECOVERY_CLEANUP_SCRIPT = r""" -import os, re, shutil +import os, re, shutil, glob home = os.path.expanduser("~") -lo_user = os.path.join(home, "AppData", "Roaming", "LibreOffice", "4", "user") - -backup_dir = os.path.join(lo_user, "backup") -if os.path.exists(backup_dir): - files = os.listdir(backup_dir) - if files: - shutil.rmtree(backup_dir) - os.makedirs(backup_dir) - print(f"Cleared {len(files)} backup file(s)") - else: - print("Backup dir empty") -else: - print("No backup dir") - -xcu = os.path.join(lo_user, "registrymodifications.xcu") -if os.path.exists(xcu): - with open(xcu, "r", encoding="utf-8") as f: - content = f.read() - changed = False - if "RecoveryList" in content: - content = re.sub( - r'.*?', - "", - content, - flags=re.DOTALL, - ) - changed = True - print("Removed RecoveryList entries") - autosave_line = ( - '' - 'false' - ) - if "AutoSave" not in content: - content = content.replace("", autosave_line + "\n") - changed = True - print("Added AutoSave=false") - elif ">true<" in content.split("AutoSave")[1].split("")[0]: - content = re.sub( - r'.*?', - autosave_line, - content, - flags=re.DOTALL, +lo_base = os.path.join(home, "AppData", "Roaming", "LibreOffice") + +# Search ALL profile paths: LibreOffice/4/user, LibreOffice/user, etc. +user_dirs = [] +if os.path.isdir(lo_base): + for entry in os.listdir(lo_base): + candidate = os.path.join(lo_base, entry, "user") + if os.path.isdir(candidate): + user_dirs.append(candidate) + # Also check LibreOffice/user directly + direct = os.path.join(lo_base, "user") + if os.path.isdir(direct) and direct not in user_dirs: + user_dirs.append(direct) + +if not user_dirs: + print(f"No LibreOffice user dirs found under {lo_base}") + +for lo_user in user_dirs: + print(f"Cleaning {lo_user}") + + # Clear backup directory + backup_dir = os.path.join(lo_user, "backup") + if os.path.exists(backup_dir): + files = os.listdir(backup_dir) + if files: + shutil.rmtree(backup_dir) + os.makedirs(backup_dir) + print(f" Cleared {len(files)} backup file(s)") + + # Clear .~lock.* files that block re-opening + for lockfile in glob.glob(os.path.join(lo_user, ".~lock.*")): + os.remove(lockfile) + print(f" Removed lock file: {os.path.basename(lockfile)}") + + # Edit registrymodifications.xcu to remove recovery entries and disable autosave + xcu = os.path.join(lo_user, "registrymodifications.xcu") + if os.path.exists(xcu): + with open(xcu, "r", encoding="utf-8") as f: + content = f.read() + changed = False + if "RecoveryList" in content: + content = re.sub( + r'.*?', + "", + content, + flags=re.DOTALL, + ) + changed = True + print(" Removed RecoveryList entries") + autosave_line = ( + '' + 'false' ) - changed = True - print("Changed AutoSave to false") - else: - print("AutoSave already disabled") - if changed: - with open(xcu, "w", encoding="utf-8") as f: - f.write(content) - print("Updated registrymodifications.xcu") -else: - print(f"No xcu found at {xcu}") + if "AutoSave" not in content: + content = content.replace("", autosave_line + "\n") + changed = True + print(" Added AutoSave=false") + elif ">true<" in content.split("AutoSave")[1].split("")[0]: + content = re.sub( + r'.*?', + autosave_line, + content, + flags=re.DOTALL, + ) + changed = True + print(" Changed AutoSave to false") + if changed: + with open(xcu, "w", encoding="utf-8") as f: + f.write(content) + +# Also clear lock files from common download locations +for d in [os.path.join(home, "Downloads"), os.path.join(home, "Documents")]: + for lockfile in glob.glob(os.path.join(d, ".~lock.*")): + os.remove(lockfile) + print(f"Removed download lock: {lockfile}") """ diff --git a/scripts/core4_eval.py b/scripts/core4_eval.py index d3841ed..ad330b7 100644 --- a/scripts/core4_eval.py +++ b/scripts/core4_eval.py @@ -85,14 +85,6 @@ def cmd_run(args: argparse.Namespace) -> int: args.agent, "--vm-user", args.vm_user, - "--transport-error-threshold", - str(args.transport_error_threshold), - "--health-samples", - str(args.health_samples), - "--health-min-success", - str(args.health_min_success), - "--health-sample-delay", - str(args.health_sample_delay), ] if args.vm_ip: cmd.extend(["--vm-ip", args.vm_ip]) @@ -112,6 +104,16 @@ def cmd_run(args: argparse.Namespace) -> int: str(args.max_replans), ] ) + if args.done_gate: + cmd.extend( + [ + "--done-gate", + "--done-gate-max-overrides", + str(args.done_gate_max_overrides), + "--done-gate-threshold", + str(args.done_gate_threshold), + ] + ) print(f"\n=== Trial {t} -> {out} ===") rc = _run_cmd(cmd, cwd=repo_root, dry_run=args.dry_run) @@ -184,15 +186,14 @@ def build_parser() -> argparse.ArgumentParser: run.add_argument("--vm-ip", default=None) run.add_argument("--vm-user", default="azureuser") run.add_argument("--start-from", type=int, default=0) - run.add_argument("--transport-error-threshold", type=int, default=8) - run.add_argument("--health-samples", type=int, default=3) - run.add_argument("--health-min-success", type=int, default=2) - run.add_argument("--health-sample-delay", type=float, default=1.5) run.add_argument("--zs-only", action="store_true") run.add_argument("--dc-only", action="store_true") run.add_argument("--controller", action="store_true") run.add_argument("--max-retries", type=int, default=2) run.add_argument("--max-replans", type=int, default=2) + run.add_argument("--done-gate", action="store_true") + run.add_argument("--done-gate-max-overrides", type=int, default=3) + run.add_argument("--done-gate-threshold", type=float, default=1.0) run.add_argument("--continue-on-fail", action="store_true") run.add_argument("--dry-run", action="store_true") run.set_defaults(func=cmd_run)