OpenAdaptAI · abrichr · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
@@ -13,5 +13,5 @@
 {"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
 {"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","notes":"wright repo (OpenAdaptAI/wright) scaffolding underway. Herald + consilium repos transferred to OpenAdaptAI org. Wright will be the orchestration layer for eval pipeline.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.422633-05:00"}
 {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
-{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-01: GPU grant applications reviewed and rewritten (11 files). Writing done, blocked on eval results (DC signal on harder tasks). Detailed status tracked in openadapt-internal (private repo).","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T17:11:02.757913-05:00"}
+{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-06T15:42:14.015601-05:00"}
 {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
diff --git a/openadapt_evals/adapters/waa/live.py b/openadapt_evals/adapters/waa/live.py
@@ -257,62 +257,84 @@ def _build_type_commands(text: str) -> str:
 
 
 _LIBREOFFICE_RECOVERY_CLEANUP_SCRIPT = r"""
-import os, re, shutil
+import os, re, shutil, glob
 
 home = os.path.expanduser("~")
-lo_user = os.path.join(home, "AppData", "Roaming", "LibreOffice", "4", "user")
-
-backup_dir = os.path.join(lo_user, "backup")
-if os.path.exists(backup_dir):
-    files = os.listdir(backup_dir)
-    if files:
-        shutil.rmtree(backup_dir)
-        os.makedirs(backup_dir)
-        print(f"Cleared {len(files)} backup file(s)")
-    else:
-        print("Backup dir empty")
-else:
-    print("No backup dir")
-
-xcu = os.path.join(lo_user, "registrymodifications.xcu")
-if os.path.exists(xcu):
-    with open(xcu, "r", encoding="utf-8") as f:
-        content = f.read()
-    changed = False
-    if "RecoveryList" in content:
-        content = re.sub(
-            r'<item oor:path="/org.openoffice.Office.Recovery/RecoveryList">.*?</item>',
-            "",
-            content,
-            flags=re.DOTALL,
-        )
-        changed = True
-        print("Removed RecoveryList entries")
-    autosave_line = (
-        '<item oor:path="/org.openoffice.Office.Recovery/AutoSave">'
-        '<prop oor:name="Enabled" oor:op="fuse"><value>false</value></prop></item>'
-    )
-    if "AutoSave" not in content:
-        content = content.replace("</oor:items>", autosave_line + "\n</oor:items>")
-        changed = True
-        print("Added AutoSave=false")
-    elif ">true<" in content.split("AutoSave")[1].split("</item>")[0]:
-        content = re.sub(
-            r'<item oor:path="/org.openoffice.Office.Recovery/AutoSave">.*?</item>',
-            autosave_line,
-            content,
-            flags=re.DOTALL,
+lo_base = os.path.join(home, "AppData", "Roaming", "LibreOffice")
+
+# Search ALL profile paths: LibreOffice/4/user, LibreOffice/user, etc.
+user_dirs = []
+if os.path.isdir(lo_base):
+    for entry in os.listdir(lo_base):
+        candidate = os.path.join(lo_base, entry, "user")
+        if os.path.isdir(candidate):
+            user_dirs.append(candidate)
+    # Also check LibreOffice/user directly
+    direct = os.path.join(lo_base, "user")
+    if os.path.isdir(direct) and direct not in user_dirs:
+        user_dirs.append(direct)
+
+if not user_dirs:
+    print(f"No LibreOffice user dirs found under {lo_base}")
+
+for lo_user in user_dirs:
+    print(f"Cleaning {lo_user}")
+
+    # Clear backup directory
+    backup_dir = os.path.join(lo_user, "backup")
+    if os.path.exists(backup_dir):
+        files = os.listdir(backup_dir)
+        if files:
+            shutil.rmtree(backup_dir)
+            os.makedirs(backup_dir)
+            print(f"  Cleared {len(files)} backup file(s)")
+
+    # Clear .~lock.* files that block re-opening
+    for lockfile in glob.glob(os.path.join(lo_user, ".~lock.*")):
+        os.remove(lockfile)
+        print(f"  Removed lock file: {os.path.basename(lockfile)}")
+
+    # Edit registrymodifications.xcu to remove recovery entries and disable autosave
+    xcu = os.path.join(lo_user, "registrymodifications.xcu")
+    if os.path.exists(xcu):
+        with open(xcu, "r", encoding="utf-8") as f:
+            content = f.read()
+        changed = False
+        if "RecoveryList" in content:
+            content = re.sub(
+                r'<item oor:path="/org.openoffice.Office.Recovery/RecoveryList">.*?</item>',
+                "",
+                content,
+                flags=re.DOTALL,
+            )
+            changed = True
+            print("  Removed RecoveryList entries")
+        autosave_line = (
+            '<item oor:path="/org.openoffice.Office.Recovery/AutoSave">'
+            '<prop oor:name="Enabled" oor:op="fuse"><value>false</value></prop></item>'
         )
-        changed = True
-        print("Changed AutoSave to false")
-    else:
-        print("AutoSave already disabled")
-    if changed:
-        with open(xcu, "w", encoding="utf-8") as f:
-            f.write(content)
-        print("Updated registrymodifications.xcu")
-else:
-    print(f"No xcu found at {xcu}")
+        if "AutoSave" not in content:
+            content = content.replace("</oor:items>", autosave_line + "\n</oor:items>")
+            changed = True
+            print("  Added AutoSave=false")
+        elif ">true<" in content.split("AutoSave")[1].split("</item>")[0]:
+            content = re.sub(
+                r'<item oor:path="/org.openoffice.Office.Recovery/AutoSave">.*?</item>',
+                autosave_line,
+                content,
+                flags=re.DOTALL,
+            )
+            changed = True
+            print("  Changed AutoSave to false")
+        if changed:
+            with open(xcu, "w", encoding="utf-8") as f:
+                f.write(content)
+
+# Also clear lock files from common download locations
+for d in [os.path.join(home, "Downloads"), os.path.join(home, "Documents")]:
+    for lockfile in glob.glob(os.path.join(d, ".~lock.*")):
+        os.remove(lockfile)
+        print(f"Removed download lock: {lockfile}")
 """
 
 

diff --git a/scripts/core4_eval.py b/scripts/core4_eval.py
@@ -85,14 +85,6 @@ def cmd_run(args: argparse.Namespace) -> int:
             args.agent,
             "--vm-user",
             args.vm_user,
-            "--transport-error-threshold",
-            str(args.transport_error_threshold),
-            "--health-samples",
-            str(args.health_samples),
-            "--health-min-success",
-            str(args.health_min_success),
-            "--health-sample-delay",
-            str(args.health_sample_delay),
         ]
         if args.vm_ip:
             cmd.extend(["--vm-ip", args.vm_ip])
@@ -112,6 +104,16 @@ def cmd_run(args: argparse.Namespace) -> int:
                     str(args.max_replans),
                 ]
             )
+        if args.done_gate:
+            cmd.extend(
+                [
+                    "--done-gate",
+                    "--done-gate-max-overrides",
+                    str(args.done_gate_max_overrides),
+                    "--done-gate-threshold",
+                    str(args.done_gate_threshold),
+                ]
+            )
 
         print(f"\n=== Trial {t} -> {out} ===")
         rc = _run_cmd(cmd, cwd=repo_root, dry_run=args.dry_run)
@@ -184,15 +186,14 @@ def build_parser() -> argparse.ArgumentParser:
     run.add_argument("--vm-ip", default=None)
     run.add_argument("--vm-user", default="azureuser")
     run.add_argument("--start-from", type=int, default=0)
-    run.add_argument("--transport-error-threshold", type=int, default=8)
-    run.add_argument("--health-samples", type=int, default=3)
-    run.add_argument("--health-min-success", type=int, default=2)
-    run.add_argument("--health-sample-delay", type=float, default=1.5)
     run.add_argument("--zs-only", action="store_true")
     run.add_argument("--dc-only", action="store_true")
     run.add_argument("--controller", action="store_true")
     run.add_argument("--max-retries", type=int, default=2)
     run.add_argument("--max-replans", type=int, default=2)
+    run.add_argument("--done-gate", action="store_true")
+    run.add_argument("--done-gate-max-overrides", type=int, default=3)
+    run.add_argument("--done-gate-threshold", type=float, default=1.0)
     run.add_argument("--continue-on-fail", action="store_true")
     run.add_argument("--dry-run", action="store_true")
     run.set_defaults(func=cmd_run)