Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .beads/issues.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@
{"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
{"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","notes":"wright repo (OpenAdaptAI/wright) scaffolding underway. Herald + consilium repos transferred to OpenAdaptAI org. Wright will be the orchestration layer for eval pipeline.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.422633-05:00"}
{"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-01: GPU grant applications reviewed and rewritten (11 files). Writing done, blocked on eval results (DC signal on harder tasks). Detailed status tracked in openadapt-internal (private repo).","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T17:11:02.757913-05:00"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-06T15:42:14.015601-05:00"}
{"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
128 changes: 75 additions & 53 deletions openadapt_evals/adapters/waa/live.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,62 +257,84 @@ def _build_type_commands(text: str) -> str:


_LIBREOFFICE_RECOVERY_CLEANUP_SCRIPT = r"""
import os, re, shutil
import os, re, shutil, glob

home = os.path.expanduser("~")
lo_user = os.path.join(home, "AppData", "Roaming", "LibreOffice", "4", "user")

backup_dir = os.path.join(lo_user, "backup")
if os.path.exists(backup_dir):
files = os.listdir(backup_dir)
if files:
shutil.rmtree(backup_dir)
os.makedirs(backup_dir)
print(f"Cleared {len(files)} backup file(s)")
else:
print("Backup dir empty")
else:
print("No backup dir")

xcu = os.path.join(lo_user, "registrymodifications.xcu")
if os.path.exists(xcu):
with open(xcu, "r", encoding="utf-8") as f:
content = f.read()
changed = False
if "RecoveryList" in content:
content = re.sub(
r'<item oor:path="/org.openoffice.Office.Recovery/RecoveryList">.*?</item>',
"",
content,
flags=re.DOTALL,
)
changed = True
print("Removed RecoveryList entries")
autosave_line = (
'<item oor:path="/org.openoffice.Office.Recovery/AutoSave">'
'<prop oor:name="Enabled" oor:op="fuse"><value>false</value></prop></item>'
)
if "AutoSave" not in content:
content = content.replace("</oor:items>", autosave_line + "\n</oor:items>")
changed = True
print("Added AutoSave=false")
elif ">true<" in content.split("AutoSave")[1].split("</item>")[0]:
content = re.sub(
r'<item oor:path="/org.openoffice.Office.Recovery/AutoSave">.*?</item>',
autosave_line,
content,
flags=re.DOTALL,
lo_base = os.path.join(home, "AppData", "Roaming", "LibreOffice")

# Search ALL profile paths: LibreOffice/4/user, LibreOffice/user, etc.
user_dirs = []
if os.path.isdir(lo_base):
for entry in os.listdir(lo_base):
candidate = os.path.join(lo_base, entry, "user")
if os.path.isdir(candidate):
user_dirs.append(candidate)
# Also check LibreOffice/user directly
direct = os.path.join(lo_base, "user")
if os.path.isdir(direct) and direct not in user_dirs:
user_dirs.append(direct)

if not user_dirs:
print(f"No LibreOffice user dirs found under {lo_base}")

for lo_user in user_dirs:
print(f"Cleaning {lo_user}")

# Clear backup directory
backup_dir = os.path.join(lo_user, "backup")
if os.path.exists(backup_dir):
files = os.listdir(backup_dir)
if files:
shutil.rmtree(backup_dir)
os.makedirs(backup_dir)
print(f" Cleared {len(files)} backup file(s)")

# Clear .~lock.* files that block re-opening
for lockfile in glob.glob(os.path.join(lo_user, ".~lock.*")):
os.remove(lockfile)
print(f" Removed lock file: {os.path.basename(lockfile)}")

# Edit registrymodifications.xcu to remove recovery entries and disable autosave
xcu = os.path.join(lo_user, "registrymodifications.xcu")
if os.path.exists(xcu):
with open(xcu, "r", encoding="utf-8") as f:
content = f.read()
changed = False
if "RecoveryList" in content:
content = re.sub(
r'<item oor:path="/org.openoffice.Office.Recovery/RecoveryList">.*?</item>',
"",
content,
flags=re.DOTALL,
)
changed = True
print(" Removed RecoveryList entries")
autosave_line = (
'<item oor:path="/org.openoffice.Office.Recovery/AutoSave">'
'<prop oor:name="Enabled" oor:op="fuse"><value>false</value></prop></item>'
)
changed = True
print("Changed AutoSave to false")
else:
print("AutoSave already disabled")
if changed:
with open(xcu, "w", encoding="utf-8") as f:
f.write(content)
print("Updated registrymodifications.xcu")
else:
print(f"No xcu found at {xcu}")
if "AutoSave" not in content:
content = content.replace("</oor:items>", autosave_line + "\n</oor:items>")
changed = True
print(" Added AutoSave=false")
elif ">true<" in content.split("AutoSave")[1].split("</item>")[0]:
content = re.sub(
r'<item oor:path="/org.openoffice.Office.Recovery/AutoSave">.*?</item>',
autosave_line,
content,
flags=re.DOTALL,
)
changed = True
print(" Changed AutoSave to false")
if changed:
with open(xcu, "w", encoding="utf-8") as f:
f.write(content)

# Also clear lock files from common download locations
for d in [os.path.join(home, "Downloads"), os.path.join(home, "Documents")]:
for lockfile in glob.glob(os.path.join(d, ".~lock.*")):
os.remove(lockfile)
print(f"Removed download lock: {lockfile}")
"""


Expand Down
25 changes: 13 additions & 12 deletions scripts/core4_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,6 @@ def cmd_run(args: argparse.Namespace) -> int:
args.agent,
"--vm-user",
args.vm_user,
"--transport-error-threshold",
str(args.transport_error_threshold),
"--health-samples",
str(args.health_samples),
"--health-min-success",
str(args.health_min_success),
"--health-sample-delay",
str(args.health_sample_delay),
]
if args.vm_ip:
cmd.extend(["--vm-ip", args.vm_ip])
Expand All @@ -112,6 +104,16 @@ def cmd_run(args: argparse.Namespace) -> int:
str(args.max_replans),
]
)
if args.done_gate:
cmd.extend(
[
"--done-gate",
"--done-gate-max-overrides",
str(args.done_gate_max_overrides),
"--done-gate-threshold",
str(args.done_gate_threshold),
]
)

print(f"\n=== Trial {t} -> {out} ===")
rc = _run_cmd(cmd, cwd=repo_root, dry_run=args.dry_run)
Expand Down Expand Up @@ -184,15 +186,14 @@ def build_parser() -> argparse.ArgumentParser:
run.add_argument("--vm-ip", default=None)
run.add_argument("--vm-user", default="azureuser")
run.add_argument("--start-from", type=int, default=0)
run.add_argument("--transport-error-threshold", type=int, default=8)
run.add_argument("--health-samples", type=int, default=3)
run.add_argument("--health-min-success", type=int, default=2)
run.add_argument("--health-sample-delay", type=float, default=1.5)
run.add_argument("--zs-only", action="store_true")
run.add_argument("--dc-only", action="store_true")
run.add_argument("--controller", action="store_true")
run.add_argument("--max-retries", type=int, default=2)
run.add_argument("--max-replans", type=int, default=2)
run.add_argument("--done-gate", action="store_true")
run.add_argument("--done-gate-max-overrides", type=int, default=3)
run.add_argument("--done-gate-threshold", type=float, default=1.0)
run.add_argument("--continue-on-fail", action="store_true")
run.add_argument("--dry-run", action="store_true")
run.set_defaults(func=cmd_run)
Expand Down