From 5945135d7ad51459eff9620ff630ed5058d6a676 Mon Sep 17 00:00:00 2001 From: engineer Date: Wed, 18 Feb 2026 09:37:11 -0800 Subject: [PATCH 1/8] fix(reflection): gate user action on explicit blockers --- reflection-3.ts | 7 +- test/reflection-3.unit.test.ts | 220 +++++++++++++++++++++++++++++++++ 2 files changed, 224 insertions(+), 3 deletions(-) diff --git a/reflection-3.ts b/reflection-3.ts index ef3e978..4924513 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -1096,7 +1096,7 @@ Rules: - Tests cannot be skipped or marked as flaky/not important. - Direct pushes to main/master are not allowed; require a PR instead. - If stuck, propose an alternate approach. -- If you need user action (auth, 2FA, credentials), list it in needs_user_action. +- If you need user action (auth, 2FA, credentials, access requests, uploads, approvals), list it in needs_user_action. - PLANNING LOOP CHECK: If the task requires code changes (fix, implement, add, create, build, refactor, update) but the "Tool Commands Run" section shows ONLY read operations (read, glob, grep, git log, git status, git diff, webfetch, task/explore) and NO write operations (edit, write, bash with build/test/commit, github_create_pull_request, etc.), then the task is NOT complete. Set status to "in_progress", set stuck to true, and list "Implement the actual code changes" in remaining_work. Analyzing and recommending changes is not the same as making them. - If you are repeating the same actions (deploy, test, build) without making progress, set "stuck": true. - Do not retry the same failing approach more than twice — try something different or report stuck.` @@ -1221,12 +1221,13 @@ function evaluateSelfAssessment(assessment: SelfAssessment, context: TaskContext addMissing("Rethink approach", "Propose an alternate approach and continue") } - const requiresHumanAction = needsUserAction.length > 0 + const explicitUserAction = needsUserAction.filter(item => /auth|2fa|credential|token|secret|permission|approve|merge|run|execute|access|provide|upload|share|login|invite/i.test(item)) + const requiresHumanAction = explicitUserAction.length > 0 // Agent should continue if there are missing items beyond what only the user can do. // Even when user action is needed (e.g. "merge PR"), the agent may still have // actionable work (e.g. uncommitted changes, missing tests) it can complete first. const agentActionableMissing = missing.filter(item => - !needsUserAction.some(ua => item.toLowerCase().includes(ua.toLowerCase()) || ua.toLowerCase().includes(item.toLowerCase())) + !explicitUserAction.some(ua => item.toLowerCase().includes(ua.toLowerCase()) || ua.toLowerCase().includes(item.toLowerCase())) ) const shouldContinue = agentActionableMissing.length > 0 || (!requiresHumanAction && missing.length > 0) const complete = status === "complete" && missing.length === 0 && confidence >= 0.8 && !requiresHumanAction diff --git a/test/reflection-3.unit.test.ts b/test/reflection-3.unit.test.ts index 5ee72cb..e1f2042 100644 --- a/test/reflection-3.unit.test.ts +++ b/test/reflection-3.unit.test.ts @@ -345,6 +345,35 @@ describe("reflection-3 unit", () => { assert.strictEqual(analysis.shouldContinue, false) }) + it("treats run-test request as user action", () => { + const assessment = { + status: "in_progress" as const, + confidence: 0.6, + remaining_work: ["Run tests", "Create PR"], + needs_user_action: ["Run tests"] + } + const analysis = evaluateSelfAssessment(assessment, { + taskSummary: "Implement feature", + taskType: "coding", + agentMode: "build", + humanMessages: ["Implement feature"], + toolsSummary: "(none)", + detectedSignals: [], + recentCommands: [], + pushedToDefaultBranch: false, + requiresTests: true, + requiresBuild: false, + requiresPR: true, + requiresCI: true, + requiresLocalTests: true, + requiresLocalTestsEvidence: true + }) + + assert.strictEqual(analysis.requiresHumanAction, true) + assert.strictEqual(analysis.shouldContinue, true) + assert.ok(analysis.missing.length > 0) + }) + it("ops tasks do not require PR or CI", () => { // For ops tasks, PR and CI should not be enforced const assessment = { @@ -899,3 +928,194 @@ describe("buildSelfAssessmentPrompt attempt awareness", () => { assert.ok(result.includes("Do not retry the same failing approach")) }) }) + +describe("isPlanMode", () => { + // Helper to create a message with given role and text parts + function msg(role: string, ...texts: string[]) { + return { + info: { role }, + parts: texts.map(t => ({ type: "text", text: t })) + } + } + + describe("system/developer message detection", () => { + it("detects 'Plan Mode' in system message", () => { + const messages = [msg("system", "# Plan Mode - System Reminder")] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("detects 'plan mode ACTIVE' in developer message", () => { + const messages = [msg("developer", "CRITICAL: plan mode ACTIVE - you are in READ-ONLY phase")] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("detects 'read-only mode' in system message", () => { + const messages = [msg("system", "You are in read-only mode")] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("detects 'READ-ONLY phase' in system message", () => { + const messages = [msg("system", "you are in READ-ONLY phase")] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("detects 'plan mode is active' in system message", () => { + const messages = [msg("system", "plan mode is active. Do not edit files.")] + assert.strictEqual(isPlanMode(messages), true) + }) + }) + + describe("system-reminder detection (OpenCode actual format)", () => { + it("detects default plan.txt system-reminder in user message", () => { + const reminder = ` + # Plan Mode - System Reminder + + CRITICAL: Plan mode ACTIVE - you are in READ-ONLY phase. STRICTLY FORBIDDEN: + ANY file edits, modifications, or system changes. + ` + const messages = [msg("user", "Help me plan", reminder)] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("detects experimental plan mode system-reminder", () => { + const reminder = ` + Plan mode is active. The user indicated that they do not want you to execute yet -- + you MUST NOT make any edits. + ` + const messages = [msg("user", "Design the architecture", reminder)] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("detects plan mode system-reminder even in older messages", () => { + const reminder = ` + Plan mode is active. READ-ONLY phase. + ` + const messages = [ + msg("user", "First message", reminder), + msg("assistant", "Here is my plan..."), + msg("user", "Thanks, looks good") + ] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("detects READ-ONLY phase in system-reminder", () => { + const reminder = ` + CRITICAL: you are in READ-ONLY phase. Do not modify files. + ` + const messages = [msg("user", "Analyze the code", reminder)] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("does NOT trigger on system-reminder without plan mode keywords", () => { + const reminder = ` + You have access to these tools: read, write, edit. + ` + const messages = [msg("user", "Fix the bug", reminder)] + assert.strictEqual(isPlanMode(messages), false) + }) + + it("does NOT trigger on plan mode keywords outside system-reminder", () => { + // The user says "plan mode" literally -> detected via user message check, not system-reminder + const messages = [msg("user", "Enable plan mode")] + assert.strictEqual(isPlanMode(messages), true) // detected via user keyword check + }) + }) + + describe("user message keyword detection", () => { + it("detects 'plan mode' in user message (case insensitive)", () => { + const messages = [msg("user", "Switch to Plan Mode")] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("detects 'plan' at start of user message", () => { + const messages = [msg("user", "plan the architecture for the new feature")] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("detects 'create a plan' pattern", () => { + const messages = [msg("user", "create a plan for the refactoring")] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("detects 'write a plan' pattern", () => { + const messages = [msg("user", "write a detailed plan")] + assert.strictEqual(isPlanMode(messages), true) + }) + + it("does NOT detect 'plan' in the middle of unrelated text", () => { + const messages = [msg("user", "Fix the airplane display bug")] + assert.strictEqual(isPlanMode(messages), false) + }) + + it("does NOT trigger on regular coding tasks", () => { + const messages = [msg("user", "Fix the login bug and add tests")] + assert.strictEqual(isPlanMode(messages), false) + }) + }) + + describe("reflection message handling", () => { + it("skips reflection messages when looking for user keywords", () => { + const reflectionMsg = { + info: { role: "user" }, + parts: [{ type: "text", text: "## Reflection-3 Self-Assessment\nplan mode test" }] + } + const messages = [msg("user", "Fix the bug"), reflectionMsg] + assert.strictEqual(isPlanMode(messages), false) + }) + + it("checks non-reflection user message even after reflection message", () => { + const reflectionMsg = { + info: { role: "user" }, + parts: [{ type: "text", text: "## Reflection-3 Self-Assessment\nsome assessment" }] + } + const messages = [msg("user", "Switch to plan mode"), reflectionMsg] + // Walks backward: skips reflectionMsg, finds "Switch to plan mode" + assert.strictEqual(isPlanMode(messages), true) + }) + }) + + describe("multiple text parts in a single message", () => { + it("checks all text parts, not just the last one", () => { + const messages = [{ + info: { role: "user" }, + parts: [ + { type: "text", text: "plan mode please" }, + { type: "text", text: "I want to think about this" } + ] + }] + assert.strictEqual(isPlanMode(messages), true) + }) + }) + + describe("edge cases", () => { + it("returns false for empty messages array", () => { + assert.strictEqual(isPlanMode([]), false) + }) + + it("returns false for messages with no parts", () => { + const messages = [{ info: { role: "user" } }] + assert.strictEqual(isPlanMode(messages), false) + }) + + it("returns false for messages with empty text parts", () => { + const messages = [{ info: { role: "user" }, parts: [{ type: "text", text: "" }] }] + assert.strictEqual(isPlanMode(messages), false) + }) + + it("returns false for assistant-only messages", () => { + const messages = [msg("assistant", "Here is the plan for the feature")] + assert.strictEqual(isPlanMode(messages), false) + }) + + it("handles build-switch reminder (should NOT be plan mode)", () => { + const reminder = ` + Your operational mode has changed from plan to build. + You are no longer in read-only mode. + ` + // "no longer in read-only mode" should not match — but "plan" + system-reminder exists + // The regex checks for "plan mode" (case insensitive) — "from plan to build" contains "plan" but NOT "plan mode" + const messages = [msg("user", "Now implement it", reminder)] + assert.strictEqual(isPlanMode(messages), false) + }) + }) +}) From a9eb98065afe3a2f8eaa09e343e3d5c5eacc972c Mon Sep 17 00:00:00 2001 From: engineer Date: Wed, 18 Feb 2026 12:14:18 -0800 Subject: [PATCH 2/8] fix(reflection): avoid blocking runnable steps --- reflection-3.test-helpers.ts | 5 +++-- reflection-3.ts | 23 +++++++++++++---------- test/reflection-3.unit.test.ts | 4 ++-- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/reflection-3.test-helpers.ts b/reflection-3.test-helpers.ts index 3b38ee7..290fdeb 100644 --- a/reflection-3.test-helpers.ts +++ b/reflection-3.test-helpers.ts @@ -276,12 +276,13 @@ export function evaluateSelfAssessment(assessment: SelfAssessment, context: Task addMissing("Rethink approach", "Propose an alternate approach and continue") } - const requiresHumanAction = needsUserAction.length > 0 + const explicitUserAction = needsUserAction.filter(item => /auth|2fa|credential|token|secret|permission|approve|merge|access|provide|upload|share|login|invite/i.test(item)) + const requiresHumanAction = explicitUserAction.length > 0 // Agent should continue if there are missing items beyond what only the user can do. // Even when user action is needed (e.g. "merge PR"), the agent may still have // actionable work (e.g. uncommitted changes, missing tests) it can complete first. const agentActionableMissing = missing.filter(item => - !needsUserAction.some(ua => item.toLowerCase().includes(ua.toLowerCase()) || ua.toLowerCase().includes(item.toLowerCase())) + !explicitUserAction.some(ua => item.toLowerCase().includes(ua.toLowerCase()) || ua.toLowerCase().includes(item.toLowerCase())) ) const shouldContinue = agentActionableMissing.length > 0 || (!requiresHumanAction && missing.length > 0) const complete = status === "complete" && missing.length === 0 && confidence >= 0.8 && !requiresHumanAction diff --git a/reflection-3.ts b/reflection-3.ts index 4924513..7f23ab9 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -1221,7 +1221,7 @@ function evaluateSelfAssessment(assessment: SelfAssessment, context: TaskContext addMissing("Rethink approach", "Propose an alternate approach and continue") } - const explicitUserAction = needsUserAction.filter(item => /auth|2fa|credential|token|secret|permission|approve|merge|run|execute|access|provide|upload|share|login|invite/i.test(item)) + const explicitUserAction = needsUserAction.filter(item => /auth|2fa|credential|token|secret|permission|approve|merge|access|provide|upload|share|login|invite/i.test(item)) const requiresHumanAction = explicitUserAction.length > 0 // Agent should continue if there are missing items beyond what only the user can do. // Even when user action is needed (e.g. "merge PR"), the agent may still have @@ -1346,15 +1346,18 @@ Return JSON only: if (!jsonMatch) continue const verdict = JSON.parse(jsonMatch[0]) as any - return { - complete: !!verdict.complete, - shouldContinue: !verdict.requires_human_action && !verdict.complete, - reason: verdict.feedback || "Judge analysis completed", - missing: Array.isArray(verdict.missing) ? verdict.missing : [], - nextActions: Array.isArray(verdict.next_actions) ? verdict.next_actions : [], - requiresHumanAction: !!verdict.requires_human_action, - severity: verdict.severity || "MEDIUM" - } + const missing = Array.isArray(verdict.missing) ? verdict.missing : [] + const requiresHumanAction = !!verdict.requires_human_action + const shouldContinue = !verdict.complete && (missing.length > 0 || !requiresHumanAction) + return { + complete: !!verdict.complete, + shouldContinue, + reason: verdict.feedback || "Judge analysis completed", + missing, + nextActions: Array.isArray(verdict.next_actions) ? verdict.next_actions : [], + requiresHumanAction, + severity: verdict.severity || "MEDIUM" + } } catch (e) { reportError(e, { plugin: "reflection-3", op: "judge-session" }) continue diff --git a/test/reflection-3.unit.test.ts b/test/reflection-3.unit.test.ts index e1f2042..055390e 100644 --- a/test/reflection-3.unit.test.ts +++ b/test/reflection-3.unit.test.ts @@ -345,7 +345,7 @@ describe("reflection-3 unit", () => { assert.strictEqual(analysis.shouldContinue, false) }) - it("treats run-test request as user action", () => { + it("does not treat run-test request as user action", () => { const assessment = { status: "in_progress" as const, confidence: 0.6, @@ -369,7 +369,7 @@ describe("reflection-3 unit", () => { requiresLocalTestsEvidence: true }) - assert.strictEqual(analysis.requiresHumanAction, true) + assert.strictEqual(analysis.requiresHumanAction, false) assert.strictEqual(analysis.shouldContinue, true) assert.ok(analysis.missing.length > 0) }) From b4e5400a3c85c92fc53296affa210760288efb4e Mon Sep 17 00:00:00 2001 From: engineer Date: Thu, 19 Feb 2026 02:03:51 -0800 Subject: [PATCH 3/8] Force add hello.js file that prints 'Hello World' --- .eval-tmp/tracked-files/hello.js | 1 + 1 file changed, 1 insertion(+) create mode 100644 .eval-tmp/tracked-files/hello.js diff --git a/.eval-tmp/tracked-files/hello.js b/.eval-tmp/tracked-files/hello.js new file mode 100644 index 0000000..73c0265 --- /dev/null +++ b/.eval-tmp/tracked-files/hello.js @@ -0,0 +1 @@ +console.log('Hello World'); \ No newline at end of file From 9b1efd8c17baae963ec280c87eff0f1f287b7e2f Mon Sep 17 00:00:00 2001 From: engineer Date: Thu, 19 Feb 2026 02:11:22 -0800 Subject: [PATCH 4/8] Add hello.js to print Hello World --- .eval-tmp/hello.js | 1 + 1 file changed, 1 insertion(+) create mode 100644 .eval-tmp/hello.js diff --git a/.eval-tmp/hello.js b/.eval-tmp/hello.js new file mode 100644 index 0000000..73c0265 --- /dev/null +++ b/.eval-tmp/hello.js @@ -0,0 +1 @@ +console.log('Hello World'); \ No newline at end of file From b0f062e20376dd13e78359afdde06fd942fe60a6 Mon Sep 17 00:00:00 2001 From: engineer Date: Thu, 19 Feb 2026 02:12:14 -0800 Subject: [PATCH 5/8] Add utils.ts with add function and its test --- .eval-tmp/utils.test.ts | 9 +++++++++ .eval-tmp/utils.ts | 3 +++ 2 files changed, 12 insertions(+) create mode 100644 .eval-tmp/utils.test.ts create mode 100644 .eval-tmp/utils.ts diff --git a/.eval-tmp/utils.test.ts b/.eval-tmp/utils.test.ts new file mode 100644 index 0000000..0486752 --- /dev/null +++ b/.eval-tmp/utils.test.ts @@ -0,0 +1,9 @@ +import { add } from './utils'; + +describe('add function', () => { + it('should return the sum of two numbers', () => { + expect(add(2, 3)).toBe(5); + expect(add(-2, 3)).toBe(1); + expect(add(0, 0)).toBe(0); + }); +}); \ No newline at end of file diff --git a/.eval-tmp/utils.ts b/.eval-tmp/utils.ts new file mode 100644 index 0000000..946a4b8 --- /dev/null +++ b/.eval-tmp/utils.ts @@ -0,0 +1,3 @@ +export function add(a: number, b: number): number { + return a + b; +} \ No newline at end of file From ff4476618c7a67808634c77cbb02891960466772 Mon Sep 17 00:00:00 2001 From: engineer Date: Thu, 19 Feb 2026 02:14:16 -0800 Subject: [PATCH 6/8] Add utils.ts and utils.test.ts with Jest configuration fix --- .eval-tmp/utils.test.ts | 28 ++++++++++++++++++++++------ .eval-tmp/utils.ts | 10 +++++++++- jest.config.js | 2 +- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/.eval-tmp/utils.test.ts b/.eval-tmp/utils.test.ts index 0486752..9e1ae69 100644 --- a/.eval-tmp/utils.test.ts +++ b/.eval-tmp/utils.test.ts @@ -1,9 +1,25 @@ +// utils.test.ts + import { add } from './utils'; -describe('add function', () => { - it('should return the sum of two numbers', () => { - expect(add(2, 3)).toBe(5); - expect(add(-2, 3)).toBe(1); - expect(add(0, 0)).toBe(0); - }); +describe('add', () => { + it('should add two positive numbers', () => { + expect(add(2, 3)).toBe(5); + }); + + it('should add a positive and a negative number', () => { + expect(add(10, -5)).toBe(5); + }); + + it('should add two negative numbers', () => { + expect(add(-4, -6)).toBe(-10); + }); + + it('should return the second number when the first is zero', () => { + expect(add(0, 7)).toBe(7); + }); + + it('should return the first number when the second is zero', () => { + expect(add(7, 0)).toBe(7); + }); }); \ No newline at end of file diff --git a/.eval-tmp/utils.ts b/.eval-tmp/utils.ts index 946a4b8..927ecef 100644 --- a/.eval-tmp/utils.ts +++ b/.eval-tmp/utils.ts @@ -1,3 +1,11 @@ +// utils.ts + +/** + * Adds two numbers and returns the result. + * @param a - The first number. + * @param b - The second number. + * @returns The sum of the two numbers. + */ export function add(a: number, b: number): number { - return a + b; + return a + b; } \ No newline at end of file diff --git a/jest.config.js b/jest.config.js index 62cd77c..7477a64 100644 --- a/jest.config.js +++ b/jest.config.js @@ -1,7 +1,7 @@ export default { preset: 'ts-jest/presets/default-esm', testEnvironment: 'node', - testMatch: ['**/test/**/*.test.ts'], + testMatch: ['**/*.test.ts'], testPathIgnorePatterns: ['/node_modules/', 'session-fork-directory.test.ts', 'e2e.test.ts'], moduleFileExtensions: ['ts', 'js', 'json'], moduleNameMapper: { From 9d2301d2bae9ecd0fc990964da2663614a886a85 Mon Sep 17 00:00:00 2001 From: engineer Date: Thu, 19 Feb 2026 02:14:36 -0800 Subject: [PATCH 7/8] Add greeter.ts with greet function --- .eval-tmp/greeter.ts | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .eval-tmp/greeter.ts diff --git a/.eval-tmp/greeter.ts b/.eval-tmp/greeter.ts new file mode 100644 index 0000000..66c6de6 --- /dev/null +++ b/.eval-tmp/greeter.ts @@ -0,0 +1,3 @@ +export function greet(name: string): string { + return `Hello, ${name}!`; +} \ No newline at end of file From 1597a36d8e347d90f37435ed4234f54e2aebea80 Mon Sep 17 00:00:00 2001 From: engineer Date: Thu, 19 Feb 2026 18:16:07 -0800 Subject: [PATCH 8/8] Refine reflection continuation and eval coverage --- .eval-tmp/greeter.ts | 3 - .eval-tmp/hello.js | 1 - .eval-tmp/opencode.json | 4 - .eval-tmp/tracked-files/hello.js | 1 - .eval-tmp/utils.test.ts | 25 ----- .eval-tmp/utils.ts | 11 -- docs/reflection.md | 6 +- docs/session-ses_38de.md | 33 ++++++ eval.ts | 27 ++++- evals/promptfooconfig.yaml | 48 ++++++++ reflection-3.test-helpers.ts | 93 ++++++++++++---- reflection-3.ts | 122 ++++++++++++++------ test/reflection-3.unit.test.ts | 91 ++++++++++++++- test/reflection-static.eval.test.ts | 165 +++++++++++++++++++++++++++- 14 files changed, 517 insertions(+), 113 deletions(-) delete mode 100644 .eval-tmp/greeter.ts delete mode 100644 .eval-tmp/hello.js delete mode 100644 .eval-tmp/opencode.json delete mode 100644 .eval-tmp/tracked-files/hello.js delete mode 100644 .eval-tmp/utils.test.ts delete mode 100644 .eval-tmp/utils.ts create mode 100644 docs/session-ses_38de.md diff --git a/.eval-tmp/greeter.ts b/.eval-tmp/greeter.ts deleted file mode 100644 index 66c6de6..0000000 --- a/.eval-tmp/greeter.ts +++ /dev/null @@ -1,3 +0,0 @@ -export function greet(name: string): string { - return `Hello, ${name}!`; -} \ No newline at end of file diff --git a/.eval-tmp/hello.js b/.eval-tmp/hello.js deleted file mode 100644 index 73c0265..0000000 --- a/.eval-tmp/hello.js +++ /dev/null @@ -1 +0,0 @@ -console.log('Hello World'); \ No newline at end of file diff --git a/.eval-tmp/opencode.json b/.eval-tmp/opencode.json deleted file mode 100644 index 8a21677..0000000 --- a/.eval-tmp/opencode.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "$schema": "https://opencode.ai/config.json", - "model": "github-copilot/gpt-4o" -} \ No newline at end of file diff --git a/.eval-tmp/tracked-files/hello.js b/.eval-tmp/tracked-files/hello.js deleted file mode 100644 index 73c0265..0000000 --- a/.eval-tmp/tracked-files/hello.js +++ /dev/null @@ -1 +0,0 @@ -console.log('Hello World'); \ No newline at end of file diff --git a/.eval-tmp/utils.test.ts b/.eval-tmp/utils.test.ts deleted file mode 100644 index 9e1ae69..0000000 --- a/.eval-tmp/utils.test.ts +++ /dev/null @@ -1,25 +0,0 @@ -// utils.test.ts - -import { add } from './utils'; - -describe('add', () => { - it('should add two positive numbers', () => { - expect(add(2, 3)).toBe(5); - }); - - it('should add a positive and a negative number', () => { - expect(add(10, -5)).toBe(5); - }); - - it('should add two negative numbers', () => { - expect(add(-4, -6)).toBe(-10); - }); - - it('should return the second number when the first is zero', () => { - expect(add(0, 7)).toBe(7); - }); - - it('should return the first number when the second is zero', () => { - expect(add(7, 0)).toBe(7); - }); -}); \ No newline at end of file diff --git a/.eval-tmp/utils.ts b/.eval-tmp/utils.ts deleted file mode 100644 index 927ecef..0000000 --- a/.eval-tmp/utils.ts +++ /dev/null @@ -1,11 +0,0 @@ -// utils.ts - -/** - * Adds two numbers and returns the result. - * @param a - The first number. - * @param b - The second number. - * @returns The sum of the two numbers. - */ -export function add(a: number, b: number): number { - return a + b; -} \ No newline at end of file diff --git a/docs/reflection.md b/docs/reflection.md index 97881ef..322a7d8 100644 --- a/docs/reflection.md +++ b/docs/reflection.md @@ -12,7 +12,7 @@ Evaluates agent task completion and enforces workflow requirements using a self- - If self-assessment parsing fails, fall back to a judge session and parse a JSON verdict. - Write verdict signals to `.reflection/verdict_.json` for TTS/Telegram gating. - Persist reflection analysis data to `.reflection/_.json`. -- Provide feedback only when incomplete; show a toast when complete or when user action is required. +- Provide feedback only when incomplete; show a toast when complete or when only human action remains. ## Configuration Reflection models are configured in `~/.config/opencode/reflection.yaml`. @@ -56,8 +56,8 @@ The agent must return JSON with evidence and status, including: ## Decision Outcomes - Complete -> toast success, write verdict signal. -- Requires human action -> toast warning, no follow-up prompt. -- Incomplete -> push feedback into the session with next steps. +- Requires human action only -> toast warning, no follow-up prompt. +- Incomplete or mixed (human action + agent steps) -> push feedback into the session with next steps. ## System Design Diagram ```mermaid diff --git a/docs/session-ses_38de.md b/docs/session-ses_38de.md new file mode 100644 index 0000000..e0b0634 --- /dev/null +++ b/docs/session-ses_38de.md @@ -0,0 +1,33 @@ +# session-ses_38de: Why It Looked Stuck + +This note documents why `session-ses_38de.md` appeared stuck and how reflection behaved. + +## Summary +- The task was **not complete**; multiple required steps remained (tests, proof screenshot, PR/CI). +- Reflection repeatedly flagged missing steps and **did** push continuation messages. +- Several required tests were attempted but **aborted by the tool runner**, leaving no passing evidence. +- The assistant kept switching between planning and attempted test execution without completing the full checklist. + +## What the Reflection Artifacts Show +From `/Users/engineer/workspace/vibebrowser/vibe-gpt52events/.reflection/`: +- `ses_38de_1771446015533.json`: missing plan, implementation, tests, proof screenshot. `shouldContinue: true`. +- `ses_38de_1771449299620.json`: missing implementation wiring, tests, PR/CI. `shouldContinue: true`. +- `ses_38de_1771452072701.json`: missing re-run tests + proof screenshot + PR/CI. `shouldContinue: true`. +- `verdict_ses_38de.json`: `complete: false`, `severity: HIGH`. + +Reflection also injected multiple “Task Incomplete (HIGH)” messages inside the session transcript (e.g., around lines 5622, 8267, 8835, 10563 in `session-ses_38de.md`). + +## Why It “Stopped” +1. **Tool execution aborted** for required tests (`npm test`, `node tests/extension.mock.test.js`, `node tests/vibe-e2e.test.js`, `node tests/google-workspace.test.js`). These appear as `Tool execution aborted` in the session log. +2. The assistant never completed all required verification steps, so reflection kept marking the task incomplete. +3. Reflection’s **continuation prompts did fire**; however, the task oscillated between planning and failed/aborted test runs, so the session looked stuck from the outside. + +## Why This Matters for “needs_user_action” +This session showed multiple steps the agent **could run** (tests, PR creation, CI checks). If these were mislabeled as `needs_user_action`, reflection could incorrectly stop. The updated logic now: +- Treats **human-only actions** (login, 2FA, OAuth consent, API key retrieval, approvals, uploads) as blocking. +- Treats **agent-runnable items** (tests, PR/CI, screenshots, commands) as actionable, keeping `shouldContinue` true. + +## Follow-up Improvements Implemented +- `needs_user_action` is split into **human-only** vs **agent-actionable** items. +- Actionable items are added to `missing` and `nextActions` to keep continuation moving. +- Judge fallback now uses the same actionable-vs-human split for `shouldContinue`. diff --git a/eval.ts b/eval.ts index a42289f..1bc4269 100644 --- a/eval.ts +++ b/eval.ts @@ -24,7 +24,8 @@ const MODEL = process.env.OPENCODE_MODEL || "github-copilot/gpt-4o" const PORT = 7654 const TIMEOUT = 300_000 // 5 minutes max per task const POLL_INTERVAL = 3_000 // Check every 3 seconds -const STABLE_POLLS_REQUIRED = 5 // Need 5 stable polls (15s of no new messages) +const STABLE_POLLS_REQUIRED = 3 // Stable polls before stopping +const MAX_WAIT_AFTER_OUTPUT = 20_000 // Test cases for evaluation interface TestCase { @@ -174,7 +175,10 @@ async function runTask( // Poll until stable - must wait for assistant to have parts let lastMsgCount = 0 let lastAssistantParts = 0 + let lastAssistantCount = 0 let stableCount = 0 + let firstAssistantOutput = "" + let firstAssistantCapturedAt: number | null = null while (Date.now() - start < TIMEOUT) { await new Promise(r => setTimeout(r, POLL_INTERVAL)) @@ -189,15 +193,25 @@ async function runTask( const assistantMsgs = (messages || []).filter((m: any) => m.info?.role === "assistant") const lastAssistant = assistantMsgs[assistantMsgs.length - 1] const assistantParts = lastAssistant?.parts?.length || 0 - + const assistantCount = assistantMsgs.length + console.log(`[${testCase.id}] Polling: ${msgCount} messages, assistant parts=${assistantParts}, stable=${stableCount}`) + + if (!firstAssistantOutput && assistantMsgs.length > 0) { + const candidate = extractTextContent(lastAssistant) + if (candidate) { + firstAssistantOutput = candidate + firstAssistantCapturedAt = Date.now() + } + } // Only consider stable if: // 1. We have at least 2 messages (user + assistant) // 2. The assistant message has at least 1 part // 3. Both message count AND part count are stable const isStable = msgCount === lastMsgCount && - assistantParts === lastAssistantParts && + assistantParts === lastAssistantParts && + assistantCount === lastAssistantCount && msgCount >= 2 && assistantParts > 0 @@ -208,6 +222,11 @@ async function runTask( stableCount = 0 lastMsgCount = msgCount lastAssistantParts = assistantParts + lastAssistantCount = assistantCount + } + + if (firstAssistantCapturedAt && Date.now() - firstAssistantCapturedAt > MAX_WAIT_AFTER_OUTPUT) { + break } } @@ -241,7 +260,7 @@ async function runTask( const assistantMsgs = messages.filter((m: any) => m.info?.role === "assistant") if (assistantMsgs.length > 0) { const lastAssistant = assistantMsgs[assistantMsgs.length - 1] - result.agentOutput = extractTextContent(lastAssistant) + result.agentOutput = firstAssistantOutput || extractTextContent(lastAssistant) console.log(`[${testCase.id}] Agent output length: ${result.agentOutput.length}`) } diff --git a/evals/promptfooconfig.yaml b/evals/promptfooconfig.yaml index 3b4e700..248d351 100644 --- a/evals/promptfooconfig.yaml +++ b/evals/promptfooconfig.yaml @@ -139,6 +139,38 @@ tests: const verdict = JSON.parse(json[0]); return verdict.complete === false && (verdict.missing?.length > 0 || verdict.next_actions?.length > 0); + - description: "Agent asked user to run tests - INCOMPLETE without human action" + vars: + task: "Run the unit tests and make sure they pass" + tools_used: | + edit: {filePath: 'src/utils.ts'} + agent_response: | + I updated the implementation, but I didn't run the tests. + Please run `npm test` and confirm the results. + assert: + - type: javascript + value: | + const json = output.match(/\{[\s\S]*\}/); + if (!json) return false; + const verdict = JSON.parse(json[0]); + return verdict.complete === false && verdict.requires_human_action === false; + + - description: "API key required - INCOMPLETE with human action" + vars: + task: "Configure the Stripe integration" + tools_used: | + read: {filePath: 'config/stripe.ts'} + agent_response: | + I need you to provide the Stripe API key from the dashboard to continue. + Once you share it, I can finish wiring the integration. + assert: + - type: javascript + value: | + const json = output.match(/\{[\s\S]*\}/); + if (!json) return false; + const verdict = JSON.parse(json[0]); + return verdict.complete === false && verdict.requires_human_action === true; + - description: "Build fails - INCOMPLETE with HIGH severity" vars: task: "Add the new feature and make sure the build passes" @@ -359,6 +391,22 @@ tests: // requires_human_action should be false (or undefined) return verdict.complete === false && !verdict.requires_human_action; + - description: "Needs user action plus agent steps - INCOMPLETE but continue" + vars: + task: "Finish the release and publish the package" + tools_used: | + bash: {command: 'npm test'} + agent_response: | + I still need to run tests and update the changelog. + Please approve the release in the dashboard once I finish. + assert: + - type: javascript + value: | + const json = output.match(/\{[\s\S]*\}/); + if (!json) return false; + const verdict = JSON.parse(json[0]); + return verdict.complete === false && verdict.requires_human_action === false; + # ============================================ # REAL SESSION PATTERNS - From production sessions # ============================================ diff --git a/reflection-3.test-helpers.ts b/reflection-3.test-helpers.ts index 290fdeb..f423e3f 100644 --- a/reflection-3.test-helpers.ts +++ b/reflection-3.test-helpers.ts @@ -169,6 +169,41 @@ export function parseSelfAssessmentJson(text: string | null | undefined): SelfAs } } +const HUMAN_ONLY_ACTION_PATTERNS: RegExp[] = [ + /\b(auth|authentication|oauth|2fa|mfa|captcha|otp|one[- ]time)\b/i, + /\b(log ?in|sign ?in|verification code|passcode)\b/i, + /\b(api key|secret|token|credential|access key|session cookie)\b/i, + /\b(permission|consent|approve|approval|access request|request access|grant access|invite)\b/i, + /\bupload\b/i +] + +const AGENT_ACTION_PATTERNS: RegExp[] = [ + /\b(run|re-?run|execute|test|build|compile|lint|format|commit|push|merge|pr|ci|check)\b/i, + /\b(gh|npm|node|python|bash|curl|script)\b/i, + /\b(edit|write|update|fix|implement|add|remove|change|create|open|verify|capture|screenshot|record)\b/i +] + +function isHumanOnlyAction(item: string): boolean { + const text = item.trim() + if (!text) return false + const hasHuman = HUMAN_ONLY_ACTION_PATTERNS.some(pattern => pattern.test(text)) + const hasAgent = AGENT_ACTION_PATTERNS.some(pattern => pattern.test(text)) + return hasHuman && !hasAgent +} + +function splitActionItems(items: string[]): { humanOnly: string[]; agentActionable: string[] } { + const humanOnly: string[] = [] + const agentActionable: string[] = [] + for (const raw of items) { + if (typeof raw !== "string") continue + const item = raw.trim() + if (!item) continue + if (isHumanOnlyAction(item)) humanOnly.push(item) + else agentActionable.push(item) + } + return { humanOnly, agentActionable } +} + export function evaluateSelfAssessment(assessment: SelfAssessment, context: TaskContext): ReflectionAnalysis { const safeContext: TaskContext = { taskSummary: context?.taskSummary || "", @@ -209,6 +244,14 @@ export function evaluateSelfAssessment(assessment: SelfAssessment, context: Task for (const item of remaining) addMissing(item) } + const { humanOnly: humanNeeds, agentActionable: agentNeeds } = splitActionItems(needsUserAction) + if (agentNeeds.length) { + for (const item of agentNeeds) { + addMissing(item) + if (!nextActions.includes(item)) nextActions.push(item) + } + } + if (safeContext.requiresTests) { if (tests.ran !== true) { addMissing("Run tests", "Run the full test suite and capture output") @@ -276,31 +319,17 @@ export function evaluateSelfAssessment(assessment: SelfAssessment, context: Task addMissing("Rethink approach", "Propose an alternate approach and continue") } - const explicitUserAction = needsUserAction.filter(item => /auth|2fa|credential|token|secret|permission|approve|merge|access|provide|upload|share|login|invite/i.test(item)) - const requiresHumanAction = explicitUserAction.length > 0 - // Agent should continue if there are missing items beyond what only the user can do. - // Even when user action is needed (e.g. "merge PR"), the agent may still have - // actionable work (e.g. uncommitted changes, missing tests) it can complete first. - const agentActionableMissing = missing.filter(item => - !explicitUserAction.some(ua => item.toLowerCase().includes(ua.toLowerCase()) || ua.toLowerCase().includes(item.toLowerCase())) - ) - const shouldContinue = agentActionableMissing.length > 0 || (!requiresHumanAction && missing.length > 0) + const humanOnlyNextSteps = (assessment.next_steps || []).filter(item => isHumanOnlyAction(item)) + const requiresHumanAction = humanNeeds.length > 0 || humanOnlyNextSteps.length > 0 || missing.some(isHumanOnlyAction) || nextActions.some(isHumanOnlyAction) const complete = status === "complete" && missing.length === 0 && confidence >= 0.8 && !requiresHumanAction let severity: ReflectionAnalysis["severity"] = "NONE" - if (missing.some(item => /test|build/i.test(item))) severity = "HIGH" - else if (missing.some(item => /CI|check/i.test(item))) severity = "MEDIUM" - else if (missing.length > 0) severity = "LOW" - - if (requiresHumanAction && missing.length === 0) severity = "LOW" + const severityItems = missing.length > 0 ? missing : nextActions + if (severityItems.some(item => /test|build/i.test(item))) severity = "HIGH" + else if (severityItems.some(item => /CI|check/i.test(item))) severity = "MEDIUM" + else if (severityItems.length > 0) severity = "LOW" - const reason = complete - ? "Self-assessment confirms completion with required evidence" - : requiresHumanAction - ? "User action required before continuing" - : missing.length - ? "Missing required workflow steps" - : "Task not confirmed complete" + if (requiresHumanAction && missing.length === 0 && nextActions.length === 0) severity = "LOW" if (assessment.next_steps?.length) { for (const step of assessment.next_steps) { @@ -308,7 +337,26 @@ export function evaluateSelfAssessment(assessment: SelfAssessment, context: Task } } - return { complete, shouldContinue, reason, missing, nextActions, requiresHumanAction, severity } + const actionableMissing = missing.filter(item => !isHumanOnlyAction(item)) + const finalActionableNextActions = nextActions.filter(item => !isHumanOnlyAction(item)) + const finalShouldContinue = actionableMissing.length > 0 || finalActionableNextActions.length > 0 + const finalReason = complete + ? "Self-assessment confirms completion with required evidence" + : requiresHumanAction && !finalShouldContinue + ? "User action required before continuing" + : missing.length || finalActionableNextActions.length + ? "Missing required workflow steps" + : "Task not confirmed complete" + + return { + complete, + shouldContinue: finalShouldContinue, + reason: finalReason, + missing, + nextActions, + requiresHumanAction, + severity + } } export type RoutingCategory = "backend" | "architecture" | "frontend" | "default" @@ -567,6 +615,7 @@ export function shouldApplyPlanningLoop(taskType: TaskType, loopDetected: boolea const SELF_ASSESSMENT_MARKER = "## Reflection-3 Self-Assessment" export function isPlanMode(messages: any[]): boolean { + if (!Array.isArray(messages)) return false // Check system/developer messages for plan mode indicators const hasSystemPlanMode = messages.some((m: any) => (m.info?.role === "system" || m.info?.role === "developer") && diff --git a/reflection-3.ts b/reflection-3.ts index 7f23ab9..1c30ff8 100644 --- a/reflection-3.ts +++ b/reflection-3.ts @@ -447,6 +447,7 @@ function isJudgeSession(sessionId: string, messages: any[], judgeSessionIds: Set } export function isPlanMode(messages: any[]): boolean { + if (!Array.isArray(messages)) return false // Check system/developer messages for plan mode indicators const hasSystemPlanMode = messages.some((m: any) => (m.info?.role === "system" || m.info?.role === "developer") && @@ -1114,6 +1115,41 @@ function parseSelfAssessmentJson(text: string | null | undefined): SelfAssessmen } } +const HUMAN_ONLY_ACTION_PATTERNS: RegExp[] = [ + /\b(auth|authentication|oauth|2fa|mfa|captcha|otp|one[- ]time)\b/i, + /\b(log ?in|sign ?in|verification code|passcode)\b/i, + /\b(api key|secret|token|credential|access key|session cookie)\b/i, + /\b(permission|consent|approve|approval|access request|request access|grant access|invite)\b/i, + /\bupload\b/i +] + +const AGENT_ACTION_PATTERNS: RegExp[] = [ + /\b(run|re-?run|execute|test|build|compile|lint|format|commit|push|merge|pr|ci|check)\b/i, + /\b(gh|npm|node|python|bash|curl|script)\b/i, + /\b(edit|write|update|fix|implement|add|remove|change|create|open|verify|capture|screenshot|record)\b/i +] + +function isHumanOnlyAction(item: string): boolean { + const text = item.trim() + if (!text) return false + const hasHuman = HUMAN_ONLY_ACTION_PATTERNS.some(pattern => pattern.test(text)) + const hasAgent = AGENT_ACTION_PATTERNS.some(pattern => pattern.test(text)) + return hasHuman && !hasAgent +} + +function splitActionItems(items: string[]): { humanOnly: string[]; agentActionable: string[] } { + const humanOnly: string[] = [] + const agentActionable: string[] = [] + for (const raw of items) { + if (typeof raw !== "string") continue + const item = raw.trim() + if (!item) continue + if (isHumanOnlyAction(item)) humanOnly.push(item) + else agentActionable.push(item) + } + return { humanOnly, agentActionable } +} + function evaluateSelfAssessment(assessment: SelfAssessment, context: TaskContext): ReflectionAnalysis { const safeContext: TaskContext = { taskSummary: context?.taskSummary || "", @@ -1154,6 +1190,14 @@ function evaluateSelfAssessment(assessment: SelfAssessment, context: TaskContext for (const item of remaining) addMissing(item) } + const { humanOnly: humanNeeds, agentActionable: agentNeeds } = splitActionItems(needsUserAction) + if (agentNeeds.length) { + for (const item of agentNeeds) { + addMissing(item) + if (!nextActions.includes(item)) nextActions.push(item) + } + } + if (safeContext.requiresTests) { if (tests.ran !== true) { addMissing("Run tests", "Run the full test suite and capture output") @@ -1221,31 +1265,17 @@ function evaluateSelfAssessment(assessment: SelfAssessment, context: TaskContext addMissing("Rethink approach", "Propose an alternate approach and continue") } - const explicitUserAction = needsUserAction.filter(item => /auth|2fa|credential|token|secret|permission|approve|merge|access|provide|upload|share|login|invite/i.test(item)) - const requiresHumanAction = explicitUserAction.length > 0 - // Agent should continue if there are missing items beyond what only the user can do. - // Even when user action is needed (e.g. "merge PR"), the agent may still have - // actionable work (e.g. uncommitted changes, missing tests) it can complete first. - const agentActionableMissing = missing.filter(item => - !explicitUserAction.some(ua => item.toLowerCase().includes(ua.toLowerCase()) || ua.toLowerCase().includes(item.toLowerCase())) - ) - const shouldContinue = agentActionableMissing.length > 0 || (!requiresHumanAction && missing.length > 0) + const humanOnlyNextSteps = (assessment.next_steps || []).filter(item => isHumanOnlyAction(item)) + const requiresHumanAction = humanNeeds.length > 0 || humanOnlyNextSteps.length > 0 || missing.some(isHumanOnlyAction) || nextActions.some(isHumanOnlyAction) const complete = status === "complete" && missing.length === 0 && confidence >= 0.8 && !requiresHumanAction let severity: ReflectionAnalysis["severity"] = "NONE" - if (missing.some(item => /test|build/i.test(item))) severity = "HIGH" - else if (missing.some(item => /CI|check/i.test(item))) severity = "MEDIUM" - else if (missing.length > 0) severity = "LOW" - - if (requiresHumanAction && missing.length === 0) severity = "LOW" + const severityItems = missing.length > 0 ? missing : nextActions + if (severityItems.some(item => /test|build/i.test(item))) severity = "HIGH" + else if (severityItems.some(item => /CI|check/i.test(item))) severity = "MEDIUM" + else if (severityItems.length > 0) severity = "LOW" - const reason = complete - ? "Self-assessment confirms completion with required evidence" - : requiresHumanAction - ? "User action required before continuing" - : missing.length - ? "Missing required workflow steps" - : "Task not confirmed complete" + if (requiresHumanAction && missing.length === 0 && nextActions.length === 0) severity = "LOW" if (assessment.next_steps?.length) { for (const step of assessment.next_steps) { @@ -1253,7 +1283,26 @@ function evaluateSelfAssessment(assessment: SelfAssessment, context: TaskContext } } - return { complete, shouldContinue, reason, missing, nextActions, requiresHumanAction, severity } + const actionableMissing = missing.filter(item => !isHumanOnlyAction(item)) + const finalActionableNextActions = nextActions.filter(item => !isHumanOnlyAction(item)) + const finalShouldContinue = actionableMissing.length > 0 || finalActionableNextActions.length > 0 + const finalReason = complete + ? "Self-assessment confirms completion with required evidence" + : requiresHumanAction && !finalShouldContinue + ? "User action required before continuing" + : missing.length || finalActionableNextActions.length + ? "Missing required workflow steps" + : "Task not confirmed complete" + + return { + complete, + shouldContinue: finalShouldContinue, + reason: finalReason, + missing, + nextActions, + requiresHumanAction, + severity + } } async function analyzeSelfAssessmentWithLLM( @@ -1346,18 +1395,23 @@ Return JSON only: if (!jsonMatch) continue const verdict = JSON.parse(jsonMatch[0]) as any - const missing = Array.isArray(verdict.missing) ? verdict.missing : [] - const requiresHumanAction = !!verdict.requires_human_action - const shouldContinue = !verdict.complete && (missing.length > 0 || !requiresHumanAction) - return { - complete: !!verdict.complete, - shouldContinue, - reason: verdict.feedback || "Judge analysis completed", - missing, - nextActions: Array.isArray(verdict.next_actions) ? verdict.next_actions : [], - requiresHumanAction, - severity: verdict.severity || "MEDIUM" - } + const missing = Array.isArray(verdict.missing) ? verdict.missing : [] + const humanOnlyMissing = missing.filter((item: string) => isHumanOnlyAction(item)) + const actionableMissing = missing.filter((item: string) => !isHumanOnlyAction(item)) + const actionableNextActions = Array.isArray(verdict.next_actions) + ? verdict.next_actions.filter((item: string) => typeof item === "string" && !isHumanOnlyAction(item)) + : [] + const shouldContinue = !verdict.complete && (actionableMissing.length > 0 || actionableNextActions.length > 0) + const requiresHumanAction = !!verdict.requires_human_action || humanOnlyMissing.length > 0 + return { + complete: !!verdict.complete, + shouldContinue, + reason: verdict.feedback || "Judge analysis completed", + missing, + nextActions: Array.isArray(verdict.next_actions) ? verdict.next_actions : [], + requiresHumanAction, + severity: verdict.severity || "MEDIUM" + } } catch (e) { reportError(e, { plugin: "reflection-3", op: "judge-session" }) continue diff --git a/test/reflection-3.unit.test.ts b/test/reflection-3.unit.test.ts index 055390e..3760782 100644 --- a/test/reflection-3.unit.test.ts +++ b/test/reflection-3.unit.test.ts @@ -118,6 +118,93 @@ describe("reflection-3 unit", () => { assert.strictEqual(analysis.complete, false) }) + it("treats non-human needs_user_action as agent-actionable work", () => { + const assessment = { + status: "in_progress" as const, + confidence: 0.4, + needs_user_action: ["Run tests", "Open PR and check CI"] + } + const analysis = evaluateSelfAssessment(assessment, { + taskSummary: "Implement feature", + taskType: "coding", + agentMode: "build", + humanMessages: ["Implement feature"], + toolsSummary: "(none)", + detectedSignals: [], + recentCommands: [], + pushedToDefaultBranch: false, + requiresTests: true, + requiresBuild: false, + requiresPR: true, + requiresCI: true, + requiresLocalTests: false, + requiresLocalTestsEvidence: false + }) + + assert.strictEqual(analysis.requiresHumanAction, false) + assert.strictEqual(analysis.shouldContinue, true) + assert.ok(analysis.missing.some((m: string) => m.toLowerCase().includes("run tests"))) + assert.ok(analysis.nextActions.some((m: string) => m.toLowerCase().includes("run tests"))) + }) + + it("keeps shouldContinue true when next_steps are actionable even if missing is empty", () => { + const assessment = { + status: "in_progress" as const, + confidence: 0.6, + remaining_work: [], + next_steps: ["Run npm test"], + needs_user_action: [] + } + const analysis = evaluateSelfAssessment(assessment, { + taskSummary: "Run tests", + taskType: "coding", + agentMode: "build", + humanMessages: ["Run tests"], + toolsSummary: "(none)", + detectedSignals: [], + recentCommands: [], + pushedToDefaultBranch: false, + requiresTests: false, + requiresBuild: false, + requiresPR: false, + requiresCI: false, + requiresLocalTests: false, + requiresLocalTestsEvidence: false + }) + + assert.strictEqual(analysis.shouldContinue, true) + assert.strictEqual(analysis.requiresHumanAction, false) + }) + + it("treats human-only needs_user_action as blocking when sole work", () => { + const assessment = { + status: "waiting_for_user" as const, + confidence: 0.9, + remaining_work: [], + next_steps: [], + needs_user_action: ["Log in and approve OAuth consent"] + } + const analysis = evaluateSelfAssessment(assessment, { + taskSummary: "Connect OAuth", + taskType: "coding", + agentMode: "build", + humanMessages: ["Connect OAuth"], + toolsSummary: "(none)", + detectedSignals: [], + recentCommands: [], + pushedToDefaultBranch: false, + requiresTests: false, + requiresBuild: false, + requiresPR: false, + requiresCI: false, + requiresLocalTests: false, + requiresLocalTestsEvidence: false + }) + + assert.strictEqual(analysis.requiresHumanAction, true) + assert.strictEqual(analysis.shouldContinue, false) + }) + it("detects PR requirement from text", () => { const signals = "Create a PR for this fix" const context = { @@ -286,12 +373,12 @@ describe("reflection-3 unit", () => { assert.strictEqual(inferTaskType("Analyze the trade-offs between approaches"), "research") }) - it("shouldContinue is true when agent has actionable work alongside needs_user_action", () => { + it("shouldContinue is true when agent has actionable work alongside human-only needs_user_action", () => { const assessment = { status: "in_progress" as const, confidence: 0.5, remaining_work: ["Commit and push uncommitted changes"], - needs_user_action: ["Merge the PR"], + needs_user_action: ["Provide API key"], evidence: { tests: { ran: false } } } const analysis = evaluateSelfAssessment(assessment, { diff --git a/test/reflection-static.eval.test.ts b/test/reflection-static.eval.test.ts index 0008f3b..a19ee7b 100644 --- a/test/reflection-static.eval.test.ts +++ b/test/reflection-static.eval.test.ts @@ -18,7 +18,7 @@ import { describe, it, before, after } from "node:test" import assert from "node:assert" -import { mkdir, rm, cp, readdir, readFile, writeFile } from "fs/promises" +import { mkdir, rm, cp, readdir, readFile, writeFile, access } from "fs/promises" import { spawn, type ChildProcess } from "child_process" import { join, dirname } from "path" import { fileURLToPath } from "url" @@ -41,6 +41,67 @@ const TELEGRAM_PLUGIN_PATH = join(__dirname, "../telegram.ts") const AGENT_MODEL = process.env.OPENCODE_MODEL || "github-copilot/gpt-4o" const TIMEOUT = 600_000 // 10 minutes for full test const POLL_INTERVAL = 3_000 +const DEFAULT_OPENCODE_BIN = "/Users/engineer/.opencode/bin/opencode" + +async function resolveOpencodeBinary(): Promise { + if (process.env.OPENCODE_BIN) return process.env.OPENCODE_BIN + try { + await access(DEFAULT_OPENCODE_BIN) + return DEFAULT_OPENCODE_BIN + } catch { + return "opencode" + } +} + +async function writeReflectionPrompt(dir: string, content: string): Promise { + await writeFile(join(dir, "reflection.md"), content) +} + +async function clearReflectionPrompt(dir: string): Promise { + await rm(join(dir, "reflection.md"), { force: true }) +} + +async function clearReflectionArtifacts(dir: string): Promise { + const reflectionDir = join(dir, ".reflection") + try { + const files = await readdir(reflectionDir) + await Promise.all( + files + .filter((file) => file.endsWith(".json") || file.endsWith(".log")) + .map((file) => rm(join(reflectionDir, file), { force: true })) + ) + } catch {} +} + +async function waitForReflectionAnalysis(dir: string, sessionId: string, timeoutMs = 120_000): Promise { + const prefix = sessionId.slice(0, 8) + const reflectionDir = join(dir, ".reflection") + const start = Date.now() + let lastError: unknown + + while (Date.now() - start < timeoutMs) { + try { + const files = await readdir(reflectionDir) + const matches = files.filter((file) => file.startsWith(`${prefix}_`) && file.endsWith(".json")) + if (matches.length > 0) { + const latest = matches + .map((file) => ({ file, ts: Number(file.split("_")[1]?.split(".")[0] || 0) })) + .sort((a, b) => a.ts - b.ts) + .pop() + if (latest?.file) { + const raw = await readFile(join(reflectionDir, latest.file), "utf-8") + const parsed = JSON.parse(raw) + if (parsed?.analysis) return parsed + } + } + } catch (err) { + lastError = err + } + await new Promise(r => setTimeout(r, 1000)) + } + + throw new Error(`Timed out waiting for reflection analysis (${prefix})${lastError ? `: ${String(lastError)}` : ""}`) +} interface TestResult { sessionId: string @@ -223,6 +284,7 @@ Return JSON only: describe("reflection + telegram plugin E2E evaluation", { timeout: TIMEOUT + 60_000 }, () => { const testDir = "/tmp/opencode-reflection-3-eval" const port = 3300 + const altPort = 3301 let server: ChildProcess | null = null let client: OpencodeClient let testResult: TestResult @@ -238,7 +300,10 @@ describe("reflection + telegram plugin E2E evaluation", { timeout: TIMEOUT + 60_ console.log(`[setup] dir=${testDir} model=${AGENT_MODEL}`) console.log(`[setup] plugins: reflection-3.ts, telegram.ts`) - server = spawn("opencode", ["serve", "--port", String(port)], { + const opencodeBin = await resolveOpencodeBinary() + console.log(`[setup] opencode bin: ${opencodeBin}`) + + server = spawn(opencodeBin, ["serve", "--port", String(port)], { cwd: testDir, stdio: ["ignore", "pipe", "pipe"], env: { @@ -255,7 +320,25 @@ describe("reflection + telegram plugin E2E evaluation", { timeout: TIMEOUT + 60_ directory: testDir }) - const ready = await waitForServer(port, 30_000) + let ready = await waitForServer(port, 30_000) + if (!ready) { + console.warn(`[setup] port ${port} failed, retrying on ${altPort}`) + server?.kill("SIGTERM") + await new Promise(r => setTimeout(r, 2000)) + server = spawn(opencodeBin, ["serve", "--port", String(altPort)], { + cwd: testDir, + stdio: ["ignore", "pipe", "pipe"], + env: { + ...process.env, + REFLECTION_DEBUG: "1" + } + }) + client = createOpencodeClient({ + baseUrl: `http://localhost:${altPort}`, + directory: testDir + }) + ready = await waitForServer(altPort, 30_000) + } if (!ready) throw new Error("Server failed to start") console.log("[setup] server ready") }) @@ -411,6 +494,81 @@ Requirements: assert.ok(testResult.messages.length >= 2, "Should have at least 2 messages") }) + it("continues when missing steps remain even with user action present", async () => { + const customPrompt = `SELF-ASSESS REFLECTION-3 + +You MUST output exactly the JSON below, with the same field values. Do not add commentary. +{ + "task_summary": "test", + "task_type": "bugfix", + "status": "in_progress", + "confidence": 0.4, + "evidence": { "tests": { "ran": false } }, + "remaining_work": ["Run tests"], + "next_steps": ["npm test"], + "needs_user_action": ["Merge the PR"], + "stuck": false, + "alternate_approach": "" +}` + + await clearReflectionArtifacts(testDir) + await writeReflectionPrompt(testDir, customPrompt) + try { + const { data: session } = await client.session.create({}) + if (!session?.id) throw new Error("Failed to create session") + + await client.session.promptAsync({ + path: { id: session.id }, + body: { parts: [{ type: "text", text: "Update docs and run tests" }] } + }) + + const analysisData = await waitForReflectionAnalysis(testDir, session.id) + const analysis = analysisData.analysis + assert.strictEqual(analysis.requiresHumanAction, false) + assert.strictEqual(analysis.shouldContinue, true) + assert.ok(analysis.missing?.length > 0) + } finally { + await clearReflectionPrompt(testDir) + } + }) + + it("stops only when user action is the sole blocker", async () => { + const customPrompt = `SELF-ASSESS REFLECTION-3 + +You MUST output exactly the JSON below, with the same field values. Do not add commentary. +{ + "task_summary": "test", + "task_type": "bugfix", + "status": "waiting_for_user", + "confidence": 0.9, + "evidence": {}, + "remaining_work": [], + "next_steps": [], + "needs_user_action": ["Provide API key"], + "stuck": false, + "alternate_approach": "" +}` + + await clearReflectionArtifacts(testDir) + await writeReflectionPrompt(testDir, customPrompt) + try { + const { data: session } = await client.session.create({}) + if (!session?.id) throw new Error("Failed to create session") + + await client.session.promptAsync({ + path: { id: session.id }, + body: { parts: [{ type: "text", text: "Configure the API" }] } + }) + + const analysisData = await waitForReflectionAnalysis(testDir, session.id) + const analysis = analysisData.analysis + assert.strictEqual(analysis.requiresHumanAction, true) + assert.strictEqual(analysis.shouldContinue, false) + } finally { + await clearReflectionPrompt(testDir) + } + }) + it("telegram extractFinalResponse filters reflection artifacts from session messages", async () => { const messages = testResult.messages @@ -555,4 +713,5 @@ Requirements: console.log(`[warn] evaluation score ${evaluationResult.score}/5 is below threshold (3)`) } }) + })