Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .eval-tmp/hello.js
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
console.log('Hello World');
1 change: 1 addition & 0 deletions .eval-tmp/tracked-files/hello.js
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
console.log('Hello World');
9 changes: 9 additions & 0 deletions .eval-tmp/utils.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import { add } from './utils';

describe('add function', () => {
it('should return the sum of two numbers', () => {
expect(add(2, 3)).toBe(5);
expect(add(-2, 3)).toBe(1);
expect(add(0, 0)).toBe(0);
});
});
3 changes: 3 additions & 0 deletions .eval-tmp/utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
export function add(a: number, b: number): number {
return a + b;
}
5 changes: 3 additions & 2 deletions reflection-3.test-helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -276,12 +276,13 @@ export function evaluateSelfAssessment(assessment: SelfAssessment, context: Task
addMissing("Rethink approach", "Propose an alternate approach and continue")
}

const requiresHumanAction = needsUserAction.length > 0
const explicitUserAction = needsUserAction.filter(item => /auth|2fa|credential|token|secret|permission|approve|merge|access|provide|upload|share|login|invite/i.test(item))
const requiresHumanAction = explicitUserAction.length > 0
// Agent should continue if there are missing items beyond what only the user can do.
// Even when user action is needed (e.g. "merge PR"), the agent may still have
// actionable work (e.g. uncommitted changes, missing tests) it can complete first.
const agentActionableMissing = missing.filter(item =>
!needsUserAction.some(ua => item.toLowerCase().includes(ua.toLowerCase()) || ua.toLowerCase().includes(item.toLowerCase()))
!explicitUserAction.some(ua => item.toLowerCase().includes(ua.toLowerCase()) || ua.toLowerCase().includes(item.toLowerCase()))
)
const shouldContinue = agentActionableMissing.length > 0 || (!requiresHumanAction && missing.length > 0)
const complete = status === "complete" && missing.length === 0 && confidence >= 0.8 && !requiresHumanAction
Expand Down
28 changes: 16 additions & 12 deletions reflection-3.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1096,7 +1096,7 @@ Rules:
- Tests cannot be skipped or marked as flaky/not important.
- Direct pushes to main/master are not allowed; require a PR instead.
- If stuck, propose an alternate approach.
- If you need user action (auth, 2FA, credentials), list it in needs_user_action.
- If you need user action (auth, 2FA, credentials, access requests, uploads, approvals), list it in needs_user_action.
- PLANNING LOOP CHECK: If the task requires code changes (fix, implement, add, create, build, refactor, update) but the "Tool Commands Run" section shows ONLY read operations (read, glob, grep, git log, git status, git diff, webfetch, task/explore) and NO write operations (edit, write, bash with build/test/commit, github_create_pull_request, etc.), then the task is NOT complete. Set status to "in_progress", set stuck to true, and list "Implement the actual code changes" in remaining_work. Analyzing and recommending changes is not the same as making them.
- If you are repeating the same actions (deploy, test, build) without making progress, set "stuck": true.
- Do not retry the same failing approach more than twice — try something different or report stuck.`
Expand Down Expand Up @@ -1221,12 +1221,13 @@ function evaluateSelfAssessment(assessment: SelfAssessment, context: TaskContext
addMissing("Rethink approach", "Propose an alternate approach and continue")
}

const requiresHumanAction = needsUserAction.length > 0
const explicitUserAction = needsUserAction.filter(item => /auth|2fa|credential|token|secret|permission|approve|merge|access|provide|upload|share|login|invite/i.test(item))
const requiresHumanAction = explicitUserAction.length > 0
// Agent should continue if there are missing items beyond what only the user can do.
// Even when user action is needed (e.g. "merge PR"), the agent may still have
// actionable work (e.g. uncommitted changes, missing tests) it can complete first.
const agentActionableMissing = missing.filter(item =>
!needsUserAction.some(ua => item.toLowerCase().includes(ua.toLowerCase()) || ua.toLowerCase().includes(item.toLowerCase()))
!explicitUserAction.some(ua => item.toLowerCase().includes(ua.toLowerCase()) || ua.toLowerCase().includes(item.toLowerCase()))
)
const shouldContinue = agentActionableMissing.length > 0 || (!requiresHumanAction && missing.length > 0)
const complete = status === "complete" && missing.length === 0 && confidence >= 0.8 && !requiresHumanAction
Expand Down Expand Up @@ -1345,15 +1346,18 @@ Return JSON only:
if (!jsonMatch) continue

const verdict = JSON.parse(jsonMatch[0]) as any
return {
complete: !!verdict.complete,
shouldContinue: !verdict.requires_human_action && !verdict.complete,
reason: verdict.feedback || "Judge analysis completed",
missing: Array.isArray(verdict.missing) ? verdict.missing : [],
nextActions: Array.isArray(verdict.next_actions) ? verdict.next_actions : [],
requiresHumanAction: !!verdict.requires_human_action,
severity: verdict.severity || "MEDIUM"
}
const missing = Array.isArray(verdict.missing) ? verdict.missing : []
const requiresHumanAction = !!verdict.requires_human_action
const shouldContinue = !verdict.complete && (missing.length > 0 || !requiresHumanAction)
return {
complete: !!verdict.complete,
shouldContinue,
reason: verdict.feedback || "Judge analysis completed",
missing,
nextActions: Array.isArray(verdict.next_actions) ? verdict.next_actions : [],
requiresHumanAction,
severity: verdict.severity || "MEDIUM"
}
} catch (e) {
reportError(e, { plugin: "reflection-3", op: "judge-session" })
continue
Expand Down
220 changes: 220 additions & 0 deletions test/reflection-3.unit.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,35 @@ describe("reflection-3 unit", () => {
assert.strictEqual(analysis.shouldContinue, false)
})

it("does not treat run-test request as user action", () => {
const assessment = {
status: "in_progress" as const,
confidence: 0.6,
remaining_work: ["Run tests", "Create PR"],
needs_user_action: ["Run tests"]
}
const analysis = evaluateSelfAssessment(assessment, {
taskSummary: "Implement feature",
taskType: "coding",
agentMode: "build",
humanMessages: ["Implement feature"],
toolsSummary: "(none)",
detectedSignals: [],
recentCommands: [],
pushedToDefaultBranch: false,
requiresTests: true,
requiresBuild: false,
requiresPR: true,
requiresCI: true,
requiresLocalTests: true,
requiresLocalTestsEvidence: true
})

assert.strictEqual(analysis.requiresHumanAction, false)
assert.strictEqual(analysis.shouldContinue, true)
assert.ok(analysis.missing.length > 0)
})

it("ops tasks do not require PR or CI", () => {
// For ops tasks, PR and CI should not be enforced
const assessment = {
Expand Down Expand Up @@ -899,3 +928,194 @@ describe("buildSelfAssessmentPrompt attempt awareness", () => {
assert.ok(result.includes("Do not retry the same failing approach"))
})
})

describe("isPlanMode", () => {
// Helper to create a message with given role and text parts
function msg(role: string, ...texts: string[]) {
return {
info: { role },
parts: texts.map(t => ({ type: "text", text: t }))
}
}

describe("system/developer message detection", () => {
it("detects 'Plan Mode' in system message", () => {
const messages = [msg("system", "# Plan Mode - System Reminder")]
assert.strictEqual(isPlanMode(messages), true)
})

it("detects 'plan mode ACTIVE' in developer message", () => {
const messages = [msg("developer", "CRITICAL: plan mode ACTIVE - you are in READ-ONLY phase")]
assert.strictEqual(isPlanMode(messages), true)
})

it("detects 'read-only mode' in system message", () => {
const messages = [msg("system", "You are in read-only mode")]
assert.strictEqual(isPlanMode(messages), true)
})

it("detects 'READ-ONLY phase' in system message", () => {
const messages = [msg("system", "you are in READ-ONLY phase")]
assert.strictEqual(isPlanMode(messages), true)
})

it("detects 'plan mode is active' in system message", () => {
const messages = [msg("system", "plan mode is active. Do not edit files.")]
assert.strictEqual(isPlanMode(messages), true)
})
})

describe("system-reminder detection (OpenCode actual format)", () => {
it("detects default plan.txt system-reminder in user message", () => {
const reminder = `<system-reminder>
# Plan Mode - System Reminder

CRITICAL: Plan mode ACTIVE - you are in READ-ONLY phase. STRICTLY FORBIDDEN:
ANY file edits, modifications, or system changes.
</system-reminder>`
const messages = [msg("user", "Help me plan", reminder)]
assert.strictEqual(isPlanMode(messages), true)
})

it("detects experimental plan mode system-reminder", () => {
const reminder = `<system-reminder>
Plan mode is active. The user indicated that they do not want you to execute yet --
you MUST NOT make any edits.
</system-reminder>`
const messages = [msg("user", "Design the architecture", reminder)]
assert.strictEqual(isPlanMode(messages), true)
})

it("detects plan mode system-reminder even in older messages", () => {
const reminder = `<system-reminder>
Plan mode is active. READ-ONLY phase.
</system-reminder>`
const messages = [
msg("user", "First message", reminder),
msg("assistant", "Here is my plan..."),
msg("user", "Thanks, looks good")
]
assert.strictEqual(isPlanMode(messages), true)
})

it("detects READ-ONLY phase in system-reminder", () => {
const reminder = `<system-reminder>
CRITICAL: you are in READ-ONLY phase. Do not modify files.
</system-reminder>`
const messages = [msg("user", "Analyze the code", reminder)]
assert.strictEqual(isPlanMode(messages), true)
})

it("does NOT trigger on system-reminder without plan mode keywords", () => {
const reminder = `<system-reminder>
You have access to these tools: read, write, edit.
</system-reminder>`
const messages = [msg("user", "Fix the bug", reminder)]
assert.strictEqual(isPlanMode(messages), false)
})

it("does NOT trigger on plan mode keywords outside system-reminder", () => {
// The user says "plan mode" literally -> detected via user message check, not system-reminder
const messages = [msg("user", "Enable plan mode")]
assert.strictEqual(isPlanMode(messages), true) // detected via user keyword check
})
})

describe("user message keyword detection", () => {
it("detects 'plan mode' in user message (case insensitive)", () => {
const messages = [msg("user", "Switch to Plan Mode")]
assert.strictEqual(isPlanMode(messages), true)
})

it("detects 'plan' at start of user message", () => {
const messages = [msg("user", "plan the architecture for the new feature")]
assert.strictEqual(isPlanMode(messages), true)
})

it("detects 'create a plan' pattern", () => {
const messages = [msg("user", "create a plan for the refactoring")]
assert.strictEqual(isPlanMode(messages), true)
})

it("detects 'write a plan' pattern", () => {
const messages = [msg("user", "write a detailed plan")]
assert.strictEqual(isPlanMode(messages), true)
})

it("does NOT detect 'plan' in the middle of unrelated text", () => {
const messages = [msg("user", "Fix the airplane display bug")]
assert.strictEqual(isPlanMode(messages), false)
})

it("does NOT trigger on regular coding tasks", () => {
const messages = [msg("user", "Fix the login bug and add tests")]
assert.strictEqual(isPlanMode(messages), false)
})
})

describe("reflection message handling", () => {
it("skips reflection messages when looking for user keywords", () => {
const reflectionMsg = {
info: { role: "user" },
parts: [{ type: "text", text: "## Reflection-3 Self-Assessment\nplan mode test" }]
}
const messages = [msg("user", "Fix the bug"), reflectionMsg]
assert.strictEqual(isPlanMode(messages), false)
})

it("checks non-reflection user message even after reflection message", () => {
const reflectionMsg = {
info: { role: "user" },
parts: [{ type: "text", text: "## Reflection-3 Self-Assessment\nsome assessment" }]
}
const messages = [msg("user", "Switch to plan mode"), reflectionMsg]
// Walks backward: skips reflectionMsg, finds "Switch to plan mode"
assert.strictEqual(isPlanMode(messages), true)
})
})

describe("multiple text parts in a single message", () => {
it("checks all text parts, not just the last one", () => {
const messages = [{
info: { role: "user" },
parts: [
{ type: "text", text: "plan mode please" },
{ type: "text", text: "I want to think about this" }
]
}]
assert.strictEqual(isPlanMode(messages), true)
})
})

describe("edge cases", () => {
it("returns false for empty messages array", () => {
assert.strictEqual(isPlanMode([]), false)
})

it("returns false for messages with no parts", () => {
const messages = [{ info: { role: "user" } }]
assert.strictEqual(isPlanMode(messages), false)
})

it("returns false for messages with empty text parts", () => {
const messages = [{ info: { role: "user" }, parts: [{ type: "text", text: "" }] }]
assert.strictEqual(isPlanMode(messages), false)
})

it("returns false for assistant-only messages", () => {
const messages = [msg("assistant", "Here is the plan for the feature")]
assert.strictEqual(isPlanMode(messages), false)
})

it("handles build-switch reminder (should NOT be plan mode)", () => {
const reminder = `<system-reminder>
Your operational mode has changed from plan to build.
You are no longer in read-only mode.
</system-reminder>`
// "no longer in read-only mode" should not match — but "plan" + system-reminder exists
// The regex checks for "plan mode" (case insensitive) — "from plan to build" contains "plan" but NOT "plan mode"
const messages = [msg("user", "Now implement it", reminder)]
assert.strictEqual(isPlanMode(messages), false)
})
})
})
Loading