dzianisv · dzianisv · Feb 20, 2026 · Feb 18, 2026 · Feb 18, 2026 · Feb 19, 2026
diff --git a/.eval-tmp/opencode.json b/.eval-tmp/opencode.json
diff --git a/docs/reflection.md b/docs/reflection.md
@@ -12,7 +12,7 @@ Evaluates agent task completion and enforces workflow requirements using a self-
 - If self-assessment parsing fails, fall back to a judge session and parse a JSON verdict.
 - Write verdict signals to `.reflection/verdict_<session>.json` for TTS/Telegram gating.
 - Persist reflection analysis data to `.reflection/<session>_<timestamp>.json`.
-- Provide feedback only when incomplete; show a toast when complete or when user action is required.
+- Provide feedback only when incomplete; show a toast when complete or when only human action remains.
 
 ## Configuration
 Reflection models are configured in `~/.config/opencode/reflection.yaml`.
@@ -56,8 +56,8 @@ The agent must return JSON with evidence and status, including:
 
 ## Decision Outcomes
 - Complete -> toast success, write verdict signal.
-- Requires human action -> toast warning, no follow-up prompt.
-- Incomplete -> push feedback into the session with next steps.
+- Requires human action only -> toast warning, no follow-up prompt.
+- Incomplete or mixed (human action + agent steps) -> push feedback into the session with next steps.
 
 ## System Design Diagram
 ```mermaid

diff --git a/docs/session-ses_38de.md b/docs/session-ses_38de.md
@@ -0,0 +1,33 @@
+# session-ses_38de: Why It Looked Stuck
+
+This note documents why `session-ses_38de.md` appeared stuck and how reflection behaved.
+
+## Summary
+- The task was **not complete**; multiple required steps remained (tests, proof screenshot, PR/CI).
+- Reflection repeatedly flagged missing steps and **did** push continuation messages.
+- Several required tests were attempted but **aborted by the tool runner**, leaving no passing evidence.
+- The assistant kept switching between planning and attempted test execution without completing the full checklist.
+
+## What the Reflection Artifacts Show
+From `/Users/engineer/workspace/vibebrowser/vibe-gpt52events/.reflection/`:
+- `ses_38de_1771446015533.json`: missing plan, implementation, tests, proof screenshot. `shouldContinue: true`.
+- `ses_38de_1771449299620.json`: missing implementation wiring, tests, PR/CI. `shouldContinue: true`.
+- `ses_38de_1771452072701.json`: missing re-run tests + proof screenshot + PR/CI. `shouldContinue: true`.
+- `verdict_ses_38de.json`: `complete: false`, `severity: HIGH`.
+
+Reflection also injected multiple “Task Incomplete (HIGH)” messages inside the session transcript (e.g., around lines 5622, 8267, 8835, 10563 in `session-ses_38de.md`).
+
+## Why It “Stopped”
+1. **Tool execution aborted** for required tests (`npm test`, `node tests/extension.mock.test.js`, `node tests/vibe-e2e.test.js`, `node tests/google-workspace.test.js`). These appear as `Tool execution aborted` in the session log.
+2. The assistant never completed all required verification steps, so reflection kept marking the task incomplete.
+3. Reflection’s **continuation prompts did fire**; however, the task oscillated between planning and failed/aborted test runs, so the session looked stuck from the outside.
+
+## Why This Matters for “needs_user_action”
+This session showed multiple steps the agent **could run** (tests, PR creation, CI checks). If these were mislabeled as `needs_user_action`, reflection could incorrectly stop. The updated logic now:
+- Treats **human-only actions** (login, 2FA, OAuth consent, API key retrieval, approvals, uploads) as blocking.
+- Treats **agent-runnable items** (tests, PR/CI, screenshots, commands) as actionable, keeping `shouldContinue` true.
+
+## Follow-up Improvements Implemented
+- `needs_user_action` is split into **human-only** vs **agent-actionable** items.
+- Actionable items are added to `missing` and `nextActions` to keep continuation moving.
+- Judge fallback now uses the same actionable-vs-human split for `shouldContinue`.
diff --git a/eval.ts b/eval.ts
@@ -24,7 +24,8 @@ const MODEL = process.env.OPENCODE_MODEL || "github-copilot/gpt-4o"
 const PORT = 7654
 const TIMEOUT = 300_000        // 5 minutes max per task
 const POLL_INTERVAL = 3_000    // Check every 3 seconds
-const STABLE_POLLS_REQUIRED = 5  // Need 5 stable polls (15s of no new messages)
+const STABLE_POLLS_REQUIRED = 3  // Stable polls before stopping
+const MAX_WAIT_AFTER_OUTPUT = 20_000
 
 // Test cases for evaluation
 interface TestCase {
@@ -174,7 +175,10 @@ async function runTask(
     // Poll until stable - must wait for assistant to have parts
     let lastMsgCount = 0
     let lastAssistantParts = 0
+    let lastAssistantCount = 0
     let stableCount = 0
+    let firstAssistantOutput = ""
+    let firstAssistantCapturedAt: number | null = null
 
     while (Date.now() - start < TIMEOUT) {
       await new Promise(r => setTimeout(r, POLL_INTERVAL))
@@ -189,15 +193,25 @@ async function runTask(
       const assistantMsgs = (messages || []).filter((m: any) => m.info?.role === "assistant")
       const lastAssistant = assistantMsgs[assistantMsgs.length - 1]
       const assistantParts = lastAssistant?.parts?.length || 0
-
+      const assistantCount = assistantMsgs.length
+
       console.log(`[${testCase.id}] Polling: ${msgCount} messages, assistant parts=${assistantParts}, stable=${stableCount}`)
+
+      if (!firstAssistantOutput && assistantMsgs.length > 0) {
+        const candidate = extractTextContent(lastAssistant)
+        if (candidate) {
+          firstAssistantOutput = candidate
+          firstAssistantCapturedAt = Date.now()
+        }
+      }
 
       // Only consider stable if:
       // 1. We have at least 2 messages (user + assistant)
       // 2. The assistant message has at least 1 part
       // 3. Both message count AND part count are stable
       const isStable = msgCount === lastMsgCount && 
-                       assistantParts === lastAssistantParts && 
+                       assistantParts === lastAssistantParts &&
+                       assistantCount === lastAssistantCount &&
                        msgCount >= 2 && 
                        assistantParts > 0
 
@@ -208,6 +222,11 @@ async function runTask(
         stableCount = 0
         lastMsgCount = msgCount
         lastAssistantParts = assistantParts
+        lastAssistantCount = assistantCount
+      }
+
+      if (firstAssistantCapturedAt && Date.now() - firstAssistantCapturedAt > MAX_WAIT_AFTER_OUTPUT) {
+        break
       }
     }
 
@@ -241,7 +260,7 @@ async function runTask(
       const assistantMsgs = messages.filter((m: any) => m.info?.role === "assistant")
       if (assistantMsgs.length > 0) {
         const lastAssistant = assistantMsgs[assistantMsgs.length - 1]
-        result.agentOutput = extractTextContent(lastAssistant)
+        result.agentOutput = firstAssistantOutput || extractTextContent(lastAssistant)
         console.log(`[${testCase.id}] Agent output length: ${result.agentOutput.length}`)
       }
 

diff --git a/evals/promptfooconfig.yaml b/evals/promptfooconfig.yaml
@@ -139,6 +139,38 @@ tests:
           const verdict = JSON.parse(json[0]);
           return verdict.complete === false && (verdict.missing?.length > 0 || verdict.next_actions?.length > 0);
 
+  - description: "Agent asked user to run tests - INCOMPLETE without human action"
+    vars:
+      task: "Run the unit tests and make sure they pass"
+      tools_used: |
+        edit: {filePath: 'src/utils.ts'}
+      agent_response: |
+        I updated the implementation, but I didn't run the tests.
+        Please run `npm test` and confirm the results.
+    assert:
+      - type: javascript
+        value: |
+          const json = output.match(/\{[\s\S]*\}/);
+          if (!json) return false;
+          const verdict = JSON.parse(json[0]);
+          return verdict.complete === false && verdict.requires_human_action === false;
+
+  - description: "API key required - INCOMPLETE with human action"
+    vars:
+      task: "Configure the Stripe integration"
+      tools_used: |
+        read: {filePath: 'config/stripe.ts'}
+      agent_response: |
+        I need you to provide the Stripe API key from the dashboard to continue.
+        Once you share it, I can finish wiring the integration.
+    assert:
+      - type: javascript
+        value: |
+          const json = output.match(/\{[\s\S]*\}/);
+          if (!json) return false;
+          const verdict = JSON.parse(json[0]);
+          return verdict.complete === false && verdict.requires_human_action === true;
+
   - description: "Build fails - INCOMPLETE with HIGH severity"
     vars:
       task: "Add the new feature and make sure the build passes"
@@ -359,6 +391,22 @@ tests:
           // requires_human_action should be false (or undefined)
           return verdict.complete === false && !verdict.requires_human_action;
 
+  - description: "Needs user action plus agent steps - INCOMPLETE but continue"
+    vars:
+      task: "Finish the release and publish the package"
+      tools_used: |
+        bash: {command: 'npm test'}
+      agent_response: |
+        I still need to run tests and update the changelog.
+        Please approve the release in the dashboard once I finish.
+    assert:
+      - type: javascript
+        value: |
+          const json = output.match(/\{[\s\S]*\}/);
+          if (!json) return false;
+          const verdict = JSON.parse(json[0]);
+          return verdict.complete === false && verdict.requires_human_action === false;
+
   # ============================================
   # REAL SESSION PATTERNS - From production sessions
   # ============================================

diff --git a/jest.config.js b/jest.config.js
@@ -1,7 +1,7 @@
 export default {
   preset: 'ts-jest/presets/default-esm',
   testEnvironment: 'node',
-  testMatch: ['**/test/**/*.test.ts'],
+  testMatch: ['**/*.test.ts'],
   testPathIgnorePatterns: ['/node_modules/', 'session-fork-directory.test.ts', 'e2e.test.ts'],
   moduleFileExtensions: ['ts', 'js', 'json'],
   moduleNameMapper: {

diff --git a/reflection-3.test-helpers.ts b/reflection-3.test-helpers.ts
@@ -169,6 +169,41 @@ export function parseSelfAssessmentJson(text: string | null | undefined): SelfAs
   }
 }
 
+const HUMAN_ONLY_ACTION_PATTERNS: RegExp[] = [
+  /\b(auth|authentication|oauth|2fa|mfa|captcha|otp|one[- ]time)\b/i,
+  /\b(log ?in|sign ?in|verification code|passcode)\b/i,
+  /\b(api key|secret|token|credential|access key|session cookie)\b/i,
+  /\b(permission|consent|approve|approval|access request|request access|grant access|invite)\b/i,
+  /\bupload\b/i
+]
+
+const AGENT_ACTION_PATTERNS: RegExp[] = [
+  /\b(run|re-?run|execute|test|build|compile|lint|format|commit|push|merge|pr|ci|check)\b/i,
+  /\b(gh|npm|node|python|bash|curl|script)\b/i,
+  /\b(edit|write|update|fix|implement|add|remove|change|create|open|verify|capture|screenshot|record)\b/i
+]
+
+function isHumanOnlyAction(item: string): boolean {
+  const text = item.trim()
+  if (!text) return false
+  const hasHuman = HUMAN_ONLY_ACTION_PATTERNS.some(pattern => pattern.test(text))
+  const hasAgent = AGENT_ACTION_PATTERNS.some(pattern => pattern.test(text))
+  return hasHuman && !hasAgent
+}
+
+function splitActionItems(items: string[]): { humanOnly: string[]; agentActionable: string[] } {
+  const humanOnly: string[] = []
+  const agentActionable: string[] = []
+  for (const raw of items) {
+    if (typeof raw !== "string") continue
+    const item = raw.trim()
+    if (!item) continue
+    if (isHumanOnlyAction(item)) humanOnly.push(item)
+    else agentActionable.push(item)
+  }
+  return { humanOnly, agentActionable }
+}
+
 export function evaluateSelfAssessment(assessment: SelfAssessment, context: TaskContext): ReflectionAnalysis {
   const safeContext: TaskContext = {
     taskSummary: context?.taskSummary || "",
@@ -209,6 +244,14 @@ export function evaluateSelfAssessment(assessment: SelfAssessment, context: Task
     for (const item of remaining) addMissing(item)
   }
 
+  const { humanOnly: humanNeeds, agentActionable: agentNeeds } = splitActionItems(needsUserAction)
+  if (agentNeeds.length) {
+    for (const item of agentNeeds) {
+      addMissing(item)
+      if (!nextActions.includes(item)) nextActions.push(item)
+    }
+  }
+
   if (safeContext.requiresTests) {
     if (tests.ran !== true) {
       addMissing("Run tests", "Run the full test suite and capture output")
@@ -276,38 +319,44 @@ export function evaluateSelfAssessment(assessment: SelfAssessment, context: Task
     addMissing("Rethink approach", "Propose an alternate approach and continue")
   }
 
-  const requiresHumanAction = needsUserAction.length > 0
-  // Agent should continue if there are missing items beyond what only the user can do.
-  // Even when user action is needed (e.g. "merge PR"), the agent may still have
-  // actionable work (e.g. uncommitted changes, missing tests) it can complete first.
-  const agentActionableMissing = missing.filter(item =>
-    !needsUserAction.some(ua => item.toLowerCase().includes(ua.toLowerCase()) || ua.toLowerCase().includes(item.toLowerCase()))
-  )
-  const shouldContinue = agentActionableMissing.length > 0 || (!requiresHumanAction && missing.length > 0)
+  const humanOnlyNextSteps = (assessment.next_steps || []).filter(item => isHumanOnlyAction(item))
+  const requiresHumanAction = humanNeeds.length > 0 || humanOnlyNextSteps.length > 0 || missing.some(isHumanOnlyAction) || nextActions.some(isHumanOnlyAction)
   const complete = status === "complete" && missing.length === 0 && confidence >= 0.8 && !requiresHumanAction
 
   let severity: ReflectionAnalysis["severity"] = "NONE"
-  if (missing.some(item => /test|build/i.test(item))) severity = "HIGH"
-  else if (missing.some(item => /CI|check/i.test(item))) severity = "MEDIUM"
-  else if (missing.length > 0) severity = "LOW"
-
-  if (requiresHumanAction && missing.length === 0) severity = "LOW"
+  const severityItems = missing.length > 0 ? missing : nextActions
+  if (severityItems.some(item => /test|build/i.test(item))) severity = "HIGH"
+  else if (severityItems.some(item => /CI|check/i.test(item))) severity = "MEDIUM"
+  else if (severityItems.length > 0) severity = "LOW"
 
-  const reason = complete
-    ? "Self-assessment confirms completion with required evidence"
-    : requiresHumanAction
-      ? "User action required before continuing"
-      : missing.length
-        ? "Missing required workflow steps"
-        : "Task not confirmed complete"
+  if (requiresHumanAction && missing.length === 0 && nextActions.length === 0) severity = "LOW"
 
   if (assessment.next_steps?.length) {
     for (const step of assessment.next_steps) {
       if (!nextActions.includes(step)) nextActions.push(step)
     }
   }
 
-  return { complete, shouldContinue, reason, missing, nextActions, requiresHumanAction, severity }
+  const actionableMissing = missing.filter(item => !isHumanOnlyAction(item))
+  const finalActionableNextActions = nextActions.filter(item => !isHumanOnlyAction(item))
+  const finalShouldContinue = actionableMissing.length > 0 || finalActionableNextActions.length > 0
+  const finalReason = complete
+    ? "Self-assessment confirms completion with required evidence"
+    : requiresHumanAction && !finalShouldContinue
+      ? "User action required before continuing"
+      : missing.length || finalActionableNextActions.length
+        ? "Missing required workflow steps"
+        : "Task not confirmed complete"
+
+  return {
+    complete,
+    shouldContinue: finalShouldContinue,
+    reason: finalReason,
+    missing,
+    nextActions,
+    requiresHumanAction,
+    severity
+  }
 }
 
 export type RoutingCategory = "backend" | "architecture" | "frontend" | "default"
@@ -566,6 +615,7 @@ export function shouldApplyPlanningLoop(taskType: TaskType, loopDetected: boolea
 const SELF_ASSESSMENT_MARKER = "## Reflection-3 Self-Assessment"
 
 export function isPlanMode(messages: any[]): boolean {
+  if (!Array.isArray(messages)) return false
   // Check system/developer messages for plan mode indicators
   const hasSystemPlanMode = messages.some((m: any) =>
     (m.info?.role === "system" || m.info?.role === "developer") &&