dzianisv · dzianisv · Feb 15, 2026 · Feb 15, 2026
diff --git a/evals/prompts/stuck-detection.txt b/evals/prompts/stuck-detection.txt
@@ -28,6 +28,7 @@ Determine if the agent is stuck and needs a nudge to continue.
 - Agent shows test results with "passed" or "✓"  
 - Agent's response indicates task fulfilled: "I've added", "I've fixed", "The X now works"
 - No pending work mentioned (no "Next I will...", "Still need to...")
+- IMPORTANT: If the user's task requires code changes (fix, implement, add, create, build, refactor), message_completed is true, and the Tool Calls show ONLY read operations (read, glob, grep, git log/status/diff, webfetch, task/explore) with NO write operations (edit, write, bash with build/test/commit, PR creation), the task is NOT complete — classify as genuinely_stuck with stuck=true. Analyzing or recommending changes is not the same as implementing them.
 → reason: "complete"
 
 ### 3. WAITING FOR USER

diff --git a/evals/stuck-detection.yaml b/evals/stuck-detection.yaml
@@ -299,4 +299,49 @@ tests:
           const result = JSON.parse(json[0]);
           return result.stuck === false || result.shouldNudge === false;
 
+  - description: "Planning loop - agent only read/explored, never wrote code (issue #115)"
+    vars:
+      time_since_activity: "120"
+      message_completed: "true"
+      output_tokens: "3500"
+      last_user_message: "https://github.com/VibeTechnologies/VibeWebAgent/issues/513 - Investigate and fix the MCP WebSocket connection failure"
+      last_assistant_response: |
+        ## Root Cause Analysis
+
+        After investigating the codebase extensively, I found that the MCP WebSocket connection
+        failure is caused by a race condition in the connection manager. The `connectToServer`
+        function doesn't properly handle the case where the server closes the connection
+        during the handshake phase.
+
+        ### Recommendations
+        1. Add retry logic with exponential backoff
+        2. Handle the WebSocket close event during handshake
+        3. Add connection state tracking
+
+        I've identified all the relevant files and understand the issue thoroughly.
+        The fix should be straightforward to implement.
+      tool_calls: |
+        github_issue_read (get issue #513)
+        task (explore codebase)
+        read (src/mcp/connection.ts)
+        read (src/mcp/websocket.ts)
+        glob (src/**/*.ts)
+        grep (connectToServer)
+        read (src/mcp/manager.ts)
+        read (src/mcp/types.ts)
+        bash (git log --oneline -10)
+        read (src/mcp/retry.ts)
+        webfetch (docs reference)
+        read (package.json)
+        task (explore more files)
+        read (src/mcp/handler.ts)
+        read (src/utils/logger.ts)
+    assert:
+      - type: javascript
+        value: |
+          const json = output.match(/\{[\s\S]*\}/);
+          if (!json) return false;
+          const result = JSON.parse(json[0]);
+          return result.stuck === true && result.reason === "genuinely_stuck";
+
 outputPath: ./evals/results/stuck-detection-latest.json
diff --git a/reflection-3.test-helpers.ts b/reflection-3.test-helpers.ts
@@ -64,15 +64,26 @@ export interface ReflectionAnalysis {
 }
 
 export function inferTaskType(text: string): TaskType {
-  if (/research|investigate|analyze|compare|evaluate|study/i.test(text)) return "research"
+  const hasResearch = /research|investigate|analyze|compare|evaluate|study/i.test(text)
+  const hasCodingAction = /\bfix\b|implement|add|create|build|feature|refactor|improve|update/i.test(text)
+  const hasCodingSignal = /\bbug\b|\berror\b|\bregression\b/i.test(text)
+  const hasGitHubIssue = /github\.com\/[^\s/]+\/[^\s/]+\/issues\/\d+/i.test(text)
+
+  // When text contains both research AND coding-action keywords (e.g. "investigate and fix this bug"),
+  // or references a GitHub issue URL alongside research terms, prefer coding —
+  // these are almost always coding tasks even if the description says "investigate".
+  // Note: coding-signal words (bug, error, regression) alone don't override research,
+  // because "investigate performance regressions" is legitimate research.
+  if (hasResearch && (hasCodingAction || hasGitHubIssue)) return "coding"
+
+  if (hasResearch) return "research"
   if (/docs?|readme|documentation/i.test(text)) return "docs"
   // Ops detection: explicit ops terms and personal-assistant / browser-automation patterns
   // Must be checked BEFORE coding to avoid "create filter" or "build entities" matching as coding
   if (/deploy|release|infra|ops|oncall|incident|runbook/i.test(text)) return "ops"
   if (/\bgmail\b|\bemail\b|\bfilter\b|\binbox\b|\bcalendar\b|\blinkedin\b|\brecruiter\b|\bbrowser\b/i.test(text)) return "ops"
   if (/\bclean\s*up\b|\borganize\b|\bconfigure\b|\bsetup\b|\bset\s*up\b|\binstall\b/i.test(text)) return "ops"
-  if (/fix|bug|issue|error|regression/i.test(text)) return "coding"
-  if (/implement|add|create|build|feature|refactor|improve|update/i.test(text)) return "coding"
+  if (hasCodingAction || hasCodingSignal) return "coding"
   return "other"
 }
 

diff --git a/reflection-3.ts b/reflection-3.ts
@@ -661,15 +661,26 @@ async function waitForResponse(client: any, sessionId: string): Promise<string |
 }
 
 function inferTaskType(text: string): TaskType {
-  if (/research|investigate|analyze|compare|evaluate|study/i.test(text)) return "research"
+  const hasResearch = /research|investigate|analyze|compare|evaluate|study/i.test(text)
+  const hasCodingAction = /\bfix\b|implement|add|create|build|feature|refactor|improve|update/i.test(text)
+  const hasCodingSignal = /\bbug\b|\berror\b|\bregression\b/i.test(text)
+  const hasGitHubIssue = /github\.com\/[^\s/]+\/[^\s/]+\/issues\/\d+/i.test(text)
+
+  // When text contains both research AND coding-action keywords (e.g. "investigate and fix this bug"),
+  // or references a GitHub issue URL alongside research terms, prefer coding —
+  // these are almost always coding tasks even if the description says "investigate".
+  // Note: coding-signal words (bug, error, regression) alone don't override research,
+  // because "investigate performance regressions" is legitimate research.
+  if (hasResearch && (hasCodingAction || hasGitHubIssue)) return "coding"
+
+  if (hasResearch) return "research"
   if (/docs?|readme|documentation/i.test(text)) return "docs"
   // Ops detection: explicit ops terms and personal-assistant / browser-automation patterns
   // Must be checked BEFORE coding to avoid "create filter" or "build entities" matching as coding
   if (/deploy|release|infra|ops|oncall|incident|runbook/i.test(text)) return "ops"
   if (/\bgmail\b|\bemail\b|\bfilter\b|\binbox\b|\bcalendar\b|\blinkedin\b|\brecruiter\b|\bbrowser\b/i.test(text)) return "ops"
   if (/\bclean\s*up\b|\borganize\b|\bconfigure\b|\bsetup\b|\bset\s*up\b|\binstall\b/i.test(text)) return "ops"
-  if (/fix|bug|issue|error|regression/i.test(text)) return "coding"
-  if (/implement|add|create|build|feature|refactor|improve|update/i.test(text)) return "coding"
+  if (hasCodingAction || hasCodingSignal) return "coding"
   return "other"
 }
 
@@ -919,7 +930,8 @@ Rules:
 - Tests cannot be skipped or marked as flaky/not important.
 - Direct pushes to main/master are not allowed; require a PR instead.
 - If stuck, propose an alternate approach.
-- If you need user action (auth, 2FA, credentials), list it in needs_user_action.`
+- If you need user action (auth, 2FA, credentials), list it in needs_user_action.
+- PLANNING LOOP CHECK: If the task requires code changes (fix, implement, add, create, build, refactor, update) but the "Tool Commands Run" section shows ONLY read operations (read, glob, grep, git log, git status, git diff, webfetch, task/explore) and NO write operations (edit, write, bash with build/test/commit, github_create_pull_request, etc.), then the task is NOT complete. Set status to "in_progress", set stuck to true, and list "Implement the actual code changes" in remaining_work. Analyzing and recommending changes is not the same as making them.`
 }
 
 function parseSelfAssessmentJson(text: string | null | undefined): SelfAssessment | null {
@@ -1122,6 +1134,7 @@ Rules:
 - If PR exists, CI checks must be verified and passing.
 - If user action is required (auth/2FA/credentials), set requires_human_action true.
 - If agent is stuck, require alternate approach and continued work.
+- PLANNING LOOP: If the task requires code changes (fix, implement, add, create, build, refactor) but the Tool Signals show ONLY read operations (read, glob, grep, git log/status/diff, webfetch) and NO write operations (edit, write, bash with build/test/commit, PR creation), set complete to false and add "Implement actual code changes" to missing. Analysis alone does not fulfill an implementation task.
 
 Return JSON only:
 {

diff --git a/test/reflection-3.unit.test.ts b/test/reflection-3.unit.test.ts
@@ -9,6 +9,7 @@ import {
   buildEscalatingFeedback,
   RoutingConfig
 } from "../reflection-3.test-helpers.ts"
+import { detectPlanningLoop } from "../reflection-3.ts"
 
 describe("reflection-3 unit", () => {
   it("detects task type from text", () => {
@@ -255,6 +256,30 @@ describe("reflection-3 unit", () => {
     assert.strictEqual(inferTaskType("Builds entities and relationships in knowledge graph for email"), "ops")
   })
 
+  it("classifies as coding when text has both research AND coding keywords (issue #115)", () => {
+    // The stuck session had text like "investigate ... fix ... issue ... error" — all present.
+    // research matched first, disabling all workflow gates, letting the task pass as "complete"
+    // even though the agent only read files and never made any code changes.
+    assert.strictEqual(inferTaskType("Investigate and fix the login bug"), "coding")
+    assert.strictEqual(inferTaskType("Analyze the error and implement a fix"), "coding")
+    assert.strictEqual(inferTaskType("Study the regression and create a patch"), "coding")
+    assert.strictEqual(inferTaskType("Evaluate the issue and update the handler"), "coding")
+  })
+
+  it("classifies as coding when text contains a GitHub issue URL with research keywords (issue #115)", () => {
+    // A GitHub issue URL + research keyword should resolve to coding, not research
+    assert.strictEqual(inferTaskType("Investigate https://github.com/VibeTechnologies/VibeWebAgent/issues/513"), "coding")
+    assert.strictEqual(inferTaskType("Analyze the problem at https://github.com/org/repo/issues/42"), "coding")
+    // A bare GitHub URL without research keywords is just "other" (no research to override)
+    assert.strictEqual(inferTaskType("https://github.com/org/repo/issues/513"), "other")
+  })
+
+  it("still classifies pure research text as research", () => {
+    assert.strictEqual(inferTaskType("Investigate performance characteristics"), "research")
+    assert.strictEqual(inferTaskType("Research best practices for caching"), "research")
+    assert.strictEqual(inferTaskType("Analyze the trade-offs between approaches"), "research")
+  })
+
   it("shouldContinue is true when agent has actionable work alongside needs_user_action", () => {
     const assessment = {
       status: "in_progress" as const,
@@ -341,6 +366,70 @@ describe("reflection-3 unit", () => {
     assert.strictEqual(analysis.complete, true)
     assert.strictEqual(analysis.missing.length, 0)
   })
+
+  it("evaluateSelfAssessment marks complete when no requirements and high confidence (issue #115 precondition)", () => {
+    // This test documents the exact scenario from issue #115:
+    // When taskType was misclassified as "research", all requires* were false,
+    // so evaluateSelfAssessment found missing.length===0 and marked it complete.
+    // The fix is in inferTaskType (prefer coding), but this test verifies the
+    // evaluator behavior hasn't changed for legitimate research tasks.
+    const assessment = {
+      status: "complete" as const,
+      confidence: 0.95,
+      evidence: {}
+    }
+    const analysis = evaluateSelfAssessment(assessment, {
+      taskSummary: "Research caching strategies",
+      taskType: "research",
+      agentMode: "build",
+      humanMessages: ["Research caching strategies"],
+      toolsSummary: "(none)",
+      detectedSignals: [],
+      recentCommands: [],
+      pushedToDefaultBranch: false,
+      requiresTests: false,
+      requiresBuild: false,
+      requiresPR: false,
+      requiresCI: false,
+      requiresLocalTests: false,
+      requiresLocalTestsEvidence: false
+    })
+
+    // For a genuine research task with no requirements, this is correct behavior
+    assert.strictEqual(analysis.complete, true)
+    assert.strictEqual(analysis.missing.length, 0)
+  })
+
+  it("detectPlanningLoop catches sessions with only read operations (issue #115)", () => {
+    // Simulate the stuck session: 15+ tool calls, all reads, zero writes
+    const messages = [
+      {
+        info: { role: "assistant" },
+        parts: [
+          { type: "tool", tool: "github_issue_read", state: { input: {} } },
+          { type: "tool", tool: "task", state: { input: {} } },
+          { type: "tool", tool: "read", state: { input: {} } },
+          { type: "tool", tool: "read", state: { input: {} } },
+          { type: "tool", tool: "glob", state: { input: {} } },
+          { type: "tool", tool: "grep", state: { input: {} } },
+          { type: "tool", tool: "read", state: { input: {} } },
+          { type: "tool", tool: "read", state: { input: {} } },
+          { type: "tool", tool: "task", state: { input: {} } },
+          { type: "tool", tool: "webfetch", state: { input: {} } },
+          { type: "tool", tool: "read", state: { input: {} } },
+          { type: "tool", tool: "bash", state: { input: { command: "git log --oneline -5" } } },
+          { type: "tool", tool: "read", state: { input: {} } },
+          { type: "tool", tool: "read", state: { input: {} } },
+          { type: "tool", tool: "skill", state: { input: {} } }
+        ]
+      }
+    ]
+    const result = detectPlanningLoop(messages)
+    assert.strictEqual(result.detected, true)
+    assert.strictEqual(result.writeCount, 0)
+    assert.ok(result.readCount > 0)
+    assert.ok(result.totalTools >= 10)
+  })
 })
 
 describe("buildEscalatingFeedback", () => {