Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions evals/prompts/stuck-detection.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Determine if the agent is stuck and needs a nudge to continue.
- Agent shows test results with "passed" or "✓"
- Agent's response indicates task fulfilled: "I've added", "I've fixed", "The X now works"
- No pending work mentioned (no "Next I will...", "Still need to...")
- IMPORTANT: If the user's task requires code changes (fix, implement, add, create, build, refactor), message_completed is true, and the Tool Calls show ONLY read operations (read, glob, grep, git log/status/diff, webfetch, task/explore) with NO write operations (edit, write, bash with build/test/commit, PR creation), the task is NOT complete — classify as genuinely_stuck with stuck=true. Analyzing or recommending changes is not the same as implementing them.
→ reason: "complete"

### 3. WAITING FOR USER
Expand Down
45 changes: 45 additions & 0 deletions evals/stuck-detection.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -299,4 +299,49 @@ tests:
const result = JSON.parse(json[0]);
return result.stuck === false || result.shouldNudge === false;

- description: "Planning loop - agent only read/explored, never wrote code (issue #115)"
vars:
time_since_activity: "120"
message_completed: "true"
output_tokens: "3500"
last_user_message: "https://github.com/VibeTechnologies/VibeWebAgent/issues/513 - Investigate and fix the MCP WebSocket connection failure"
last_assistant_response: |
## Root Cause Analysis

After investigating the codebase extensively, I found that the MCP WebSocket connection
failure is caused by a race condition in the connection manager. The `connectToServer`
function doesn't properly handle the case where the server closes the connection
during the handshake phase.

### Recommendations
1. Add retry logic with exponential backoff
2. Handle the WebSocket close event during handshake
3. Add connection state tracking

I've identified all the relevant files and understand the issue thoroughly.
The fix should be straightforward to implement.
tool_calls: |
github_issue_read (get issue #513)
task (explore codebase)
read (src/mcp/connection.ts)
read (src/mcp/websocket.ts)
glob (src/**/*.ts)
grep (connectToServer)
read (src/mcp/manager.ts)
read (src/mcp/types.ts)
bash (git log --oneline -10)
read (src/mcp/retry.ts)
webfetch (docs reference)
read (package.json)
task (explore more files)
read (src/mcp/handler.ts)
read (src/utils/logger.ts)
assert:
- type: javascript
value: |
const json = output.match(/\{[\s\S]*\}/);
if (!json) return false;
const result = JSON.parse(json[0]);
return result.stuck === true && result.reason === "genuinely_stuck";

outputPath: ./evals/results/stuck-detection-latest.json
17 changes: 14 additions & 3 deletions reflection-3.test-helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,26 @@ export interface ReflectionAnalysis {
}

export function inferTaskType(text: string): TaskType {
if (/research|investigate|analyze|compare|evaluate|study/i.test(text)) return "research"
const hasResearch = /research|investigate|analyze|compare|evaluate|study/i.test(text)
const hasCodingAction = /\bfix\b|implement|add|create|build|feature|refactor|improve|update/i.test(text)
const hasCodingSignal = /\bbug\b|\berror\b|\bregression\b/i.test(text)
const hasGitHubIssue = /github\.com\/[^\s/]+\/[^\s/]+\/issues\/\d+/i.test(text)

// When text contains both research AND coding-action keywords (e.g. "investigate and fix this bug"),
// or references a GitHub issue URL alongside research terms, prefer coding —
// these are almost always coding tasks even if the description says "investigate".
// Note: coding-signal words (bug, error, regression) alone don't override research,
// because "investigate performance regressions" is legitimate research.
if (hasResearch && (hasCodingAction || hasGitHubIssue)) return "coding"

if (hasResearch) return "research"
if (/docs?|readme|documentation/i.test(text)) return "docs"
// Ops detection: explicit ops terms and personal-assistant / browser-automation patterns
// Must be checked BEFORE coding to avoid "create filter" or "build entities" matching as coding
if (/deploy|release|infra|ops|oncall|incident|runbook/i.test(text)) return "ops"
if (/\bgmail\b|\bemail\b|\bfilter\b|\binbox\b|\bcalendar\b|\blinkedin\b|\brecruiter\b|\bbrowser\b/i.test(text)) return "ops"
if (/\bclean\s*up\b|\borganize\b|\bconfigure\b|\bsetup\b|\bset\s*up\b|\binstall\b/i.test(text)) return "ops"
if (/fix|bug|issue|error|regression/i.test(text)) return "coding"
if (/implement|add|create|build|feature|refactor|improve|update/i.test(text)) return "coding"
if (hasCodingAction || hasCodingSignal) return "coding"
return "other"
}

Expand Down
21 changes: 17 additions & 4 deletions reflection-3.ts
Original file line number Diff line number Diff line change
Expand Up @@ -661,15 +661,26 @@ async function waitForResponse(client: any, sessionId: string): Promise<string |
}

function inferTaskType(text: string): TaskType {
if (/research|investigate|analyze|compare|evaluate|study/i.test(text)) return "research"
const hasResearch = /research|investigate|analyze|compare|evaluate|study/i.test(text)
const hasCodingAction = /\bfix\b|implement|add|create|build|feature|refactor|improve|update/i.test(text)
const hasCodingSignal = /\bbug\b|\berror\b|\bregression\b/i.test(text)
const hasGitHubIssue = /github\.com\/[^\s/]+\/[^\s/]+\/issues\/\d+/i.test(text)

// When text contains both research AND coding-action keywords (e.g. "investigate and fix this bug"),
// or references a GitHub issue URL alongside research terms, prefer coding —
// these are almost always coding tasks even if the description says "investigate".
// Note: coding-signal words (bug, error, regression) alone don't override research,
// because "investigate performance regressions" is legitimate research.
if (hasResearch && (hasCodingAction || hasGitHubIssue)) return "coding"

if (hasResearch) return "research"
if (/docs?|readme|documentation/i.test(text)) return "docs"
// Ops detection: explicit ops terms and personal-assistant / browser-automation patterns
// Must be checked BEFORE coding to avoid "create filter" or "build entities" matching as coding
if (/deploy|release|infra|ops|oncall|incident|runbook/i.test(text)) return "ops"
if (/\bgmail\b|\bemail\b|\bfilter\b|\binbox\b|\bcalendar\b|\blinkedin\b|\brecruiter\b|\bbrowser\b/i.test(text)) return "ops"
if (/\bclean\s*up\b|\borganize\b|\bconfigure\b|\bsetup\b|\bset\s*up\b|\binstall\b/i.test(text)) return "ops"
if (/fix|bug|issue|error|regression/i.test(text)) return "coding"
if (/implement|add|create|build|feature|refactor|improve|update/i.test(text)) return "coding"
if (hasCodingAction || hasCodingSignal) return "coding"
return "other"
}

Expand Down Expand Up @@ -919,7 +930,8 @@ Rules:
- Tests cannot be skipped or marked as flaky/not important.
- Direct pushes to main/master are not allowed; require a PR instead.
- If stuck, propose an alternate approach.
- If you need user action (auth, 2FA, credentials), list it in needs_user_action.`
- If you need user action (auth, 2FA, credentials), list it in needs_user_action.
- PLANNING LOOP CHECK: If the task requires code changes (fix, implement, add, create, build, refactor, update) but the "Tool Commands Run" section shows ONLY read operations (read, glob, grep, git log, git status, git diff, webfetch, task/explore) and NO write operations (edit, write, bash with build/test/commit, github_create_pull_request, etc.), then the task is NOT complete. Set status to "in_progress", set stuck to true, and list "Implement the actual code changes" in remaining_work. Analyzing and recommending changes is not the same as making them.`
}

function parseSelfAssessmentJson(text: string | null | undefined): SelfAssessment | null {
Expand Down Expand Up @@ -1122,6 +1134,7 @@ Rules:
- If PR exists, CI checks must be verified and passing.
- If user action is required (auth/2FA/credentials), set requires_human_action true.
- If agent is stuck, require alternate approach and continued work.
- PLANNING LOOP: If the task requires code changes (fix, implement, add, create, build, refactor) but the Tool Signals show ONLY read operations (read, glob, grep, git log/status/diff, webfetch) and NO write operations (edit, write, bash with build/test/commit, PR creation), set complete to false and add "Implement actual code changes" to missing. Analysis alone does not fulfill an implementation task.

Return JSON only:
{
Expand Down
89 changes: 89 additions & 0 deletions test/reflection-3.unit.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {
buildEscalatingFeedback,
RoutingConfig
} from "../reflection-3.test-helpers.ts"
import { detectPlanningLoop } from "../reflection-3.ts"

describe("reflection-3 unit", () => {
it("detects task type from text", () => {
Expand Down Expand Up @@ -255,6 +256,30 @@ describe("reflection-3 unit", () => {
assert.strictEqual(inferTaskType("Builds entities and relationships in knowledge graph for email"), "ops")
})

it("classifies as coding when text has both research AND coding keywords (issue #115)", () => {
// The stuck session had text like "investigate ... fix ... issue ... error" — all present.
// research matched first, disabling all workflow gates, letting the task pass as "complete"
// even though the agent only read files and never made any code changes.
assert.strictEqual(inferTaskType("Investigate and fix the login bug"), "coding")
assert.strictEqual(inferTaskType("Analyze the error and implement a fix"), "coding")
assert.strictEqual(inferTaskType("Study the regression and create a patch"), "coding")
assert.strictEqual(inferTaskType("Evaluate the issue and update the handler"), "coding")
})

it("classifies as coding when text contains a GitHub issue URL with research keywords (issue #115)", () => {
// A GitHub issue URL + research keyword should resolve to coding, not research
assert.strictEqual(inferTaskType("Investigate https://github.com/VibeTechnologies/VibeWebAgent/issues/513"), "coding")
assert.strictEqual(inferTaskType("Analyze the problem at https://github.com/org/repo/issues/42"), "coding")
// A bare GitHub URL without research keywords is just "other" (no research to override)
assert.strictEqual(inferTaskType("https://github.com/org/repo/issues/513"), "other")
})

it("still classifies pure research text as research", () => {
assert.strictEqual(inferTaskType("Investigate performance characteristics"), "research")
assert.strictEqual(inferTaskType("Research best practices for caching"), "research")
assert.strictEqual(inferTaskType("Analyze the trade-offs between approaches"), "research")
})

it("shouldContinue is true when agent has actionable work alongside needs_user_action", () => {
const assessment = {
status: "in_progress" as const,
Expand Down Expand Up @@ -341,6 +366,70 @@ describe("reflection-3 unit", () => {
assert.strictEqual(analysis.complete, true)
assert.strictEqual(analysis.missing.length, 0)
})

it("evaluateSelfAssessment marks complete when no requirements and high confidence (issue #115 precondition)", () => {
// This test documents the exact scenario from issue #115:
// When taskType was misclassified as "research", all requires* were false,
// so evaluateSelfAssessment found missing.length===0 and marked it complete.
// The fix is in inferTaskType (prefer coding), but this test verifies the
// evaluator behavior hasn't changed for legitimate research tasks.
const assessment = {
status: "complete" as const,
confidence: 0.95,
evidence: {}
}
const analysis = evaluateSelfAssessment(assessment, {
taskSummary: "Research caching strategies",
taskType: "research",
agentMode: "build",
humanMessages: ["Research caching strategies"],
toolsSummary: "(none)",
detectedSignals: [],
recentCommands: [],
pushedToDefaultBranch: false,
requiresTests: false,
requiresBuild: false,
requiresPR: false,
requiresCI: false,
requiresLocalTests: false,
requiresLocalTestsEvidence: false
})

// For a genuine research task with no requirements, this is correct behavior
assert.strictEqual(analysis.complete, true)
assert.strictEqual(analysis.missing.length, 0)
})

it("detectPlanningLoop catches sessions with only read operations (issue #115)", () => {
// Simulate the stuck session: 15+ tool calls, all reads, zero writes
const messages = [
{
info: { role: "assistant" },
parts: [
{ type: "tool", tool: "github_issue_read", state: { input: {} } },
{ type: "tool", tool: "task", state: { input: {} } },
{ type: "tool", tool: "read", state: { input: {} } },
{ type: "tool", tool: "read", state: { input: {} } },
{ type: "tool", tool: "glob", state: { input: {} } },
{ type: "tool", tool: "grep", state: { input: {} } },
{ type: "tool", tool: "read", state: { input: {} } },
{ type: "tool", tool: "read", state: { input: {} } },
{ type: "tool", tool: "task", state: { input: {} } },
{ type: "tool", tool: "webfetch", state: { input: {} } },
{ type: "tool", tool: "read", state: { input: {} } },
{ type: "tool", tool: "bash", state: { input: { command: "git log --oneline -5" } } },
{ type: "tool", tool: "read", state: { input: {} } },
{ type: "tool", tool: "read", state: { input: {} } },
{ type: "tool", tool: "skill", state: { input: {} } }
]
}
]
const result = detectPlanningLoop(messages)
assert.strictEqual(result.detected, true)
assert.strictEqual(result.writeCount, 0)
assert.ok(result.readCount > 0)
assert.ok(result.totalTools >= 10)
})
})

describe("buildEscalatingFeedback", () => {
Expand Down
Loading