From 25205b7d0ee79884eb0175353da593a626c4ed4e Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 12:46:02 -0800 Subject: [PATCH 01/12] feat: multi-pass skill pipeline with phase-aware execution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skills can declare `phase = 2` in warden.toml to run after phase-1 skills complete. Phase-2 skills receive all phase-1 SkillReport[] serialized into their prompt context, enabling meta-analysis skills like a linter rule judge. - Add `phase` field to SkillConfigSchema and ResolvedTrigger - Add groupByPhase() utility in src/pipeline/ - Inject prior findings into buildHunkUserPrompt() between PR context and diff - Thread priorReports through SkillRunnerOptions → analyzeHunk → prompt - Phase-aware execution loops in CLI (runConfigMode, runSkills) and Action - Example linter-rule-judge skill as proof-of-concept phase-2 skill Co-Authored-By: Claude Opus 4.6 --- .agents/skills/linter-rule-judge/SKILL.md | 31 +++++ src/action/triggers/executor.ts | 3 + src/action/workflow/pr-workflow.ts | 52 +++++--- src/cli/main.ts | 133 ++++++++++++------- src/config/loader.ts | 4 + src/config/schema.ts | 2 + src/pipeline/index.ts | 1 + src/pipeline/phase.test.ts | 53 ++++++++ src/pipeline/phase.ts | 29 +++++ src/sdk/analyze.ts | 7 +- src/sdk/prompt.test.ts | 149 ++++++++++++++++++++++ src/sdk/prompt.ts | 57 ++++++++- src/sdk/runner.ts | 2 +- src/sdk/types.ts | 4 +- 14 files changed, 455 insertions(+), 72 deletions(-) create mode 100644 .agents/skills/linter-rule-judge/SKILL.md create mode 100644 src/pipeline/index.ts create mode 100644 src/pipeline/phase.test.ts create mode 100644 src/pipeline/phase.ts create mode 100644 src/sdk/prompt.test.ts diff --git a/.agents/skills/linter-rule-judge/SKILL.md b/.agents/skills/linter-rule-judge/SKILL.md new file mode 100644 index 00000000..f46050de --- /dev/null +++ b/.agents/skills/linter-rule-judge/SKILL.md @@ -0,0 +1,31 @@ +--- +name: linter-rule-judge +description: Evaluate whether findings with fixes could be caught by linter rules +allowed-tools: Read Grep Glob +--- + +# Linter Rule Judge + +You are a second-pass skill that evaluates findings from earlier phases. + +For each prior finding that includes a `suggestedFix`, determine whether the issue could be caught by a static analysis or linter rule. + +## Evaluation + +Categorize each finding as one of: + +- **existing_rule**: An existing linter rule already catches this. Name the tool and rule (e.g., `eslint: no-unused-vars`, `clippy: needless_return`). +- **custom_rule**: A custom lint rule could catch this pattern. Describe what the rule would check and how it would be implemented (AST visitor, regex pattern, etc.). +- **ai_only**: Static analysis cannot reliably catch this. Explain why (requires semantic understanding, cross-file analysis, runtime behavior, etc.). + +## Output + +For each evaluated finding, report a single finding with: +- **title**: `[category] Original finding title` +- **severity**: `info` +- **description**: Your reasoning and, for `existing_rule`/`custom_rule`, the rule details +- **location**: Same location as the original finding + +Skip findings that have no `suggestedFix` -- those are outside this skill's scope. + +If no prior findings have suggested fixes, return an empty findings array. diff --git a/src/action/triggers/executor.ts b/src/action/triggers/executor.ts index ed4d8b6d..624b7775 100644 --- a/src/action/triggers/executor.ts +++ b/src/action/triggers/executor.ts @@ -54,6 +54,8 @@ export interface TriggerExecutorDeps { globalRequestChanges?: boolean; /** Global fail-check from action inputs (trigger-specific takes precedence) */ globalFailCheck?: boolean; + /** Prior-phase skill reports, injected into prompts for second-pass skills */ + priorReports?: SkillReport[]; } /** @@ -134,6 +136,7 @@ export async function executeTrigger( maxTurns: trigger.maxTurns, batchDelayMs: config.defaults?.batchDelayMs, pathToClaudeCodeExecutable: claudePath, + priorReports: deps.priorReports, }, }; diff --git a/src/action/workflow/pr-workflow.ts b/src/action/workflow/pr-workflow.ts index 0a5e774a..f5a3cf68 100644 --- a/src/action/workflow/pr-workflow.ts +++ b/src/action/workflow/pr-workflow.ts @@ -18,6 +18,7 @@ import type { ExistingComment } from '../../output/dedup.js'; import { buildAnalyzedScope, findStaleComments, resolveStaleComments } from '../../output/stale.js'; import type { EventContext, SkillReport, Finding } from '../../types/index.js'; import { processInBatches } from '../../utils/index.js'; +import { groupByPhase } from '../../pipeline/index.js'; import { evaluateFixAttempts, postThreadReply } from '../fix-evaluation/index.js'; import type { FixEvaluation } from '../fix-evaluation/index.js'; import { logAction, warnAction } from '../../cli/output/tty.js'; @@ -229,7 +230,9 @@ async function setupGitHubState( } /** - * Run all matched triggers in parallel batches. + * Run all matched triggers, grouped by phase. + * Within each phase triggers run in parallel batches. + * Prior-phase reports are threaded forward to later phases. */ async function executeAllTriggers( matchedTriggers: ResolvedTrigger[], @@ -241,23 +244,36 @@ async function executeAllTriggers( const concurrency = config.runner?.concurrency ?? inputs.parallel; const claudePath = await findClaudeCodeExecutable(); - return processInBatches( - matchedTriggers, - (trigger) => - executeTrigger(trigger, { - octokit, - context, - config, - anthropicApiKey: inputs.anthropicApiKey, - claudePath, - globalFailOn: inputs.failOn, - globalReportOn: inputs.reportOn, - globalMaxFindings: inputs.maxFindings, - globalRequestChanges: inputs.requestChanges, - globalFailCheck: inputs.failCheck, - }), - concurrency - ); + const byPhase = groupByPhase(matchedTriggers); + let allResults: TriggerResult[] = []; + let priorReports: SkillReport[] = []; + + for (const [, phaseTriggers] of byPhase) { + const results = await processInBatches( + phaseTriggers, + (trigger) => + executeTrigger(trigger, { + octokit, + context, + config, + anthropicApiKey: inputs.anthropicApiKey, + claudePath, + globalFailOn: inputs.failOn, + globalReportOn: inputs.reportOn, + globalMaxFindings: inputs.maxFindings, + globalRequestChanges: inputs.requestChanges, + globalFailCheck: inputs.failCheck, + priorReports: priorReports.length > 0 ? priorReports : undefined, + }), + concurrency + ); + + const reports = results.flatMap((r) => r.report ? [r.report] : []); + priorReports = [...priorReports, ...reports]; + allResults = [...allResults, ...results]; + } + + return allResults; } /** diff --git a/src/cli/main.ts b/src/cli/main.ts index ad8fd600..e4cdb2e6 100644 --- a/src/cli/main.ts +++ b/src/cli/main.ts @@ -3,7 +3,7 @@ import { dirname, join, resolve } from 'node:path'; import { config as dotenvConfig } from 'dotenv'; import { Sentry, flushSentry } from '../sentry.js'; import { loadWardenConfig, resolveSkillConfigs } from '../config/loader.js'; -import type { SkillRunnerOptions } from '../sdk/runner.js'; +import { groupByPhase } from '../pipeline/index.js'; import { resolveSkillAsync } from '../skills/loader.js'; import { matchTrigger, filterContextByPaths, shouldFail, countFindingsAtOrAbove } from '../triggers/matcher.js'; import type { SkillReport } from '../types/index.js'; @@ -90,6 +90,7 @@ interface SkillToRun { skill: string; remote?: string; filters: { paths?: string[]; ignorePaths?: string[] }; + phase?: number; } interface ProcessedResults { @@ -253,7 +254,7 @@ async function runSkills( seen.add(t.skill); return true; }) - .map((t) => ({ skill: t.skill, remote: t.remote, filters: t.filters })); + .map((t) => ({ skill: t.skill, remote: t.remote, filters: t.filters, phase: t.phase })); } else { skillsToRun = []; } @@ -269,41 +270,63 @@ async function runSkills( return 0; } - // Build skill tasks + // Build skill tasks grouped by phase // Model precedence: defaults.model > CLI flag > WARDEN_MODEL env var > SDK default const model = config?.defaults?.model ?? options.model ?? process.env['WARDEN_MODEL']; - const runnerOptions: SkillRunnerOptions = { - apiKey, - model, - abortController, - maxTurns: config?.defaults?.maxTurns, - batchDelayMs: config?.defaults?.batchDelayMs, - }; - const tasks: SkillTaskOptions[] = skillsToRun.map(({ skill, remote, filters }) => ({ - name: skill, - failOn: options.failOn, - resolveSkill: () => resolveSkillAsync(skill, repoPath, { - remote, - offline: options.offline, - }), - context: filterContextByPaths(context, filters), - runnerOptions, - })); - - // Run skills with Ink UI (TTY) or simple console output (non-TTY) const concurrency = options.parallel ?? DEFAULT_CONCURRENCY; const taskOptions = { mode: reporter.mode, verbosity: reporter.verbosity, concurrency, }; - const results = reporter.mode.isTTY - ? await runSkillTasksWithInk(tasks, taskOptions) - : await runSkillTasks(tasks, taskOptions); + + // Group skills by phase + const byPhase = new Map(); + for (const s of skillsToRun) { + const phase = s.phase ?? 1; + const existing = byPhase.get(phase); + if (existing) { + existing.push(s); + } else { + byPhase.set(phase, [s]); + } + } + const sortedPhases = [...byPhase.entries()].sort(([a], [b]) => a - b); + + let allResults: Awaited> = []; + let priorReports: SkillReport[] = []; + + for (const [, phaseSkills] of sortedPhases) { + const tasks: SkillTaskOptions[] = phaseSkills.map(({ skill, remote, filters }) => ({ + name: skill, + failOn: options.failOn, + resolveSkill: () => resolveSkillAsync(skill, repoPath, { + remote, + offline: options.offline, + }), + context: filterContextByPaths(context, filters), + runnerOptions: { + apiKey, + model, + abortController, + maxTurns: config?.defaults?.maxTurns, + batchDelayMs: config?.defaults?.batchDelayMs, + priorReports: priorReports.length > 0 ? priorReports : undefined, + }, + })); + + const results = reporter.mode.isTTY + ? await runSkillTasksWithInk(tasks, taskOptions) + : await runSkillTasks(tasks, taskOptions); + + const reports = results.flatMap((r) => r.report ? [r.report] : []); + priorReports = [...priorReports, ...reports]; + allResults = [...allResults, ...results]; + } // Process results and output const totalDuration = Date.now() - startTime; - const processed = processTaskResults(results, options.reportOn); + const processed = processTaskResults(allResults, options.reportOn); return outputResultsAndHandleFixes(processed, options, reporter, repoPath ?? cwd, totalDuration); } @@ -515,38 +538,50 @@ async function runConfigMode(options: CLIOptions, reporter: Reporter): Promise ({ - name: trigger.name, - displayName: trigger.skill, - failOn: trigger.failOn ?? options.failOn, - resolveSkill: () => resolveSkillAsync(trigger.skill, repoPath, { - remote: trigger.remote, - offline: options.offline, - }), - context: filterContextByPaths(context, trigger.filters), - runnerOptions: { - apiKey, - model: trigger.model, - abortController, - maxTurns: trigger.maxTurns, - }, - })); - - // Run triggers with Ink UI (TTY) or simple console output (non-TTY) + // Group triggers by phase and run each phase sequentially + const byPhase = groupByPhase(triggersToRun); const concurrency = options.parallel ?? config.runner?.concurrency ?? DEFAULT_CONCURRENCY; const taskOptions = { mode: reporter.mode, verbosity: reporter.verbosity, concurrency, }; - const results = reporter.mode.isTTY - ? await runSkillTasksWithInk(tasks, taskOptions) - : await runSkillTasks(tasks, taskOptions); + + let allResults: Awaited> = []; + let priorReports: SkillReport[] = []; + + for (const [, phaseTriggers] of byPhase) { + const tasks: SkillTaskOptions[] = phaseTriggers.map((trigger) => ({ + name: trigger.name, + displayName: trigger.skill, + failOn: trigger.failOn ?? options.failOn, + resolveSkill: () => resolveSkillAsync(trigger.skill, repoPath, { + remote: trigger.remote, + offline: options.offline, + }), + context: filterContextByPaths(context, trigger.filters), + runnerOptions: { + apiKey, + model: trigger.model, + abortController, + maxTurns: trigger.maxTurns, + priorReports: priorReports.length > 0 ? priorReports : undefined, + }, + })); + + const results = reporter.mode.isTTY + ? await runSkillTasksWithInk(tasks, taskOptions) + : await runSkillTasks(tasks, taskOptions); + + // Collect reports for next phase + const reports = results.flatMap((r) => r.report ? [r.report] : []); + priorReports = [...priorReports, ...reports]; + allResults = [...allResults, ...results]; + } // Process results and output const totalDuration = Date.now() - startTime; - const processed = processTaskResults(results, options.reportOn); + const processed = processTaskResults(allResults, options.reportOn); return outputResultsAndHandleFixes(processed, options, reporter, repoPath, totalDuration); } diff --git a/src/config/loader.ts b/src/config/loader.ts index ff0b260c..5713de2d 100644 --- a/src/config/loader.ts +++ b/src/config/loader.ts @@ -98,6 +98,8 @@ export interface ResolvedTrigger { maxTurns?: number; /** Schedule-specific configuration */ schedule?: ScheduleConfig; + /** Execution phase (default: 1). Phase-2 skills receive phase-1 findings in their prompt. */ + phase?: number; } /** @@ -164,6 +166,7 @@ export function resolveSkillConfigs( failCheck: skill.failCheck ?? defaults?.failCheck, model: baseModel, maxTurns: skill.maxTurns ?? defaults?.maxTurns, + phase: skill.phase, }); } else { for (const trigger of skill.triggers) { @@ -184,6 +187,7 @@ export function resolveSkillConfigs( model: emptyToUndefined(trigger.model) ?? baseModel, maxTurns: trigger.maxTurns ?? skill.maxTurns ?? defaults?.maxTurns, schedule: trigger.schedule, + phase: skill.phase, }); } } diff --git a/src/config/schema.ts b/src/config/schema.ts index 4b1c9ae3..91e261e3 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -91,6 +91,8 @@ export const SkillConfigSchema = z.object({ ignorePaths: z.array(z.string()).optional(), /** Remote repository reference for the skill (e.g., "owner/repo" or "owner/repo@sha") */ remote: z.string().optional(), + /** Execution phase (default: 1). Phase-2 skills receive phase-1 findings in their prompt. */ + phase: z.number().int().positive().default(1).optional(), // Flattened output fields (skill-level defaults) failOn: SeverityThresholdSchema.optional(), reportOn: SeverityThresholdSchema.optional(), diff --git a/src/pipeline/index.ts b/src/pipeline/index.ts new file mode 100644 index 00000000..592c4e7b --- /dev/null +++ b/src/pipeline/index.ts @@ -0,0 +1 @@ +export { groupByPhase } from './phase.js'; diff --git a/src/pipeline/phase.test.ts b/src/pipeline/phase.test.ts new file mode 100644 index 00000000..dd9c7182 --- /dev/null +++ b/src/pipeline/phase.test.ts @@ -0,0 +1,53 @@ +import { describe, it, expect } from 'vitest'; +import { groupByPhase } from './phase.js'; +import type { ResolvedTrigger } from '../config/loader.js'; + +/** Minimal trigger stub for testing. */ +function makeTrigger(name: string, phase?: number): ResolvedTrigger { + return { + name, + skill: name, + type: '*', + filters: {}, + phase, + }; +} + +describe('groupByPhase', () => { + it('defaults triggers with no phase to 1', () => { + const triggers = [makeTrigger('a'), makeTrigger('b')]; + const result = groupByPhase(triggers); + + expect([...result.keys()]).toEqual([1]); + expect(result.get(1)).toHaveLength(2); + }); + + it('groups mixed phases correctly and sorts ascending', () => { + const triggers = [ + makeTrigger('p2-a', 2), + makeTrigger('p1-a', 1), + makeTrigger('p2-b', 2), + makeTrigger('p3', 3), + makeTrigger('default'), + ]; + const result = groupByPhase(triggers); + + expect([...result.keys()]).toEqual([1, 2, 3]); + expect(result.get(1)!.map((t) => t.name)).toEqual(['p1-a', 'default']); + expect(result.get(2)!.map((t) => t.name)).toEqual(['p2-a', 'p2-b']); + expect(result.get(3)!.map((t) => t.name)).toEqual(['p3']); + }); + + it('returns a single group for single-phase triggers', () => { + const triggers = [makeTrigger('a', 1), makeTrigger('b', 1)]; + const result = groupByPhase(triggers); + + expect([...result.keys()]).toEqual([1]); + expect(result.get(1)).toHaveLength(2); + }); + + it('handles empty input', () => { + const result = groupByPhase([]); + expect(result.size).toBe(0); + }); +}); diff --git a/src/pipeline/phase.ts b/src/pipeline/phase.ts new file mode 100644 index 00000000..df95cc67 --- /dev/null +++ b/src/pipeline/phase.ts @@ -0,0 +1,29 @@ +/** + * Phase grouping utility for multi-pass skill execution. + * + * Groups triggers by their `phase` field (defaulting to 1) + * and returns them sorted by phase ascending. + */ + +import type { ResolvedTrigger } from '../config/loader.js'; + +/** + * Group triggers by execution phase, sorted ascending. + * Triggers without a phase default to phase 1. + */ +export function groupByPhase(triggers: ResolvedTrigger[]): Map { + const grouped = new Map(); + + for (const trigger of triggers) { + const phase = trigger.phase ?? 1; + const existing = grouped.get(phase); + if (existing) { + existing.push(trigger); + } else { + grouped.set(phase, [trigger]); + } + } + + // Return a new Map with keys sorted ascending + return new Map([...grouped.entries()].sort(([a], [b]) => a - b)); +} diff --git a/src/sdk/analyze.ts b/src/sdk/analyze.ts index 8e0ce0b3..b78ba791 100644 --- a/src/sdk/analyze.ts +++ b/src/sdk/analyze.ts @@ -6,7 +6,7 @@ import { Sentry, emitExtractionMetrics, emitRetryMetric, emitDedupMetrics } from import { SkillRunnerError, WardenAuthenticationError, isRetryableError, isAuthenticationError, isAuthenticationErrorMessage } from './errors.js'; import { DEFAULT_RETRY_CONFIG, calculateRetryDelay, sleep } from './retry.js'; import { extractUsage, aggregateUsage, emptyUsage, estimateTokens, aggregateAuxiliaryUsage } from './usage.js'; -import { buildHunkSystemPrompt, buildHunkUserPrompt, type PRPromptContext } from './prompt.js'; +import { buildHunkSystemPrompt, buildHunkUserPrompt, type PRPromptContext, type PriorFindingsContext } from './prompt.js'; import { extractFindingsJson, extractFindingsWithLLM, validateFindings, deduplicateFindings } from './extract.js'; import { LARGE_PROMPT_THRESHOLD_CHARS, @@ -340,7 +340,10 @@ async function analyzeHunk( const { apiKey, abortController, retry } = options; const systemPrompt = buildHunkSystemPrompt(skill); - const userPrompt = buildHunkUserPrompt(skill, hunkCtx, prContext); + const priorFindings: PriorFindingsContext | undefined = options.priorReports?.length + ? { reports: options.priorReports } + : undefined; + const userPrompt = buildHunkUserPrompt(skill, hunkCtx, prContext, priorFindings); // Report prompt size information const systemChars = systemPrompt.length; diff --git a/src/sdk/prompt.test.ts b/src/sdk/prompt.test.ts new file mode 100644 index 00000000..21d185bf --- /dev/null +++ b/src/sdk/prompt.test.ts @@ -0,0 +1,149 @@ +import { describe, it, expect } from 'vitest'; +import { buildHunkUserPrompt, type PriorFindingsContext, type PRPromptContext } from './prompt.js'; +import type { SkillDefinition } from '../config/schema.js'; +import type { HunkWithContext } from '../diff/index.js'; +import type { SkillReport } from '../types/index.js'; + +/** Minimal skill stub. */ +const skill: SkillDefinition = { + name: 'test-skill', + description: 'Test skill', + prompt: 'Check for issues', +}; + +/** Minimal hunk stub. */ +function makeHunk(filename: string): HunkWithContext { + return { + filename, + hunk: { + oldStart: 1, + oldCount: 3, + newStart: 1, + newCount: 5, + content: '@@ -1,3 +1,5 @@\n line1\n+line2\n+line3\n line4\n line5', + lines: [' line1', '+line2', '+line3', ' line4', ' line5'], + }, + contextBefore: [], + contextAfter: [], + contextStartLine: 1, + language: 'typescript', + }; +} + +/** Minimal skill report stub. */ +function makeReport(skillName: string, findings: SkillReport['findings']): SkillReport { + return { + skill: skillName, + summary: `${skillName}: ${findings.length} findings`, + findings, + }; +} + +describe('buildHunkUserPrompt', () => { + it('returns unchanged output when no prior findings provided', () => { + const hunk = makeHunk('src/foo.ts'); + const result = buildHunkUserPrompt(skill, hunk); + expect(result).not.toContain('Prior Findings'); + expect(result).toContain('test-skill'); + }); + + it('includes Prior Findings section for matching file', () => { + const hunk = makeHunk('src/foo.ts'); + const priorFindings: PriorFindingsContext = { + reports: [ + makeReport('security-scan', [ + { + id: 'sec-1', + severity: 'high', + title: 'SQL Injection', + description: 'Unsanitized input in query', + location: { path: 'src/foo.ts', startLine: 10 }, + }, + ]), + ], + }; + + const result = buildHunkUserPrompt(skill, hunk, undefined, priorFindings); + expect(result).toContain('## Prior Findings'); + expect(result).toContain('security-scan'); + expect(result).toContain('[high] SQL Injection'); + expect(result).toContain('Unsanitized input in query'); + }); + + it('omits Prior Findings section when findings are for a different file', () => { + const hunk = makeHunk('src/foo.ts'); + const priorFindings: PriorFindingsContext = { + reports: [ + makeReport('security-scan', [ + { + id: 'sec-1', + severity: 'high', + title: 'SQL Injection', + description: 'Unsanitized input', + location: { path: 'src/bar.ts', startLine: 10 }, + }, + ]), + ], + }; + + const result = buildHunkUserPrompt(skill, hunk, undefined, priorFindings); + expect(result).not.toContain('Prior Findings'); + }); + + it('includes suggestedFix in serialized findings', () => { + const hunk = makeHunk('src/foo.ts'); + const priorFindings: PriorFindingsContext = { + reports: [ + makeReport('lint-check', [ + { + id: 'lint-1', + severity: 'medium', + title: 'Unused variable', + description: 'Variable x is declared but never used', + location: { path: 'src/foo.ts', startLine: 5 }, + suggestedFix: { + description: 'Remove the variable declaration', + diff: '- const x = 1;', + }, + }, + ]), + ], + }; + + const result = buildHunkUserPrompt(skill, hunk, undefined, priorFindings); + expect(result).toContain('Prior Findings'); + expect(result).toContain('Suggested fix: Remove the variable declaration'); + }); + + it('places Prior Findings between PR context and diff', () => { + const hunk = makeHunk('src/foo.ts'); + const prContext: PRPromptContext = { + changedFiles: ['src/foo.ts', 'src/bar.ts'], + title: 'Add feature', + }; + const priorFindings: PriorFindingsContext = { + reports: [ + makeReport('security-scan', [ + { + id: 'sec-1', + severity: 'high', + title: 'Issue found', + description: 'Details', + location: { path: 'src/foo.ts', startLine: 1 }, + }, + ]), + ], + }; + + const result = buildHunkUserPrompt(skill, hunk, prContext, priorFindings); + + const prContextIndex = result.indexOf('Pull Request Context'); + const priorFindingsIndex = result.indexOf('Prior Findings'); + const diffIndex = result.indexOf('## File:'); + + // Prior findings should appear after PR context and before the diff + expect(prContextIndex).toBeGreaterThan(-1); + expect(priorFindingsIndex).toBeGreaterThan(prContextIndex); + expect(diffIndex).toBeGreaterThan(priorFindingsIndex); + }); +}); diff --git a/src/sdk/prompt.ts b/src/sdk/prompt.ts index 261dd897..289ebcda 100644 --- a/src/sdk/prompt.ts +++ b/src/sdk/prompt.ts @@ -1,6 +1,7 @@ import { existsSync } from 'node:fs'; import { join } from 'node:path'; import type { SkillDefinition } from '../config/schema.js'; +import type { SkillReport } from '../types/index.js'; import { formatHunkForAnalysis, type HunkWithContext } from '../diff/index.js'; /** @@ -19,6 +20,51 @@ export interface PRPromptContext { body?: string | null; } +/** + * Prior findings from earlier-phase skills, injected into prompts + * for second-pass skills. + */ +export interface PriorFindingsContext { + /** Findings from prior-phase skills, grouped by skill */ + reports: SkillReport[]; +} + +/** + * Serialize prior findings for a specific file into a prompt section. + * Only includes findings whose location matches the given filename. + * Returns undefined if no relevant findings exist. + */ +function serializePriorFindings( + priorFindings: PriorFindingsContext, + filename: string +): string | undefined { + const lines: string[] = []; + + for (const report of priorFindings.reports) { + const relevant = report.findings.filter( + (f) => f.location?.path === filename + ); + if (relevant.length === 0) continue; + + lines.push(`### ${report.skill}`); + for (const finding of relevant) { + const loc = finding.location; + const locStr = loc + ? `${loc.path}:${loc.startLine}${loc.endLine ? `-${loc.endLine}` : ''}` + : 'general'; + lines.push(`- **[${finding.severity}] ${finding.title}** (${locStr})`); + lines.push(` ${finding.description}`); + if (finding.suggestedFix) { + lines.push(` Suggested fix: ${finding.suggestedFix.description}`); + } + } + } + + if (lines.length === 0) return undefined; + + return `## Prior Findings\nThe following findings were reported by earlier-phase skills for this file:\n\n${lines.join('\n')}`; +} + /** * Builds the system prompt for hunk-based analysis. * @@ -108,7 +154,8 @@ You can read files from ${dirList} subdirectories using the Read tool with the f export function buildHunkUserPrompt( skill: SkillDefinition, hunkCtx: HunkWithContext, - prContext?: PRPromptContext + prContext?: PRPromptContext, + priorFindings?: PriorFindingsContext ): string { const sections: string[] = []; @@ -128,6 +175,14 @@ export function buildHunkUserPrompt( sections.push(prSection); } + // Include prior findings from earlier-phase skills (between PR context and diff) + if (priorFindings) { + const priorSection = serializePriorFindings(priorFindings, hunkCtx.filename); + if (priorSection) { + sections.push(priorSection); + } + } + // Include list of other files being changed in the PR for context const otherFiles = prContext?.changedFiles.filter((f) => f !== hunkCtx.filename) ?? []; if (otherFiles.length > 0) { diff --git a/src/sdk/runner.ts b/src/sdk/runner.ts index 9b40e1e6..60f9df7b 100644 --- a/src/sdk/runner.ts +++ b/src/sdk/runner.ts @@ -26,7 +26,7 @@ export { apiUsageToStats } from './pricing.js'; // Re-export prompt building (with legacy alias) export { buildHunkSystemPrompt, buildHunkUserPrompt } from './prompt.js'; -export type { PRPromptContext } from './prompt.js'; +export type { PRPromptContext, PriorFindingsContext } from './prompt.js'; // Legacy export for backwards compatibility export { buildHunkSystemPrompt as buildSystemPrompt } from './prompt.js'; diff --git a/src/sdk/types.ts b/src/sdk/types.ts index dd77e162..bf8346c7 100644 --- a/src/sdk/types.ts +++ b/src/sdk/types.ts @@ -1,4 +1,4 @@ -import type { Finding, UsageStats, SkippedFile, RetryConfig } from '../types/index.js'; +import type { Finding, UsageStats, SkippedFile, RetryConfig, SkillReport } from '../types/index.js'; import type { HunkWithContext } from '../diff/index.js'; import type { ChunkingConfig } from '../config/schema.js'; @@ -75,6 +75,8 @@ export interface SkillRunnerOptions { retry?: RetryConfig; /** Enable verbose logging for retry attempts */ verbose?: boolean; + /** Prior-phase skill reports, injected into prompts for second-pass skills */ + priorReports?: SkillReport[]; } /** From c110caac6f59673cb74d8ba40b22dc5d8b394022 Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 13:27:22 -0800 Subject: [PATCH 02/12] refine linter-rule-judge: produce rules, not commentary Rewrite skill to only report when it can generate an actual lint rule config or implementation as a suggestedFix. Skip silently otherwise. Co-Authored-By: Claude Opus 4.6 --- .agents/skills/linter-rule-judge/SKILL.md | 34 ++++++++++++----------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/.agents/skills/linter-rule-judge/SKILL.md b/.agents/skills/linter-rule-judge/SKILL.md index f46050de..a1981cf5 100644 --- a/.agents/skills/linter-rule-judge/SKILL.md +++ b/.agents/skills/linter-rule-judge/SKILL.md @@ -1,31 +1,33 @@ --- name: linter-rule-judge -description: Evaluate whether findings with fixes could be caught by linter rules +description: Generate lint rules that replace AI findings with deterministic checks allowed-tools: Read Grep Glob --- # Linter Rule Judge -You are a second-pass skill that evaluates findings from earlier phases. +You are a second-pass skill. Your job: turn AI findings into lint rules. -For each prior finding that includes a `suggestedFix`, determine whether the issue could be caught by a static analysis or linter rule. +For each prior finding that has a `suggestedFix`, determine if a linter rule could catch the same issue deterministically. Only report findings where you can produce the actual rule. -## Evaluation +## What to report -Categorize each finding as one of: +**Existing rule you can enable**: Produce a `suggestedFix` with the config snippet to enable it. For example, an `.eslintrc` diff adding `"no-eval": "error"`, or a `clippy.toml` change. -- **existing_rule**: An existing linter rule already catches this. Name the tool and rule (e.g., `eslint: no-unused-vars`, `clippy: needless_return`). -- **custom_rule**: A custom lint rule could catch this pattern. Describe what the rule would check and how it would be implemented (AST visitor, regex pattern, etc.). -- **ai_only**: Static analysis cannot reliably catch this. Explain why (requires semantic understanding, cross-file analysis, runtime behavior, etc.). +**Custom rule you can write**: Produce a `suggestedFix` containing the rule implementation. ESLint plugin rule, oxlint plugin, Semgrep pattern, whatever fits the project's toolchain. Use `Read` and `Glob` to check which linters the project actually uses before proposing a rule for a tool they don't have. -## Output +## What to skip silently -For each evaluated finding, report a single finding with: -- **title**: `[category] Original finding title` -- **severity**: `info` -- **description**: Your reasoning and, for `existing_rule`/`custom_rule`, the rule details -- **location**: Same location as the original finding +- Findings without a `suggestedFix` (not your problem) +- Findings that require semantic understanding, cross-file analysis, or runtime context (AI-only, nothing to generate) +- Findings where you'd just be restating "a rule could exist" without producing it -Skip findings that have no `suggestedFix` -- those are outside this skill's scope. +If nothing is actionable, return an empty findings array. Silence is fine. -If no prior findings have suggested fixes, return an empty findings array. +## Output format + +- **title**: The lint rule identifier (e.g., `no-eval`, `custom: no-execSync-interpolation`) +- **severity**: `low` +- **description**: One sentence on what the rule catches +- **suggestedFix**: The actual config change or rule implementation +- **location**: Same as the original finding From 3015852064cc6371460002b6ecb737e8ce0a784f Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 13:33:22 -0800 Subject: [PATCH 03/12] refine linter-rule-judge: require linter detection and high confidence Skill now must detect the project's actual linter system before proposing anything. Only reports when it can produce a concrete rule with high precision. Silent when nothing qualifies. Co-Authored-By: Claude Opus 4.6 --- .agents/skills/linter-rule-judge/SKILL.md | 52 +++++++++++++++++------ 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/.agents/skills/linter-rule-judge/SKILL.md b/.agents/skills/linter-rule-judge/SKILL.md index a1981cf5..6fc3b5c4 100644 --- a/.agents/skills/linter-rule-judge/SKILL.md +++ b/.agents/skills/linter-rule-judge/SKILL.md @@ -6,28 +6,56 @@ allowed-tools: Read Grep Glob # Linter Rule Judge -You are a second-pass skill. Your job: turn AI findings into lint rules. +You are a second-pass skill. Your job: turn AI findings into lint rules that the project can actually use. -For each prior finding that has a `suggestedFix`, determine if a linter rule could catch the same issue deterministically. Only report findings where you can produce the actual rule. +## Step 1: Detect the linter -## What to report +Before evaluating any findings, determine what linter system the project uses. Use `Glob` and `Read` to check for: -**Existing rule you can enable**: Produce a `suggestedFix` with the config snippet to enable it. For example, an `.eslintrc` diff adding `"no-eval": "error"`, or a `clippy.toml` change. +- `.oxlintrc.json` / `oxlint.json` (oxlint) +- `.eslintrc.*` / `eslint.config.*` / `"eslintConfig"` in package.json (eslint) +- `clippy.toml` / `.clippy.toml` (Rust clippy) +- `.pylintrc` / `pyproject.toml` with `[tool.pylint]` (pylint) +- `.flake8` / `setup.cfg` with `[flake8]` (flake8) +- `biome.json` / `biome.jsonc` (biome) -**Custom rule you can write**: Produce a `suggestedFix` containing the rule implementation. ESLint plugin rule, oxlint plugin, Semgrep pattern, whatever fits the project's toolchain. Use `Read` and `Glob` to check which linters the project actually uses before proposing a rule for a tool they don't have. +Also check whether the linter supports custom/plugin rules: +- oxlint: check for `jsPlugins` in config and an existing plugins directory +- eslint: check for local plugins or `eslint-plugin-*` deps +- biome: no custom rule support, existing rules only + +If the project has no linter, return an empty findings array. You cannot propose rules for a tool that doesn't exist. + +## Step 2: Evaluate prior findings + +For each prior finding that has a `suggestedFix`, ask: can this exact pattern be caught deterministically by the linter we found in Step 1? + +**Only report if ALL of these are true:** +1. You can identify a specific existing rule by name, OR you can write a complete working custom rule +2. The rule would catch this pattern with high precision (low false positive rate) +3. The project's linter actually supports this (don't propose eslint rules for an oxlint project unless there's eslint too) ## What to skip silently -- Findings without a `suggestedFix` (not your problem) -- Findings that require semantic understanding, cross-file analysis, or runtime context (AI-only, nothing to generate) -- Findings where you'd just be restating "a rule could exist" without producing it +- Findings without `suggestedFix` +- Patterns that need semantic understanding, type information the linter can't access, cross-file analysis, or runtime context +- Cases where a rule would have a high false positive rate +- Cases where you're not confident the rule implementation is correct -If nothing is actionable, return an empty findings array. Silence is fine. +Return an empty findings array when nothing qualifies. That's the expected common case. ## Output format -- **title**: The lint rule identifier (e.g., `no-eval`, `custom: no-execSync-interpolation`) +For existing rules: +- **title**: The rule name (e.g., `no-eval`) +- **severity**: `low` +- **description**: One sentence on what pattern it catches +- **suggestedFix**: A diff enabling the rule in the project's linter config +- **location**: Same as the original finding + +For custom rules: +- **title**: `custom: ` (e.g., `custom: no-execsync-interpolation`) - **severity**: `low` -- **description**: One sentence on what the rule catches -- **suggestedFix**: The actual config change or rule implementation +- **description**: One sentence on the pattern it detects +- **suggestedFix**: The complete rule implementation file AND the config diff to wire it up. Match the style and conventions of existing custom rules in the project. - **location**: Same as the original finding From 963c94ffcc99e66df944d1d2d3611291a21150e6 Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 13:33:59 -0800 Subject: [PATCH 04/12] linter-rule-judge: require deterministic AST patterns, not heuristics Sharpen the confidence bar: rules must match specific syntactic patterns in the AST with zero/near-zero false positives. Explicitly distinguish deterministic checks (ban eval(), require ===) from heuristic guesses (looks like user input, probably a bug). Skip anything that isn't the former. Co-Authored-By: Claude Opus 4.6 --- .agents/skills/linter-rule-judge/SKILL.md | 34 ++++++++++++++++------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/.agents/skills/linter-rule-judge/SKILL.md b/.agents/skills/linter-rule-judge/SKILL.md index 6fc3b5c4..8b599112 100644 --- a/.agents/skills/linter-rule-judge/SKILL.md +++ b/.agents/skills/linter-rule-judge/SKILL.md @@ -6,7 +6,9 @@ allowed-tools: Read Grep Glob # Linter Rule Judge -You are a second-pass skill. Your job: turn AI findings into lint rules that the project can actually use. +You are a second-pass skill. Your job: turn AI findings into deterministic lint rules. + +The bar is high. Only propose a rule when you can guarantee it catches the exact pattern through AST structure, not heuristics. A rule that fires on `eval(anything)` is deterministic. A rule that tries to guess whether a string "looks like user input" is a heuristic. Only the first kind belongs here. ## Step 1: Detect the linter @@ -28,19 +30,31 @@ If the project has no linter, return an empty findings array. You cannot propose ## Step 2: Evaluate prior findings -For each prior finding that has a `suggestedFix`, ask: can this exact pattern be caught deterministically by the linter we found in Step 1? +For each prior finding that has a `suggestedFix`, ask: can this exact pattern be caught by a deterministic AST check in the linter we found? + +**Deterministic means:** +- The rule matches a specific syntactic pattern in the AST (node type, property name, call signature) +- Zero or near-zero false positives -- if the AST matches, the code is wrong +- No guessing about intent, data flow, variable contents, or runtime behavior +- Examples: banning `eval()`, requiring `===` over `==`, disallowing `execSync` with template literal arguments, flagging `new Function()` calls + +**Not deterministic (skip these):** +- "This variable might contain user input" (data flow analysis) +- "This function name suggests it handles sensitive data" (naming heuristic) +- "This pattern is usually a bug" (probabilistic) +- Anything that requires understanding what a variable contains at runtime **Only report if ALL of these are true:** 1. You can identify a specific existing rule by name, OR you can write a complete working custom rule -2. The rule would catch this pattern with high precision (low false positive rate) -3. The project's linter actually supports this (don't propose eslint rules for an oxlint project unless there's eslint too) +2. The rule is deterministic: it matches AST structure, not heuristics +3. The project's linter actually supports this ## What to skip silently - Findings without `suggestedFix` -- Patterns that need semantic understanding, type information the linter can't access, cross-file analysis, or runtime context -- Cases where a rule would have a high false positive rate -- Cases where you're not confident the rule implementation is correct +- Patterns that need type information the linter can't access, cross-file context, or runtime knowledge +- Patterns where the rule would need to guess or use heuristics +- Cases where you're not confident the rule is correct and complete Return an empty findings array when nothing qualifies. That's the expected common case. @@ -49,13 +63,13 @@ Return an empty findings array when nothing qualifies. That's the expected commo For existing rules: - **title**: The rule name (e.g., `no-eval`) - **severity**: `low` -- **description**: One sentence on what pattern it catches +- **description**: One sentence: what AST pattern it matches - **suggestedFix**: A diff enabling the rule in the project's linter config - **location**: Same as the original finding For custom rules: - **title**: `custom: ` (e.g., `custom: no-execsync-interpolation`) - **severity**: `low` -- **description**: One sentence on the pattern it detects -- **suggestedFix**: The complete rule implementation file AND the config diff to wire it up. Match the style and conventions of existing custom rules in the project. +- **description**: One sentence: what AST pattern it matches +- **suggestedFix**: The complete rule implementation file AND the config diff to wire it up. Match the conventions of existing custom rules in the project. - **location**: Same as the original finding From fe552133718942208f4a419c8fa1b097111e5a6a Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 13:49:31 -0800 Subject: [PATCH 05/12] linter-rule-judge: omit location so findings post as top-level comments Findings target linter config/plugin files, not source code. Setting location to the original finding's source line causes nonsensical inline comments (oxlint config snippets suggested on loader.ts lines). Omitting location makes them top-level review comments instead. Co-Authored-By: Claude Opus 4.6 --- .agents/skills/linter-rule-judge/SKILL.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.agents/skills/linter-rule-judge/SKILL.md b/.agents/skills/linter-rule-judge/SKILL.md index 8b599112..9951c2f1 100644 --- a/.agents/skills/linter-rule-judge/SKILL.md +++ b/.agents/skills/linter-rule-judge/SKILL.md @@ -60,16 +60,16 @@ Return an empty findings array when nothing qualifies. That's the expected commo ## Output format +**Do NOT set a `location` field.** These findings target linter config and plugin files, not the source code where the original issue was found. Omitting location ensures they appear as top-level review comments, not inline on unrelated source lines. + For existing rules: - **title**: The rule name (e.g., `no-eval`) - **severity**: `low` - **description**: One sentence: what AST pattern it matches -- **suggestedFix**: A diff enabling the rule in the project's linter config -- **location**: Same as the original finding +- **suggestedFix**: A diff enabling the rule in the project's linter config file For custom rules: - **title**: `custom: ` (e.g., `custom: no-execsync-interpolation`) - **severity**: `low` - **description**: One sentence: what AST pattern it matches - **suggestedFix**: The complete rule implementation file AND the config diff to wire it up. Match the conventions of existing custom rules in the project. -- **location**: Same as the original finding From f621f977c2ae29e7c5cc5b65418004840bcb3890 Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 15:43:09 -0800 Subject: [PATCH 06/12] linter-rule-judge: description is the primary output, suggestedFix for local --fix only Co-Authored-By: Claude Opus 4.6 --- .agents/skills/linter-rule-judge/SKILL.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.agents/skills/linter-rule-judge/SKILL.md b/.agents/skills/linter-rule-judge/SKILL.md index 9951c2f1..09bf3b1e 100644 --- a/.agents/skills/linter-rule-judge/SKILL.md +++ b/.agents/skills/linter-rule-judge/SKILL.md @@ -62,14 +62,18 @@ Return an empty findings array when nothing qualifies. That's the expected commo **Do NOT set a `location` field.** These findings target linter config and plugin files, not the source code where the original issue was found. Omitting location ensures they appear as top-level review comments, not inline on unrelated source lines. +**The `description` is the primary output.** It must be self-contained and actionable: tell the developer exactly what to do, which config file to change, and what rule to enable or create. Write it so someone reading a PR comment knows the next step without seeing the diff. Example: "Enable the `no-eval` rule in `.oxlintrc.json` under `rules` to ban all `eval()` calls. Run `warden --fix` locally to apply." + +The `suggestedFix` carries the machine-readable diff for local application via `warden --fix`. It is not shown in PR comments. + For existing rules: - **title**: The rule name (e.g., `no-eval`) - **severity**: `low` -- **description**: One sentence: what AST pattern it matches +- **description**: What the rule catches, which config file to change, and how. Actionable on its own. - **suggestedFix**: A diff enabling the rule in the project's linter config file For custom rules: - **title**: `custom: ` (e.g., `custom: no-execsync-interpolation`) - **severity**: `low` -- **description**: One sentence: what AST pattern it matches +- **description**: What AST pattern the rule matches, which files to create/modify, and how to wire it up. Actionable on its own. - **suggestedFix**: The complete rule implementation file AND the config diff to wire it up. Match the conventions of existing custom rules in the project. From 9233626bcd8a54e470c99f4b7af14b91b5a7bf22 Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 15:52:18 -0800 Subject: [PATCH 07/12] render skill summary as header in review body for locationless findings Co-Authored-By: Claude Opus 4.6 --- .agents/skills/linter-rule-judge/SKILL.md | 4 +++- src/output/renderer.ts | 12 +++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.agents/skills/linter-rule-judge/SKILL.md b/.agents/skills/linter-rule-judge/SKILL.md index 09bf3b1e..2baf4282 100644 --- a/.agents/skills/linter-rule-judge/SKILL.md +++ b/.agents/skills/linter-rule-judge/SKILL.md @@ -60,9 +60,11 @@ Return an empty findings array when nothing qualifies. That's the expected commo ## Output format +**Set the report `summary`** to a short framing paragraph. This appears as a header on the PR comment. It should explain that these are long-term improvements the developer can adopt, and suggest prompting a local agent to apply them. Example: "Warden identified lint rules that could permanently catch patterns flagged in this review. To apply, prompt your local agent with the instructions below, or run `warden --fix` locally." + **Do NOT set a `location` field.** These findings target linter config and plugin files, not the source code where the original issue was found. Omitting location ensures they appear as top-level review comments, not inline on unrelated source lines. -**The `description` is the primary output.** It must be self-contained and actionable: tell the developer exactly what to do, which config file to change, and what rule to enable or create. Write it so someone reading a PR comment knows the next step without seeing the diff. Example: "Enable the `no-eval` rule in `.oxlintrc.json` under `rules` to ban all `eval()` calls. Run `warden --fix` locally to apply." +**The `description` is the primary output.** It must be self-contained and actionable: tell the developer exactly what to do, which config file to change, and what rule to enable or create. Write it so someone reading a PR comment knows the next step without seeing the diff. Example: "Enable the `no-eval` rule in `.oxlintrc.json` under `rules` to ban all `eval()` calls." The `suggestedFix` carries the machine-readable diff for local application via `warden --fix`. It is not shown in PR comments. diff --git a/src/output/renderer.ts b/src/output/renderer.ts index 2031b553..1a709c9d 100644 --- a/src/output/renderer.ts +++ b/src/output/renderer.ts @@ -55,7 +55,7 @@ function renderReview( if (findingsWithoutLocation.length > 0) { return { event, - body: renderFindingsBody(findingsWithoutLocation, report.skill), + body: renderFindingsBody(findingsWithoutLocation, report.skill, report.summary), comments: [], }; } @@ -105,7 +105,7 @@ function renderReview( // Include locationless findings in the review body when mixed with inline comments const body = findingsWithoutLocation.length > 0 - ? renderFindingsBody(findingsWithoutLocation, report.skill) + ? renderFindingsBody(findingsWithoutLocation, report.skill, report.summary) : ''; return { @@ -261,8 +261,14 @@ function renderFindingItem(finding: Finding): string { } /** Render findings as markdown for inclusion in a review body. */ -export function renderFindingsBody(findings: Finding[], skill: string): string { +export function renderFindingsBody(findings: Finding[], skill: string, summary?: string): string { const lines: string[] = []; + if (summary) { + lines.push(`### ${skill}`); + lines.push(''); + lines.push(summary); + lines.push(''); + } for (const finding of findings) { const emoji = SEVERITY_EMOJI[finding.severity]; const location = finding.location From 155e8f1ae3264e468fa122ad9b1be20b5470cbe7 Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 15:57:49 -0800 Subject: [PATCH 08/12] use skill description as summary header in review body The summary now includes the skill's description from SKILL.md frontmatter, giving phase-2 skills like linter-rule-judge a proper framing paragraph in PR comments instead of just stats. Co-Authored-By: Claude Opus 4.6 --- .agents/skills/linter-rule-judge/SKILL.md | 4 +--- src/sdk/analyze.ts | 12 ++++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.agents/skills/linter-rule-judge/SKILL.md b/.agents/skills/linter-rule-judge/SKILL.md index 2baf4282..aa0ac1fd 100644 --- a/.agents/skills/linter-rule-judge/SKILL.md +++ b/.agents/skills/linter-rule-judge/SKILL.md @@ -1,6 +1,6 @@ --- name: linter-rule-judge -description: Generate lint rules that replace AI findings with deterministic checks +description: Warden identified lint rules that could permanently catch patterns flagged in this review. To apply, prompt your local agent with the instructions below or run `warden --fix` locally. allowed-tools: Read Grep Glob --- @@ -60,8 +60,6 @@ Return an empty findings array when nothing qualifies. That's the expected commo ## Output format -**Set the report `summary`** to a short framing paragraph. This appears as a header on the PR comment. It should explain that these are long-term improvements the developer can adopt, and suggest prompting a local agent to apply them. Example: "Warden identified lint rules that could permanently catch patterns flagged in this review. To apply, prompt your local agent with the instructions below, or run `warden --fix` locally." - **Do NOT set a `location` field.** These findings target linter config and plugin files, not the source code where the original issue was found. Omitting location ensures they appear as top-level review comments, not inline on unrelated source lines. **The `description` is the primary output.** It must be self-contained and actionable: tell the developer exactly what to do, which config file to change, and what rule to enable or create. Write it so someone reading a PR comment knows the next step without seeing the diff. Example: "Enable the `no-eval` rule in `.oxlintrc.json` under `rules` to ban all `eval()` calls." diff --git a/src/sdk/analyze.ts b/src/sdk/analyze.ts index b78ba791..ffaab8ff 100644 --- a/src/sdk/analyze.ts +++ b/src/sdk/analyze.ts @@ -633,7 +633,7 @@ export async function analyzeFile( /** * Generate a summary of findings. */ -export function generateSummary(skillName: string, findings: Finding[]): string { +export function generateSummary(skillName: string, findings: Finding[], skillDescription?: string): string { if (findings.length === 0) { return `${skillName}: No issues found`; } @@ -650,7 +650,11 @@ export function generateSummary(skillName: string, findings: Finding[]): string if (counts['low']) parts.push(`${counts['low']} low`); if (counts['info']) parts.push(`${counts['info']} info`); - return `${skillName}: Found ${findings.length} issue${findings.length === 1 ? '' : 's'} (${parts.join(', ')})`; + const stats = `Found ${findings.length} issue${findings.length === 1 ? '' : 's'} (${parts.join(', ')})`; + if (skillDescription) { + return `${skillDescription}\n\n${stats}`; + } + return `${skillName}: ${stats}`; } /** @@ -822,8 +826,8 @@ export async function runSkill( const uniqueFindings = deduplicateFindings(allFindings); emitDedupMetrics(allFindings.length, uniqueFindings.length); - // Generate summary - const summary = generateSummary(skill.name, uniqueFindings); + // Generate summary, including skill description when findings exist + const summary = generateSummary(skill.name, uniqueFindings, skill.description); // Aggregate usage across all hunks const totalUsage = aggregateUsage(allUsage); From a72eefb8ca79bc194806a38f63205a0d692b3745 Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 16:04:14 -0800 Subject: [PATCH 09/12] linter-rule-judge: frame findings as agent-promptable instructions Description explains GitHub limitations and directs users to copy-paste each finding as a prompt to their local coding agent. Co-Authored-By: Claude Opus 4.6 --- .agents/skills/linter-rule-judge/SKILL.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.agents/skills/linter-rule-judge/SKILL.md b/.agents/skills/linter-rule-judge/SKILL.md index aa0ac1fd..965edb5c 100644 --- a/.agents/skills/linter-rule-judge/SKILL.md +++ b/.agents/skills/linter-rule-judge/SKILL.md @@ -1,6 +1,6 @@ --- name: linter-rule-judge -description: Warden identified lint rules that could permanently catch patterns flagged in this review. To apply, prompt your local agent with the instructions below or run `warden --fix` locally. +description: Warden identified lint rules that could permanently catch patterns flagged in this review. These target linter config and plugin files outside this PR, so GitHub can't propose them as inline suggestions. To apply, copy the prompt for each rule below to your local coding agent, or run `warden --fix` locally. allowed-tools: Read Grep Glob --- @@ -62,18 +62,18 @@ Return an empty findings array when nothing qualifies. That's the expected commo **Do NOT set a `location` field.** These findings target linter config and plugin files, not the source code where the original issue was found. Omitting location ensures they appear as top-level review comments, not inline on unrelated source lines. -**The `description` is the primary output.** It must be self-contained and actionable: tell the developer exactly what to do, which config file to change, and what rule to enable or create. Write it so someone reading a PR comment knows the next step without seeing the diff. Example: "Enable the `no-eval` rule in `.oxlintrc.json` under `rules` to ban all `eval()` calls." +**The `description` is the primary output.** Write each finding's description as a prompt you could copy-paste directly into a local coding agent. It should be a clear, complete instruction that an agent can act on without additional context. Example: "Add `\"no-eval\": \"error\"` to the `rules` object in `.oxlintrc.json` to ban all `eval()` calls." The `suggestedFix` carries the machine-readable diff for local application via `warden --fix`. It is not shown in PR comments. For existing rules: - **title**: The rule name (e.g., `no-eval`) - **severity**: `low` -- **description**: What the rule catches, which config file to change, and how. Actionable on its own. +- **description**: A copy-pasteable prompt: which config file to edit, what to add, and why. - **suggestedFix**: A diff enabling the rule in the project's linter config file For custom rules: - **title**: `custom: ` (e.g., `custom: no-execsync-interpolation`) - **severity**: `low` -- **description**: What AST pattern the rule matches, which files to create/modify, and how to wire it up. Actionable on its own. +- **description**: A copy-pasteable prompt: what plugin file to create, what AST pattern it matches, and how to wire it into the linter config. - **suggestedFix**: The complete rule implementation file AND the config diff to wire it up. Match the conventions of existing custom rules in the project. From 0b704b995cc9ec6a9ba4718ed0a0972b5257c9e4 Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 16:16:25 -0800 Subject: [PATCH 10/12] fix: pass skill description to generateSummary in tasks.ts The runSkillTask path in tasks.ts was the second call site that wasn't passing skill.description, so the summary never included the framing text. Co-Authored-By: Claude Opus 4.6 --- src/cli/output/tasks.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cli/output/tasks.ts b/src/cli/output/tasks.ts index b9dc8516..03069c88 100644 --- a/src/cli/output/tasks.ts +++ b/src/cli/output/tasks.ts @@ -334,7 +334,7 @@ export async function runSkillTask( const report: SkillReport = { skill: skill.name, - summary: generateSummary(skill.name, uniqueFindings), + summary: generateSummary(skill.name, uniqueFindings, skill.description), findings: uniqueFindings, usage: aggregateUsage(allUsage), durationMs: duration, From d76efb281c141551c16a774c84449f08c6f24f2f Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 17:13:51 -0800 Subject: [PATCH 11/12] feat: report-scoped skill execution Report-scoped skills run once with all prior findings instead of per-hunk. Single SDK call, no diff. Implicitly ordered after all diff-scoped skills via phase = Infinity. - Add `scope` field to SkillConfigSchema and ResolvedTrigger - Add `buildReportUserPrompt` and `serializeAllPriorFindings` - Add `analyzeReport` function, branch in `runSkill`/`runSkillTask` - Thread scope through CLI and action callers - Use `scope = "report"` for warden-lint-judge in warden.toml Co-Authored-By: Claude Opus 4.6 --- src/action/triggers/executor.ts | 1 + src/action/workflow/pr-workflow.ts | 8 +- src/cli/main.ts | 37 ++++----- src/cli/output/tasks.ts | 57 ++++++++++--- src/config/loader.test.ts | 105 ++++++++++++++++++++++++ src/config/loader.ts | 12 ++- src/config/schema.ts | 2 + src/pipeline/phase.ts | 21 +++-- src/sdk/analyze.ts | 88 ++++++++++++++++++++- src/sdk/prompt.test.ts | 123 ++++++++++++++++++++++++++++- src/sdk/prompt.ts | 123 +++++++++++++++++++++++------ src/sdk/runner.ts | 4 +- src/sdk/types.ts | 2 + warden.toml | 9 +++ 14 files changed, 515 insertions(+), 77 deletions(-) diff --git a/src/action/triggers/executor.ts b/src/action/triggers/executor.ts index 624b7775..ca7ff482 100644 --- a/src/action/triggers/executor.ts +++ b/src/action/triggers/executor.ts @@ -137,6 +137,7 @@ export async function executeTrigger( batchDelayMs: config.defaults?.batchDelayMs, pathToClaudeCodeExecutable: claudePath, priorReports: deps.priorReports, + scope: trigger.scope, }, }; diff --git a/src/action/workflow/pr-workflow.ts b/src/action/workflow/pr-workflow.ts index f5a3cf68..ef915b86 100644 --- a/src/action/workflow/pr-workflow.ts +++ b/src/action/workflow/pr-workflow.ts @@ -245,8 +245,8 @@ async function executeAllTriggers( const claudePath = await findClaudeCodeExecutable(); const byPhase = groupByPhase(matchedTriggers); - let allResults: TriggerResult[] = []; - let priorReports: SkillReport[] = []; + const allResults: TriggerResult[] = []; + const priorReports: SkillReport[] = []; for (const [, phaseTriggers] of byPhase) { const results = await processInBatches( @@ -269,8 +269,8 @@ async function executeAllTriggers( ); const reports = results.flatMap((r) => r.report ? [r.report] : []); - priorReports = [...priorReports, ...reports]; - allResults = [...allResults, ...results]; + priorReports.push(...reports); + allResults.push(...results); } return allResults; diff --git a/src/cli/main.ts b/src/cli/main.ts index e4cdb2e6..e249e5db 100644 --- a/src/cli/main.ts +++ b/src/cli/main.ts @@ -91,6 +91,7 @@ interface SkillToRun { remote?: string; filters: { paths?: string[]; ignorePaths?: string[] }; phase?: number; + scope?: 'diff' | 'report'; } interface ProcessedResults { @@ -254,7 +255,7 @@ async function runSkills( seen.add(t.skill); return true; }) - .map((t) => ({ skill: t.skill, remote: t.remote, filters: t.filters, phase: t.phase })); + .map((t) => ({ skill: t.skill, remote: t.remote, filters: t.filters, phase: t.phase, scope: t.scope })); } else { skillsToRun = []; } @@ -281,23 +282,13 @@ async function runSkills( }; // Group skills by phase - const byPhase = new Map(); - for (const s of skillsToRun) { - const phase = s.phase ?? 1; - const existing = byPhase.get(phase); - if (existing) { - existing.push(s); - } else { - byPhase.set(phase, [s]); - } - } - const sortedPhases = [...byPhase.entries()].sort(([a], [b]) => a - b); + const byPhase = groupByPhase(skillsToRun); - let allResults: Awaited> = []; - let priorReports: SkillReport[] = []; + const allResults: Awaited> = []; + const priorReports: SkillReport[] = []; - for (const [, phaseSkills] of sortedPhases) { - const tasks: SkillTaskOptions[] = phaseSkills.map(({ skill, remote, filters }) => ({ + for (const [, phaseSkills] of byPhase) { + const tasks: SkillTaskOptions[] = phaseSkills.map(({ skill, remote, filters, scope }) => ({ name: skill, failOn: options.failOn, resolveSkill: () => resolveSkillAsync(skill, repoPath, { @@ -312,6 +303,7 @@ async function runSkills( maxTurns: config?.defaults?.maxTurns, batchDelayMs: config?.defaults?.batchDelayMs, priorReports: priorReports.length > 0 ? priorReports : undefined, + scope, }, })); @@ -320,8 +312,8 @@ async function runSkills( : await runSkillTasks(tasks, taskOptions); const reports = results.flatMap((r) => r.report ? [r.report] : []); - priorReports = [...priorReports, ...reports]; - allResults = [...allResults, ...results]; + priorReports.push(...reports); + allResults.push(...results); } // Process results and output @@ -547,8 +539,8 @@ async function runConfigMode(options: CLIOptions, reporter: Reporter): Promise> = []; - let priorReports: SkillReport[] = []; + const allResults: Awaited> = []; + const priorReports: SkillReport[] = []; for (const [, phaseTriggers] of byPhase) { const tasks: SkillTaskOptions[] = phaseTriggers.map((trigger) => ({ @@ -566,6 +558,7 @@ async function runConfigMode(options: CLIOptions, reporter: Reporter): Promise 0 ? priorReports : undefined, + scope: trigger.scope, }, })); @@ -575,8 +568,8 @@ async function runConfigMode(options: CLIOptions, reporter: Reporter): Promise r.report ? [r.report] : []); - priorReports = [...priorReports, ...reports]; - allResults = [...allResults, ...results]; + priorReports.push(...reports); + allResults.push(...results); } // Process results and output diff --git a/src/cli/output/tasks.ts b/src/cli/output/tasks.ts index 03069c88..fbc41a28 100644 --- a/src/cli/output/tasks.ts +++ b/src/cli/output/tasks.ts @@ -9,6 +9,7 @@ import { Sentry, emitSkillMetrics, emitDedupMetrics, logger } from '../../sentry import { prepareFiles, analyzeFile, + analyzeReport, aggregateUsage, aggregateAuxiliaryUsage, deduplicateFindings, @@ -176,6 +177,53 @@ export async function runSkillTask( // Resolve the skill const skill = await resolveSkill(); + // Build PR context for inclusion in prompts (if available) + const prContext: PRPromptContext | undefined = context.pullRequest + ? { + changedFiles: context.pullRequest.files.map((f) => f.filename), + title: context.pullRequest.title, + body: context.pullRequest.body, + } + : undefined; + + // Report-scoped: single SDK call on prior findings, skip per-file loop + if (runnerOptions.scope === 'report') { + callbacks.onSkillStart({ + name, + displayName, + status: 'running', + startTime, + files: [], + findings: [], + }); + + const result = await analyzeReport(skill, context.repoPath, runnerOptions, prContext); + const uniqueFindings = deduplicateFindings(result.findings); + const duration = Date.now() - startTime; + + const report: SkillReport = { + skill: skill.name, + summary: generateSummary(skill.name, uniqueFindings, skill.description), + findings: uniqueFindings, + usage: result.usage, + durationMs: duration, + }; + if (result.auxiliaryUsage) { + report.auxiliaryUsage = aggregateAuxiliaryUsage(result.auxiliaryUsage); + } + + emitSkillMetrics(report); + callbacks.onSkillUpdate(name, { + status: 'done', + durationMs: duration, + findings: uniqueFindings, + usage: report.usage, + }); + callbacks.onSkillComplete(name, report); + + return { name, report, failOn }; + } + // Prepare files (parse patches into hunks) const { files: preparedFiles, skippedFiles } = prepareFiles(context, { contextLines: runnerOptions.contextLines, @@ -216,15 +264,6 @@ export async function runSkillTask( findings: [], }); - // Build PR context for inclusion in prompts (if available) - const prContext: PRPromptContext | undefined = context.pullRequest - ? { - changedFiles: context.pullRequest.files.map((f) => f.filename), - title: context.pullRequest.title, - body: context.pullRequest.body, - } - : undefined; - // Process files with concurrency const processFile = async (prepared: PreparedFile, index: number): Promise => { const filename = prepared.filename; diff --git a/src/config/loader.test.ts b/src/config/loader.test.ts index c6bd4305..5fcbfe69 100644 --- a/src/config/loader.test.ts +++ b/src/config/loader.test.ts @@ -739,3 +739,108 @@ describe('defaults.ignorePaths config', () => { expect(result.data?.defaults?.ignorePaths).toEqual(['dist/**', 'node_modules/**']); }); }); + +describe('scope config', () => { + it('accepts scope = "diff" in skill', () => { + const config = { + version: 1, + skills: [{ name: 'test', scope: 'diff' }], + }; + + const result = WardenConfigSchema.safeParse(config); + expect(result.success).toBe(true); + expect(result.data?.skills[0]?.scope).toBe('diff'); + }); + + it('accepts scope = "report" in skill', () => { + const config = { + version: 1, + skills: [{ name: 'test', scope: 'report' }], + }; + + const result = WardenConfigSchema.safeParse(config); + expect(result.success).toBe(true); + expect(result.data?.skills[0]?.scope).toBe('report'); + }); + + it('defaults scope to "diff"', () => { + const config = { + version: 1, + skills: [{ name: 'test' }], + }; + + const result = WardenConfigSchema.safeParse(config); + expect(result.success).toBe(true); + expect(result.data?.skills[0]?.scope).toBe('diff'); + }); + + it('rejects invalid scope values', () => { + const config = { + version: 1, + skills: [{ name: 'test', scope: 'invalid' }], + }; + + const result = WardenConfigSchema.safeParse(config); + expect(result.success).toBe(false); + }); + + it('report-scoped triggers get forced to last phase (Infinity)', () => { + const config: WardenConfig = { + version: 1, + skills: [ + { name: 'diff-skill', phase: 1 }, + { name: 'report-skill', scope: 'report', phase: 1 }, + ], + }; + + const resolved = resolveSkillConfigs(config); + const diffSkill = resolved.find((t) => t.name === 'diff-skill'); + const reportSkill = resolved.find((t) => t.name === 'report-skill'); + + expect(diffSkill?.phase).toBe(1); + expect(reportSkill?.phase).toBe(Infinity); + expect(reportSkill?.scope).toBe('report'); + }); + + it('report-scoped triggers override explicit phase', () => { + const config: WardenConfig = { + version: 1, + skills: [ + { name: 'report-skill', scope: 'report', phase: 2 }, + ], + }; + + const [resolved] = resolveSkillConfigs(config); + expect(resolved?.phase).toBe(Infinity); + }); + + it('diff-scoped triggers preserve their phase', () => { + const config: WardenConfig = { + version: 1, + skills: [ + { name: 'diff-skill', scope: 'diff', phase: 3 }, + ], + }; + + const [resolved] = resolveSkillConfigs(config); + expect(resolved?.phase).toBe(3); + expect(resolved?.scope).toBeUndefined(); + }); + + it('scope is threaded through trigger entries', () => { + const config: WardenConfig = { + version: 1, + skills: [{ + name: 'report-skill', + scope: 'report', + triggers: [ + { type: 'pull_request', actions: ['opened'] }, + ], + }], + }; + + const [resolved] = resolveSkillConfigs(config); + expect(resolved?.scope).toBe('report'); + expect(resolved?.phase).toBe(Infinity); + }); +}); diff --git a/src/config/loader.ts b/src/config/loader.ts index 5713de2d..42356b7d 100644 --- a/src/config/loader.ts +++ b/src/config/loader.ts @@ -100,6 +100,8 @@ export interface ResolvedTrigger { schedule?: ScheduleConfig; /** Execution phase (default: 1). Phase-2 skills receive phase-1 findings in their prompt. */ phase?: number; + /** Execution scope: 'diff' (default) analyzes hunks, 'report' analyzes prior findings as a whole. */ + scope?: 'diff' | 'report'; } /** @@ -150,6 +152,10 @@ export function resolveSkillConfigs( ignorePaths: mergedIgnorePaths.length > 0 ? mergedIgnorePaths : undefined, }; + // Report-scoped skills always run after all diff-scoped skills + const scope = skill.scope === 'report' ? 'report' : undefined; + const effectivePhase = scope === 'report' ? Infinity : skill.phase; + if (!skill.triggers || skill.triggers.length === 0) { // Wildcard: no triggers means run everywhere result.push({ @@ -166,7 +172,8 @@ export function resolveSkillConfigs( failCheck: skill.failCheck ?? defaults?.failCheck, model: baseModel, maxTurns: skill.maxTurns ?? defaults?.maxTurns, - phase: skill.phase, + phase: effectivePhase, + scope, }); } else { for (const trigger of skill.triggers) { @@ -187,7 +194,8 @@ export function resolveSkillConfigs( model: emptyToUndefined(trigger.model) ?? baseModel, maxTurns: trigger.maxTurns ?? skill.maxTurns ?? defaults?.maxTurns, schedule: trigger.schedule, - phase: skill.phase, + phase: effectivePhase, + scope, }); } } diff --git a/src/config/schema.ts b/src/config/schema.ts index 91e261e3..39a01b0e 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -93,6 +93,8 @@ export const SkillConfigSchema = z.object({ remote: z.string().optional(), /** Execution phase (default: 1). Phase-2 skills receive phase-1 findings in their prompt. */ phase: z.number().int().positive().default(1).optional(), + /** Execution scope: 'diff' (default) analyzes hunks, 'report' analyzes prior findings as a whole. */ + scope: z.enum(['diff', 'report']).default('diff').optional(), // Flattened output fields (skill-level defaults) failOn: SeverityThresholdSchema.optional(), reportOn: SeverityThresholdSchema.optional(), diff --git a/src/pipeline/phase.ts b/src/pipeline/phase.ts index df95cc67..4424794c 100644 --- a/src/pipeline/phase.ts +++ b/src/pipeline/phase.ts @@ -1,29 +1,26 @@ /** * Phase grouping utility for multi-pass skill execution. * - * Groups triggers by their `phase` field (defaulting to 1) + * Groups items by their `phase` field (defaulting to 1) * and returns them sorted by phase ascending. */ -import type { ResolvedTrigger } from '../config/loader.js'; - /** - * Group triggers by execution phase, sorted ascending. - * Triggers without a phase default to phase 1. + * Group items by execution phase, sorted ascending. + * Items without a phase default to phase 1. */ -export function groupByPhase(triggers: ResolvedTrigger[]): Map { - const grouped = new Map(); +export function groupByPhase(items: T[]): Map { + const grouped = new Map(); - for (const trigger of triggers) { - const phase = trigger.phase ?? 1; + for (const item of items) { + const phase = item.phase ?? 1; const existing = grouped.get(phase); if (existing) { - existing.push(trigger); + existing.push(item); } else { - grouped.set(phase, [trigger]); + grouped.set(phase, [item]); } } - // Return a new Map with keys sorted ascending return new Map([...grouped.entries()].sort(([a], [b]) => a - b)); } diff --git a/src/sdk/analyze.ts b/src/sdk/analyze.ts index ffaab8ff..3acefeb8 100644 --- a/src/sdk/analyze.ts +++ b/src/sdk/analyze.ts @@ -6,7 +6,7 @@ import { Sentry, emitExtractionMetrics, emitRetryMetric, emitDedupMetrics } from import { SkillRunnerError, WardenAuthenticationError, isRetryableError, isAuthenticationError, isAuthenticationErrorMessage } from './errors.js'; import { DEFAULT_RETRY_CONFIG, calculateRetryDelay, sleep } from './retry.js'; import { extractUsage, aggregateUsage, emptyUsage, estimateTokens, aggregateAuxiliaryUsage } from './usage.js'; -import { buildHunkSystemPrompt, buildHunkUserPrompt, type PRPromptContext, type PriorFindingsContext } from './prompt.js'; +import { buildHunkSystemPrompt, buildHunkUserPrompt, buildReportUserPrompt, type PRPromptContext, type PriorFindingsContext } from './prompt.js'; import { extractFindingsJson, extractFindingsWithLLM, validateFindings, deduplicateFindings } from './extract.js'; import { LARGE_PROMPT_THRESHOLD_CHARS, @@ -630,6 +630,73 @@ export async function analyzeFile( ); } +/** + * Analyze prior findings as a whole (report-scoped skill). + * Makes a single SDK call instead of per-hunk iteration. + */ +export async function analyzeReport( + skill: SkillDefinition, + repoPath: string, + options: SkillRunnerOptions, + prContext?: PRPromptContext +): Promise<{ findings: Finding[]; usage: UsageStats; failed: boolean; auxiliaryUsage?: AuxiliaryUsageEntry[] }> { + return Sentry.startSpan( + { + op: 'skill.analyze_report', + name: `analyze report ${skill.name}`, + attributes: { 'skill.name': skill.name, 'skill.scope': 'report' }, + }, + async (span) => { + const { apiKey } = options; + + const systemPrompt = buildHunkSystemPrompt(skill); + const priorFindings: PriorFindingsContext | undefined = options.priorReports?.length + ? { reports: options.priorReports } + : undefined; + const userPrompt = buildReportUserPrompt(skill, prContext, priorFindings); + + try { + const { result: resultMessage, authError } = await executeQuery(systemPrompt, userPrompt, repoPath, options, skill.name); + + if (authError) { + throw new WardenAuthenticationError(authError); + } + + if (!resultMessage) { + return { findings: [], usage: emptyUsage(), failed: true }; + } + + const usage = extractUsage(resultMessage); + + const isError = resultMessage.is_error || resultMessage.subtype !== 'success'; + if (isError) { + return { findings: [], usage, failed: true }; + } + + // Report-scoped findings don't have a single filename context. + // Pass empty string — parseHunkOutput validates/fills location from the finding itself. + const parseResult = await parseHunkOutput(resultMessage, '', apiKey); + + span.setAttribute('finding.count', parseResult.findings.length); + + return { + findings: parseResult.findings, + usage, + failed: false, + auxiliaryUsage: parseResult.extractionUsage + ? [{ agent: 'extraction', usage: parseResult.extractionUsage }] + : undefined, + }; + } catch (error) { + if (error instanceof WardenAuthenticationError) throw error; + if (isAuthenticationError(error)) throw new WardenAuthenticationError(); + console.error(`Report analysis failed: ${error instanceof Error ? error.message : String(error)}`); + return { findings: [], usage: emptyUsage(), failed: true }; + } + }, + ); +} + /** * Generate a summary of findings. */ @@ -672,6 +739,25 @@ export async function runSkill( throw new SkillRunnerError('Pull request context required for skill execution'); } + // Report-scoped: single SDK call on prior findings, skip per-file loop + if (options.scope === 'report') { + const prContext: PRPromptContext = { + changedFiles: context.pullRequest.files.map((f) => f.filename), + title: context.pullRequest.title, + body: context.pullRequest.body, + }; + const result = await analyzeReport(skill, context.repoPath, options, prContext); + const uniqueFindings = deduplicateFindings(result.findings); + return { + skill: skill.name, + summary: generateSummary(skill.name, uniqueFindings, skill.description), + findings: uniqueFindings, + usage: result.usage, + durationMs: Date.now() - startTime, + ...(result.auxiliaryUsage && { auxiliaryUsage: aggregateAuxiliaryUsage(result.auxiliaryUsage) }), + }; + } + const { files: fileHunks, skippedFiles } = prepareFiles(context, { contextLines: options.contextLines, // Note: chunking config should come from the caller (e.g., from warden.toml defaults) diff --git a/src/sdk/prompt.test.ts b/src/sdk/prompt.test.ts index 21d185bf..af6452d5 100644 --- a/src/sdk/prompt.test.ts +++ b/src/sdk/prompt.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { buildHunkUserPrompt, type PriorFindingsContext, type PRPromptContext } from './prompt.js'; +import { buildHunkUserPrompt, buildReportUserPrompt, serializeAllPriorFindings, type PriorFindingsContext, type PRPromptContext } from './prompt.js'; import type { SkillDefinition } from '../config/schema.js'; import type { HunkWithContext } from '../diff/index.js'; import type { SkillReport } from '../types/index.js'; @@ -147,3 +147,124 @@ describe('buildHunkUserPrompt', () => { expect(diffIndex).toBeGreaterThan(priorFindingsIndex); }); }); + +describe('serializeAllPriorFindings', () => { + it('includes findings from all files', () => { + const priorFindings: PriorFindingsContext = { + reports: [ + makeReport('security-scan', [ + { + id: 'sec-1', + severity: 'high', + title: 'SQL Injection', + description: 'Unsanitized input', + location: { path: 'src/foo.ts', startLine: 10 }, + }, + { + id: 'sec-2', + severity: 'medium', + title: 'XSS Risk', + description: 'Unescaped output', + location: { path: 'src/bar.ts', startLine: 5 }, + }, + ]), + ], + }; + + const result = serializeAllPriorFindings(priorFindings); + expect(result).toContain('security-scan'); + expect(result).toContain('SQL Injection'); + expect(result).toContain('src/foo.ts'); + expect(result).toContain('XSS Risk'); + expect(result).toContain('src/bar.ts'); + }); + + it('returns undefined when no findings exist', () => { + const priorFindings: PriorFindingsContext = { + reports: [makeReport('empty-skill', [])], + }; + + const result = serializeAllPriorFindings(priorFindings); + expect(result).toBeUndefined(); + }); + + it('includes findings from multiple skills', () => { + const priorFindings: PriorFindingsContext = { + reports: [ + makeReport('security-scan', [ + { id: 'sec-1', severity: 'high', title: 'Issue A', description: 'Desc A', location: { path: 'a.ts', startLine: 1 } }, + ]), + makeReport('lint-check', [ + { id: 'lint-1', severity: 'low', title: 'Issue B', description: 'Desc B', location: { path: 'b.ts', startLine: 2 } }, + ]), + ], + }; + + const result = serializeAllPriorFindings(priorFindings); + expect(result).toContain('### security-scan'); + expect(result).toContain('### lint-check'); + expect(result).toContain('Issue A'); + expect(result).toContain('Issue B'); + }); +}); + +describe('buildReportUserPrompt', () => { + it('includes all prior findings (not filtered by file)', () => { + const priorFindings: PriorFindingsContext = { + reports: [ + makeReport('security-scan', [ + { id: 'sec-1', severity: 'high', title: 'SQL Injection', description: 'In foo', location: { path: 'src/foo.ts', startLine: 10 } }, + { id: 'sec-2', severity: 'medium', title: 'XSS', description: 'In bar', location: { path: 'src/bar.ts', startLine: 5 } }, + ]), + ], + }; + + const result = buildReportUserPrompt(skill, undefined, priorFindings); + expect(result).toContain('Prior Findings'); + expect(result).toContain('SQL Injection'); + expect(result).toContain('XSS'); + expect(result).toContain('src/foo.ts'); + expect(result).toContain('src/bar.ts'); + }); + + it('includes changed files list', () => { + const prContext: PRPromptContext = { + changedFiles: ['src/foo.ts', 'src/bar.ts', 'src/baz.ts'], + title: 'Add feature', + }; + + const result = buildReportUserPrompt(skill, prContext); + expect(result).toContain('## Changed Files'); + expect(result).toContain('src/foo.ts'); + expect(result).toContain('src/bar.ts'); + expect(result).toContain('src/baz.ts'); + }); + + it('produces valid prompt with empty prior findings', () => { + const priorFindings: PriorFindingsContext = { + reports: [makeReport('empty-skill', [])], + }; + + const result = buildReportUserPrompt(skill, undefined, priorFindings); + expect(result).toContain('test-skill'); + expect(result).not.toContain('Prior Findings'); + expect(result).toContain('IMPORTANT'); + }); + + it('includes PR context when provided', () => { + const prContext: PRPromptContext = { + changedFiles: ['src/foo.ts'], + title: 'Fix authentication bug', + body: 'This PR fixes a critical auth bypass', + }; + + const result = buildReportUserPrompt(skill, prContext); + expect(result).toContain('Fix authentication bug'); + expect(result).toContain('critical auth bypass'); + }); + + it('references skill name in opening instruction', () => { + const result = buildReportUserPrompt(skill); + expect(result).toContain(`"${skill.name}"`); + }); +}); diff --git a/src/sdk/prompt.ts b/src/sdk/prompt.ts index 289ebcda..eaad8940 100644 --- a/src/sdk/prompt.ts +++ b/src/sdk/prompt.ts @@ -29,6 +29,26 @@ export interface PriorFindingsContext { reports: SkillReport[]; } +/** + * Serialize a list of findings into markdown lines. + * Shared by both file-scoped and report-scoped serializers. + */ +function serializeFindingsList(findings: SkillReport['findings']): string[] { + const lines: string[] = []; + for (const finding of findings) { + const loc = finding.location; + const locStr = loc + ? `${loc.path}:${loc.startLine}${loc.endLine ? `-${loc.endLine}` : ''}` + : 'general'; + lines.push(`- **[${finding.severity}] ${finding.title}** (${locStr})`); + lines.push(` ${finding.description}`); + if (finding.suggestedFix) { + lines.push(` Suggested fix: ${finding.suggestedFix.description}`); + } + } + return lines; +} + /** * Serialize prior findings for a specific file into a prompt section. * Only includes findings whose location matches the given filename. @@ -47,17 +67,7 @@ function serializePriorFindings( if (relevant.length === 0) continue; lines.push(`### ${report.skill}`); - for (const finding of relevant) { - const loc = finding.location; - const locStr = loc - ? `${loc.path}:${loc.startLine}${loc.endLine ? `-${loc.endLine}` : ''}` - : 'general'; - lines.push(`- **[${finding.severity}] ${finding.title}** (${locStr})`); - lines.push(` ${finding.description}`); - if (finding.suggestedFix) { - lines.push(` Suggested fix: ${finding.suggestedFix.description}`); - } - } + lines.push(...serializeFindingsList(relevant)); } if (lines.length === 0) return undefined; @@ -65,6 +75,46 @@ function serializePriorFindings( return `## Prior Findings\nThe following findings were reported by earlier-phase skills for this file:\n\n${lines.join('\n')}`; } +/** + * Serialize all prior findings (across all files) into a prompt section. + * Used by report-scoped skills that analyze findings as a whole. + * Returns undefined if no findings exist. + */ +export function serializeAllPriorFindings( + priorFindings: PriorFindingsContext +): string | undefined { + const lines: string[] = []; + + for (const report of priorFindings.reports) { + if (report.findings.length === 0) continue; + + lines.push(`### ${report.skill}`); + lines.push(...serializeFindingsList(report.findings)); + } + + if (lines.length === 0) return undefined; + + return `## Prior Findings\nThe following findings were reported by earlier-phase skills:\n\n${lines.join('\n')}`; +} + +/** + * Format PR context (title + truncated body) as a prompt section. + * Returns undefined if no title is available. + */ +function formatPRContext(prContext?: PRPromptContext): string | undefined { + if (!prContext?.title) return undefined; + + let section = `## Pull Request Context\n**Title:** ${prContext.title}`; + if (prContext.body) { + const maxBodyLength = 1000; + const body = prContext.body.length > maxBodyLength + ? prContext.body.slice(0, maxBodyLength) + '...' + : prContext.body; + section += `\n\n**Description:**\n${body}`; + } + return section; +} + /** * Builds the system prompt for hunk-based analysis. * @@ -161,19 +211,8 @@ export function buildHunkUserPrompt( sections.push(`Analyze this code change according to the "${skill.name}" skill criteria.`); - // Include PR title and description for context on intent - if (prContext?.title) { - let prSection = `## Pull Request Context\n**Title:** ${prContext.title}`; - if (prContext.body) { - // Truncate very long PR descriptions to avoid bloating prompts - const maxBodyLength = 1000; - const body = prContext.body.length > maxBodyLength - ? prContext.body.slice(0, maxBodyLength) + '...' - : prContext.body; - prSection += `\n\n**Description:**\n${body}`; - } - sections.push(prSection); - } + const prSection = formatPRContext(prContext); + if (prSection) sections.push(prSection); // Include prior findings from earlier-phase skills (between PR context and diff) if (priorFindings) { @@ -199,3 +238,39 @@ ${otherFiles.map((f) => `- ${f}`).join('\n')}`); return sections.join('\n\n'); } + +/** + * Builds the user prompt for a report-scoped skill. + * Report-scoped skills analyze all prior findings as a whole (single SDK call, no diff). + */ +export function buildReportUserPrompt( + skill: SkillDefinition, + prContext?: PRPromptContext, + priorFindings?: PriorFindingsContext +): string { + const sections: string[] = []; + + sections.push(`Analyze the findings from prior skills according to the "${skill.name}" skill criteria.`); + + const prSection = formatPRContext(prContext); + if (prSection) sections.push(prSection); + + // Include changed files list for context + if (prContext?.changedFiles && prContext.changedFiles.length > 0) { + sections.push(`## Changed Files\nThe following files were changed in this PR:\n${prContext.changedFiles.map((f) => `- ${f}`).join('\n')}`); + } + + // Include all prior findings + if (priorFindings) { + const priorSection = serializeAllPriorFindings(priorFindings); + if (priorSection) { + sections.push(priorSection); + } + } + + sections.push( + `IMPORTANT: Only report findings that are explicitly covered by the skill instructions. Do not report general code quality issues, bugs, or improvements unless the skill specifically asks for them. Return an empty findings array if no issues match the skill's criteria.` + ); + + return sections.join('\n\n'); +} diff --git a/src/sdk/runner.ts b/src/sdk/runner.ts index 60f9df7b..7875a0e6 100644 --- a/src/sdk/runner.ts +++ b/src/sdk/runner.ts @@ -25,7 +25,7 @@ export { aggregateUsage, aggregateAuxiliaryUsage, mergeAuxiliaryUsage, estimateT export { apiUsageToStats } from './pricing.js'; // Re-export prompt building (with legacy alias) -export { buildHunkSystemPrompt, buildHunkUserPrompt } from './prompt.js'; +export { buildHunkSystemPrompt, buildHunkUserPrompt, buildReportUserPrompt, serializeAllPriorFindings } from './prompt.js'; export type { PRPromptContext, PriorFindingsContext } from './prompt.js'; // Legacy export for backwards compatibility export { buildHunkSystemPrompt as buildSystemPrompt } from './prompt.js'; @@ -46,7 +46,7 @@ export type { ExtractFindingsResult } from './extract.js'; export { prepareFiles } from './prepare.js'; // Re-export analysis functions -export { analyzeFile, runSkill, generateSummary } from './analyze.js'; +export { analyzeFile, analyzeReport, runSkill, generateSummary } from './analyze.js'; // Re-export types export type { diff --git a/src/sdk/types.ts b/src/sdk/types.ts index bf8346c7..c2b24180 100644 --- a/src/sdk/types.ts +++ b/src/sdk/types.ts @@ -77,6 +77,8 @@ export interface SkillRunnerOptions { verbose?: boolean; /** Prior-phase skill reports, injected into prompts for second-pass skills */ priorReports?: SkillReport[]; + /** Execution scope: 'report' runs a single analysis on prior findings instead of per-hunk */ + scope?: 'diff' | 'report'; } /** diff --git a/warden.toml b/warden.toml index de66b1e3..aa44db98 100644 --- a/warden.toml +++ b/warden.toml @@ -17,6 +17,15 @@ ignorePaths = ["src/**/*.test.ts"] type = "pull_request" actions = ["opened", "synchronize", "reopened"] +[[skills]] +name = "warden-lint-judge" +remote = "getsentry/skills" +scope = "report" + +[[skills.triggers]] +type = "pull_request" +actions = ["opened", "synchronize", "reopened"] + [[skills]] name = "code-simplifier" remote = "getsentry/sentry-skills" From 57dd6d37114c9c860016eaab12affd935dc06f99 Mon Sep 17 00:00:00 2001 From: Greg Pstrucha <875316+gricha@users.noreply.github.com> Date: Fri, 13 Feb 2026 17:14:00 -0800 Subject: [PATCH 12/12] remove local linter-rule-judge skill Now using remote warden-lint-judge from getsentry/skills. Co-Authored-By: Claude Opus 4.6 --- .agents/skills/linter-rule-judge/SKILL.md | 79 ----------------------- 1 file changed, 79 deletions(-) delete mode 100644 .agents/skills/linter-rule-judge/SKILL.md diff --git a/.agents/skills/linter-rule-judge/SKILL.md b/.agents/skills/linter-rule-judge/SKILL.md deleted file mode 100644 index 965edb5c..00000000 --- a/.agents/skills/linter-rule-judge/SKILL.md +++ /dev/null @@ -1,79 +0,0 @@ ---- -name: linter-rule-judge -description: Warden identified lint rules that could permanently catch patterns flagged in this review. These target linter config and plugin files outside this PR, so GitHub can't propose them as inline suggestions. To apply, copy the prompt for each rule below to your local coding agent, or run `warden --fix` locally. -allowed-tools: Read Grep Glob ---- - -# Linter Rule Judge - -You are a second-pass skill. Your job: turn AI findings into deterministic lint rules. - -The bar is high. Only propose a rule when you can guarantee it catches the exact pattern through AST structure, not heuristics. A rule that fires on `eval(anything)` is deterministic. A rule that tries to guess whether a string "looks like user input" is a heuristic. Only the first kind belongs here. - -## Step 1: Detect the linter - -Before evaluating any findings, determine what linter system the project uses. Use `Glob` and `Read` to check for: - -- `.oxlintrc.json` / `oxlint.json` (oxlint) -- `.eslintrc.*` / `eslint.config.*` / `"eslintConfig"` in package.json (eslint) -- `clippy.toml` / `.clippy.toml` (Rust clippy) -- `.pylintrc` / `pyproject.toml` with `[tool.pylint]` (pylint) -- `.flake8` / `setup.cfg` with `[flake8]` (flake8) -- `biome.json` / `biome.jsonc` (biome) - -Also check whether the linter supports custom/plugin rules: -- oxlint: check for `jsPlugins` in config and an existing plugins directory -- eslint: check for local plugins or `eslint-plugin-*` deps -- biome: no custom rule support, existing rules only - -If the project has no linter, return an empty findings array. You cannot propose rules for a tool that doesn't exist. - -## Step 2: Evaluate prior findings - -For each prior finding that has a `suggestedFix`, ask: can this exact pattern be caught by a deterministic AST check in the linter we found? - -**Deterministic means:** -- The rule matches a specific syntactic pattern in the AST (node type, property name, call signature) -- Zero or near-zero false positives -- if the AST matches, the code is wrong -- No guessing about intent, data flow, variable contents, or runtime behavior -- Examples: banning `eval()`, requiring `===` over `==`, disallowing `execSync` with template literal arguments, flagging `new Function()` calls - -**Not deterministic (skip these):** -- "This variable might contain user input" (data flow analysis) -- "This function name suggests it handles sensitive data" (naming heuristic) -- "This pattern is usually a bug" (probabilistic) -- Anything that requires understanding what a variable contains at runtime - -**Only report if ALL of these are true:** -1. You can identify a specific existing rule by name, OR you can write a complete working custom rule -2. The rule is deterministic: it matches AST structure, not heuristics -3. The project's linter actually supports this - -## What to skip silently - -- Findings without `suggestedFix` -- Patterns that need type information the linter can't access, cross-file context, or runtime knowledge -- Patterns where the rule would need to guess or use heuristics -- Cases where you're not confident the rule is correct and complete - -Return an empty findings array when nothing qualifies. That's the expected common case. - -## Output format - -**Do NOT set a `location` field.** These findings target linter config and plugin files, not the source code where the original issue was found. Omitting location ensures they appear as top-level review comments, not inline on unrelated source lines. - -**The `description` is the primary output.** Write each finding's description as a prompt you could copy-paste directly into a local coding agent. It should be a clear, complete instruction that an agent can act on without additional context. Example: "Add `\"no-eval\": \"error\"` to the `rules` object in `.oxlintrc.json` to ban all `eval()` calls." - -The `suggestedFix` carries the machine-readable diff for local application via `warden --fix`. It is not shown in PR comments. - -For existing rules: -- **title**: The rule name (e.g., `no-eval`) -- **severity**: `low` -- **description**: A copy-pasteable prompt: which config file to edit, what to add, and why. -- **suggestedFix**: A diff enabling the rule in the project's linter config file - -For custom rules: -- **title**: `custom: ` (e.g., `custom: no-execsync-interpolation`) -- **severity**: `low` -- **description**: A copy-pasteable prompt: what plugin file to create, what AST pattern it matches, and how to wire it into the linter config. -- **suggestedFix**: The complete rule implementation file AND the config diff to wire it up. Match the conventions of existing custom rules in the project.