diff --git a/js/index.ts b/js/index.ts index 49e7863..bf06db3 100644 --- a/js/index.ts +++ b/js/index.ts @@ -40,3 +40,11 @@ export * from "./ragas"; export * from "./value"; export { Evaluators } from "./manifest"; export { makePartial, ScorerWithPartial } from "./partial"; +export { + computeThreadTemplateVars, + formatMessageArrayAsText, + isLLMMessageArray, + isRoleContentMessage, + type LLMMessage, + type ThreadTemplateVars, +} from "./thread-utils"; diff --git a/js/llm.ts b/js/llm.ts index 28ec3d3..bc20874 100644 --- a/js/llm.ts +++ b/js/llm.ts @@ -8,6 +8,42 @@ import { } from "openai/resources"; import { makePartial, ScorerWithPartial } from "./partial"; import { renderMessages } from "./render-messages"; +import { + computeThreadTemplateVars, + type ThreadTemplateVars, +} from "./thread-utils"; + +/** + * Minimal interface for a Trace object that can provide thread data. + * This is compatible with the Trace interface from the braintrust SDK. + */ +export interface TraceForScorer { + getThread(options?: { preprocessor?: string }): Promise; +} + +// Thread-related template variable names that require preprocessor invocation +export const THREAD_VARIABLE_NAMES = [ + "thread", + "thread_count", + "first_message", + "last_message", + "user_messages", + "assistant_messages", + "human_ai_pairs", +]; + +// Pattern to match thread variables in template syntax: {{thread, {{ thread, {%...thread, etc. +export const THREAD_VARIABLE_PATTERN = new RegExp( + `\\{[\\{%]\\s*(${THREAD_VARIABLE_NAMES.join("|")})`, +); + +/** + * Check if a template string might use thread-related template variables. + * This is a heuristic - looks for variable names after {{ or {% syntax. + */ +export function templateUsesThreadVariables(template: string): boolean { + return THREAD_VARIABLE_PATTERN.test(template); +} const NO_COT_SUFFIX = "Answer the question by calling `select_choice` with a single choice from {{__choices}}."; @@ -193,6 +229,12 @@ function parseResponse( export type LLMClassifierArgs = { model?: string; useCoT?: boolean; + /** + * Optional trace object for multi-turn scoring. + * When provided, thread template variables (thread_text, thread_count, etc.) + * are automatically computed and made available in the template. + */ + trace?: TraceForScorer; } & LLMArgs & RenderArgs; @@ -217,6 +259,21 @@ export function LLMClassifierFromTemplate({ ) => { const useCoT = runtimeArgs.useCoT ?? useCoTArg ?? true; + // Compute thread template variables if trace is available AND the template uses them. + // These become available in templates as {{thread}}, {{thread_count}}, etc. + // Note: {{thread}} automatically renders as human-readable text via smart escape. + // Only call getThread() if the template actually uses thread variables to avoid + // creating unnecessary preprocessor spans. + let threadVars: Record = {}; + if (runtimeArgs.trace && templateUsesThreadVariables(promptTemplate)) { + const thread = await runtimeArgs.trace.getThread(); + const computed = computeThreadTemplateVars(thread); + // Build threadVars from THREAD_VARIABLE_NAMES to keep in sync with the pattern + for (const name of THREAD_VARIABLE_NAMES) { + threadVars[name] = computed[name as keyof ThreadTemplateVars]; + } + } + const prompt = promptTemplate + "\n" + (useCoT ? COT_SUFFIX : NO_COT_SUFFIX); @@ -228,7 +285,8 @@ export function LLMClassifierFromTemplate({ }, ]; - return await OpenAIClassifier({ + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const classifierArgs: any = { name, messages, choiceScores, @@ -237,12 +295,15 @@ export function LLMClassifierFromTemplate({ maxTokens, temperature, __choices: choiceStrings, + // Thread template vars come first so explicit args can override + ...threadVars, ...runtimeArgs, - // Since the logic is a bit funky for computing this, include // it at the end to prevent overrides useCoT, - }); + }; + + return await OpenAIClassifier(classifierArgs); }; Object.defineProperty(ret, "name", { value: name, diff --git a/js/render-messages.test.ts b/js/render-messages.test.ts index 39e7000..3d41939 100644 --- a/js/render-messages.test.ts +++ b/js/render-messages.test.ts @@ -37,3 +37,149 @@ describe("renderMessages", () => { expect(rendered[0].content).toBe(""); }); }); + +describe("renderMessages with thread variables", () => { + const sampleThread = [ + { role: "user", content: "Hello, how are you?" }, + { role: "assistant", content: "I am doing well, thank you!" }, + { role: "user", content: "What is the weather like?" }, + { role: "assistant", content: "It is sunny and warm today." }, + ]; + + it("{{thread}} renders full conversation as human-readable text", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "{{thread}}" }, + ]; + const rendered = renderMessages(messages, { thread: sampleThread }); + + expect(rendered[0].content).toContain("User:"); + expect(rendered[0].content).toContain("Hello, how are you?"); + expect(rendered[0].content).toContain("Assistant:"); + expect(rendered[0].content).toContain("I am doing well, thank you!"); + expect(rendered[0].content).toContain("What is the weather like?"); + expect(rendered[0].content).toContain("It is sunny and warm today."); + }); + + it("{{thread.0}} renders first message as formatted text", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "First message: {{thread.0}}" }, + ]; + const rendered = renderMessages(messages, { thread: sampleThread }); + + expect(rendered[0].content).toBe( + "First message: user: Hello, how are you?", + ); + }); + + it("{{thread.1}} renders second message as formatted text", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "Second message: {{thread.1}}" }, + ]; + const rendered = renderMessages(messages, { thread: sampleThread }); + + expect(rendered[0].content).toBe( + "Second message: assistant: I am doing well, thank you!", + ); + }); + + it("{{first_message}} renders single message formatted", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "First: {{first_message}}" }, + ]; + const rendered = renderMessages(messages, { + first_message: sampleThread[0], + }); + + expect(rendered[0].content).toBe("First: user: Hello, how are you?"); + }); + + it("{{thread_count}} renders as a number", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "Count: {{thread_count}}" }, + ]; + const rendered = renderMessages(messages, { thread_count: 4 }); + + expect(rendered[0].content).toBe("Count: 4"); + }); + + it("{{user_messages}} renders array of user messages", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "Users said: {{user_messages}}" }, + ]; + const userMessages = sampleThread.filter((m) => m.role === "user"); + const rendered = renderMessages(messages, { user_messages: userMessages }); + + expect(rendered[0].content).toContain("User:"); + expect(rendered[0].content).toContain("Hello, how are you?"); + expect(rendered[0].content).toContain("What is the weather like?"); + expect(rendered[0].content).not.toContain("Assistant:"); + }); + + it("{{user_messages.0}} renders first user message", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "First user: {{user_messages.0}}" }, + ]; + const userMessages = sampleThread.filter((m) => m.role === "user"); + const rendered = renderMessages(messages, { user_messages: userMessages }); + + expect(rendered[0].content).toBe("First user: user: Hello, how are you?"); + }); + + it("{{human_ai_pairs}} renders array of paired turns", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "Pairs: {{human_ai_pairs}}" }, + ]; + const pairs = [ + { human: sampleThread[0], assistant: sampleThread[1] }, + { human: sampleThread[2], assistant: sampleThread[3] }, + ]; + const rendered = renderMessages(messages, { human_ai_pairs: pairs }); + + // Pairs are objects, so they get JSON stringified + expect(rendered[0].content).toContain("Pairs:"); + expect(rendered[0].content).toContain("human"); + expect(rendered[0].content).toContain("assistant"); + }); + + it("{{#thread}}...{{/thread}} iterates over messages", () => { + const messages: ChatCompletionMessageParam[] = [ + { + role: "user", + content: "Messages:{{#thread}}\n- {{role}}: {{content}}{{/thread}}", + }, + ]; + const rendered = renderMessages(messages, { thread: sampleThread }); + + expect(rendered[0].content).toBe( + "Messages:\n- user: Hello, how are you?\n- assistant: I am doing well, thank you!\n- user: What is the weather like?\n- assistant: It is sunny and warm today.", + ); + }); + + it("handles empty thread gracefully", () => { + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "Thread: {{thread}}" }, + ]; + const rendered = renderMessages(messages, { thread: [] }); + + expect(rendered[0].content).toBe("Thread: "); + }); + + it("handles thread with complex content (arrays)", () => { + const complexThread = [ + { + role: "user", + content: [{ type: "text", text: "Hello with structured content" }], + }, + { role: "assistant", content: "Simple response" }, + ]; + const messages: ChatCompletionMessageParam[] = [ + { role: "user", content: "{{thread}}" }, + ]; + const rendered = renderMessages(messages, { thread: complexThread }); + + expect(rendered[0].content).toContain("User:"); + expect(rendered[0].content).toContain("Hello with structured content"); + expect(rendered[0].content).toContain("Assistant:"); + expect(rendered[0].content).toContain("Simple response"); + }); +}); diff --git a/js/render-messages.ts b/js/render-messages.ts index ee29586..c111890 100644 --- a/js/render-messages.ts +++ b/js/render-messages.ts @@ -1,5 +1,32 @@ import mustache from "mustache"; import { ChatCompletionMessageParam } from "openai/resources"; +import { + isLLMMessageArray, + isRoleContentMessage, + formatMessageArrayAsText, +} from "./thread-utils"; + +/** + * Smart escape function for Mustache templates. + * - Strings are passed through unchanged + * - LLM message arrays are formatted as human-readable text + * - Single messages are formatted with role and content + * - Other values are JSON-stringified + */ +function escapeValue(v: unknown): string { + if (typeof v === "string") { + return v; + } + if (isLLMMessageArray(v)) { + return formatMessageArrayAsText(v); + } + if (isRoleContentMessage(v)) { + const content = + typeof v.content === "string" ? v.content : JSON.stringify(v.content); + return `${v.role}: ${content}`; + } + return JSON.stringify(v); +} export function renderMessages( messages: ChatCompletionMessageParam[], @@ -9,8 +36,7 @@ export function renderMessages( ...m, content: m.content ? mustache.render(m.content as string, renderArgs, undefined, { - escape: (v: unknown) => - typeof v === "string" ? v : JSON.stringify(v), + escape: escapeValue, }) : "", })); diff --git a/js/thread-utils.ts b/js/thread-utils.ts new file mode 100644 index 0000000..b999b62 --- /dev/null +++ b/js/thread-utils.ts @@ -0,0 +1,327 @@ +/** + * Thread utilities for LLM-as-a-judge scorers. + * + * This module provides utilities for working with preprocessed conversation + * messages (threads) in LLM scorer templates. + */ + +/** + * A message with role and content fields (LLM chat message format). + */ +export interface LLMMessage { + role: string; + content: unknown; +} + +function isObject(value: unknown): value is { [key: string]: unknown } { + return value instanceof Object && !(value instanceof Array); +} + +/** + * Check if an item looks like an LLM message (has role and content). + */ +export function isRoleContentMessage(item: unknown): item is LLMMessage { + return isObject(item) && "role" in item && "content" in item; +} + +/** + * Check if a value is an array of LLM messages. + */ +export function isLLMMessageArray(value: unknown): value is LLMMessage[] { + return Array.isArray(value) && value.every(isRoleContentMessage); +} + +function indent(text: string, prefix = " "): string { + return text + .split("\n") + .map((line) => (line ? prefix + line : prefix)) + .join("\n"); +} + +function truncateMiddle(text: string, maxLen: number): string { + if (text.length <= maxLen) return text; + const charsRemoved = text.length - maxLen + 30; + const ellipsis = ` [...${charsRemoved} chars truncated...] `; + const avail = maxLen - ellipsis.length; + if (avail <= 0) return text.slice(0, maxLen); + const left = Math.floor(avail / 2); + const right = avail - left; + return text.slice(0, left) + ellipsis + text.slice(-right); +} + +interface PendingToolCall { + name: string; + args: string; +} + +function isTypedPart( + part: unknown, +): part is { type: string; [key: string]: unknown } { + return isObject(part) && typeof part.type === "string"; +} + +function extractToolCalls(content: unknown[]): Map { + const toolCalls = new Map(); + + for (const part of content) { + if (!isTypedPart(part) || part.type !== "tool_call") continue; + + const id = typeof part.tool_call_id === "string" ? part.tool_call_id : ""; + if (!id) continue; + + const name = + typeof part.tool_name === "string" ? part.tool_name : "unknown"; + + let args = ""; + if (isObject(part.arguments)) { + const argsObj = part.arguments; + if (argsObj.type === "valid") { + args = JSON.stringify(argsObj.value); + } else if (typeof argsObj.value === "string") { + args = argsObj.value; + } else { + args = JSON.stringify(argsObj.value); + } + } + + toolCalls.set(id, { name, args }); + } + + return toolCalls; +} + +function unwrapContent(content: unknown): string { + if (typeof content === "string") { + try { + const parsed = JSON.parse(content); + return unwrapContent(parsed); + } catch { + const errorMatch = content.match(/^error:\s*'(.+)'$/s); + if (errorMatch) { + return errorMatch[1]; + } + return content; + } + } + + if (Array.isArray(content)) { + const textParts: string[] = []; + for (const item of content) { + if (isObject(item) && typeof item.text === "string") { + textParts.push(unwrapContent(item.text)); + } else if (typeof item === "string") { + textParts.push(unwrapContent(item)); + } + } + if (textParts.length > 0) { + return textParts.join("\n"); + } + } + + if (isObject(content) && typeof content.text === "string") { + return unwrapContent(content.text); + } + + return typeof content === "string" ? content : JSON.stringify(content); +} + +function formatToolResult( + toolCallId: string, + toolName: string, + output: unknown, + pendingToolCalls: Map, +): string { + const pendingCall = pendingToolCalls.get(toolCallId); + const name = toolName || pendingCall?.name || "tool"; + const args = pendingCall?.args || ""; + + const resultContent = unwrapContent(output); + const lines = [`Tool (${name}):`]; + + if (args) { + lines.push(` Args:`); + lines.push(` ${truncateMiddle(args, 500)}`); + } + + const isError = + resultContent.toLowerCase().includes("error:") || + resultContent.toLowerCase().includes('"error"') || + resultContent.toLowerCase().startsWith("error"); + + if (isError) { + lines.push(` Error:`); + lines.push(` ${truncateMiddle(resultContent, 500)}`); + } else { + lines.push(` Result:`); + lines.push(` ${truncateMiddle(resultContent, 500)}`); + } + + if (pendingCall) { + pendingToolCalls.delete(toolCallId); + } + + return lines.join("\n"); +} + +function formatToolResults( + content: unknown[], + pendingToolCalls: Map, +): string[] { + const results: string[] = []; + + for (const part of content) { + if (!isTypedPart(part) || part.type !== "tool_result") continue; + + const toolCallId = + typeof part.tool_call_id === "string" ? part.tool_call_id : ""; + const toolName = typeof part.tool_name === "string" ? part.tool_name : ""; + + results.push( + formatToolResult(toolCallId, toolName, part.output, pendingToolCalls), + ); + } + + return results; +} + +function extractTextContent(content: unknown): string { + if (typeof content === "string") { + return content.trim() ? content : ""; + } + + if (!Array.isArray(content)) { + return ""; + } + + const parts: string[] = []; + for (const part of content) { + if (typeof part === "string" && part.trim()) { + parts.push(part); + } else if (isTypedPart(part)) { + if (part.type === "text" && typeof part.text === "string") { + parts.push(part.text); + } else if (part.type === "reasoning" && typeof part.text === "string") { + parts.push(`[thinking: ${part.text.slice(0, 100)}...]`); + } + } else if (isObject(part) && typeof part.text === "string") { + parts.push(part.text); + } + } + + return parts.join("\n"); +} + +/** + * Format an array of LLM messages as human-readable text. + */ +export function formatMessageArrayAsText(messages: LLMMessage[]): string { + const pendingToolCalls = new Map(); + for (const msg of messages) { + if (msg.role === "assistant" && Array.isArray(msg.content)) { + const calls = extractToolCalls(msg.content); + for (const [id, call] of calls) { + pendingToolCalls.set(id, call); + } + } + } + + const parts: string[] = []; + for (const msg of messages) { + const role = msg.role; + const capitalizedRole = role.charAt(0).toUpperCase() + role.slice(1); + + if (role === "tool" && Array.isArray(msg.content)) { + const toolResults = formatToolResults(msg.content, pendingToolCalls); + parts.push(...toolResults); + } else { + const text = extractTextContent(msg.content); + if (text) { + parts.push(`${capitalizedRole}:\n${indent(text)}`); + } + } + } + + return parts.join("\n\n"); +} + +/** + * Template variables computed from a thread for use in LLM-as-a-judge scorers. + * + * Note: `thread` automatically renders as human-readable text in Mustache + * templates via the smart escape function. No need for a separate `thread_text`. + */ +export interface ThreadTemplateVars { + thread: unknown[]; + thread_count: number; + first_message: unknown | null; + last_message: unknown | null; + user_messages: unknown[]; + assistant_messages: unknown[]; + human_ai_pairs: Array<{ human: unknown; assistant: unknown }>; +} + +/** + * Compute template variables from a thread for use in mustache templates. + * Uses lazy getters so expensive computations only run when accessed. + * + * Note: `thread` (and other message variables) will automatically render as + * human-readable text when used in templates like `{{thread}}` due to the + * smart escape function in renderMessages. + */ +export function computeThreadTemplateVars( + thread: unknown[], +): ThreadTemplateVars { + let _user_messages: unknown[] | undefined; + let _assistant_messages: unknown[] | undefined; + let _human_ai_pairs: + | Array<{ human: unknown; assistant: unknown }> + | undefined; + + return { + thread, + thread_count: thread.length, + + get first_message(): unknown | null { + return thread[0] ?? null; + }, + + get last_message(): unknown | null { + return thread[thread.length - 1] ?? null; + }, + + get user_messages(): unknown[] { + if (_user_messages === undefined) { + _user_messages = thread.filter( + (m) => isRoleContentMessage(m) && m.role === "user", + ); + } + return _user_messages; + }, + + get assistant_messages(): unknown[] { + if (_assistant_messages === undefined) { + _assistant_messages = thread.filter( + (m) => isRoleContentMessage(m) && m.role === "assistant", + ); + } + return _assistant_messages; + }, + + get human_ai_pairs(): Array<{ human: unknown; assistant: unknown }> { + if (_human_ai_pairs === undefined) { + _human_ai_pairs = []; + const users = thread.filter( + (m) => isRoleContentMessage(m) && m.role === "user", + ); + const assistants = thread.filter( + (m) => isRoleContentMessage(m) && m.role === "assistant", + ); + const pairCount = Math.min(users.length, assistants.length); + for (let i = 0; i < pairCount; i++) { + _human_ai_pairs.push({ human: users[i], assistant: assistants[i] }); + } + } + return _human_ai_pairs; + }, + }; +}