From 6e7df8e339b266df0334552969828e60c82c64f2 Mon Sep 17 00:00:00 2001 From: Theo Browne Date: Sat, 29 Nov 2025 02:08:28 -0800 Subject: [PATCH 01/10] new plan --- .cursor/plan.md | 171 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 116 insertions(+), 55 deletions(-) diff --git a/.cursor/plan.md b/.cursor/plan.md index f1b7fed..01b2060 100644 --- a/.cursor/plan.md +++ b/.cursor/plan.md @@ -1,55 +1,116 @@ -## AI Essay → Review → Revision Pipeline - -### Goal - -Implement a **Bun-friendly TypeScript CLI** (`bun run index.ts`) that: - -- Prompts the user for an essay topic. -- Uses **model A (OpenRouter via Vercel AI SDK)** to generate an essay. -- Uses **model B** to review that essay and produce feedback. -- Calls **model A again** with the feedback to produce a revised essay. -- Saves all three artifacts as **markdown files** on disk in a consistent location, with the `runs/` directory ignored by git. - -### High-level Design - -- **Runtime & entrypoint**: Keep using Bun with `index.ts` as the main CLI entrypoint. -- **AI client setup**: -- Add `ai` and the **OpenRouter provider for the Vercel AI SDK** as dependencies (no separate `openai` package needed since we are using OpenRouter directly). -- Configure a small `aiClient.ts` module (or keep logic inline in `index.ts` if very small) that wires the AI SDK to OpenRouter using an `OPENROUTER_API_KEY` env var. -- Hard-code two model IDs (e.g. one for essay generation, one for review) with clear `const` names so you can easily change them later. -- **Pipeline orchestration**: -- Implement a `runEssayPipeline()` function that: -- Reads the prompt from stdin (simple interactive question). -- Calls the **essay model** with a system prompt + user prompt to generate the initial essay. -- Calls the **review model** with system instructions plus the essay content to generate feedback. -- Calls the **essay model** again with the original prompt and the feedback to produce a revised essay. -- Keep everything **strongly typed** with small TypeScript interfaces for the pipeline results. -- **Markdown file output**: -- Decide on a simple folder and naming scheme (e.g. `runs/-essay.md`, `runs/-review.md`, `runs/-revision.md`). -- Use Bun / Node fs APIs in a small utility to write each step as a separate markdown file. -- Include basic front-matter or headings (e.g. `# Original Essay`, `# Review Feedback`, `# Revised Essay`) for easy inspection in an editor. -- Ensure `runs/` is added to `.gitignore` so generated artifacts don’t clutter git history. - -### Implementation Steps - -- **setup-deps**: Add `ai` and the OpenRouter provider for the AI SDK to `package.json` and document the required `OPENROUTER_API_KEY` env var in `README.md`. -- **ai-client**: Create a small AI client configuration that: -- Instantiates the AI SDK with the OpenRouter provider. -- Exposes typed helpers like `generateEssay(prompt)`, `reviewEssay(essay)`, and `reviseEssay(prompt, essay, feedback)`. -- **pipeline-logic**: Implement `runEssayPipeline()` in `index.ts` that: -- Interactively asks for a prompt via stdin. -- Runs the three AI steps in sequence (no streaming needed) with clear logging to the console. -- Returns a typed result object containing the three text outputs. -- **file-output**: Add a small utility function to: -- Create a `runs/` directory if it doesn’t exist. -- Write three markdown files with timestamped names and simple headings. -- Confirm that `runs/` is listed in `.gitignore`. -- **polish-types**: Ensure all public functions are type-safe (typed params and return types where helpful) and that the code compiles under the existing `tsconfig`. - -### Todos - -- **setup-deps**: Add and configure Vercel AI SDK (`ai`) and the OpenRouter provider, and document `OPENROUTER_API_KEY`. -- **ai-client**: Implement the AI client helper(s) for essay generation, review, and revision using hard-coded OpenRouter model IDs. -- **pipeline-logic**: Implement the CLI flow in `index.ts` to run the generation → review → revision pipeline. -- **file-output**: Implement markdown file-writing utilities (create `runs/` directory, timestamped filenames, headings) and ensure `runs/` is in `.gitignore`. -- **polish-types**: Run TypeScript checks and tighten any loose types if needed. +# Writing Quality Arena + +## Models Configuration + +Use the provided `modelsToRun` array in `constants.ts`: + +```ts +export type RunnableModel = { + name: string; + llm: LanguageModelV1; + reasoning: boolean; +}; + +export const modelsToRun: RunnableModel[] = [ + { + name: "claude-4.5-opus-reasoning", + llm: openrouter("anthropic/claude-opus-4.5"), + reasoning: true, + }, + // ... 11 models total +]; + +export const PARALLEL_LIMIT = 5; // Configurable concurrency +``` + +## Execution Flow (4 Phases) + +### Phase 1: Essay Generation + +Each model writes an essay on the topic. **N calls**. + +### Phase 2: All-to-All Review + +Every model reviews EVERY essay (including their own). **N × N calls**. + +### Phase 3: Per-Reviewer Revisions + +Each model creates a separate revised essay for EACH piece of feedback received. **N × N revisions**. + +### Phase 4: Scoring + +Every model scores EVERY essay (N originals + N×(N-1) revisions). Use `generateObject` with Zod schema: + +```ts +const ScoreSchema = z.object({ + score: z.number().min(1).max(10), + justification: z.string(), +}); +``` + +**N × (N + N×(N-1)) = N × N² = N³ calls**. + +## API Call Summary (N=11 models) + +| Phase | Formula | Calls | + +|-------|---------|-------| + +| Essays | N | 11 | + +| Feedback | N×(N-1) | 110 | + +| Revisions | N×(N-1) | 110 | + +| Scores | N³ | 1331 | + +| **Total** | | **1562** | + +## Rankings + +**Essay Ranking**: All essays (original + revised) ranked by average score across all judges. + +**Reviewer Ranking**: For each reviewer, calculate avg improvement = mean(revision_score - original_score) for all revisions that used their feedback. + +## File Structure + +``` +results/{timestamp}/ +├── essays/{model-name}.md +├── feedback/{reviewer}-on-{author}.md +├── revisions/{author}-revised-by-{reviewer}.md +├── results.json +└── summary.md +``` + +## File Changes + +| File | Change | + +|------|--------| + +| `constants.ts` | Add `RunnableModel` type, `modelsToRun` array, `PARALLEL_LIMIT` | + +| `types.ts` | Already has appropriate types; verify alignment | + +| `aiClient.ts` | Update functions to accept `RunnableModel`, add `scoreEssay()` using `generateObject` | + +| `index.ts` | Rewrite with 4-phase arena orchestration, parallel execution via `p-limit`, `confirmRun()` | + +| `fileUtils.ts` | Rewrite for arena folder structure (`results/` dir, essays/, feedback/, revisions/, results.json, summary.md) | + +## CLI Confirmation + +Display call counts and prompt before running: + +```ts +async function confirmRun(): Promise { + const n = modelsToRun.length; + const essays = n; + const feedback = n * (n - 1); + const revisions = n * (n - 1); + const scores = n * n * n; + const total = essays + feedback + revisions + scores; + // ... display and prompt Y/n +} +``` From e853546d7d94dfbf0b4e536179f0a61e5e7575d5 Mon Sep 17 00:00:00 2001 From: Theo Browne Date: Sat, 29 Nov 2025 02:25:11 -0800 Subject: [PATCH 02/10] arena first pass --- aiClient.ts | 80 ++++--- bun.lock | 34 +++ constants.ts | 104 +++++++++ fileUtils.ts | 240 ++++++++++++++++---- index.ts | 618 +++++++++++++++++++++++++++++++++++++++++++++++---- package.json | 4 +- 6 files changed, 966 insertions(+), 114 deletions(-) create mode 100644 constants.ts diff --git a/aiClient.ts b/aiClient.ts index 96ef8fc..795db1b 100644 --- a/aiClient.ts +++ b/aiClient.ts @@ -1,20 +1,6 @@ -import { createOpenRouter } from "@openrouter/ai-sdk-provider"; -import { generateText } from "ai"; - -// Model IDs - easily changeable constants -const ESSAY_MODEL = "anthropic/claude-opus-4.5"; -const REVIEW_MODEL = "moonshotai/kimi-k2-thinking"; - -// Initialize the OpenRouter provider -if (!process.env.OPENROUTER_API_KEY) { - throw new Error( - "OPENROUTER_API_KEY environment variable is required. Please set it before running the script." - ); -} - -const openrouter = createOpenRouter({ - apiKey: process.env.OPENROUTER_API_KEY, -}); +import { generateObject, generateText } from "ai"; +import { z } from "zod"; +import type { RunnableModel } from "./constants"; export interface EssayResult { text: string; @@ -28,14 +14,25 @@ export interface RevisionResult { text: string; } +export const ScoreSchema = z.object({ + score: z.number().min(1).max(10), + justification: z.string(), +}); + +export type ScoreResult = z.infer; + /** * Generates an essay based on the given topic prompt. */ -export async function generateEssay(topic: string): Promise { +export async function generateEssay( + model: RunnableModel, + topic: string +): Promise { const result = await generateText({ - model: openrouter(ESSAY_MODEL), + model: model.llm, system: `You are an expert essay writer. Write a well-structured, thoughtful essay on the given topic. -The essay should be clear, engaging, and demonstrate strong writing skills.`, +The essay should be clear, engaging, and demonstrate strong writing skills. +Write approximately 800-1200 words.`, prompt: `Write an essay on the following topic:\n\n${topic}`, }); @@ -47,13 +44,17 @@ The essay should be clear, engaging, and demonstrate strong writing skills.`, /** * Reviews an essay and provides constructive feedback. */ -export async function reviewEssay(essay: string): Promise { +export async function reviewEssay( + model: RunnableModel, + essay: string, + topic: string +): Promise { const result = await generateText({ - model: openrouter(REVIEW_MODEL), + model: model.llm, system: `You are an expert writing tutor and editor. Review the essay provided and give constructive, specific feedback on areas such as structure, clarity, argumentation, style, and areas for improvement. -Be thorough but encouraging.`, - prompt: `Please review the following essay and provide detailed feedback:\n\n${essay}`, +Be thorough but encouraging. Focus on actionable improvements.`, + prompt: `Topic: ${topic}\n\nPlease review the following essay and provide detailed feedback:\n\n${essay}`, }); return { @@ -65,14 +66,16 @@ Be thorough but encouraging.`, * Revises an essay based on the original topic, original essay, and review feedback. */ export async function reviseEssay( + model: RunnableModel, topic: string, originalEssay: string, feedback: string ): Promise { const result = await generateText({ - model: openrouter(ESSAY_MODEL), + model: model.llm, system: `You are an expert essay writer. Revise the provided essay based on the feedback given, -while maintaining the core message and improving the areas identified.`, +while maintaining the core message and improving the areas identified. +Produce a complete revised essay, not just suggestions.`, prompt: `Original topic: ${topic}\n\nOriginal essay:\n${originalEssay}\n\nReview feedback:\n${feedback}\n\nPlease revise the essay based on the feedback above.`, }); @@ -80,3 +83,28 @@ while maintaining the core message and improving the areas identified.`, text: result.text, }; } + +/** + * Scores an essay on a scale of 1-10 with justification. + */ +export async function scoreEssay( + model: RunnableModel, + essay: string, + topic: string +): Promise { + const result = await generateObject({ + model: model.llm, + schema: ScoreSchema, + system: `You are an expert essay judge. Score the essay on a scale of 1-10 based on: +- Clarity and coherence of argument +- Quality of writing (style, grammar, flow) +- Depth of insight and originality +- Relevance to the topic +- Overall effectiveness + +Be fair and consistent in your scoring. A score of 5 is average, 7-8 is good, 9-10 is exceptional.`, + prompt: `Topic: ${topic}\n\nPlease score the following essay:\n\n${essay}`, + }); + + return result.object; +} diff --git a/bun.lock b/bun.lock index adc4602..325db0a 100644 --- a/bun.lock +++ b/bun.lock @@ -4,6 +4,12 @@ "workspaces": { "": { "name": "auto-draftify", + "dependencies": { + "@openrouter/ai-sdk-provider": "^1.0.0", + "ai": "^5.0.0", + "p-limit": "^6.1.0", + "zod": "^3.24.0", + }, "devDependencies": { "@types/bun": "latest", }, @@ -13,14 +19,42 @@ }, }, "packages": { + "@ai-sdk/gateway": ["@ai-sdk/gateway@2.0.17", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.18", "@vercel/oidc": "3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-oVAG6q72KsjKlrYdLhWjRO7rcqAR8CjokAbYuyVZoCO4Uh2PH/VzZoxZav71w2ipwlXhHCNaInGYWNs889MMDA=="], + + "@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="], + + "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.18", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-ypv1xXMsgGcNKUP+hglKqtdDuMg68nWHucPPAhIENrbFAI+xCHiqPVN8Zllxyv1TNZwGWUghPxJXU+Mqps0YRQ=="], + + "@openrouter/ai-sdk-provider": ["@openrouter/ai-sdk-provider@1.2.8", "", { "dependencies": { "@openrouter/sdk": "^0.1.8" }, "peerDependencies": { "ai": "^5.0.0", "zod": "^3.24.1 || ^v4" } }, "sha512-pQT8AzZBKg9f4bkt4doF486ZlhK0XjKkevrLkiqYgfh1Jplovieu28nK4Y+xy3sF18/mxjqh9/2y6jh01qzLrA=="], + + "@openrouter/sdk": ["@openrouter/sdk@0.1.27", "", { "dependencies": { "zod": "^3.25.0 || ^4.0.0" } }, "sha512-RH//L10bSmc81q25zAZudiI4kNkLgxF2E+WU42vghp3N6TEvZ6F0jK7uT3tOxkEn91gzmMw9YVmDENy7SJsajQ=="], + + "@opentelemetry/api": ["@opentelemetry/api@1.9.0", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="], + + "@standard-schema/spec": ["@standard-schema/spec@1.0.0", "", {}, "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA=="], + "@types/bun": ["@types/bun@1.3.3", "", { "dependencies": { "bun-types": "1.3.3" } }, "sha512-ogrKbJ2X5N0kWLLFKeytG0eHDleBYtngtlbu9cyBKFtNL3cnpDZkNdQj8flVf6WTZUX5ulI9AY1oa7ljhSrp+g=="], "@types/node": ["@types/node@24.10.1", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-GNWcUTRBgIRJD5zj+Tq0fKOJ5XZajIiBroOF0yvj2bSU1WvNdYS/dn9UxwsujGW4JX06dnHyjV2y9rRaybH0iQ=="], + "@vercel/oidc": ["@vercel/oidc@3.0.5", "", {}, "sha512-fnYhv671l+eTTp48gB4zEsTW/YtRgRPnkI2nT7x6qw5rkI1Lq2hTmQIpHPgyThI0znLK+vX2n9XxKdXZ7BUbbw=="], + + "ai": ["ai@5.0.104", "", { "dependencies": { "@ai-sdk/gateway": "2.0.17", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.18", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-MZOkL9++nY5PfkpWKBR3Rv+Oygxpb9S16ctv8h91GvrSif7UnNEdPMVZe3bUyMd2djxf0AtBk/csBixP0WwWZQ=="], + "bun-types": ["bun-types@1.3.3", "", { "dependencies": { "@types/node": "*" } }, "sha512-z3Xwlg7j2l9JY27x5Qn3Wlyos8YAp0kKRlrePAOjgjMGS5IG6E7Jnlx736vH9UVI4wUICwwhC9anYL++XeOgTQ=="], + "eventsource-parser": ["eventsource-parser@3.0.6", "", {}, "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg=="], + + "json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="], + + "p-limit": ["p-limit@6.2.0", "", { "dependencies": { "yocto-queue": "^1.1.1" } }, "sha512-kuUqqHNUqoIWp/c467RI4X6mmyuojY5jGutNU0wVTmEOOfcuwLqyMVoAi9MKi2Ak+5i9+nhmrK4ufZE8069kHA=="], + "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], "undici-types": ["undici-types@7.16.0", "", {}, "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="], + + "yocto-queue": ["yocto-queue@1.2.2", "", {}, "sha512-4LCcse/U2MHZ63HAJVE+v71o7yOdIe4cZ70Wpf8D/IyjDKYQLV5GD46B+hSTjJsvV5PztjvHoU580EftxjDZFQ=="], + + "zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="], } } diff --git a/constants.ts b/constants.ts new file mode 100644 index 0000000..2503a2f --- /dev/null +++ b/constants.ts @@ -0,0 +1,104 @@ +import { createOpenRouter } from "@openrouter/ai-sdk-provider"; +import type { LanguageModel } from "ai"; + +// Initialize the OpenRouter provider +if (!process.env.OPENROUTER_API_KEY) { + throw new Error( + "OPENROUTER_API_KEY environment variable is required. Please set it before running the script." + ); +} + +const openrouter = createOpenRouter({ + apiKey: process.env.OPENROUTER_API_KEY, +}); + +// Parallelism configuration +export const PARALLEL_LIMIT = 20; + +// Essay topics +export const TOPICS = [ + "The role of failure in personal growth", + // "Why boredom is underrated", + // "The ethics of artificial intelligence", + // "How social media reshapes human connection", + // "The value of slow living in a fast world", + // "Why we should embrace uncertainty", + // "The hidden costs of convenience", + // "What makes a good explanation", + // "The relationship between creativity and constraint", + // "Why some ideas spread and others don't", +] as const; + +// Model definition +export interface RunnableModel { + name: string; + llm: LanguageModel; + reasoning: boolean; +} + +export const modelsToRun: RunnableModel[] = [ + // Anthropic + { + name: "claude-4.5-opus-reasoning", + llm: openrouter("anthropic/claude-opus-4.5"), + reasoning: true, + }, + { + name: "claude-4.5-opus-non-reasoning", + llm: openrouter("anthropic/claude-opus-4.5"), + reasoning: false, + }, + + // OpenAI + { + name: "gpt-4o", + llm: openrouter("openai/gpt-4o"), + reasoning: false, + }, + { + name: "gpt-5", + llm: openrouter("openai/gpt-5"), + reasoning: true, + }, + { + name: "gpt-5-chat", + llm: openrouter("openai/gpt-5-chat"), + reasoning: false, + }, + { + name: "gpt-5-mini", + llm: openrouter("openai/gpt-5-mini"), + reasoning: true, + }, + + // Google + { + name: "gemini-3-pro-preview", + llm: openrouter("google/gemini-3-pro-preview"), + reasoning: true, + }, + { + name: "gemini-2.5-flash", + llm: openrouter("google/gemini-2.5-pro-preview"), + reasoning: true, + }, + + // Grok + { + name: "grok-4.1-fast", + llm: openrouter("x-ai/grok-4.1-fast"), + reasoning: true, + }, + + // Open Weight + { + name: "kimi-k2", + llm: openrouter("moonshotai/kimi-k2"), + reasoning: false, + }, + { + name: "kimi-k2-thinking", + llm: openrouter("moonshotai/kimi-k2-thinking"), + reasoning: true, + }, +]; diff --git a/fileUtils.ts b/fileUtils.ts index b11c2f9..cc42eae 100644 --- a/fileUtils.ts +++ b/fileUtils.ts @@ -1,26 +1,52 @@ import { mkdir, writeFile } from "fs/promises"; import { join } from "path"; -const RUNS_DIR = "runs"; +const RESULTS_DIR = "results"; -export interface PipelineOutput { - essay: string; - review: string; - revision: string; +export interface TopicResults { + topic: string; + essays: Record; + feedback: Record>; + revisions: Record>; + scores: { + original: Record< + string, + Record + >; + revised: Record< + string, + Record> + >; + }; + rankings: { + essays: Array<{ + type: "original" | "revised"; + author: string; + reviewer?: string; + avgScore: number; + }>; + reviewers: Array<{ + reviewer: string; + avgImprovement: number; + }>; + }; } -/** - * Ensures the runs directory exists, creating it if necessary. - */ -async function ensureRunsDirectory(): Promise { - try { - await mkdir(RUNS_DIR, { recursive: true }); - } catch (error) { - // Directory might already exist, which is fine - if ((error as NodeJS.ErrnoException).code !== "EEXIST") { - throw error; - } - } +export interface ArenaResults { + timestamp: string; + models: string[]; + topics: TopicResults[]; + aggregateRankings: { + essays: Array<{ + author: string; + avgScore: number; + avgImprovement: number; + }>; + reviewers: Array<{ + reviewer: string; + avgImprovement: number; + }>; + }; } /** @@ -32,35 +58,167 @@ function getTimestamp(): string { } /** - * Writes the pipeline outputs to markdown files in the runs directory. + * Sanitizes a name for use in filenames. */ -export async function writePipelineOutputs( - outputs: PipelineOutput -): Promise { - await ensureRunsDirectory(); - const timestamp = getTimestamp(); +function sanitizeName(name: string): string { + return name.replace(/[^a-zA-Z0-9-_]/g, "-").toLowerCase(); +} - const files = [ - { - path: join(RUNS_DIR, `${timestamp}-essay.md`), - content: `# Original Essay\n\n${outputs.essay}`, - }, - { - path: join(RUNS_DIR, `${timestamp}-review.md`), - content: `# Review Feedback\n\n${outputs.review}`, - }, - { - path: join(RUNS_DIR, `${timestamp}-revision.md`), - content: `# Revised Essay\n\n${outputs.revision}`, - }, +/** + * Creates the arena results directory structure for a topic. + */ +export async function createTopicDirectories(baseDir: string, topic: string) { + const topicSlug = sanitizeName(topic).slice(0, 50); + const topicDir = join(baseDir, topicSlug); + const dirs = [ + topicDir, + join(topicDir, "essays"), + join(topicDir, "feedback"), + join(topicDir, "revisions"), ]; - for (const file of files) { - await writeFile(file.path, file.content, "utf-8"); + for (const dir of dirs) { + await mkdir(dir, { recursive: true }); } - console.log(`\n✓ Files written:`); - files.forEach((file) => { - console.log(` - ${file.path}`); + return topicDir; +} + +/** + * Writes an essay to the essays directory. + */ +export async function writeEssay( + topicDir: string, + modelName: string, + essay: string +) { + const filename = `${sanitizeName(modelName)}.md`; + const path = join(topicDir, "essays", filename); + await writeFile(path, `# Essay by ${modelName}\n\n${essay}`, "utf-8"); + return path; +} + +/** + * Writes feedback to the feedback directory. + */ +export async function writeFeedback( + topicDir: string, + reviewer: string, + author: string, + feedback: string +) { + const filename = `${sanitizeName(reviewer)}-on-${sanitizeName(author)}.md`; + const path = join(topicDir, "feedback", filename); + await writeFile( + path, + `# Feedback by ${reviewer} on ${author}'s Essay\n\n${feedback}`, + "utf-8" + ); + return path; +} + +/** + * Writes a revision to the revisions directory. + */ +export async function writeRevision( + topicDir: string, + author: string, + reviewer: string, + revision: string +) { + const filename = `${sanitizeName(author)}-revised-by-${sanitizeName( + reviewer + )}.md`; + const path = join(topicDir, "revisions", filename); + await writeFile( + path, + `# ${author}'s Essay Revised Based on ${reviewer}'s Feedback\n\n${revision}`, + "utf-8" + ); + return path; +} + +/** + * Writes the complete results JSON file. + */ +export async function writeResultsJson(baseDir: string, results: ArenaResults) { + const path = join(baseDir, "results.json"); + await writeFile(path, JSON.stringify(results, null, 2), "utf-8"); + return path; +} + +/** + * Generates and writes the summary markdown file. + */ +export async function writeSummary(baseDir: string, results: ArenaResults) { + const path = join(baseDir, "summary.md"); + + let content = `# Writing Quality Arena Results\n\n`; + content += `**Date:** ${results.timestamp}\n\n`; + content += `**Models:** ${results.models.length}\n\n`; + content += `**Topics:** ${results.topics.length}\n\n`; + + // Aggregate Model Rankings (as writers) + content += `## Aggregate Model Rankings (as Writers)\n\n`; + content += `| Rank | Model | Avg Score | Avg Improvement |\n`; + content += `|------|-------|-----------|----------------|\n`; + + results.aggregateRankings.essays.forEach((entry, index) => { + const sign = entry.avgImprovement >= 0 ? "+" : ""; + content += `| ${index + 1} | ${entry.author} | ${entry.avgScore.toFixed( + 2 + )} | ${sign}${entry.avgImprovement.toFixed(2)} |\n`; }); + + // Aggregate Reviewer Rankings + content += `\n## Aggregate Reviewer Rankings (by Improvement Impact)\n\n`; + content += `| Rank | Reviewer | Avg Improvement |\n`; + content += `|------|----------|----------------|\n`; + + results.aggregateRankings.reviewers.forEach((entry, index) => { + const sign = entry.avgImprovement >= 0 ? "+" : ""; + content += `| ${index + 1} | ${ + entry.reviewer + } | ${sign}${entry.avgImprovement.toFixed(2)} |\n`; + }); + + // Per-topic summaries + content += `\n## Per-Topic Results\n\n`; + + for (const topic of results.topics) { + content += `### ${topic.topic}\n\n`; + + // Top 3 essays for this topic + content += `**Top 3 Essays:**\n`; + topic.rankings.essays.slice(0, 3).forEach((entry, index) => { + const reviewer = entry.reviewer ? ` (← ${entry.reviewer})` : ""; + content += `${index + 1}. ${entry.author}${reviewer} [${ + entry.type + }] - ${entry.avgScore.toFixed(2)}\n`; + }); + + // Top 3 reviewers for this topic + content += `\n**Top 3 Reviewers:**\n`; + topic.rankings.reviewers.slice(0, 3).forEach((entry, index) => { + const sign = entry.avgImprovement >= 0 ? "+" : ""; + content += `${index + 1}. ${ + entry.reviewer + } - ${sign}${entry.avgImprovement.toFixed(2)}\n`; + }); + + content += `\n`; + } + + await writeFile(path, content, "utf-8"); + return path; +} + +/** + * Creates a new arena run and returns the base directory and timestamp. + */ +export async function initArenaRun() { + const timestamp = getTimestamp(); + const baseDir = join(RESULTS_DIR, timestamp); + await mkdir(baseDir, { recursive: true }); + return { baseDir, timestamp }; } diff --git a/index.ts b/index.ts index c68e297..bc05dcb 100644 --- a/index.ts +++ b/index.ts @@ -1,66 +1,592 @@ -import { generateEssay, reviewEssay, reviseEssay } from "./aiClient"; -import { writePipelineOutputs } from "./fileUtils"; +import pLimit from "p-limit"; +import { + generateEssay, + reviewEssay, + reviseEssay, + scoreEssay, +} from "./aiClient"; +import { modelsToRun, PARALLEL_LIMIT, TOPICS } from "./constants"; +import { + createTopicDirectories, + initArenaRun, + writeEssay, + writeFeedback, + writeResultsJson, + writeRevision, + writeSummary, + type ArenaResults, + type TopicResults, +} from "./fileUtils"; + +const limit = pLimit(PARALLEL_LIMIT); + +/** + * Counts the actual API calls for each phase based on model configuration. + */ +function countApiCalls() { + let essays = 0; + let feedback = 0; + let revisions = 0; + let scores = 0; + + // Per topic counts + for (const _topic of TOPICS) { + // Phase 1: Essays + for (const _model of modelsToRun) { + essays++; + } + + // Phase 2: Feedback (each model reviews every OTHER model's essay) + for (const reviewer of modelsToRun) { + for (const author of modelsToRun) { + if (reviewer.name === author.name) continue; + feedback++; + } + } + + // Phase 3: Revisions (each author revises for each reviewer's feedback) + for (const author of modelsToRun) { + for (const reviewer of modelsToRun) { + if (author.name === reviewer.name) continue; + revisions++; + } + } + + // Phase 4: Scoring (every model scores every essay) + // Original essays + for (const _judge of modelsToRun) { + for (const _author of modelsToRun) { + scores++; + } + } + // Revised essays + for (const _judge of modelsToRun) { + for (const author of modelsToRun) { + for (const reviewer of modelsToRun) { + if (author.name === reviewer.name) continue; + scores++; + } + } + } + } + + return { + essays, + feedback, + revisions, + scores, + total: essays + feedback + revisions + scores, + }; +} /** - * Prompts the user for an essay topic via stdin. + * Prompts the user for confirmation before running the arena. */ -async function promptForTopic(): Promise { - const prompt = "Enter your essay topic: "; - process.stdout.write(prompt); +async function confirmRun(): Promise { + const { essays, feedback, revisions, scores, total } = countApiCalls(); + + console.log("\n🏟️ Writing Quality Arena\n"); + console.log(`Models: ${modelsToRun.length}`); + console.log(`Topics: ${TOPICS.length}`); + console.log(`\nAPI Call Breakdown (across all ${TOPICS.length} topics):`); + console.log(` Phase 1 - Essays: ${essays.toString().padStart(6)} calls`); + console.log( + ` Phase 2 - Feedback: ${feedback.toString().padStart(6)} calls` + ); + console.log( + ` Phase 3 - Revisions: ${revisions.toString().padStart(6)} calls` + ); + console.log(` Phase 4 - Scores: ${scores.toString().padStart(6)} calls`); + console.log(` ────────────────────────────`); + console.log(` Total: ${total.toString().padStart(6)} calls\n`); + console.log(`Parallelism: ${PARALLEL_LIMIT} concurrent requests\n`); + + process.stdout.write("Proceed? (Y/n): "); return new Promise((resolve) => { process.stdin.once("data", (data) => { - const topic = data.toString().trim(); - if (!topic) { - console.error("Topic cannot be empty. Please try again."); - process.exit(1); - } - resolve(topic); + const input = data.toString().trim().toLowerCase(); + resolve(input === "" || input === "y" || input === "yes"); }); }); } /** - * Runs the complete essay pipeline: generation → review → revision. + * Phase 1: Each model generates an essay on the topic. */ -async function runEssayPipeline(): Promise { - console.log("🎓 Auto-Draftify: Essay Generation Pipeline\n"); - - // Step 1: Get topic from user - const topic = await promptForTopic(); - console.log(`\n📝 Topic: ${topic}\n`); - - // Step 2: Generate initial essay - console.log("Step 1/3: Generating initial essay..."); - const essayResult = await generateEssay(topic); - console.log("✓ Essay generated\n"); - - // Step 3: Review the essay - console.log("Step 2/3: Reviewing essay..."); - const reviewResult = await reviewEssay(essayResult.text); - console.log("✓ Review completed\n"); - - // Step 4: Revise the essay - console.log("Step 3/3: Revising essay based on feedback..."); - const revisionResult = await reviseEssay( - topic, - essayResult.text, - reviewResult.text +async function runPhase1Essays( + topic: string, + topicDir: string +): Promise> { + const essays: Record = {}; + + const tasks = modelsToRun.map((model) => + limit(async () => { + console.log(` Generating essay: ${model.name}...`); + const result = await generateEssay(model, topic); + essays[model.name] = result.text; + await writeEssay(topicDir, model.name, result.text); + console.log(` ✓ ${model.name}`); + return result; + }) ); - console.log("✓ Revision completed\n"); - // Step 5: Write outputs to files - await writePipelineOutputs({ - essay: essayResult.text, - review: reviewResult.text, - revision: revisionResult.text, + await Promise.all(tasks); + return essays; +} + +/** + * Phase 2: Every model reviews every OTHER model's essay. + */ +async function runPhase2Feedback( + topic: string, + essays: Record, + topicDir: string +): Promise>> { + const feedback: Record> = {}; + + // Initialize nested objects + for (const reviewer of modelsToRun) { + feedback[reviewer.name] = {}; + } + + const tasks: Array> = []; + + for (const reviewer of modelsToRun) { + for (const author of modelsToRun) { + if (reviewer.name === author.name) continue; + + tasks.push( + limit(async () => { + console.log(` ${reviewer.name} reviewing ${author.name}...`); + const essayText = essays[author.name]!; + const result = await reviewEssay(reviewer, essayText, topic); + feedback[reviewer.name]![author.name] = result.text; + await writeFeedback( + topicDir, + reviewer.name, + author.name, + result.text + ); + console.log(` ✓ ${reviewer.name} → ${author.name}`); + }) + ); + } + } + + await Promise.all(tasks); + return feedback; +} + +/** + * Phase 3: Each author revises their essay for EACH piece of feedback received. + */ +async function runPhase3Revisions( + topic: string, + essays: Record, + feedback: Record>, + topicDir: string +): Promise>> { + const revisions: Record> = {}; + + // Initialize nested objects + for (const author of modelsToRun) { + revisions[author.name] = {}; + } + + const tasks: Array> = []; + + for (const author of modelsToRun) { + for (const reviewer of modelsToRun) { + if (author.name === reviewer.name) continue; + + tasks.push( + limit(async () => { + const reviewerFeedback = feedback[reviewer.name]![author.name]!; + const essayText = essays[author.name]!; + console.log( + ` ${author.name} revising based on ${reviewer.name}...` + ); + const result = await reviseEssay( + author, + topic, + essayText, + reviewerFeedback + ); + revisions[author.name]![reviewer.name] = result.text; + await writeRevision( + topicDir, + author.name, + reviewer.name, + result.text + ); + console.log(` ✓ ${author.name} ← ${reviewer.name}`); + }) + ); + } + } + + await Promise.all(tasks); + return revisions; +} + +/** + * Phase 4: Every model scores every essay (original and revised). + */ +async function runPhase4Scoring( + topic: string, + essays: Record, + revisions: Record> +): Promise<{ + original: Record< + string, + Record + >; + revised: Record< + string, + Record> + >; +}> { + const originalScores: Record< + string, + Record + > = {}; + const revisedScores: Record< + string, + Record> + > = {}; + + // Initialize nested objects + for (const judge of modelsToRun) { + originalScores[judge.name] = {}; + revisedScores[judge.name] = {}; + for (const author of modelsToRun) { + revisedScores[judge.name]![author.name] = {}; + } + } + + const tasks: Array> = []; + + // Score original essays + for (const judge of modelsToRun) { + for (const author of modelsToRun) { + tasks.push( + limit(async () => { + const essayText = essays[author.name]!; + console.log(` ${judge.name} scoring ${author.name} (original)...`); + const result = await scoreEssay(judge, essayText, topic); + originalScores[judge.name]![author.name] = result; + console.log( + ` ✓ ${judge.name} → ${author.name} (original): ${result.score}` + ); + }) + ); + } + } + + // Score revised essays + for (const judge of modelsToRun) { + for (const author of modelsToRun) { + for (const reviewer of modelsToRun) { + if (author.name === reviewer.name) continue; + + tasks.push( + limit(async () => { + const revision = revisions[author.name]![reviewer.name]!; + console.log( + ` ${judge.name} scoring ${author.name}←${reviewer.name} (revised)...` + ); + const result = await scoreEssay(judge, revision, topic); + revisedScores[judge.name]![author.name]![reviewer.name] = result; + console.log( + ` ✓ ${judge.name} → ${author.name}←${reviewer.name}: ${result.score}` + ); + }) + ); + } + } + } + + await Promise.all(tasks); + return { original: originalScores, revised: revisedScores }; +} + +/** + * Calculate rankings from scores for a single topic. + */ +function calculateRankings(scores: { + original: Record< + string, + Record + >; + revised: Record< + string, + Record> + >; +}): TopicResults["rankings"] { + const essayScores: Array<{ + type: "original" | "revised"; + author: string; + reviewer?: string; + avgScore: number; + }> = []; + + const judges = Object.keys(scores.original); + const firstJudge = judges[0]!; + const authors = Object.keys(scores.original[firstJudge]!); + + // Calculate average scores for original essays + for (const author of authors) { + const judgeScores = judges.map((j) => scores.original[j]![author]!.score); + const avgScore = + judgeScores.reduce((a, b) => a + b, 0) / judgeScores.length; + essayScores.push({ type: "original", author, avgScore }); + } + + // Calculate average scores for revised essays + for (const author of authors) { + for (const reviewer of authors) { + if (author === reviewer) continue; + const judgeScores = judges.map( + (j) => scores.revised[j]![author]![reviewer]!.score + ); + const avgScore = + judgeScores.reduce((a, b) => a + b, 0) / judgeScores.length; + essayScores.push({ type: "revised", author, reviewer, avgScore }); + } + } + + // Sort by average score descending + essayScores.sort((a, b) => b.avgScore - a.avgScore); + + // Calculate reviewer impact (average improvement from their feedback) + const reviewerImpact: Record = {}; + for (const reviewer of authors) { + reviewerImpact[reviewer] = []; + } + + for (const author of authors) { + const originalAvg = + judges.reduce((sum, j) => sum + scores.original[j]![author]!.score, 0) / + judges.length; + + for (const reviewer of authors) { + if (author === reviewer) continue; + const revisedAvg = + judges.reduce( + (sum, j) => sum + scores.revised[j]![author]![reviewer]!.score, + 0 + ) / judges.length; + const improvement = revisedAvg - originalAvg; + reviewerImpact[reviewer]!.push(improvement); + } + } + + const reviewerScores = Object.entries(reviewerImpact).map( + ([reviewer, improvements]) => ({ + reviewer, + avgImprovement: + improvements.reduce((a, b) => a + b, 0) / improvements.length, + }) + ); + + // Sort by average improvement descending + reviewerScores.sort((a, b) => b.avgImprovement - a.avgImprovement); + + return { + essays: essayScores, + reviewers: reviewerScores, + }; +} + +/** + * Calculate aggregate rankings across all topics. + */ +function calculateAggregateRankings( + topics: TopicResults[] +): ArenaResults["aggregateRankings"] { + // Aggregate scores per model (as writer) + const modelScores: Record< + string, + { scores: number[]; improvements: number[] } + > = {}; + // Aggregate improvements per reviewer + const reviewerImprovements: Record = {}; + + for (const topic of topics) { + // Get original essay scores per author + const originalByAuthor: Record = {}; + for (const entry of topic.rankings.essays) { + if (entry.type === "original") { + originalByAuthor[entry.author] = entry.avgScore; + if (!modelScores[entry.author]) { + modelScores[entry.author] = { scores: [], improvements: [] }; + } + modelScores[entry.author]!.scores.push(entry.avgScore); + } + } + + // Calculate improvement for revised essays + for (const entry of topic.rankings.essays) { + if (entry.type === "revised" && entry.reviewer) { + const original = originalByAuthor[entry.author]!; + const improvement = entry.avgScore - original; + modelScores[entry.author]!.improvements.push(improvement); + + if (!reviewerImprovements[entry.reviewer]) { + reviewerImprovements[entry.reviewer] = []; + } + reviewerImprovements[entry.reviewer]!.push(improvement); + } + } + } + + // Calculate averages for essays + const essayRankings = Object.entries(modelScores).map(([author, data]) => ({ + author, + avgScore: data.scores.reduce((a, b) => a + b, 0) / data.scores.length, + avgImprovement: + data.improvements.length > 0 + ? data.improvements.reduce((a, b) => a + b, 0) / + data.improvements.length + : 0, + })); + essayRankings.sort((a, b) => b.avgScore - a.avgScore); + + // Calculate averages for reviewers + const reviewerRankings = Object.entries(reviewerImprovements).map( + ([reviewer, improvements]) => ({ + reviewer, + avgImprovement: + improvements.reduce((a, b) => a + b, 0) / improvements.length, + }) + ); + reviewerRankings.sort((a, b) => b.avgImprovement - a.avgImprovement); + + return { + essays: essayRankings, + reviewers: reviewerRankings, + }; +} + +/** + * Run all phases for a single topic. + */ +async function runTopicArena( + topic: string, + topicIndex: number, + totalTopics: number, + baseDir: string +): Promise { + console.log( + `\n${"═".repeat(60)}\n📚 Topic ${ + topicIndex + 1 + }/${totalTopics}: "${topic}"\n${"═".repeat(60)}` + ); + + const topicDir = await createTopicDirectories(baseDir, topic); + + // Phase 1: Generate essays + console.log("\n 📝 Phase 1: Essay Generation"); + const essays = await runPhase1Essays(topic, topicDir); + console.log(` ✓ Phase 1 complete: ${modelsToRun.length} essays`); + + // Phase 2: Generate feedback + console.log("\n 📋 Phase 2: Feedback Generation"); + const feedback = await runPhase2Feedback(topic, essays, topicDir); + const feedbackCount = modelsToRun.length * (modelsToRun.length - 1); + console.log(` ✓ Phase 2 complete: ${feedbackCount} feedback pieces`); + + // Phase 3: Generate revisions + console.log("\n ✏️ Phase 3: Revisions"); + const revisions = await runPhase3Revisions(topic, essays, feedback, topicDir); + console.log(` ✓ Phase 3 complete: ${feedbackCount} revisions`); + + // Phase 4: Score all essays + console.log("\n ⭐ Phase 4: Scoring"); + const scores = await runPhase4Scoring(topic, essays, revisions); + console.log(` ✓ Phase 4 complete`); + + // Calculate rankings for this topic + const rankings = calculateRankings(scores); + + return { + topic, + essays, + feedback, + revisions, + scores, + rankings, + }; +} + +/** + * Main arena orchestration. + */ +async function runArena(): Promise { + const confirmed = await confirmRun(); + if (!confirmed) { + console.log("\nAborted."); + process.exit(0); + } + + const { baseDir, timestamp } = await initArenaRun(); + console.log(`\nResults will be saved to: ${baseDir}`); + + // Run arena for each topic + const topicResults: TopicResults[] = []; + + for (let i = 0; i < TOPICS.length; i++) { + const topic = TOPICS[i]!; + const result = await runTopicArena(topic, i, TOPICS.length, baseDir); + topicResults.push(result); + } + + // Calculate aggregate rankings + console.log("\n\n📊 Calculating aggregate rankings...\n"); + const aggregateRankings = calculateAggregateRankings(topicResults); + + // Compile results + const results: ArenaResults = { + timestamp, + models: modelsToRun.map((m) => m.name), + topics: topicResults, + aggregateRankings, + }; + + // Write final results + await writeResultsJson(baseDir, results); + await writeSummary(baseDir, results); + + // Print summary + console.log("═".repeat(60)); + console.log("\n🏆 AGGREGATE RESULTS\n"); + + console.log("Top 5 Models (as Writers):\n"); + aggregateRankings.essays.slice(0, 5).forEach((entry, index) => { + const sign = entry.avgImprovement >= 0 ? "+" : ""; + console.log( + ` ${index + 1}. ${entry.author} - ${entry.avgScore.toFixed( + 2 + )} avg (${sign}${entry.avgImprovement.toFixed(2)} after feedback)` + ); + }); + + console.log("\n🎯 Top 5 Reviewers (by improvement impact):\n"); + aggregateRankings.reviewers.slice(0, 5).forEach((entry, index) => { + const sign = entry.avgImprovement >= 0 ? "+" : ""; + console.log( + ` ${index + 1}. ${ + entry.reviewer + } - ${sign}${entry.avgImprovement.toFixed(2)}` + ); }); - console.log("\n✨ Pipeline complete!"); + console.log(`\n✨ Arena complete! Results saved to: ${baseDir}`); } -// Run the pipeline -runEssayPipeline().catch((error) => { - console.error("Error running pipeline:", error); +// Run the arena +runArena().catch((error) => { + console.error("Error running arena:", error); process.exit(1); }); diff --git a/package.json b/package.json index f21c41c..e0f5ced 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,9 @@ "private": true, "dependencies": { "ai": "^5.0.0", - "@openrouter/ai-sdk-provider": "^1.0.0" + "@openrouter/ai-sdk-provider": "^1.0.0", + "p-limit": "^6.1.0", + "zod": "^3.24.0" }, "devDependencies": { "@types/bun": "latest" From f5b8f92a14140684e07ed6c139f4fe4d2f888b31 Mon Sep 17 00:00:00 2001 From: Theo Browne Date: Sat, 29 Nov 2025 03:55:46 -0800 Subject: [PATCH 03/10] working --- .gitignore | 1 + aiClient.ts | 70 ++++++++++++++---- constants.ts | 27 +++++-- index.ts | 195 ++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 269 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 1c7683b..0cb6022 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,4 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json # Generated essay runs runs/ +results/ diff --git a/aiClient.ts b/aiClient.ts index 795db1b..ea38cb6 100644 --- a/aiClient.ts +++ b/aiClient.ts @@ -1,25 +1,34 @@ -import { generateObject, generateText } from "ai"; -import { z } from "zod"; +import { generateText } from "ai"; import type { RunnableModel } from "./constants"; +export interface TokenUsage { + inputTokens: number; + outputTokens: number; + totalTokens: number; + /** Cost in USD from OpenRouter */ + cost: number; +} + export interface EssayResult { text: string; + usage: TokenUsage; } export interface ReviewResult { text: string; + usage: TokenUsage; } export interface RevisionResult { text: string; + usage: TokenUsage; } -export const ScoreSchema = z.object({ - score: z.number().min(1).max(10), - justification: z.string(), -}); - -export type ScoreResult = z.infer; +export interface ScoreResult { + score: number; + justification: string; + usage: TokenUsage; +} /** * Generates an essay based on the given topic prompt. @@ -38,6 +47,12 @@ Write approximately 800-1200 words.`, return { text: result.text, + usage: { + inputTokens: result.usage?.inputTokens ?? 0, + outputTokens: result.usage?.outputTokens ?? 0, + totalTokens: result.usage?.totalTokens ?? 0, + cost: (result.providerMetadata?.openrouter?.cost as number) ?? 0, + }, }; } @@ -59,6 +74,12 @@ Be thorough but encouraging. Focus on actionable improvements.`, return { text: result.text, + usage: { + inputTokens: result.usage?.inputTokens ?? 0, + outputTokens: result.usage?.outputTokens ?? 0, + totalTokens: result.usage?.totalTokens ?? 0, + cost: (result.providerMetadata?.openrouter?.cost as number) ?? 0, + }, }; } @@ -81,6 +102,12 @@ Produce a complete revised essay, not just suggestions.`, return { text: result.text, + usage: { + inputTokens: result.usage?.inputTokens ?? 0, + outputTokens: result.usage?.outputTokens ?? 0, + totalTokens: result.usage?.totalTokens ?? 0, + cost: (result.providerMetadata?.openrouter?.cost as number) ?? 0, + }, }; } @@ -92,9 +119,8 @@ export async function scoreEssay( essay: string, topic: string ): Promise { - const result = await generateObject({ + const result = await generateText({ model: model.llm, - schema: ScoreSchema, system: `You are an expert essay judge. Score the essay on a scale of 1-10 based on: - Clarity and coherence of argument - Quality of writing (style, grammar, flow) @@ -102,9 +128,29 @@ export async function scoreEssay( - Relevance to the topic - Overall effectiveness -Be fair and consistent in your scoring. A score of 5 is average, 7-8 is good, 9-10 is exceptional.`, +Be fair and consistent in your scoring. A score of 5 is average, 7-8 is good, 9-10 is exceptional. + +IMPORTANT: Start your response with EXACTLY "Score: X/10" on the first line (where X is your score), then provide your detailed justification below.`, prompt: `Topic: ${topic}\n\nPlease score the following essay:\n\n${essay}`, }); - return result.object; + // Parse score from the text - look for "Score: X/10" or similar patterns + const scoreMatch = result.text.match(/Score:\s*(\d+(?:\.\d+)?)\s*\/\s*10/i); + const score = scoreMatch?.[1] ? parseFloat(scoreMatch[1]) : 5; // Default to 5 if parsing fails + + // Everything after the score line is the justification + const justification = result.text + .replace(/^Score:\s*\d+(?:\.\d+)?\s*\/\s*10\s*/i, "") + .trim(); + + return { + score: Math.min(10, Math.max(1, score)), // Clamp between 1-10 + justification, + usage: { + inputTokens: result.usage?.inputTokens ?? 0, + outputTokens: result.usage?.outputTokens ?? 0, + totalTokens: result.usage?.totalTokens ?? 0, + cost: (result.providerMetadata?.openrouter?.cost as number) ?? 0, + }, + }; } diff --git a/constants.ts b/constants.ts index 2503a2f..c1a2d2b 100644 --- a/constants.ts +++ b/constants.ts @@ -40,12 +40,12 @@ export const modelsToRun: RunnableModel[] = [ // Anthropic { name: "claude-4.5-opus-reasoning", - llm: openrouter("anthropic/claude-opus-4.5"), + llm: openrouter("anthropic/claude-opus-4-5"), reasoning: true, }, { name: "claude-4.5-opus-non-reasoning", - llm: openrouter("anthropic/claude-opus-4.5"), + llm: openrouter("anthropic/claude-opus-4-5"), reasoning: false, }, @@ -78,8 +78,8 @@ export const modelsToRun: RunnableModel[] = [ reasoning: true, }, { - name: "gemini-2.5-flash", - llm: openrouter("google/gemini-2.5-pro-preview"), + name: "gemini-2.5-pro", + llm: openrouter("google/gemini-2.5-pro"), reasoning: true, }, @@ -102,3 +102,22 @@ export const modelsToRun: RunnableModel[] = [ reasoning: true, }, ]; + +// Cheap models for dry-run testing +export const dryRunModels: RunnableModel[] = [ + { + name: "claude-4.5-haiku", + llm: openrouter("anthropic/claude-haiku-4.5"), + reasoning: false, + }, + { + name: "gemini-2.5-flash", + llm: openrouter("google/gemini-2.5-flash"), + reasoning: true, + }, + { + name: "gpt-5-mini", + llm: openrouter("openai/gpt-5-mini"), + reasoning: true, + }, +]; diff --git a/index.ts b/index.ts index bc05dcb..98876f3 100644 --- a/index.ts +++ b/index.ts @@ -4,8 +4,18 @@ import { reviewEssay, reviseEssay, scoreEssay, + type TokenUsage, } from "./aiClient"; -import { modelsToRun, PARALLEL_LIMIT, TOPICS } from "./constants"; +import { + modelsToRun as allModels, + dryRunModels, + PARALLEL_LIMIT, + TOPICS, +} from "./constants"; + +// Parse CLI flags +const isDryRun = process.argv.includes("--dry-run"); +const modelsToRun = isDryRun ? dryRunModels : allModels; import { createTopicDirectories, initArenaRun, @@ -20,6 +30,34 @@ import { const limit = pLimit(PARALLEL_LIMIT); +/** + * Tracks token usage and costs per model per phase. + */ +interface UsageTracker { + essays: Record; + reviews: Record; + revisions: Record; + scores: Record; +} + +function createUsageTracker(): UsageTracker { + const tracker: UsageTracker = { + essays: {}, + reviews: {}, + revisions: {}, + scores: {}, + }; + for (const model of modelsToRun) { + tracker.essays[model.name] = []; + tracker.reviews[model.name] = []; + tracker.revisions[model.name] = []; + tracker.scores[model.name] = []; + } + return tracker; +} + +const usageTracker = createUsageTracker(); + /** * Counts the actual API calls for each phase based on model configuration. */ @@ -86,6 +124,9 @@ async function confirmRun(): Promise { const { essays, feedback, revisions, scores, total } = countApiCalls(); console.log("\n🏟️ Writing Quality Arena\n"); + if (isDryRun) { + console.log("⚡ DRY RUN MODE (using cheap models)\n"); + } console.log(`Models: ${modelsToRun.length}`); console.log(`Topics: ${TOPICS.length}`); console.log(`\nAPI Call Breakdown (across all ${TOPICS.length} topics):`); @@ -125,8 +166,13 @@ async function runPhase1Essays( console.log(` Generating essay: ${model.name}...`); const result = await generateEssay(model, topic); essays[model.name] = result.text; + usageTracker.essays[model.name]!.push(result.usage); await writeEssay(topicDir, model.name, result.text); - console.log(` ✓ ${model.name}`); + console.log( + ` ✓ ${model.name} (${ + result.usage.totalTokens + } tokens, $${result.usage.cost.toFixed(4)})` + ); return result; }) ); @@ -162,13 +208,18 @@ async function runPhase2Feedback( const essayText = essays[author.name]!; const result = await reviewEssay(reviewer, essayText, topic); feedback[reviewer.name]![author.name] = result.text; + usageTracker.reviews[reviewer.name]!.push(result.usage); await writeFeedback( topicDir, reviewer.name, author.name, result.text ); - console.log(` ✓ ${reviewer.name} → ${author.name}`); + console.log( + ` ✓ ${reviewer.name} → ${author.name} (${ + result.usage.totalTokens + } tokens, $${result.usage.cost.toFixed(4)})` + ); }) ); } @@ -214,13 +265,18 @@ async function runPhase3Revisions( reviewerFeedback ); revisions[author.name]![reviewer.name] = result.text; + usageTracker.revisions[author.name]!.push(result.usage); await writeRevision( topicDir, author.name, reviewer.name, result.text ); - console.log(` ✓ ${author.name} ← ${reviewer.name}`); + console.log( + ` ✓ ${author.name} ← ${reviewer.name} (${ + result.usage.totalTokens + } tokens, $${result.usage.cost.toFixed(4)})` + ); }) ); } @@ -275,9 +331,17 @@ async function runPhase4Scoring( const essayText = essays[author.name]!; console.log(` ${judge.name} scoring ${author.name} (original)...`); const result = await scoreEssay(judge, essayText, topic); - originalScores[judge.name]![author.name] = result; + originalScores[judge.name]![author.name] = { + score: result.score, + justification: result.justification, + }; + usageTracker.scores[judge.name]!.push(result.usage); console.log( - ` ✓ ${judge.name} → ${author.name} (original): ${result.score}` + ` ✓ ${judge.name} → ${author.name} (original): ${ + result.score + } (${result.usage.totalTokens} tokens, $${result.usage.cost.toFixed( + 4 + )})` ); }) ); @@ -297,9 +361,17 @@ async function runPhase4Scoring( ` ${judge.name} scoring ${author.name}←${reviewer.name} (revised)...` ); const result = await scoreEssay(judge, revision, topic); - revisedScores[judge.name]![author.name]![reviewer.name] = result; + revisedScores[judge.name]![author.name]![reviewer.name] = { + score: result.score, + justification: result.justification, + }; + usageTracker.scores[judge.name]!.push(result.usage); console.log( - ` ✓ ${judge.name} → ${author.name}←${reviewer.name}: ${result.score}` + ` ✓ ${judge.name} → ${author.name}←${reviewer.name}: ${ + result.score + } (${ + result.usage.totalTokens + } tokens, $${result.usage.cost.toFixed(4)})` ); }) ); @@ -582,9 +654,116 @@ async function runArena(): Promise { ); }); + // Print usage and cost summary + printUsageSummary(); + console.log(`\n✨ Arena complete! Results saved to: ${baseDir}`); } +/** + * Calculates average tokens from an array of usage records. + */ +function calcAverage(usages: TokenUsage[]) { + if (usages.length === 0) return { tokens: 0, cost: 0 }; + const totalTokens = usages.reduce((sum, u) => sum + u.totalTokens, 0); + const totalCost = usages.reduce((sum, u) => sum + u.cost, 0); + return { + tokens: Math.round(totalTokens / usages.length), + cost: totalCost, + }; +} + +/** + * Prints a summary of token usage and costs. + */ +function printUsageSummary() { + console.log("\n" + "═".repeat(60)); + console.log("\n💰 TOKEN USAGE & COST SUMMARY\n"); + + // Calculate phase totals + let totalEssayCost = 0; + let totalReviewCost = 0; + let totalRevisionCost = 0; + let totalScoreCost = 0; + + // Per-model stats + const modelStats: Array<{ + name: string; + essayAvgTokens: number; + essayCost: number; + reviewAvgTokens: number; + reviewCost: number; + revisionAvgTokens: number; + revisionCost: number; + scoreCost: number; + totalCost: number; + }> = []; + + for (const model of modelsToRun) { + const essayStats = calcAverage(usageTracker.essays[model.name]!); + const reviewStats = calcAverage(usageTracker.reviews[model.name]!); + const revisionStats = calcAverage(usageTracker.revisions[model.name]!); + const scoreStats = calcAverage(usageTracker.scores[model.name]!); + + totalEssayCost += essayStats.cost; + totalReviewCost += reviewStats.cost; + totalRevisionCost += revisionStats.cost; + totalScoreCost += scoreStats.cost; + + modelStats.push({ + name: model.name, + essayAvgTokens: essayStats.tokens, + essayCost: essayStats.cost, + reviewAvgTokens: reviewStats.tokens, + reviewCost: reviewStats.cost, + revisionAvgTokens: revisionStats.tokens, + revisionCost: revisionStats.cost, + scoreCost: scoreStats.cost, + totalCost: + essayStats.cost + + reviewStats.cost + + revisionStats.cost + + scoreStats.cost, + }); + } + + const grandTotal = + totalEssayCost + totalReviewCost + totalRevisionCost + totalScoreCost; + + // Print phase cost breakdown + console.log("Phase Costs:"); + console.log(` Essays (First): $${totalEssayCost.toFixed(4)}`); + console.log(` Reviews: $${totalReviewCost.toFixed(4)}`); + console.log(` Revisions (Follow): $${totalRevisionCost.toFixed(4)}`); + console.log(` Scoring: $${totalScoreCost.toFixed(4)}`); + console.log(` ────────────────────────────`); + console.log(` Total: $${grandTotal.toFixed(4)}`); + + // Print per-model breakdown + console.log("\n\nPer-Model Token Averages & Costs:\n"); + console.log( + " Model".padEnd(32) + + "First Essay".padStart(14) + + "Reviews".padStart(14) + + "Follow-up".padStart(14) + + "Total Cost".padStart(12) + ); + console.log(" " + "─".repeat(84)); + + for (const stat of modelStats.sort((a, b) => b.totalCost - a.totalCost)) { + const essayCol = `${stat.essayAvgTokens} tok`.padStart(14); + const reviewCol = `${stat.reviewAvgTokens} tok`.padStart(14); + const revisionCol = `${stat.revisionAvgTokens} tok`.padStart(14); + const costCol = `$${stat.totalCost.toFixed(4)}`.padStart(12); + console.log( + ` ${stat.name.padEnd(30)}${essayCol}${reviewCol}${revisionCol}${costCol}` + ); + } + + console.log("\n " + "─".repeat(84)); + console.log(` ${"GRAND TOTAL".padEnd(72)}$${grandTotal.toFixed(4)}`); +} + // Run the arena runArena().catch((error) => { console.error("Error running arena:", error); From fe8547c230915c9884fa008ad64f99782ae99b36 Mon Sep 17 00:00:00 2001 From: Theo Browne Date: Sat, 29 Nov 2025 04:28:11 -0800 Subject: [PATCH 04/10] fully working --- constants.ts | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/constants.ts b/constants.ts index c1a2d2b..3e5f714 100644 --- a/constants.ts +++ b/constants.ts @@ -17,7 +17,7 @@ export const PARALLEL_LIMIT = 20; // Essay topics export const TOPICS = [ - "The role of failure in personal growth", + // "The role of failure in personal growth", // "Why boredom is underrated", // "The ethics of artificial intelligence", // "How social media reshapes human connection", @@ -25,7 +25,7 @@ export const TOPICS = [ // "Why we should embrace uncertainty", // "The hidden costs of convenience", // "What makes a good explanation", - // "The relationship between creativity and constraint", + "The relationship between creativity and constraint", // "Why some ideas spread and others don't", ] as const; @@ -40,12 +40,12 @@ export const modelsToRun: RunnableModel[] = [ // Anthropic { name: "claude-4.5-opus-reasoning", - llm: openrouter("anthropic/claude-opus-4-5"), + llm: openrouter("anthropic/claude-opus-4.5"), reasoning: true, }, { name: "claude-4.5-opus-non-reasoning", - llm: openrouter("anthropic/claude-opus-4-5"), + llm: openrouter("anthropic/claude-opus-4.5"), reasoning: false, }, @@ -56,13 +56,13 @@ export const modelsToRun: RunnableModel[] = [ reasoning: false, }, { - name: "gpt-5", + name: "gpt-5.1", llm: openrouter("openai/gpt-5"), reasoning: true, }, { - name: "gpt-5-chat", - llm: openrouter("openai/gpt-5-chat"), + name: "gpt-5.1-chat", + llm: openrouter("openai/gpt-5.1-chat"), reasoning: false, }, { From bfde9d7ec24808d34b2dab20e7ea14d51973599e Mon Sep 17 00:00:00 2001 From: Theo Browne Date: Sat, 29 Nov 2025 13:03:35 -0800 Subject: [PATCH 05/10] all working --- aiClient.ts | 92 +++++- constants.ts | 71 ++-- fileUtils.ts | 195 ++++++++++- index.ts | 891 +++++++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 1066 insertions(+), 183 deletions(-) diff --git a/aiClient.ts b/aiClient.ts index ea38cb6..4007beb 100644 --- a/aiClient.ts +++ b/aiClient.ts @@ -1,6 +1,20 @@ import { generateText } from "ai"; import type { RunnableModel } from "./constants"; +/** + * Extracts cost from OpenRouter provider metadata. + */ +function extractCost( + providerMetadata: Record | undefined +): number { + if (!providerMetadata) return 0; + const openrouterMeta = providerMetadata.openrouter as any; + if (openrouterMeta?.usage?.cost) { + return openrouterMeta.usage.cost; + } + return 0; +} + export interface TokenUsage { inputTokens: number; outputTokens: number; @@ -30,6 +44,12 @@ export interface ScoreResult { usage: TokenUsage; } +export interface CompareResult { + winner: "A" | "B" | "tie"; + reasoning: string; + usage: TokenUsage; +} + /** * Generates an essay based on the given topic prompt. */ @@ -51,7 +71,7 @@ Write approximately 800-1200 words.`, inputTokens: result.usage?.inputTokens ?? 0, outputTokens: result.usage?.outputTokens ?? 0, totalTokens: result.usage?.totalTokens ?? 0, - cost: (result.providerMetadata?.openrouter?.cost as number) ?? 0, + cost: extractCost(result.providerMetadata), }, }; } @@ -78,7 +98,7 @@ Be thorough but encouraging. Focus on actionable improvements.`, inputTokens: result.usage?.inputTokens ?? 0, outputTokens: result.usage?.outputTokens ?? 0, totalTokens: result.usage?.totalTokens ?? 0, - cost: (result.providerMetadata?.openrouter?.cost as number) ?? 0, + cost: extractCost(result.providerMetadata), }, }; } @@ -106,7 +126,7 @@ Produce a complete revised essay, not just suggestions.`, inputTokens: result.usage?.inputTokens ?? 0, outputTokens: result.usage?.outputTokens ?? 0, totalTokens: result.usage?.totalTokens ?? 0, - cost: (result.providerMetadata?.openrouter?.cost as number) ?? 0, + cost: extractCost(result.providerMetadata), }, }; } @@ -150,7 +170,71 @@ IMPORTANT: Start your response with EXACTLY "Score: X/10" on the first line (whe inputTokens: result.usage?.inputTokens ?? 0, outputTokens: result.usage?.outputTokens ?? 0, totalTokens: result.usage?.totalTokens ?? 0, - cost: (result.providerMetadata?.openrouter?.cost as number) ?? 0, + cost: extractCost(result.providerMetadata), + }, + }; +} + +/** + * Compares two essays head-to-head and picks a winner. + */ +export async function compareEssays( + judge: RunnableModel, + essayA: { author: string; text: string }, + essayB: { author: string; text: string }, + topic: string +): Promise { + const result = await generateText({ + model: judge.llm, + system: `You are an expert essay judge conducting a head-to-head comparison. You will be shown two essays on the same topic, labeled Essay A and Essay B. + +Compare them based on: +- Clarity and coherence of argument +- Quality of writing (style, grammar, flow) +- Depth of insight and originality +- Relevance to the topic +- Overall effectiveness + +You MUST pick a winner. Only declare a tie if the essays are genuinely indistinguishable in quality. + +IMPORTANT: Start your response with EXACTLY one of these on the first line: +- "Winner: A" (if Essay A is better) +- "Winner: B" (if Essay B is better) +- "Winner: Tie" (only if truly equal) + +Then provide your detailed reasoning below, explaining why you chose that winner.`, + prompt: `Topic: ${topic} + +Essay A: +${essayA.text} + +Essay B: +${essayB.text} + +Compare these essays and pick a winner.`, + }); + + // Parse winner from the text + const winnerMatch = result.text.match(/Winner:\s*(A|B|Tie)/i); + let winner: "A" | "B" | "tie" = "tie"; + if (winnerMatch) { + const parsed = winnerMatch[1]!.toUpperCase(); + if (parsed === "A") winner = "A"; + else if (parsed === "B") winner = "B"; + else winner = "tie"; + } + + // Everything after the winner line is the reasoning + const reasoning = result.text.replace(/^Winner:\s*(A|B|Tie)\s*/i, "").trim(); + + return { + winner, + reasoning, + usage: { + inputTokens: result.usage?.inputTokens ?? 0, + outputTokens: result.usage?.outputTokens ?? 0, + totalTokens: result.usage?.totalTokens ?? 0, + cost: extractCost(result.providerMetadata), }, }; } diff --git a/constants.ts b/constants.ts index 3e5f714..8f8482a 100644 --- a/constants.ts +++ b/constants.ts @@ -27,6 +27,7 @@ export const TOPICS = [ // "What makes a good explanation", "The relationship between creativity and constraint", // "Why some ideas spread and others don't", + "the negative impacts on society from artificial intelligence", ] as const; // Model definition @@ -43,33 +44,33 @@ export const modelsToRun: RunnableModel[] = [ llm: openrouter("anthropic/claude-opus-4.5"), reasoning: true, }, - { - name: "claude-4.5-opus-non-reasoning", - llm: openrouter("anthropic/claude-opus-4.5"), - reasoning: false, - }, + // { + // name: "claude-4.5-opus-non-reasoning", + // llm: openrouter("anthropic/claude-opus-4.5"), + // reasoning: false, + // }, // OpenAI - { - name: "gpt-4o", - llm: openrouter("openai/gpt-4o"), - reasoning: false, - }, + // { + // name: "gpt-4o", + // llm: openrouter("openai/gpt-4o"), + // reasoning: false, + // }, { name: "gpt-5.1", llm: openrouter("openai/gpt-5"), reasoning: true, }, - { - name: "gpt-5.1-chat", - llm: openrouter("openai/gpt-5.1-chat"), - reasoning: false, - }, - { - name: "gpt-5-mini", - llm: openrouter("openai/gpt-5-mini"), - reasoning: true, - }, + // { + // name: "gpt-5.1-chat", + // llm: openrouter("openai/gpt-5.1-chat"), + // reasoning: false, + // }, + // { + // name: "gpt-5-mini", + // llm: openrouter("openai/gpt-5-mini"), + // reasoning: true, + // }, // Google { @@ -77,25 +78,25 @@ export const modelsToRun: RunnableModel[] = [ llm: openrouter("google/gemini-3-pro-preview"), reasoning: true, }, - { - name: "gemini-2.5-pro", - llm: openrouter("google/gemini-2.5-pro"), - reasoning: true, - }, + // { + // name: "gemini-2.5-pro", + // llm: openrouter("google/gemini-2.5-pro"), + // reasoning: true, + // }, // Grok - { - name: "grok-4.1-fast", - llm: openrouter("x-ai/grok-4.1-fast"), - reasoning: true, - }, + // { + // name: "grok-4.1-fast", + // llm: openrouter("x-ai/grok-4.1-fast"), + // reasoning: true, + // }, // Open Weight - { - name: "kimi-k2", - llm: openrouter("moonshotai/kimi-k2"), - reasoning: false, - }, + // { + // name: "kimi-k2", + // llm: openrouter("moonshotai/kimi-k2"), + // reasoning: false, + // }, { name: "kimi-k2-thinking", llm: openrouter("moonshotai/kimi-k2-thinking"), diff --git a/fileUtils.ts b/fileUtils.ts index cc42eae..ed96fed 100644 --- a/fileUtils.ts +++ b/fileUtils.ts @@ -3,6 +3,8 @@ import { join } from "path"; const RESULTS_DIR = "results"; +export type TestType = "scoring-test" | "1v1"; + export interface TopicResults { topic: string; essays: Record; @@ -49,6 +51,63 @@ export interface ArenaResults { }; } +// 1v1 specific types +export interface ComparisonResult { + judge: string; + essayA: { author: string; reviewer?: string }; + essayB: { author: string; reviewer?: string }; + winner: "A" | "B" | "tie"; + reasoning: string; +} + +export interface OneVsOneTopicResults { + topic: string; + essays: Record; + feedback: Record>; + revisions: Record>; + comparisons: ComparisonResult[]; + rankings: { + essays: Array<{ + author: string; + reviewer?: string; + wins: number; + losses: number; + ties: number; + winRate: number; + }>; + }; +} + +export interface OneVsOneResults { + timestamp: string; + models: string[]; + topics: OneVsOneTopicResults[]; + aggregateRankings: { + essays: Array<{ + author: string; + wins: number; + losses: number; + ties: number; + winRate: number; + }>; + reviewers: Array<{ + reviewer: string; + wins: number; + losses: number; + ties: number; + winRate: number; + }>; + pairings: Array<{ + author: string; + reviewer: string; + wins: number; + losses: number; + ties: number; + winRate: number; + }>; + }; +} + /** * Generates a timestamp string for filenames. */ @@ -216,9 +275,141 @@ export async function writeSummary(baseDir: string, results: ArenaResults) { /** * Creates a new arena run and returns the base directory and timestamp. */ -export async function initArenaRun() { +export async function initArenaRun(testType: TestType) { const timestamp = getTimestamp(); - const baseDir = join(RESULTS_DIR, timestamp); + const baseDir = join(RESULTS_DIR, testType, timestamp); await mkdir(baseDir, { recursive: true }); return { baseDir, timestamp }; } + +/** + * Writes a comparison result to the comparisons directory. + */ +export async function writeComparison( + topicDir: string, + judge: string, + essayA: { author: string; reviewer?: string }, + essayB: { author: string; reviewer?: string }, + winner: "A" | "B" | "tie", + reasoning: string +) { + const comparisonsDir = join(topicDir, "comparisons"); + await mkdir(comparisonsDir, { recursive: true }); + + const essayALabel = essayA.reviewer + ? `${sanitizeName(essayA.author)}-revised-by-${sanitizeName( + essayA.reviewer + )}` + : sanitizeName(essayA.author); + const essayBLabel = essayB.reviewer + ? `${sanitizeName(essayB.author)}-revised-by-${sanitizeName( + essayB.reviewer + )}` + : sanitizeName(essayB.author); + + const filename = `${sanitizeName(judge)}-${essayALabel}-vs-${essayBLabel}.md`; + const path = join(comparisonsDir, filename); + + const essayADisplay = essayA.reviewer + ? `${essayA.author} (revised by ${essayA.reviewer})` + : essayA.author; + const essayBDisplay = essayB.reviewer + ? `${essayB.author} (revised by ${essayB.reviewer})` + : essayB.author; + + const winnerDisplay = + winner === "A" ? essayADisplay : winner === "B" ? essayBDisplay : "Tie"; + + await writeFile( + path, + `# Comparison by ${judge}\n\n**Essay A:** ${essayADisplay}\n**Essay B:** ${essayBDisplay}\n\n**Winner:** ${winnerDisplay}\n\n## Reasoning\n\n${reasoning}`, + "utf-8" + ); + return path; +} + +/** + * Writes the 1v1 results JSON file. + */ +export async function writeOneVsOneResultsJson( + baseDir: string, + results: OneVsOneResults +) { + const path = join(baseDir, "results.json"); + await writeFile(path, JSON.stringify(results, null, 2), "utf-8"); + return path; +} + +/** + * Generates and writes the 1v1 summary markdown file. + */ +export async function writeOneVsOneSummary( + baseDir: string, + results: OneVsOneResults +) { + const path = join(baseDir, "summary.md"); + + let content = `# 1v1 Arena Results\n\n`; + content += `**Date:** ${results.timestamp}\n\n`; + content += `**Models:** ${results.models.length}\n\n`; + content += `**Topics:** ${results.topics.length}\n\n`; + + // Aggregate Model Rankings (as Writers) + content += `## Aggregate Model Rankings (as Writers)\n\n`; + content += `| Rank | Model | Wins | Losses | Ties | Win Rate |\n`; + content += `|------|-------|------|--------|------|----------|\n`; + + results.aggregateRankings.essays.forEach((entry, index) => { + content += `| ${index + 1} | ${entry.author} | ${entry.wins} | ${ + entry.losses + } | ${entry.ties} | ${(entry.winRate * 100).toFixed(1)}% |\n`; + }); + + // Aggregate Reviewer Rankings + content += `\n## Aggregate Reviewer Rankings\n\n`; + content += `| Rank | Reviewer | Wins | Losses | Ties | Win Rate |\n`; + content += `|------|----------|------|--------|------|----------|\n`; + + results.aggregateRankings.reviewers.forEach((entry, index) => { + content += `| ${index + 1} | ${entry.reviewer} | ${entry.wins} | ${ + entry.losses + } | ${entry.ties} | ${(entry.winRate * 100).toFixed(1)}% |\n`; + }); + + // Aggregate Pairing Rankings + content += `\n## Aggregate Pairing Rankings (Author + Reviewer)\n\n`; + content += `| Rank | Author | Reviewer | Wins | Losses | Ties | Win Rate |\n`; + content += `|------|--------|----------|------|--------|------|----------|\n`; + + results.aggregateRankings.pairings.forEach((entry, index) => { + content += `| ${index + 1} | ${entry.author} | ${entry.reviewer} | ${ + entry.wins + } | ${entry.losses} | ${entry.ties} | ${(entry.winRate * 100).toFixed( + 1 + )}% |\n`; + }); + + // Per-topic summaries + content += `\n## Per-Topic Results\n\n`; + + for (const topic of results.topics) { + content += `### ${topic.topic}\n\n`; + + content += `| Rank | Essay | Wins | Losses | Ties | Win Rate |\n`; + content += `|------|-------|------|--------|------|----------|\n`; + + topic.rankings.essays.forEach((entry, index) => { + const label = entry.reviewer + ? `${entry.author} (← ${entry.reviewer})` + : entry.author; + content += `| ${index + 1} | ${label} | ${entry.wins} | ${ + entry.losses + } | ${entry.ties} | ${(entry.winRate * 100).toFixed(1)}% |\n`; + }); + + content += `\n`; + } + + await writeFile(path, content, "utf-8"); + return path; +} diff --git a/index.ts b/index.ts index 98876f3..5dc0471 100644 --- a/index.ts +++ b/index.ts @@ -4,6 +4,7 @@ import { reviewEssay, reviseEssay, scoreEssay, + compareEssays, type TokenUsage, } from "./aiClient"; import { @@ -11,11 +12,8 @@ import { dryRunModels, PARALLEL_LIMIT, TOPICS, + type RunnableModel, } from "./constants"; - -// Parse CLI flags -const isDryRun = process.argv.includes("--dry-run"); -const modelsToRun = isDryRun ? dryRunModels : allModels; import { createTopicDirectories, initArenaRun, @@ -24,10 +22,33 @@ import { writeResultsJson, writeRevision, writeSummary, + writeComparison, + writeOneVsOneResultsJson, + writeOneVsOneSummary, type ArenaResults, type TopicResults, + type TestType, + type OneVsOneResults, + type OneVsOneTopicResults, + type ComparisonResult, } from "./fileUtils"; +// Parse CLI flags +const isDryRun = process.argv.includes("--dry-run"); +const modelsToRun = isDryRun ? dryRunModels : allModels; + +// Parse --test argument +function getTestTypeFromArgs(): TestType | null { + const testArg = process.argv.find((arg) => arg.startsWith("--test=")); + if (!testArg) return null; + const value = testArg.split("=")[1]; + if (value === "scoring-test" || value === "1v1") { + return value; + } + console.error(`Invalid test type: ${value}. Use "scoring-test" or "1v1".`); + process.exit(1); +} + const limit = pLimit(PARALLEL_LIMIT); /** @@ -38,6 +59,7 @@ interface UsageTracker { reviews: Record; revisions: Record; scores: Record; + comparisons: Record; } function createUsageTracker(): UsageTracker { @@ -46,112 +68,50 @@ function createUsageTracker(): UsageTracker { reviews: {}, revisions: {}, scores: {}, + comparisons: {}, }; for (const model of modelsToRun) { tracker.essays[model.name] = []; tracker.reviews[model.name] = []; tracker.revisions[model.name] = []; tracker.scores[model.name] = []; + tracker.comparisons[model.name] = []; } return tracker; } -const usageTracker = createUsageTracker(); +let usageTracker = createUsageTracker(); /** - * Counts the actual API calls for each phase based on model configuration. + * Interactive test type selection UI. */ -function countApiCalls() { - let essays = 0; - let feedback = 0; - let revisions = 0; - let scores = 0; - - // Per topic counts - for (const _topic of TOPICS) { - // Phase 1: Essays - for (const _model of modelsToRun) { - essays++; - } - - // Phase 2: Feedback (each model reviews every OTHER model's essay) - for (const reviewer of modelsToRun) { - for (const author of modelsToRun) { - if (reviewer.name === author.name) continue; - feedback++; - } - } - - // Phase 3: Revisions (each author revises for each reviewer's feedback) - for (const author of modelsToRun) { - for (const reviewer of modelsToRun) { - if (author.name === reviewer.name) continue; - revisions++; - } - } - - // Phase 4: Scoring (every model scores every essay) - // Original essays - for (const _judge of modelsToRun) { - for (const _author of modelsToRun) { - scores++; - } - } - // Revised essays - for (const _judge of modelsToRun) { - for (const author of modelsToRun) { - for (const reviewer of modelsToRun) { - if (author.name === reviewer.name) continue; - scores++; - } - } - } - } - - return { - essays, - feedback, - revisions, - scores, - total: essays + feedback + revisions + scores, - }; -} - -/** - * Prompts the user for confirmation before running the arena. - */ -async function confirmRun(): Promise { - const { essays, feedback, revisions, scores, total } = countApiCalls(); - +async function selectTestType(): Promise { console.log("\n🏟️ Writing Quality Arena\n"); - if (isDryRun) { - console.log("⚡ DRY RUN MODE (using cheap models)\n"); - } - console.log(`Models: ${modelsToRun.length}`); - console.log(`Topics: ${TOPICS.length}`); - console.log(`\nAPI Call Breakdown (across all ${TOPICS.length} topics):`); - console.log(` Phase 1 - Essays: ${essays.toString().padStart(6)} calls`); - console.log( - ` Phase 2 - Feedback: ${feedback.toString().padStart(6)} calls` - ); - console.log( - ` Phase 3 - Revisions: ${revisions.toString().padStart(6)} calls` - ); - console.log(` Phase 4 - Scores: ${scores.toString().padStart(6)} calls`); - console.log(` ────────────────────────────`); - console.log(` Total: ${total.toString().padStart(6)} calls\n`); - console.log(`Parallelism: ${PARALLEL_LIMIT} concurrent requests\n`); + console.log("Select test type:\n"); + console.log(" 1. scoring-test - Models score essays on a 1-10 scale"); + console.log(" 2. 1v1 - Head-to-head essay comparisons\n"); - process.stdout.write("Proceed? (Y/n): "); + process.stdout.write("Enter choice (1 or 2): "); return new Promise((resolve) => { process.stdin.once("data", (data) => { - const input = data.toString().trim().toLowerCase(); - resolve(input === "" || input === "y" || input === "yes"); + const input = data.toString().trim(); + if (input === "1" || input === "scoring-test") { + resolve("scoring-test"); + } else if (input === "2" || input === "1v1") { + resolve("1v1"); + } else { + console.log("Invalid choice, defaulting to scoring-test"); + resolve("scoring-test"); + } }); }); } +// ============================================================================ +// SHARED PHASES (used by both test types) +// ============================================================================ + /** * Phase 1: Each model generates an essay on the topic. */ @@ -286,8 +246,99 @@ async function runPhase3Revisions( return revisions; } +// ============================================================================ +// SCORING TEST SPECIFIC +// ============================================================================ + /** - * Phase 4: Every model scores every essay (original and revised). + * Counts API calls for scoring test. + */ +function countScoringApiCalls() { + let essays = 0; + let feedback = 0; + let revisions = 0; + let scores = 0; + + for (const _topic of TOPICS) { + for (const _model of modelsToRun) { + essays++; + } + + for (const reviewer of modelsToRun) { + for (const author of modelsToRun) { + if (reviewer.name === author.name) continue; + feedback++; + } + } + + for (const author of modelsToRun) { + for (const reviewer of modelsToRun) { + if (author.name === reviewer.name) continue; + revisions++; + } + } + + for (const _judge of modelsToRun) { + for (const _author of modelsToRun) { + scores++; + } + } + for (const _judge of modelsToRun) { + for (const author of modelsToRun) { + for (const reviewer of modelsToRun) { + if (author.name === reviewer.name) continue; + scores++; + } + } + } + } + + return { + essays, + feedback, + revisions, + scores, + total: essays + feedback + revisions + scores, + }; +} + +/** + * Prompts for scoring test confirmation. + */ +async function confirmScoringRun(): Promise { + const { essays, feedback, revisions, scores, total } = countScoringApiCalls(); + + console.log("\n🏟️ Writing Quality Arena - Scoring Test\n"); + if (isDryRun) { + console.log("⚡ DRY RUN MODE (using cheap models)\n"); + } + console.log(`Models: ${modelsToRun.length}`); + console.log(`Topics: ${TOPICS.length}`); + console.log(`\nAPI Call Breakdown (across all ${TOPICS.length} topics):`); + console.log(` Phase 1 - Essays: ${essays.toString().padStart(6)} calls`); + console.log( + ` Phase 2 - Feedback: ${feedback.toString().padStart(6)} calls` + ); + console.log( + ` Phase 3 - Revisions: ${revisions.toString().padStart(6)} calls` + ); + console.log(` Phase 4 - Scores: ${scores.toString().padStart(6)} calls`); + console.log(` ────────────────────────────`); + console.log(` Total: ${total.toString().padStart(6)} calls\n`); + console.log(`Parallelism: ${PARALLEL_LIMIT} concurrent requests\n`); + + process.stdout.write("Proceed? (Y/n): "); + + return new Promise((resolve) => { + process.stdin.once("data", (data) => { + const input = data.toString().trim().toLowerCase(); + resolve(input === "" || input === "y" || input === "yes"); + }); + }); +} + +/** + * Phase 4 (Scoring): Every model scores every essay. */ async function runPhase4Scoring( topic: string, @@ -312,7 +363,6 @@ async function runPhase4Scoring( Record> > = {}; - // Initialize nested objects for (const judge of modelsToRun) { originalScores[judge.name] = {}; revisedScores[judge.name] = {}; @@ -323,7 +373,6 @@ async function runPhase4Scoring( const tasks: Array> = []; - // Score original essays for (const judge of modelsToRun) { for (const author of modelsToRun) { tasks.push( @@ -348,7 +397,6 @@ async function runPhase4Scoring( } } - // Score revised essays for (const judge of modelsToRun) { for (const author of modelsToRun) { for (const reviewer of modelsToRun) { @@ -386,7 +434,7 @@ async function runPhase4Scoring( /** * Calculate rankings from scores for a single topic. */ -function calculateRankings(scores: { +function calculateScoringRankings(scores: { original: Record< string, Record @@ -407,7 +455,6 @@ function calculateRankings(scores: { const firstJudge = judges[0]!; const authors = Object.keys(scores.original[firstJudge]!); - // Calculate average scores for original essays for (const author of authors) { const judgeScores = judges.map((j) => scores.original[j]![author]!.score); const avgScore = @@ -415,7 +462,6 @@ function calculateRankings(scores: { essayScores.push({ type: "original", author, avgScore }); } - // Calculate average scores for revised essays for (const author of authors) { for (const reviewer of authors) { if (author === reviewer) continue; @@ -428,10 +474,8 @@ function calculateRankings(scores: { } } - // Sort by average score descending essayScores.sort((a, b) => b.avgScore - a.avgScore); - // Calculate reviewer impact (average improvement from their feedback) const reviewerImpact: Record = {}; for (const reviewer of authors) { reviewerImpact[reviewer] = []; @@ -462,7 +506,6 @@ function calculateRankings(scores: { }) ); - // Sort by average improvement descending reviewerScores.sort((a, b) => b.avgImprovement - a.avgImprovement); return { @@ -472,21 +515,18 @@ function calculateRankings(scores: { } /** - * Calculate aggregate rankings across all topics. + * Calculate aggregate rankings across all topics for scoring test. */ -function calculateAggregateRankings( +function calculateScoringAggregateRankings( topics: TopicResults[] ): ArenaResults["aggregateRankings"] { - // Aggregate scores per model (as writer) const modelScores: Record< string, { scores: number[]; improvements: number[] } > = {}; - // Aggregate improvements per reviewer const reviewerImprovements: Record = {}; for (const topic of topics) { - // Get original essay scores per author const originalByAuthor: Record = {}; for (const entry of topic.rankings.essays) { if (entry.type === "original") { @@ -498,7 +538,6 @@ function calculateAggregateRankings( } } - // Calculate improvement for revised essays for (const entry of topic.rankings.essays) { if (entry.type === "revised" && entry.reviewer) { const original = originalByAuthor[entry.author]!; @@ -513,7 +552,6 @@ function calculateAggregateRankings( } } - // Calculate averages for essays const essayRankings = Object.entries(modelScores).map(([author, data]) => ({ author, avgScore: data.scores.reduce((a, b) => a + b, 0) / data.scores.length, @@ -525,7 +563,6 @@ function calculateAggregateRankings( })); essayRankings.sort((a, b) => b.avgScore - a.avgScore); - // Calculate averages for reviewers const reviewerRankings = Object.entries(reviewerImprovements).map( ([reviewer, improvements]) => ({ reviewer, @@ -542,9 +579,9 @@ function calculateAggregateRankings( } /** - * Run all phases for a single topic. + * Run all phases for a single topic (scoring test). */ -async function runTopicArena( +async function runScoringTopicArena( topic: string, topicIndex: number, totalTopics: number, @@ -558,29 +595,24 @@ async function runTopicArena( const topicDir = await createTopicDirectories(baseDir, topic); - // Phase 1: Generate essays console.log("\n 📝 Phase 1: Essay Generation"); const essays = await runPhase1Essays(topic, topicDir); console.log(` ✓ Phase 1 complete: ${modelsToRun.length} essays`); - // Phase 2: Generate feedback console.log("\n 📋 Phase 2: Feedback Generation"); const feedback = await runPhase2Feedback(topic, essays, topicDir); const feedbackCount = modelsToRun.length * (modelsToRun.length - 1); console.log(` ✓ Phase 2 complete: ${feedbackCount} feedback pieces`); - // Phase 3: Generate revisions console.log("\n ✏️ Phase 3: Revisions"); const revisions = await runPhase3Revisions(topic, essays, feedback, topicDir); console.log(` ✓ Phase 3 complete: ${feedbackCount} revisions`); - // Phase 4: Score all essays console.log("\n ⭐ Phase 4: Scoring"); const scores = await runPhase4Scoring(topic, essays, revisions); console.log(` ✓ Phase 4 complete`); - // Calculate rankings for this topic - const rankings = calculateRankings(scores); + const rankings = calculateScoringRankings(scores); return { topic, @@ -593,32 +625,53 @@ async function runTopicArena( } /** - * Main arena orchestration. + * Formats duration in milliseconds to human-readable string. */ -async function runArena(): Promise { - const confirmed = await confirmRun(); +function formatDuration(ms: number): string { + if (ms < 1000) return `${ms}ms`; + const seconds = Math.floor(ms / 1000); + if (seconds < 60) return `${seconds}s`; + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + if (minutes < 60) return `${minutes}m ${remainingSeconds}s`; + const hours = Math.floor(minutes / 60); + const remainingMinutes = minutes % 60; + return `${hours}h ${remainingMinutes}m ${remainingSeconds}s`; +} + +/** + * Main scoring test orchestration. + */ +async function runScoringTest(): Promise { + usageTracker = createUsageTracker(); + + const confirmed = await confirmScoringRun(); if (!confirmed) { console.log("\nAborted."); process.exit(0); } - const { baseDir, timestamp } = await initArenaRun(); + const overallStart = Date.now(); + + const { baseDir, timestamp } = await initArenaRun("scoring-test"); console.log(`\nResults will be saved to: ${baseDir}`); - // Run arena for each topic const topicResults: TopicResults[] = []; + const topicTimes: Array<{ topic: string; duration: number }> = []; for (let i = 0; i < TOPICS.length; i++) { const topic = TOPICS[i]!; - const result = await runTopicArena(topic, i, TOPICS.length, baseDir); + const topicStart = Date.now(); + const result = await runScoringTopicArena(topic, i, TOPICS.length, baseDir); + const topicDuration = Date.now() - topicStart; topicResults.push(result); + topicTimes.push({ topic, duration: topicDuration }); + console.log(` ⏱️ Topic completed in ${formatDuration(topicDuration)}`); } - // Calculate aggregate rankings console.log("\n\n📊 Calculating aggregate rankings...\n"); - const aggregateRankings = calculateAggregateRankings(topicResults); + const aggregateRankings = calculateScoringAggregateRankings(topicResults); - // Compile results const results: ArenaResults = { timestamp, models: modelsToRun.map((m) => m.name), @@ -626,16 +679,14 @@ async function runArena(): Promise { aggregateRankings, }; - // Write final results await writeResultsJson(baseDir, results); await writeSummary(baseDir, results); - // Print summary console.log("═".repeat(60)); console.log("\n🏆 AGGREGATE RESULTS\n"); - console.log("Top 5 Models (as Writers):\n"); - aggregateRankings.essays.slice(0, 5).forEach((entry, index) => { + console.log("📝 Models (as Writers):\n"); + aggregateRankings.essays.forEach((entry, index) => { const sign = entry.avgImprovement >= 0 ? "+" : ""; console.log( ` ${index + 1}. ${entry.author} - ${entry.avgScore.toFixed( @@ -644,8 +695,8 @@ async function runArena(): Promise { ); }); - console.log("\n🎯 Top 5 Reviewers (by improvement impact):\n"); - aggregateRankings.reviewers.slice(0, 5).forEach((entry, index) => { + console.log("\n🎯 Reviewers (by improvement impact):\n"); + aggregateRankings.reviewers.forEach((entry, index) => { const sign = entry.avgImprovement >= 0 ? "+" : ""; console.log( ` ${index + 1}. ${ @@ -654,12 +705,541 @@ async function runArena(): Promise { ); }); - // Print usage and cost summary - printUsageSummary(); + printUsageSummary("scoring-test"); + + const overallDuration = Date.now() - overallStart; + console.log("\n" + "═".repeat(60)); + console.log("\n⏱️ RUNTIME SUMMARY\n"); + topicTimes.forEach((t) => { + console.log( + ` ${t.topic.slice(0, 40).padEnd(42)} ${formatDuration(t.duration)}` + ); + }); + console.log(` ${"─".repeat(50)}`); + console.log(` ${"Total".padEnd(42)} ${formatDuration(overallDuration)}`); + + console.log(`\n✨ Scoring test complete! Results saved to: ${baseDir}`); +} + +// ============================================================================ +// 1V1 TEST SPECIFIC +// ============================================================================ + +/** + * Counts API calls for 1v1 test. + */ +function countOneVsOneApiCalls() { + let essays = 0; + let feedback = 0; + let revisions = 0; + let comparisons = 0; + + const n = modelsToRun.length; + + for (const _topic of TOPICS) { + // Phase 1: Essays + essays += n; + + // Phase 2: Feedback (each model reviews every other) + feedback += n * (n - 1); + + // Phase 3: Revisions (each author revises per reviewer) + revisions += n * (n - 1); + + // Phase 4: Comparisons + // Original essays: C(n, 2) pairs = n*(n-1)/2, each judged by n models + const originalPairs = (n * (n - 1)) / 2; + comparisons += originalPairs * n; + + // Revised essays: each model has (n-1) revisions + // Total revised essays = n * (n-1) + // Pairs of revised essays = C(n*(n-1), 2) = ...but we only compare within same topic + // Actually: all revised essays compete pairwise + const revisedCount = n * (n - 1); + const revisedPairs = (revisedCount * (revisedCount - 1)) / 2; + comparisons += revisedPairs * n; + } + + return { + essays, + feedback, + revisions, + comparisons, + total: essays + feedback + revisions + comparisons, + }; +} + +/** + * Prompts for 1v1 test confirmation. + */ +async function confirmOneVsOneRun(): Promise { + const { essays, feedback, revisions, comparisons, total } = + countOneVsOneApiCalls(); + + console.log("\n🏟️ Writing Quality Arena - 1v1 Test\n"); + if (isDryRun) { + console.log("⚡ DRY RUN MODE (using cheap models)\n"); + } + console.log(`Models: ${modelsToRun.length}`); + console.log(`Topics: ${TOPICS.length}`); + console.log(`\nAPI Call Breakdown (across all ${TOPICS.length} topics):`); + console.log( + ` Phase 1 - Essays: ${essays.toString().padStart(6)} calls` + ); + console.log( + ` Phase 2 - Feedback: ${feedback.toString().padStart(6)} calls` + ); + console.log( + ` Phase 3 - Revisions: ${revisions.toString().padStart(6)} calls` + ); + console.log( + ` Phase 4 - Comparisons: ${comparisons.toString().padStart(6)} calls` + ); + console.log(` ────────────────────────────`); + console.log( + ` Total: ${total.toString().padStart(6)} calls\n` + ); + console.log(`Parallelism: ${PARALLEL_LIMIT} concurrent requests\n`); + + process.stdout.write("Proceed? (Y/n): "); + + return new Promise((resolve) => { + process.stdin.once("data", (data) => { + const input = data.toString().trim().toLowerCase(); + resolve(input === "" || input === "y" || input === "yes"); + }); + }); +} + +/** + * Phase 4 (1v1): Head-to-head comparisons of all essays. + */ +async function runPhase4Comparisons( + topic: string, + essays: Record, + revisions: Record>, + topicDir: string +): Promise { + const comparisons: ComparisonResult[] = []; + + // Build list of all essays (original + revised) + interface EssayEntry { + author: string; + reviewer?: string; + text: string; + } + + const allEssays: EssayEntry[] = []; + + // Add original essays + for (const author of Object.keys(essays)) { + allEssays.push({ author, text: essays[author]! }); + } + + // Add revised essays + for (const author of Object.keys(revisions)) { + for (const reviewer of Object.keys(revisions[author]!)) { + allEssays.push({ + author, + reviewer, + text: revisions[author]![reviewer]!, + }); + } + } + + // Generate all unique pairs + const pairs: Array<[EssayEntry, EssayEntry]> = []; + for (let i = 0; i < allEssays.length; i++) { + for (let j = i + 1; j < allEssays.length; j++) { + pairs.push([allEssays[i]!, allEssays[j]!]); + } + } + + const tasks: Array> = []; + + for (const judge of modelsToRun) { + for (const [essayA, essayB] of pairs) { + tasks.push( + limit(async () => { + const labelA = essayA.reviewer + ? `${essayA.author}←${essayA.reviewer}` + : essayA.author; + const labelB = essayB.reviewer + ? `${essayB.author}←${essayB.reviewer}` + : essayB.author; + + console.log(` ${judge.name} comparing ${labelA} vs ${labelB}...`); + + const result = await compareEssays( + judge, + { author: essayA.author, text: essayA.text }, + { author: essayB.author, text: essayB.text }, + topic + ); + + const comparison: ComparisonResult = { + judge: judge.name, + essayA: { author: essayA.author, reviewer: essayA.reviewer }, + essayB: { author: essayB.author, reviewer: essayB.reviewer }, + winner: result.winner, + reasoning: result.reasoning, + }; + + comparisons.push(comparison); + usageTracker.comparisons[judge.name]!.push(result.usage); + + await writeComparison( + topicDir, + judge.name, + { author: essayA.author, reviewer: essayA.reviewer }, + { author: essayB.author, reviewer: essayB.reviewer }, + result.winner, + result.reasoning + ); + + const winnerLabel = + result.winner === "A" + ? labelA + : result.winner === "B" + ? labelB + : "Tie"; + console.log( + ` ✓ ${judge.name}: ${labelA} vs ${labelB} → ${winnerLabel} (${ + result.usage.totalTokens + } tokens, $${result.usage.cost.toFixed(4)})` + ); + }) + ); + } + } + + await Promise.all(tasks); + return comparisons; +} + +/** + * Calculate rankings from comparisons for a single topic. + */ +function calculateOneVsOneRankings( + comparisons: ComparisonResult[] +): OneVsOneTopicResults["rankings"] { + // Track wins/losses/ties per essay + const stats: Record< + string, + { + wins: number; + losses: number; + ties: number; + author: string; + reviewer?: string; + } + > = {}; + + function getKey(author: string, reviewer?: string) { + return reviewer ? `${author}:${reviewer}` : author; + } + + for (const comp of comparisons) { + const keyA = getKey(comp.essayA.author, comp.essayA.reviewer); + const keyB = getKey(comp.essayB.author, comp.essayB.reviewer); + + if (!stats[keyA]) { + stats[keyA] = { + wins: 0, + losses: 0, + ties: 0, + author: comp.essayA.author, + reviewer: comp.essayA.reviewer, + }; + } + if (!stats[keyB]) { + stats[keyB] = { + wins: 0, + losses: 0, + ties: 0, + author: comp.essayB.author, + reviewer: comp.essayB.reviewer, + }; + } + + if (comp.winner === "A") { + stats[keyA]!.wins++; + stats[keyB]!.losses++; + } else if (comp.winner === "B") { + stats[keyB]!.wins++; + stats[keyA]!.losses++; + } else { + stats[keyA]!.ties++; + stats[keyB]!.ties++; + } + } + + const essays = Object.values(stats).map((s) => ({ + author: s.author, + reviewer: s.reviewer, + wins: s.wins, + losses: s.losses, + ties: s.ties, + winRate: + s.wins + s.losses + s.ties > 0 + ? s.wins / (s.wins + s.losses + s.ties) + : 0, + })); + + essays.sort((a, b) => b.winRate - a.winRate || b.wins - a.wins); + + return { essays }; +} + +/** + * Calculate aggregate rankings across all topics for 1v1 test. + */ +function calculateOneVsOneAggregateRankings( + topics: OneVsOneTopicResults[] +): OneVsOneResults["aggregateRankings"] { + // Aggregate by original author only (not per-revision) + const authorStats: Record< + string, + { wins: number; losses: number; ties: number } + > = {}; + + // Aggregate by reviewer (how well essays do after being revised by this reviewer) + const reviewerStats: Record< + string, + { wins: number; losses: number; ties: number } + > = {}; + + // Aggregate by author+reviewer pairing + const pairingStats: Record< + string, + { + author: string; + reviewer: string; + wins: number; + losses: number; + ties: number; + } + > = {}; + + for (const topic of topics) { + for (const entry of topic.rankings.essays) { + if (!entry.reviewer) { + // Original essay - count for author + if (!authorStats[entry.author]) { + authorStats[entry.author] = { wins: 0, losses: 0, ties: 0 }; + } + authorStats[entry.author]!.wins += entry.wins; + authorStats[entry.author]!.losses += entry.losses; + authorStats[entry.author]!.ties += entry.ties; + } else { + // Revised essay - count for reviewer and pairing + if (!reviewerStats[entry.reviewer]) { + reviewerStats[entry.reviewer] = { wins: 0, losses: 0, ties: 0 }; + } + reviewerStats[entry.reviewer]!.wins += entry.wins; + reviewerStats[entry.reviewer]!.losses += entry.losses; + reviewerStats[entry.reviewer]!.ties += entry.ties; + + const pairingKey = `${entry.author}:${entry.reviewer}`; + if (!pairingStats[pairingKey]) { + pairingStats[pairingKey] = { + author: entry.author, + reviewer: entry.reviewer, + wins: 0, + losses: 0, + ties: 0, + }; + } + pairingStats[pairingKey]!.wins += entry.wins; + pairingStats[pairingKey]!.losses += entry.losses; + pairingStats[pairingKey]!.ties += entry.ties; + } + } + } + + const calcWinRate = (s: { wins: number; losses: number; ties: number }) => + s.wins + s.losses + s.ties > 0 ? s.wins / (s.wins + s.losses + s.ties) : 0; + + const essays = Object.entries(authorStats).map(([author, s]) => ({ + author, + wins: s.wins, + losses: s.losses, + ties: s.ties, + winRate: calcWinRate(s), + })); + essays.sort((a, b) => b.winRate - a.winRate || b.wins - a.wins); + + const reviewers = Object.entries(reviewerStats).map(([reviewer, s]) => ({ + reviewer, + wins: s.wins, + losses: s.losses, + ties: s.ties, + winRate: calcWinRate(s), + })); + reviewers.sort((a, b) => b.winRate - a.winRate || b.wins - a.wins); + + const pairings = Object.values(pairingStats).map((s) => ({ + author: s.author, + reviewer: s.reviewer, + wins: s.wins, + losses: s.losses, + ties: s.ties, + winRate: calcWinRate(s), + })); + pairings.sort((a, b) => b.winRate - a.winRate || b.wins - a.wins); + + return { essays, reviewers, pairings }; +} + +/** + * Run all phases for a single topic (1v1 test). + */ +async function runOneVsOneTopicArena( + topic: string, + topicIndex: number, + totalTopics: number, + baseDir: string +): Promise { + console.log( + `\n${"═".repeat(60)}\n📚 Topic ${ + topicIndex + 1 + }/${totalTopics}: "${topic}"\n${"═".repeat(60)}` + ); + + const topicDir = await createTopicDirectories(baseDir, topic); + + console.log("\n 📝 Phase 1: Essay Generation"); + const essays = await runPhase1Essays(topic, topicDir); + console.log(` ✓ Phase 1 complete: ${modelsToRun.length} essays`); + + console.log("\n 📋 Phase 2: Feedback Generation"); + const feedback = await runPhase2Feedback(topic, essays, topicDir); + const feedbackCount = modelsToRun.length * (modelsToRun.length - 1); + console.log(` ✓ Phase 2 complete: ${feedbackCount} feedback pieces`); + + console.log("\n ✏️ Phase 3: Revisions"); + const revisions = await runPhase3Revisions(topic, essays, feedback, topicDir); + console.log(` ✓ Phase 3 complete: ${feedbackCount} revisions`); + + console.log("\n 🥊 Phase 4: Head-to-Head Comparisons"); + const comparisons = await runPhase4Comparisons( + topic, + essays, + revisions, + topicDir + ); + console.log(` ✓ Phase 4 complete: ${comparisons.length} comparisons`); + + const rankings = calculateOneVsOneRankings(comparisons); + + return { + topic, + essays, + feedback, + revisions, + comparisons, + rankings, + }; +} + +/** + * Main 1v1 test orchestration. + */ +async function runOneVsOneTest(): Promise { + usageTracker = createUsageTracker(); + + const confirmed = await confirmOneVsOneRun(); + if (!confirmed) { + console.log("\nAborted."); + process.exit(0); + } + + const overallStart = Date.now(); + + const { baseDir, timestamp } = await initArenaRun("1v1"); + console.log(`\nResults will be saved to: ${baseDir}`); + + const topicResults: OneVsOneTopicResults[] = []; + const topicTimes: Array<{ topic: string; duration: number }> = []; + + for (let i = 0; i < TOPICS.length; i++) { + const topic = TOPICS[i]!; + const topicStart = Date.now(); + const result = await runOneVsOneTopicArena( + topic, + i, + TOPICS.length, + baseDir + ); + const topicDuration = Date.now() - topicStart; + topicResults.push(result); + topicTimes.push({ topic, duration: topicDuration }); + console.log(` ⏱️ Topic completed in ${formatDuration(topicDuration)}`); + } + + console.log("\n\n📊 Calculating aggregate rankings...\n"); + const aggregateRankings = calculateOneVsOneAggregateRankings(topicResults); + + const results: OneVsOneResults = { + timestamp, + models: modelsToRun.map((m) => m.name), + topics: topicResults, + aggregateRankings, + }; + + await writeOneVsOneResultsJson(baseDir, results); + await writeOneVsOneSummary(baseDir, results); + + console.log("═".repeat(60)); + console.log("\n🏆 AGGREGATE RESULTS\n"); + + console.log("📝 Models (as Writers - Original Essays):\n"); + aggregateRankings.essays.forEach((entry, index) => { + console.log( + ` ${index + 1}. ${entry.author} - ${entry.wins}W/${entry.losses}L/${ + entry.ties + }T (${(entry.winRate * 100).toFixed(1)}% win rate)` + ); + }); + + console.log("\n🎯 Reviewers (by revised essay performance):\n"); + aggregateRankings.reviewers.forEach((entry, index) => { + console.log( + ` ${index + 1}. ${entry.reviewer} - ${entry.wins}W/${entry.losses}L/${ + entry.ties + }T (${(entry.winRate * 100).toFixed(1)}% win rate)` + ); + }); + + console.log("\n🤝 Pairings (Author + Reviewer):\n"); + aggregateRankings.pairings.forEach((entry, index) => { + console.log( + ` ${index + 1}. ${entry.author} ← ${entry.reviewer} - ${entry.wins}W/${ + entry.losses + }L/${entry.ties}T (${(entry.winRate * 100).toFixed(1)}% win rate)` + ); + }); + + printUsageSummary("1v1"); + + const overallDuration = Date.now() - overallStart; + console.log("\n" + "═".repeat(60)); + console.log("\n⏱️ RUNTIME SUMMARY\n"); + topicTimes.forEach((t) => { + console.log( + ` ${t.topic.slice(0, 40).padEnd(42)} ${formatDuration(t.duration)}` + ); + }); + console.log(` ${"─".repeat(50)}`); + console.log(` ${"Total".padEnd(42)} ${formatDuration(overallDuration)}`); - console.log(`\n✨ Arena complete! Results saved to: ${baseDir}`); + console.log(`\n✨ 1v1 test complete! Results saved to: ${baseDir}`); } +// ============================================================================ +// SHARED UTILITIES +// ============================================================================ + /** * Calculates average tokens from an array of usage records. */ @@ -676,17 +1256,16 @@ function calcAverage(usages: TokenUsage[]) { /** * Prints a summary of token usage and costs. */ -function printUsageSummary() { +function printUsageSummary(testType: TestType) { console.log("\n" + "═".repeat(60)); console.log("\n💰 TOKEN USAGE & COST SUMMARY\n"); - // Calculate phase totals let totalEssayCost = 0; let totalReviewCost = 0; let totalRevisionCost = 0; let totalScoreCost = 0; + let totalComparisonCost = 0; - // Per-model stats const modelStats: Array<{ name: string; essayAvgTokens: number; @@ -696,6 +1275,7 @@ function printUsageSummary() { revisionAvgTokens: number; revisionCost: number; scoreCost: number; + comparisonCost: number; totalCost: number; }> = []; @@ -704,11 +1284,13 @@ function printUsageSummary() { const reviewStats = calcAverage(usageTracker.reviews[model.name]!); const revisionStats = calcAverage(usageTracker.revisions[model.name]!); const scoreStats = calcAverage(usageTracker.scores[model.name]!); + const comparisonStats = calcAverage(usageTracker.comparisons[model.name]!); totalEssayCost += essayStats.cost; totalReviewCost += reviewStats.cost; totalRevisionCost += revisionStats.cost; totalScoreCost += scoreStats.cost; + totalComparisonCost += comparisonStats.cost; modelStats.push({ name: model.name, @@ -719,27 +1301,35 @@ function printUsageSummary() { revisionAvgTokens: revisionStats.tokens, revisionCost: revisionStats.cost, scoreCost: scoreStats.cost, + comparisonCost: comparisonStats.cost, totalCost: essayStats.cost + reviewStats.cost + revisionStats.cost + - scoreStats.cost, + scoreStats.cost + + comparisonStats.cost, }); } const grandTotal = - totalEssayCost + totalReviewCost + totalRevisionCost + totalScoreCost; + totalEssayCost + + totalReviewCost + + totalRevisionCost + + totalScoreCost + + totalComparisonCost; - // Print phase cost breakdown console.log("Phase Costs:"); console.log(` Essays (First): $${totalEssayCost.toFixed(4)}`); console.log(` Reviews: $${totalReviewCost.toFixed(4)}`); console.log(` Revisions (Follow): $${totalRevisionCost.toFixed(4)}`); - console.log(` Scoring: $${totalScoreCost.toFixed(4)}`); + if (testType === "scoring-test") { + console.log(` Scoring: $${totalScoreCost.toFixed(4)}`); + } else { + console.log(` Comparisons: $${totalComparisonCost.toFixed(4)}`); + } console.log(` ────────────────────────────`); console.log(` Total: $${grandTotal.toFixed(4)}`); - // Print per-model breakdown console.log("\n\nPer-Model Token Averages & Costs:\n"); console.log( " Model".padEnd(32) + @@ -764,8 +1354,25 @@ function printUsageSummary() { console.log(` ${"GRAND TOTAL".padEnd(72)}$${grandTotal.toFixed(4)}`); } -// Run the arena -runArena().catch((error) => { +// ============================================================================ +// MAIN ENTRY POINT +// ============================================================================ + +async function main() { + let testType = getTestTypeFromArgs(); + + if (!testType) { + testType = await selectTestType(); + } + + if (testType === "scoring-test") { + await runScoringTest(); + } else { + await runOneVsOneTest(); + } +} + +main().catch((error) => { console.error("Error running arena:", error); process.exit(1); }); From 76f5cd5c95be4fadbe1e35bf2f2126e2e0bd38fa Mon Sep 17 00:00:00 2001 From: Theo Browne Date: Sat, 29 Nov 2025 13:21:36 -0800 Subject: [PATCH 06/10] fix costs --- constants.ts | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/constants.ts b/constants.ts index 8f8482a..b9d9ddc 100644 --- a/constants.ts +++ b/constants.ts @@ -37,69 +37,76 @@ export interface RunnableModel { reasoning: boolean; } +// Include "usage" so we can log cost +const defaultProviderOptions = { + usage: { + include: true, + }, +}; + export const modelsToRun: RunnableModel[] = [ // Anthropic { name: "claude-4.5-opus-reasoning", - llm: openrouter("anthropic/claude-opus-4.5"), + llm: openrouter("anthropic/claude-opus-4.5", defaultProviderOptions), reasoning: true, }, // { // name: "claude-4.5-opus-non-reasoning", - // llm: openrouter("anthropic/claude-opus-4.5"), + // llm: openrouter("anthropic/claude-opus-4.5", defaultProviderOptions), // reasoning: false, // }, // OpenAI // { // name: "gpt-4o", - // llm: openrouter("openai/gpt-4o"), + // llm: openrouter("openai/gpt-4o", defaultProviderOptions), // reasoning: false, // }, { name: "gpt-5.1", - llm: openrouter("openai/gpt-5"), + llm: openrouter("openai/gpt-5.1", defaultProviderOptions), reasoning: true, }, // { // name: "gpt-5.1-chat", - // llm: openrouter("openai/gpt-5.1-chat"), + // llm: openrouter("openai/gpt-5.1-chat", defaultProviderOptions), // reasoning: false, // }, // { // name: "gpt-5-mini", - // llm: openrouter("openai/gpt-5-mini"), + // llm: openrouter("openai/gpt-5-mini", defaultProviderOptions), // reasoning: true, // }, // Google { name: "gemini-3-pro-preview", - llm: openrouter("google/gemini-3-pro-preview"), + llm: openrouter("google/gemini-3-pro-preview", defaultProviderOptions), reasoning: true, }, // { // name: "gemini-2.5-pro", - // llm: openrouter("google/gemini-2.5-pro"), + // llm: openrouter("google/gemini-2.5-pro", defaultProviderOptions), // reasoning: true, // }, // Grok // { // name: "grok-4.1-fast", - // llm: openrouter("x-ai/grok-4.1-fast"), + // llm: openrouter("x-ai/grok-4.1-fast", defaultProviderOptions), // reasoning: true, // }, // Open Weight // { // name: "kimi-k2", - // llm: openrouter("moonshotai/kimi-k2"), + // llm: openrouter("moonshotai/kimi-k2", defaultProviderOptions), // reasoning: false, // }, { name: "kimi-k2-thinking", - llm: openrouter("moonshotai/kimi-k2-thinking"), + llm: openrouter("moonshotai/kimi-k2-thinking", defaultProviderOptions), reasoning: true, }, ]; @@ -108,17 +115,17 @@ export const modelsToRun: RunnableModel[] = [ export const dryRunModels: RunnableModel[] = [ { name: "claude-4.5-haiku", - llm: openrouter("anthropic/claude-haiku-4.5"), + llm: openrouter("anthropic/claude-haiku-4.5", defaultProviderOptions), reasoning: false, }, { name: "gemini-2.5-flash", - llm: openrouter("google/gemini-2.5-flash"), + llm: openrouter("google/gemini-2.5-flash", defaultProviderOptions), reasoning: true, }, { name: "gpt-5-mini", - llm: openrouter("openai/gpt-5-mini"), + llm: openrouter("openai/gpt-5-mini", defaultProviderOptions), reasoning: true, }, ]; From e8abd3eb115b055c897738479bc2c71d1c565c40 Mon Sep 17 00:00:00 2001 From: Theo Browne Date: Sat, 29 Nov 2025 14:16:21 -0800 Subject: [PATCH 07/10] include upstream --- aiClient.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/aiClient.ts b/aiClient.ts index 4007beb..bd4d0ec 100644 --- a/aiClient.ts +++ b/aiClient.ts @@ -9,9 +9,15 @@ function extractCost( ): number { if (!providerMetadata) return 0; const openrouterMeta = providerMetadata.openrouter as any; + if (openrouterMeta?.usage?.cost) { return openrouterMeta.usage.cost; } + + if (openrouterMeta?.usage?.costDetails?.upstreamInferenceCost) { + return openrouterMeta.usage.costDetails.upstreamInferenceCost; + } + return 0; } From 83de7cfe18669d03547de83cebc6d684d76099bc Mon Sep 17 00:00:00 2001 From: Theo Browne Date: Sun, 30 Nov 2025 14:07:03 -0800 Subject: [PATCH 08/10] better results view --- constants.ts | 8 +++--- index.ts | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/constants.ts b/constants.ts index b9d9ddc..e5768d0 100644 --- a/constants.ts +++ b/constants.ts @@ -13,19 +13,19 @@ const openrouter = createOpenRouter({ }); // Parallelism configuration -export const PARALLEL_LIMIT = 20; +export const PARALLEL_LIMIT = 30; // Essay topics export const TOPICS = [ // "The role of failure in personal growth", // "Why boredom is underrated", - // "The ethics of artificial intelligence", - // "How social media reshapes human connection", + "The ethics of artificial intelligence", + "How social media reshapes human connection", // "The value of slow living in a fast world", // "Why we should embrace uncertainty", // "The hidden costs of convenience", // "What makes a good explanation", - "The relationship between creativity and constraint", + // "The relationship between creativity and constraint", // "Why some ideas spread and others don't", "the negative impacts on society from artificial intelligence", ] as const; diff --git a/index.ts b/index.ts index 5dc0471..9a593af 100644 --- a/index.ts +++ b/index.ts @@ -578,6 +578,34 @@ function calculateScoringAggregateRankings( }; } +/** + * Prints topic results for scoring test. + */ +function printScoringTopicResults(result: TopicResults) { + console.log(`\n 📊 Results for "${result.topic}":\n`); + + console.log(" 📝 Essay Rankings (by avg score):"); + result.rankings.essays.slice(0, 5).forEach((entry, index) => { + const label = entry.reviewer + ? `${entry.author} ← ${entry.reviewer} (revised)` + : `${entry.author} (original)`; + console.log(` ${index + 1}. ${label} - ${entry.avgScore.toFixed(2)}`); + }); + if (result.rankings.essays.length > 5) { + console.log(` ... and ${result.rankings.essays.length - 5} more`); + } + + console.log("\n 🎯 Reviewer Rankings (by improvement impact):"); + result.rankings.reviewers.forEach((entry, index) => { + const sign = entry.avgImprovement >= 0 ? "+" : ""; + console.log( + ` ${index + 1}. ${ + entry.reviewer + } - ${sign}${entry.avgImprovement.toFixed(2)}` + ); + }); +} + /** * Run all phases for a single topic (scoring test). */ @@ -666,10 +694,19 @@ async function runScoringTest(): Promise { const topicDuration = Date.now() - topicStart; topicResults.push(result); topicTimes.push({ topic, duration: topicDuration }); - console.log(` ⏱️ Topic completed in ${formatDuration(topicDuration)}`); + printScoringTopicResults(result); + console.log(`\n ⏱️ Topic completed in ${formatDuration(topicDuration)}`); } console.log("\n\n📊 Calculating aggregate rankings...\n"); + + // Log all topic results before aggregate + console.log("═".repeat(60)); + console.log("\n📋 INDIVIDUAL TOPIC RESULTS\n"); + for (const result of topicResults) { + printScoringTopicResults(result); + console.log(""); + } const aggregateRankings = calculateScoringAggregateRankings(topicResults); const results: ArenaResults = { @@ -1091,6 +1128,28 @@ function calculateOneVsOneAggregateRankings( return { essays, reviewers, pairings }; } +/** + * Prints topic results for 1v1 test. + */ +function printOneVsOneTopicResults(result: OneVsOneTopicResults) { + console.log(`\n 📊 Results for "${result.topic}":\n`); + + console.log(" 📝 Essay Rankings (by win rate):"); + result.rankings.essays.slice(0, 5).forEach((entry, index) => { + const label = entry.reviewer + ? `${entry.author} ← ${entry.reviewer} (revised)` + : `${entry.author} (original)`; + console.log( + ` ${index + 1}. ${label} - ${entry.wins}W/${entry.losses}L/${ + entry.ties + }T (${(entry.winRate * 100).toFixed(1)}%)` + ); + }); + if (result.rankings.essays.length > 5) { + console.log(` ... and ${result.rankings.essays.length - 5} more`); + } +} + /** * Run all phases for a single topic (1v1 test). */ @@ -1174,10 +1233,20 @@ async function runOneVsOneTest(): Promise { const topicDuration = Date.now() - topicStart; topicResults.push(result); topicTimes.push({ topic, duration: topicDuration }); - console.log(` ⏱️ Topic completed in ${formatDuration(topicDuration)}`); + printOneVsOneTopicResults(result); + console.log(`\n ⏱️ Topic completed in ${formatDuration(topicDuration)}`); } console.log("\n\n📊 Calculating aggregate rankings...\n"); + + // Log all topic results before aggregate + console.log("═".repeat(60)); + console.log("\n📋 INDIVIDUAL TOPIC RESULTS\n"); + for (const result of topicResults) { + printOneVsOneTopicResults(result); + console.log(""); + } + const aggregateRankings = calculateOneVsOneAggregateRankings(topicResults); const results: OneVsOneResults = { From 479ff83f49ddfb0d28d3cef05f4b3dbc21528d97 Mon Sep 17 00:00:00 2001 From: Theo Browne Date: Sat, 13 Dec 2025 16:22:35 -0800 Subject: [PATCH 09/10] Reviewers --- constants.ts | 16 ++++++++++++++++ index.ts | 11 +++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/constants.ts b/constants.ts index e5768d0..6ac5d61 100644 --- a/constants.ts +++ b/constants.ts @@ -35,6 +35,8 @@ export interface RunnableModel { name: string; llm: LanguageModel; reasoning: boolean; + /** If true, this model will be used as a "reviewer/judge" for comparisons. */ + reviewer: boolean; } // Include "usage" so we can log cost @@ -50,11 +52,13 @@ export const modelsToRun: RunnableModel[] = [ name: "claude-4.5-opus-reasoning", llm: openrouter("anthropic/claude-opus-4.5", defaultProviderOptions), reasoning: true, + reviewer: true, }, // { // name: "claude-4.5-opus-non-reasoning", // llm: openrouter("anthropic/claude-opus-4.5", defaultProviderOptions), // reasoning: false, + // reviewer: true, // }, // OpenAI @@ -62,21 +66,25 @@ export const modelsToRun: RunnableModel[] = [ // name: "gpt-4o", // llm: openrouter("openai/gpt-4o", defaultProviderOptions), // reasoning: false, + // reviewer: true, // }, { name: "gpt-5.1", llm: openrouter("openai/gpt-5.1", defaultProviderOptions), reasoning: true, + reviewer: true, }, // { // name: "gpt-5.1-chat", // llm: openrouter("openai/gpt-5.1-chat", defaultProviderOptions), // reasoning: false, + // reviewer: true, // }, // { // name: "gpt-5-mini", // llm: openrouter("openai/gpt-5-mini", defaultProviderOptions), // reasoning: true, + // reviewer: true, // }, // Google @@ -84,11 +92,13 @@ export const modelsToRun: RunnableModel[] = [ name: "gemini-3-pro-preview", llm: openrouter("google/gemini-3-pro-preview", defaultProviderOptions), reasoning: true, + reviewer: true, }, // { // name: "gemini-2.5-pro", // llm: openrouter("google/gemini-2.5-pro", defaultProviderOptions), // reasoning: true, + // reviewer: true, // }, // Grok @@ -96,6 +106,7 @@ export const modelsToRun: RunnableModel[] = [ // name: "grok-4.1-fast", // llm: openrouter("x-ai/grok-4.1-fast", defaultProviderOptions), // reasoning: true, + // reviewer: true, // }, // Open Weight @@ -103,11 +114,13 @@ export const modelsToRun: RunnableModel[] = [ // name: "kimi-k2", // llm: openrouter("moonshotai/kimi-k2", defaultProviderOptions), // reasoning: false, + // reviewer: true, // }, { name: "kimi-k2-thinking", llm: openrouter("moonshotai/kimi-k2-thinking", defaultProviderOptions), reasoning: true, + reviewer: true, }, ]; @@ -117,15 +130,18 @@ export const dryRunModels: RunnableModel[] = [ name: "claude-4.5-haiku", llm: openrouter("anthropic/claude-haiku-4.5", defaultProviderOptions), reasoning: false, + reviewer: true, }, { name: "gemini-2.5-flash", llm: openrouter("google/gemini-2.5-flash", defaultProviderOptions), reasoning: true, + reviewer: true, }, { name: "gpt-5-mini", llm: openrouter("openai/gpt-5-mini", defaultProviderOptions), reasoning: true, + reviewer: true, }, ]; diff --git a/index.ts b/index.ts index 9a593af..081a66c 100644 --- a/index.ts +++ b/index.ts @@ -12,7 +12,6 @@ import { dryRunModels, PARALLEL_LIMIT, TOPICS, - type RunnableModel, } from "./constants"; import { createTopicDirectories, @@ -36,6 +35,8 @@ import { // Parse CLI flags const isDryRun = process.argv.includes("--dry-run"); const modelsToRun = isDryRun ? dryRunModels : allModels; +const reviewerModels = modelsToRun.filter((m) => m.reviewer); +const comparisonJudges = reviewerModels.length > 0 ? reviewerModels : modelsToRun; // Parse --test argument function getTestTypeFromArgs(): TestType | null { @@ -772,6 +773,7 @@ function countOneVsOneApiCalls() { let comparisons = 0; const n = modelsToRun.length; + const judgeCount = comparisonJudges.length; for (const _topic of TOPICS) { // Phase 1: Essays @@ -786,7 +788,7 @@ function countOneVsOneApiCalls() { // Phase 4: Comparisons // Original essays: C(n, 2) pairs = n*(n-1)/2, each judged by n models const originalPairs = (n * (n - 1)) / 2; - comparisons += originalPairs * n; + comparisons += originalPairs * judgeCount; // Revised essays: each model has (n-1) revisions // Total revised essays = n * (n-1) @@ -794,7 +796,7 @@ function countOneVsOneApiCalls() { // Actually: all revised essays compete pairwise const revisedCount = n * (n - 1); const revisedPairs = (revisedCount * (revisedCount - 1)) / 2; - comparisons += revisedPairs * n; + comparisons += revisedPairs * judgeCount; } return { @@ -818,6 +820,7 @@ async function confirmOneVsOneRun(): Promise { console.log("⚡ DRY RUN MODE (using cheap models)\n"); } console.log(`Models: ${modelsToRun.length}`); + console.log(`Comparison judges: ${comparisonJudges.length}`); console.log(`Topics: ${TOPICS.length}`); console.log(`\nAPI Call Breakdown (across all ${TOPICS.length} topics):`); console.log( @@ -894,7 +897,7 @@ async function runPhase4Comparisons( const tasks: Array> = []; - for (const judge of modelsToRun) { + for (const judge of comparisonJudges) { for (const [essayA, essayB] of pairs) { tasks.push( limit(async () => { From 3d6ca5f3f6d1214daab66b34c2f89640427fa509 Mon Sep 17 00:00:00 2001 From: Theo Browne Date: Sat, 13 Dec 2025 18:00:37 -0800 Subject: [PATCH 10/10] Handle errors --- aiClient.ts | 257 ++++++++++++++++++++++++++++++--------------------- constants.ts | 48 ++++++++-- index.ts | 125 +++++++++++++++++++++---- 3 files changed, 300 insertions(+), 130 deletions(-) diff --git a/aiClient.ts b/aiClient.ts index bd4d0ec..e529076 100644 --- a/aiClient.ts +++ b/aiClient.ts @@ -1,6 +1,38 @@ import { generateText } from "ai"; import type { RunnableModel } from "./constants"; +const MAX_RETRIES = 3; +const RETRY_DELAY_MS = 1000; + +/** + * Retries an async function up to MAX_RETRIES times. + * Returns null if all retries fail. + */ +async function withRetry( + fn: () => Promise, + label: string +): Promise { + let lastError: unknown; + for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { + try { + return await fn(); + } catch (error) { + lastError = error; + const errorMsg = error instanceof Error ? error.message : String(error); + console.error( + ` ⚠️ ${label} failed (attempt ${attempt}/${MAX_RETRIES}): ${errorMsg}` + ); + if (attempt < MAX_RETRIES) { + await new Promise((resolve) => setTimeout(resolve, RETRY_DELAY_MS)); + } + } + } + console.error( + ` ❌ ${label} failed after ${MAX_RETRIES} attempts, dropping` + ); + return null; +} + /** * Extracts cost from OpenRouter provider metadata. */ @@ -58,96 +90,107 @@ export interface CompareResult { /** * Generates an essay based on the given topic prompt. + * Returns null if all retries fail. */ export async function generateEssay( model: RunnableModel, topic: string -): Promise { - const result = await generateText({ - model: model.llm, - system: `You are an expert essay writer. Write a well-structured, thoughtful essay on the given topic. +): Promise { + return withRetry(async () => { + const result = await generateText({ + model: model.llm, + system: `You are an expert essay writer. Write a well-structured, thoughtful essay on the given topic. The essay should be clear, engaging, and demonstrate strong writing skills. Write approximately 800-1200 words.`, - prompt: `Write an essay on the following topic:\n\n${topic}`, - }); - - return { - text: result.text, - usage: { - inputTokens: result.usage?.inputTokens ?? 0, - outputTokens: result.usage?.outputTokens ?? 0, - totalTokens: result.usage?.totalTokens ?? 0, - cost: extractCost(result.providerMetadata), - }, - }; + prompt: `Write an essay on the following topic:\n\n${topic}`, + }); + + return { + text: result.text, + usage: { + inputTokens: result.usage?.inputTokens ?? 0, + outputTokens: result.usage?.outputTokens ?? 0, + totalTokens: result.usage?.totalTokens ?? 0, + cost: extractCost(result.providerMetadata), + }, + }; + }, `generateEssay(${model.name})`); } /** * Reviews an essay and provides constructive feedback. + * Returns null if all retries fail. */ export async function reviewEssay( model: RunnableModel, essay: string, topic: string -): Promise { - const result = await generateText({ - model: model.llm, - system: `You are an expert writing tutor and editor. Review the essay provided and give constructive, +): Promise { + return withRetry(async () => { + const result = await generateText({ + model: model.llm, + system: `You are an expert writing tutor and editor. Review the essay provided and give constructive, specific feedback on areas such as structure, clarity, argumentation, style, and areas for improvement. Be thorough but encouraging. Focus on actionable improvements.`, - prompt: `Topic: ${topic}\n\nPlease review the following essay and provide detailed feedback:\n\n${essay}`, - }); - - return { - text: result.text, - usage: { - inputTokens: result.usage?.inputTokens ?? 0, - outputTokens: result.usage?.outputTokens ?? 0, - totalTokens: result.usage?.totalTokens ?? 0, - cost: extractCost(result.providerMetadata), - }, - }; + prompt: `Topic: ${topic}\n\nPlease review the following essay and provide detailed feedback:\n\n${essay}`, + }); + + return { + text: result.text, + usage: { + inputTokens: result.usage?.inputTokens ?? 0, + outputTokens: result.usage?.outputTokens ?? 0, + totalTokens: result.usage?.totalTokens ?? 0, + cost: extractCost(result.providerMetadata), + }, + }; + }, `reviewEssay(${model.name})`); } /** * Revises an essay based on the original topic, original essay, and review feedback. + * Returns null if all retries fail. */ export async function reviseEssay( model: RunnableModel, topic: string, originalEssay: string, feedback: string -): Promise { - const result = await generateText({ - model: model.llm, - system: `You are an expert essay writer. Revise the provided essay based on the feedback given, +): Promise { + return withRetry(async () => { + const result = await generateText({ + model: model.llm, + system: `You are an expert essay writer. Revise the provided essay based on the feedback given, while maintaining the core message and improving the areas identified. Produce a complete revised essay, not just suggestions.`, - prompt: `Original topic: ${topic}\n\nOriginal essay:\n${originalEssay}\n\nReview feedback:\n${feedback}\n\nPlease revise the essay based on the feedback above.`, - }); - - return { - text: result.text, - usage: { - inputTokens: result.usage?.inputTokens ?? 0, - outputTokens: result.usage?.outputTokens ?? 0, - totalTokens: result.usage?.totalTokens ?? 0, - cost: extractCost(result.providerMetadata), - }, - }; + prompt: `Original topic: ${topic}\n\nOriginal essay:\n${originalEssay}\n\nReview feedback:\n${feedback}\n\nPlease revise the essay based on the feedback above.`, + }); + + return { + text: result.text, + usage: { + inputTokens: result.usage?.inputTokens ?? 0, + outputTokens: result.usage?.outputTokens ?? 0, + totalTokens: result.usage?.totalTokens ?? 0, + cost: extractCost(result.providerMetadata), + }, + }; + }, `reviseEssay(${model.name})`); } /** * Scores an essay on a scale of 1-10 with justification. + * Returns null if all retries fail. */ export async function scoreEssay( model: RunnableModel, essay: string, topic: string -): Promise { - const result = await generateText({ - model: model.llm, - system: `You are an expert essay judge. Score the essay on a scale of 1-10 based on: +): Promise { + return withRetry(async () => { + const result = await generateText({ + model: model.llm, + system: `You are an expert essay judge. Score the essay on a scale of 1-10 based on: - Clarity and coherence of argument - Quality of writing (style, grammar, flow) - Depth of insight and originality @@ -157,42 +200,45 @@ export async function scoreEssay( Be fair and consistent in your scoring. A score of 5 is average, 7-8 is good, 9-10 is exceptional. IMPORTANT: Start your response with EXACTLY "Score: X/10" on the first line (where X is your score), then provide your detailed justification below.`, - prompt: `Topic: ${topic}\n\nPlease score the following essay:\n\n${essay}`, - }); - - // Parse score from the text - look for "Score: X/10" or similar patterns - const scoreMatch = result.text.match(/Score:\s*(\d+(?:\.\d+)?)\s*\/\s*10/i); - const score = scoreMatch?.[1] ? parseFloat(scoreMatch[1]) : 5; // Default to 5 if parsing fails - - // Everything after the score line is the justification - const justification = result.text - .replace(/^Score:\s*\d+(?:\.\d+)?\s*\/\s*10\s*/i, "") - .trim(); - - return { - score: Math.min(10, Math.max(1, score)), // Clamp between 1-10 - justification, - usage: { - inputTokens: result.usage?.inputTokens ?? 0, - outputTokens: result.usage?.outputTokens ?? 0, - totalTokens: result.usage?.totalTokens ?? 0, - cost: extractCost(result.providerMetadata), - }, - }; + prompt: `Topic: ${topic}\n\nPlease score the following essay:\n\n${essay}`, + }); + + // Parse score from the text - look for "Score: X/10" or similar patterns + const scoreMatch = result.text.match(/Score:\s*(\d+(?:\.\d+)?)\s*\/\s*10/i); + const score = scoreMatch?.[1] ? parseFloat(scoreMatch[1]) : 5; // Default to 5 if parsing fails + + // Everything after the score line is the justification + const justification = result.text + .replace(/^Score:\s*\d+(?:\.\d+)?\s*\/\s*10\s*/i, "") + .trim(); + + return { + score: Math.min(10, Math.max(1, score)), // Clamp between 1-10 + justification, + usage: { + inputTokens: result.usage?.inputTokens ?? 0, + outputTokens: result.usage?.outputTokens ?? 0, + totalTokens: result.usage?.totalTokens ?? 0, + cost: extractCost(result.providerMetadata), + }, + }; + }, `scoreEssay(${model.name})`); } /** * Compares two essays head-to-head and picks a winner. + * Returns null if all retries fail. */ export async function compareEssays( judge: RunnableModel, essayA: { author: string; text: string }, essayB: { author: string; text: string }, topic: string -): Promise { - const result = await generateText({ - model: judge.llm, - system: `You are an expert essay judge conducting a head-to-head comparison. You will be shown two essays on the same topic, labeled Essay A and Essay B. +): Promise { + return withRetry(async () => { + const result = await generateText({ + model: judge.llm, + system: `You are an expert essay judge conducting a head-to-head comparison. You will be shown two essays on the same topic, labeled Essay A and Essay B. Compare them based on: - Clarity and coherence of argument @@ -209,7 +255,7 @@ IMPORTANT: Start your response with EXACTLY one of these on the first line: - "Winner: Tie" (only if truly equal) Then provide your detailed reasoning below, explaining why you chose that winner.`, - prompt: `Topic: ${topic} + prompt: `Topic: ${topic} Essay A: ${essayA.text} @@ -218,29 +264,32 @@ Essay B: ${essayB.text} Compare these essays and pick a winner.`, - }); - - // Parse winner from the text - const winnerMatch = result.text.match(/Winner:\s*(A|B|Tie)/i); - let winner: "A" | "B" | "tie" = "tie"; - if (winnerMatch) { - const parsed = winnerMatch[1]!.toUpperCase(); - if (parsed === "A") winner = "A"; - else if (parsed === "B") winner = "B"; - else winner = "tie"; - } - - // Everything after the winner line is the reasoning - const reasoning = result.text.replace(/^Winner:\s*(A|B|Tie)\s*/i, "").trim(); - - return { - winner, - reasoning, - usage: { - inputTokens: result.usage?.inputTokens ?? 0, - outputTokens: result.usage?.outputTokens ?? 0, - totalTokens: result.usage?.totalTokens ?? 0, - cost: extractCost(result.providerMetadata), - }, - }; + }); + + // Parse winner from the text + const winnerMatch = result.text.match(/Winner:\s*(A|B|Tie)/i); + let winner: "A" | "B" | "tie" = "tie"; + if (winnerMatch) { + const parsed = winnerMatch[1]!.toUpperCase(); + if (parsed === "A") winner = "A"; + else if (parsed === "B") winner = "B"; + else winner = "tie"; + } + + // Everything after the winner line is the reasoning + const reasoning = result.text + .replace(/^Winner:\s*(A|B|Tie)\s*/i, "") + .trim(); + + return { + winner, + reasoning, + usage: { + inputTokens: result.usage?.inputTokens ?? 0, + outputTokens: result.usage?.outputTokens ?? 0, + totalTokens: result.usage?.totalTokens ?? 0, + cost: extractCost(result.providerMetadata), + }, + }; + }, `compareEssays(${judge.name})`); } diff --git a/constants.ts b/constants.ts index 6ac5d61..e4add15 100644 --- a/constants.ts +++ b/constants.ts @@ -13,7 +13,22 @@ const openrouter = createOpenRouter({ }); // Parallelism configuration -export const PARALLEL_LIMIT = 30; +export const PARALLEL_LIMIT = 100; + +/** + * Stagger request start times to avoid huge bursts when running high concurrency. + * + * This is applied per-phase, and bounded by roughly: + * (PARALLEL_LIMIT - 1) * API_STAGGER_MS + * + * You can override via env vars: + * - ARENA_API_STAGGER_MS + * - ARENA_API_STAGGER_JITTER_MS + */ +export const API_STAGGER_MS = + Number.parseInt(process.env.ARENA_API_STAGGER_MS ?? "", 10) || 25; +export const API_STAGGER_JITTER_MS = + Number.parseInt(process.env.ARENA_API_STAGGER_JITTER_MS ?? "", 10) || 25; // Essay topics export const TOPICS = [ @@ -50,7 +65,10 @@ export const modelsToRun: RunnableModel[] = [ // Anthropic { name: "claude-4.5-opus-reasoning", - llm: openrouter("anthropic/claude-opus-4.5", defaultProviderOptions), + llm: openrouter("anthropic/claude-opus-4.5", { + ...defaultProviderOptions, + reasoning: { effort: "high" }, + }), reasoning: true, reviewer: true, }, @@ -68,9 +86,21 @@ export const modelsToRun: RunnableModel[] = [ // reasoning: false, // reviewer: true, // }, + // { + // name: "gpt-5.1", + // llm: openrouter("openai/gpt-5.1", { + // ...defaultProviderOptions, + // reasoning: { effort: "high" }, + // }), + // reasoning: true, + // reviewer: false, + // }, { - name: "gpt-5.1", - llm: openrouter("openai/gpt-5.1", defaultProviderOptions), + name: "gpt-5.2", + llm: openrouter("openai/gpt-5.2", { + ...defaultProviderOptions, + reasoning: { effort: "high" }, + }), reasoning: true, reviewer: true, }, @@ -90,7 +120,10 @@ export const modelsToRun: RunnableModel[] = [ // Google { name: "gemini-3-pro-preview", - llm: openrouter("google/gemini-3-pro-preview", defaultProviderOptions), + llm: openrouter("google/gemini-3-pro-preview", { + ...defaultProviderOptions, + reasoning: { effort: "high" }, + }), reasoning: true, reviewer: true, }, @@ -118,7 +151,10 @@ export const modelsToRun: RunnableModel[] = [ // }, { name: "kimi-k2-thinking", - llm: openrouter("moonshotai/kimi-k2-thinking", defaultProviderOptions), + llm: openrouter("moonshotai/kimi-k2-thinking", { + ...defaultProviderOptions, + reasoning: { effort: "high" }, + }), reasoning: true, reviewer: true, }, diff --git a/index.ts b/index.ts index 081a66c..ea07e72 100644 --- a/index.ts +++ b/index.ts @@ -10,6 +10,8 @@ import { import { modelsToRun as allModels, dryRunModels, + API_STAGGER_JITTER_MS, + API_STAGGER_MS, PARALLEL_LIMIT, TOPICS, } from "./constants"; @@ -36,7 +38,8 @@ import { const isDryRun = process.argv.includes("--dry-run"); const modelsToRun = isDryRun ? dryRunModels : allModels; const reviewerModels = modelsToRun.filter((m) => m.reviewer); -const comparisonJudges = reviewerModels.length > 0 ? reviewerModels : modelsToRun; +const comparisonJudges = + reviewerModels.length > 0 ? reviewerModels : modelsToRun; // Parse --test argument function getTestTypeFromArgs(): TestType | null { @@ -50,7 +53,37 @@ function getTestTypeFromArgs(): TestType | null { process.exit(1); } -const limit = pLimit(PARALLEL_LIMIT); +function sleep(ms: number) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** + * Creates a per-phase limiter that keeps concurrency high, but staggers + * request start times to avoid huge bursts at time 0. + */ +function createApiLimit() { + const limit = pLimit(PARALLEL_LIMIT); + let started = 0; + + return async function runLimited(fn: () => Promise) { + return limit(async () => { + const slot = started % PARALLEL_LIMIT; + started++; + + const jitter = + API_STAGGER_JITTER_MS > 0 + ? Math.floor(Math.random() * (API_STAGGER_JITTER_MS + 1)) + : 0; + const delay = slot * API_STAGGER_MS + jitter; + + if (delay > 0) { + await sleep(delay); + } + + return fn(); + }); + }; +} /** * Tracks token usage and costs per model per phase. @@ -120,12 +153,16 @@ async function runPhase1Essays( topic: string, topicDir: string ): Promise> { + const limit = createApiLimit(); const essays: Record = {}; const tasks = modelsToRun.map((model) => limit(async () => { console.log(` Generating essay: ${model.name}...`); const result = await generateEssay(model, topic); + if (!result) { + return null; + } essays[model.name] = result.text; usageTracker.essays[model.name]!.push(result.usage); await writeEssay(topicDir, model.name, result.text); @@ -150,6 +187,7 @@ async function runPhase2Feedback( essays: Record, topicDir: string ): Promise>> { + const limit = createApiLimit(); const feedback: Record> = {}; // Initialize nested objects @@ -162,12 +200,17 @@ async function runPhase2Feedback( for (const reviewer of modelsToRun) { for (const author of modelsToRun) { if (reviewer.name === author.name) continue; + // Skip if no essay exists for this author + if (!essays[author.name]) continue; tasks.push( limit(async () => { console.log(` ${reviewer.name} reviewing ${author.name}...`); const essayText = essays[author.name]!; const result = await reviewEssay(reviewer, essayText, topic); + if (!result) { + return; + } feedback[reviewer.name]![author.name] = result.text; usageTracker.reviews[reviewer.name]!.push(result.usage); await writeFeedback( @@ -199,6 +242,7 @@ async function runPhase3Revisions( feedback: Record>, topicDir: string ): Promise>> { + const limit = createApiLimit(); const revisions: Record> = {}; // Initialize nested objects @@ -211,6 +255,9 @@ async function runPhase3Revisions( for (const author of modelsToRun) { for (const reviewer of modelsToRun) { if (author.name === reviewer.name) continue; + // Skip if no essay or no feedback exists + if (!essays[author.name]) continue; + if (!feedback[reviewer.name]?.[author.name]) continue; tasks.push( limit(async () => { @@ -225,6 +272,9 @@ async function runPhase3Revisions( essayText, reviewerFeedback ); + if (!result) { + return; + } revisions[author.name]![reviewer.name] = result.text; usageTracker.revisions[author.name]!.push(result.usage); await writeRevision( @@ -355,6 +405,7 @@ async function runPhase4Scoring( Record> >; }> { + const limit = createApiLimit(); const originalScores: Record< string, Record @@ -376,11 +427,17 @@ async function runPhase4Scoring( for (const judge of modelsToRun) { for (const author of modelsToRun) { + // Skip if no essay exists for this author + if (!essays[author.name]) continue; + tasks.push( limit(async () => { const essayText = essays[author.name]!; console.log(` ${judge.name} scoring ${author.name} (original)...`); const result = await scoreEssay(judge, essayText, topic); + if (!result) { + return; + } originalScores[judge.name]![author.name] = { score: result.score, justification: result.justification, @@ -402,6 +459,8 @@ async function runPhase4Scoring( for (const author of modelsToRun) { for (const reviewer of modelsToRun) { if (author.name === reviewer.name) continue; + // Skip if no revision exists + if (!revisions[author.name]?.[reviewer.name]) continue; tasks.push( limit(async () => { @@ -410,6 +469,9 @@ async function runPhase4Scoring( ` ${judge.name} scoring ${author.name}←${reviewer.name} (revised)...` ); const result = await scoreEssay(judge, revision, topic); + if (!result) { + return; + } revisedScores[judge.name]![author.name]![reviewer.name] = { score: result.score, justification: result.justification, @@ -453,24 +515,38 @@ function calculateScoringRankings(scores: { }> = []; const judges = Object.keys(scores.original); - const firstJudge = judges[0]!; - const authors = Object.keys(scores.original[firstJudge]!); + if (judges.length === 0) { + return { essays: [], reviewers: [] }; + } + + // Collect all authors that have at least one score + const allAuthors = new Set(); + for (const judge of judges) { + for (const author of Object.keys(scores.original[judge] ?? {})) { + allAuthors.add(author); + } + } + const authors = Array.from(allAuthors); for (const author of authors) { - const judgeScores = judges.map((j) => scores.original[j]![author]!.score); + const judgeScoresRaw = judges + .map((j) => scores.original[j]?.[author]?.score) + .filter((s): s is number => s !== undefined); + if (judgeScoresRaw.length === 0) continue; const avgScore = - judgeScores.reduce((a, b) => a + b, 0) / judgeScores.length; + judgeScoresRaw.reduce((a, b) => a + b, 0) / judgeScoresRaw.length; essayScores.push({ type: "original", author, avgScore }); } for (const author of authors) { for (const reviewer of authors) { if (author === reviewer) continue; - const judgeScores = judges.map( - (j) => scores.revised[j]![author]![reviewer]!.score - ); + const judgeScoresRaw = judges + .map((j) => scores.revised[j]?.[author]?.[reviewer]?.score) + .filter((s): s is number => s !== undefined); + if (judgeScoresRaw.length === 0) continue; const avgScore = - judgeScores.reduce((a, b) => a + b, 0) / judgeScores.length; + judgeScoresRaw.reduce((a, b) => a + b, 0) / judgeScoresRaw.length; essayScores.push({ type: "revised", author, reviewer, avgScore }); } } @@ -483,29 +559,33 @@ function calculateScoringRankings(scores: { } for (const author of authors) { + const originalScoresRaw = judges + .map((j) => scores.original[j]?.[author]?.score) + .filter((s): s is number => s !== undefined); + if (originalScoresRaw.length === 0) continue; const originalAvg = - judges.reduce((sum, j) => sum + scores.original[j]![author]!.score, 0) / - judges.length; + originalScoresRaw.reduce((a, b) => a + b, 0) / originalScoresRaw.length; for (const reviewer of authors) { if (author === reviewer) continue; + const revisedScoresRaw = judges + .map((j) => scores.revised[j]?.[author]?.[reviewer]?.score) + .filter((s): s is number => s !== undefined); + if (revisedScoresRaw.length === 0) continue; const revisedAvg = - judges.reduce( - (sum, j) => sum + scores.revised[j]![author]![reviewer]!.score, - 0 - ) / judges.length; + revisedScoresRaw.reduce((a, b) => a + b, 0) / revisedScoresRaw.length; const improvement = revisedAvg - originalAvg; reviewerImpact[reviewer]!.push(improvement); } } - const reviewerScores = Object.entries(reviewerImpact).map( - ([reviewer, improvements]) => ({ + const reviewerScores = Object.entries(reviewerImpact) + .filter(([, improvements]) => improvements.length > 0) + .map(([reviewer, improvements]) => ({ reviewer, avgImprovement: improvements.reduce((a, b) => a + b, 0) / improvements.length, - }) - ); + })); reviewerScores.sort((a, b) => b.avgImprovement - a.avgImprovement); @@ -860,6 +940,7 @@ async function runPhase4Comparisons( revisions: Record>, topicDir: string ): Promise { + const limit = createApiLimit(); const comparisons: ComparisonResult[] = []; // Build list of all essays (original + revised) @@ -917,6 +998,10 @@ async function runPhase4Comparisons( topic ); + if (!result) { + return; + } + const comparison: ComparisonResult = { judge: judge.name, essayA: { author: essayA.author, reviewer: essayA.reviewer },