From ebaf37b84d9bb9e8df4a7752ec65a845f3b85961 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 11:22:26 -0700
Subject: [PATCH 01/12] Add evalbuff: iterative agent improvement via docs
 optimization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Evalbuff is an automated overnight loop that improves coding agent
performance by optimizing project documentation. It runs eval tasks,
judges outputs with living quality criteria (L1-L5), analyzes failures,
proposes targeted doc edits, and keeps only changes that measurably
improve scores. Agent-agnostic — works with any CLI coding agent.

Key components:
- cli-runner: agent-agnostic CLI runner (shells out to any command)
- criteria: living quality criteria with L1-L5 promotion logic
- judge: modified from BuffBench with criteria injection
- docs-optimizer: failure analysis + doc writing + score comparison
- morning-report: markdown summary from overnight JSONL log
- run-evalbuff: main orchestrator with budget caps and resumable state

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 evals/evalbuff/README.md                      | 214 +++++++++
 evals/evalbuff/agent-runner.ts                | 196 ++++++++
 evals/evalbuff/cli-runner.ts                  |  94 ++++
 evals/evalbuff/criteria.ts                    | 145 ++++++
 evals/evalbuff/docs-optimizer.ts              | 233 ++++++++++
 evals/evalbuff/evalbuff-criteria.json         |  22 +
 evals/evalbuff/judge.ts                       | 308 +++++++++++++
 evals/evalbuff/morning-report.ts              | 197 ++++++++
 evals/evalbuff/old/agents/context-agent.ts    |  56 +++
 evals/evalbuff/old/agents/review-agent.ts     |  97 ++++
 evals/evalbuff/old/agents/scan-agent.ts       |  46 ++
 evals/evalbuff/old/cli/package.json           |  24 +
 .../evalbuff/old/cli/src/commands/context.ts  |  87 ++++
 evals/evalbuff/old/cli/src/commands/init.ts   | 127 ++++++
 evals/evalbuff/old/cli/src/commands/login.ts  |  22 +
 evals/evalbuff/old/cli/src/commands/logout.ts |  12 +
 evals/evalbuff/old/cli/src/commands/review.ts | 139 ++++++
 evals/evalbuff/old/cli/src/index.ts           |  82 ++++
 evals/evalbuff/old/cli/src/templates/skill.ts |  45 ++
 evals/evalbuff/old/cli/src/utils/auth.ts      | 188 ++++++++
 evals/evalbuff/old/cli/src/utils/config.ts    | 119 +++++
 evals/evalbuff/old/cli/src/utils/git.ts       | 110 +++++
 evals/evalbuff/old/cli/src/utils/knowledge.ts |  50 ++
 evals/evalbuff/old/cli/src/utils/output.ts    |  62 +++
 evals/evalbuff/old/cli/src/utils/project.ts   |   9 +
 evals/evalbuff/old/cli/tsconfig.json          |  12 +
 evals/evalbuff/run-evalbuff.ts                | 428 ++++++++++++++++++
 evals/evalbuff/runners/claude.ts              | 176 +++++++
 evals/evalbuff/runners/codebuff.ts            | 139 ++++++
 evals/evalbuff/runners/codex.ts               | 143 ++++++
 evals/evalbuff/runners/index.ts               |   3 +
 evals/evalbuff/runners/runner.ts              |  13 +
 evals/evalbuff/test-repo-utils.ts             | 131 ++++++
 evals/evalbuff/types.ts                       |  83 ++++
 evals/package.json                            |   1 +
 35 files changed, 3813 insertions(+)
 create mode 100644 evals/evalbuff/README.md
 create mode 100644 evals/evalbuff/agent-runner.ts
 create mode 100644 evals/evalbuff/cli-runner.ts
 create mode 100644 evals/evalbuff/criteria.ts
 create mode 100644 evals/evalbuff/docs-optimizer.ts
 create mode 100644 evals/evalbuff/evalbuff-criteria.json
 create mode 100644 evals/evalbuff/judge.ts
 create mode 100644 evals/evalbuff/morning-report.ts
 create mode 100644 evals/evalbuff/old/agents/context-agent.ts
 create mode 100644 evals/evalbuff/old/agents/review-agent.ts
 create mode 100644 evals/evalbuff/old/agents/scan-agent.ts
 create mode 100644 evals/evalbuff/old/cli/package.json
 create mode 100644 evals/evalbuff/old/cli/src/commands/context.ts
 create mode 100644 evals/evalbuff/old/cli/src/commands/init.ts
 create mode 100644 evals/evalbuff/old/cli/src/commands/login.ts
 create mode 100644 evals/evalbuff/old/cli/src/commands/logout.ts
 create mode 100644 evals/evalbuff/old/cli/src/commands/review.ts
 create mode 100644 evals/evalbuff/old/cli/src/index.ts
 create mode 100644 evals/evalbuff/old/cli/src/templates/skill.ts
 create mode 100644 evals/evalbuff/old/cli/src/utils/auth.ts
 create mode 100644 evals/evalbuff/old/cli/src/utils/config.ts
 create mode 100644 evals/evalbuff/old/cli/src/utils/git.ts
 create mode 100644 evals/evalbuff/old/cli/src/utils/knowledge.ts
 create mode 100644 evals/evalbuff/old/cli/src/utils/output.ts
 create mode 100644 evals/evalbuff/old/cli/src/utils/project.ts
 create mode 100644 evals/evalbuff/old/cli/tsconfig.json
 create mode 100644 evals/evalbuff/run-evalbuff.ts
 create mode 100644 evals/evalbuff/runners/claude.ts
 create mode 100644 evals/evalbuff/runners/codebuff.ts
 create mode 100644 evals/evalbuff/runners/codex.ts
 create mode 100644 evals/evalbuff/runners/index.ts
 create mode 100644 evals/evalbuff/runners/runner.ts
 create mode 100644 evals/evalbuff/test-repo-utils.ts
 create mode 100644 evals/evalbuff/types.ts

diff --git a/evals/evalbuff/README.md b/evals/evalbuff/README.md
new file mode 100644
index 0000000000..df88d41065
--- /dev/null
+++ b/evals/evalbuff/README.md
@@ -0,0 +1,214 @@
+# Evalbuff
+
+Evalbuff is an automated system that iteratively improves a coding agent's performance by optimizing project documentation. It runs overnight, discovers what an agent gets wrong, writes docs to fix those gaps, and keeps only the changes that measurably improve scores.
+
+## The Idea
+
+Most coding agents read project documentation before making changes. Better docs lead to better code. But writing good docs is hard — you don't know what an agent needs to know until you watch it fail.
+
+Evalbuff closes this loop automatically:
+
+1. **Run** a coding agent on real eval tasks (reconstructing git commits)
+2. **Judge** the output with AI judges that apply living quality criteria
+3. **Analyze** failures — feed the judge's weaknesses to a doc-writer agent
+4. **Test** whether a proposed doc edit actually improves the agent's score
+5. **Keep** doc changes that help, revert ones that don't
+6. **Repeat** until the budget runs out or scores plateau
+
+The result: a `docs/` directory and `AGENTS.md` table of contents that encode exactly what the agent needs to know to perform well on your codebase. Any agent that reads project docs benefits — Claude Code, Codex, Codebuff, or anything else with a CLI.
+
+## Why Documentation?
+
+We chose documentation as the improvement lever because:
+
+- **Agent-agnostic.** Every modern coding agent reads project docs. Improving docs improves all agents, not just one.
+- **Interpretable.** Unlike fine-tuning weights or tweaking system prompts, docs are human-readable. You can review what evalbuff learned and decide if it makes sense.
+- **Composable.** Doc improvements stack. A doc about error handling patterns doesn't conflict with a doc about naming conventions.
+- **Persistent.** Docs live in the repo and benefit every future session, not just the current one.
+
+## Living Quality Criteria
+
+Evalbuff uses a leveling system so it doesn't try to optimize everything at once:
+
+| Level | Criteria Added | When |
+|-------|---------------|------|
+| L1 | Correctness, Completeness, Basic Style | Start |
+| L2 | + Pattern Consistency | After L1 avg >= 8.0 over 10 tasks |
+| L3 | + Test Quality | After L2 avg >= 8.0 over 10 tasks |
+| L4 | + Optimal Design | After L3 avg >= 8.0 over 10 tasks |
+| L5 | + Fluency | After L4 avg >= 8.0 over 10 tasks |
+
+This prevents the system from penalizing an agent for style issues when it can't even get the code to compile. Criteria are injected directly into the AI judge prompts.
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────┐
+│                   Orchestrator                       │
+│                 (run-evalbuff.ts)                    │
+│                                                     │
+│  for each eval task:                                │
+│    1. Clone repo into isolated temp dir             │
+│    2. Copy current docs/ into the clone             │
+│    3. Run agent CLI on the task prompt              │
+│    4. Judge the diff against ground truth           │
+│    5. If score < threshold:                         │
+│       a. Analyze failure → propose doc edit         │
+│       b. Re-run agent with new doc                  │
+│       c. Re-judge → keep doc if score improved      │
+│    6. Update criteria level if scores are high      │
+│    7. Log entry to JSONL, save state                │
+│                                                     │
+│  Generate morning report                            │
+└─────────────────────────────────────────────────────┘
+```
+
+### Components
+
+| File | Role |
+|------|------|
+| `run-evalbuff.ts` | Main orchestrator loop with budget caps and resumable state |
+| `cli-runner.ts` | Agent-agnostic CLI runner — spawns any agent command, captures git diff |
+| `judge.ts` | AI judging system (GPT-5.1 + Gemini) with criteria injection |
+| `docs-optimizer.ts` | Failure analysis, doc writing, doc application, score comparison |
+| `criteria.ts` | Living quality criteria with L1-L5 promotion logic |
+| `morning-report.ts` | Generates markdown summary from overnight JSONL log |
+| `test-repo-utils.ts` | Creates isolated git repos per eval task |
+| `agent-runner.ts` | BuffBench-style agent runner (for Codebuff SDK agents) |
+| `types.ts` | Shared types (EvalCommitV2, EvalDataV2, etc.) |
+
+## Usage
+
+### Command Line
+
+```bash
+bun run evals/evalbuff/run-evalbuff.ts \
+  --repo /path/to/target-repo \
+  --agent "claude -p" \
+  --evals evals/buffbench/eval-codebuff.json,evals/buffbench/eval-manifold.json \
+  --max-iterations 50 \
+  --max-cost 50 \
+  --score-threshold 7.0 \
+  --agent-timeout 300000
+```
+
+Or via the workspace script:
+
+```bash
+bun run --filter @codebuff/evals run-evalbuff -- \
+  --repo /path/to/target-repo \
+  --agent "codex exec --full-auto" \
+  --evals evals/buffbench/eval-codebuff.json
+```
+
+### Arguments
+
+| Argument | Default | Description |
+|----------|---------|-------------|
+| `--repo` | required | Path to the target repo where docs/ will be written |
+| `--agent` | required | Agent CLI command (prompt is appended as last arg) |
+| `--evals` | required | Comma-separated paths to eval JSON files |
+| `--max-iterations` | 50 | Stop after this many tasks |
+| `--max-cost` | 50 | Stop after spending this many USD (estimated) |
+| `--score-threshold` | 7.0 | Only attempt doc edits for scores below this |
+| `--agent-timeout` | 300000 | Per-task agent timeout in ms (5 min default) |
+| `--criteria` | auto | Path to criteria JSON (auto-created if omitted) |
+
+### Overnight Run
+
+For an overnight run, set generous limits and let it go:
+
+```bash
+nohup bun run evals/evalbuff/run-evalbuff.ts \
+  --repo /path/to/repo \
+  --agent "claude -p" \
+  --evals evals/buffbench/eval-codebuff.json \
+  --max-iterations 200 \
+  --max-cost 100 \
+  > evalbuff-overnight.log 2>&1 &
+```
+
+Check results in the morning:
+- `<repo>/evalbuff-report-YYYY-MM-DD.md` — morning report
+- `<repo>/evalbuff-log.jsonl` — detailed per-task log
+- `<repo>/docs/` — the docs that were kept
+- `<repo>/AGENTS.md` — table of contents
+
+### Resumable
+
+Evalbuff saves state to `evalbuff-state.json` in the target repo. If interrupted, re-running with the same arguments will skip completed tasks and continue where it left off.
+
+## How It Decides What Docs to Write
+
+When an agent scores below the threshold on a task, evalbuff:
+
+1. **Feeds the judge's weaknesses** to a doc-writer LLM agent
+2. The doc writer sees: the task prompt, ground truth diff, agent's diff, judge analysis, and all current docs
+3. It produces a **targeted doc file** — specific to the gap between what the agent did and what it should have done
+4. The doc is written to `docs/<suggested-path>.md` and `AGENTS.md` is updated
+
+The doc writer is instructed to be specific and actionable — referencing concrete file paths, function names, and patterns. Generic advice like "follow best practices" is explicitly rejected.
+
+## What Gets Produced
+
+After a run, the target repo will contain:
+
+```
+target-repo/
+├── docs/
+│   ├── patterns/
+│   │   └── error-handling.md      # Evalbuff-generated
+│   ├── conventions/
+│   │   └── naming.md              # Evalbuff-generated
+│   └── architecture/
+│       └── data-flow.md           # Evalbuff-generated
+├── AGENTS.md                       # Table of contents
+├── evalbuff-state.json            # Resumable state
+├── evalbuff-log.jsonl             # Per-task log
+├── evalbuff-criteria.json         # Current criteria level
+└── evalbuff-report-2026-03-25.md  # Morning report
+```
+
+### Morning Report
+
+The morning report includes:
+- Summary table (iterations, cost, duration, score deltas)
+- Doc changes table (which docs were tried, score impact, kept/reverted)
+- Error log
+- Score trajectory visualization
+
+## Eval Data Format
+
+Evalbuff reuses BuffBench's `EvalDataV2` format. Eval tasks are real git commits from open source repos, turned into prompts:
+
+```json
+{
+  "repoUrl": "https://github.com/org/repo",
+  "evalCommits": [
+    {
+      "id": "task-abc123",
+      "sha": "abc123",
+      "parentSha": "def456",
+      "prompt": "Add error handling to the API endpoint...",
+      "fileDiffs": [{ "path": "src/api.ts", "diff": "..." }],
+      "supplementalFiles": ["src/types.ts"]
+    }
+  ]
+}
+```
+
+Generate new evals with BuffBench's eval generation tools, then point evalbuff at the JSON files.
+
+## Relationship to BuffBench
+
+BuffBench benchmarks agents against each other. Evalbuff improves a single agent's performance over time.
+
+| | BuffBench | Evalbuff |
+|---|-----------|----------|
+| **Goal** | Compare agents | Improve an agent |
+| **Output** | Scores + rankings | Documentation |
+| **Loop** | Single pass | Iterative |
+| **Judges** | 3 (GPT, Gemini, Claude) | 2 (GPT, Gemini) |
+| **Agent coupling** | Codebuff SDK | Any CLI agent |
+
+Evalbuff was deep-copied from BuffBench and modified — they share types and eval data format but are independent codebases.
diff --git a/evals/evalbuff/agent-runner.ts b/evals/evalbuff/agent-runner.ts
new file mode 100644
index 0000000000..174dcb22b9
--- /dev/null
+++ b/evals/evalbuff/agent-runner.ts
@@ -0,0 +1,196 @@
+import { execSync , exec } from 'child_process'
+import { promisify } from 'util'
+
+const execAsync = promisify(exec)
+
+import { withTimeout } from '@codebuff/common/util/promise'
+
+
+import { withTestRepo } from './test-repo-utils'
+import { ClaudeRunner } from './runners/claude'
+import { CodebuffRunner } from './runners/codebuff'
+import { CodexRunner } from './runners/codex'
+
+import type { Runner, AgentStep } from './runners/runner'
+import type { EvalCommitV2, FinalCheckOutput } from './types'
+import type { CodebuffClient } from '@codebuff/sdk'
+
+export type { AgentStep }
+
+export type ExternalAgentType = 'claude' | 'codex'
+
+export async function runAgentOnCommit({
+  client,
+  agentId,
+  commit,
+  repoUrl,
+  initCommand,
+  env,
+  localAgentDefinitions,
+  printEvents,
+  finalCheckCommands,
+  externalAgentType,
+}: {
+  client: CodebuffClient
+  agentId: string
+  commit: EvalCommitV2
+  repoUrl: string
+  initCommand?: string
+  env?: Record<string, string>
+  localAgentDefinitions: any[]
+  printEvents: boolean
+  finalCheckCommands?: string[]
+  externalAgentType?: ExternalAgentType
+}): Promise<{
+  diff: string
+  contextFiles: Record<string, string>
+  durationMs: number
+  cost: number
+  error?: string
+  trace: AgentStep[]
+  finalCheckOutputs?: FinalCheckOutput[]
+}> {
+  console.log(`[${commit.id}] Running agent ${agentId}...`)
+  const startTime = Date.now()
+  let diff = ''
+  let contextFiles: Record<string, string> = {}
+  let error: string | undefined
+  let cost = 0
+  const trace: AgentStep[] = []
+  let finalCheckOutputs: FinalCheckOutput[] | undefined
+
+  try {
+    const timeoutMs = 60 * 60 * 1000 // 60 minutes
+    await withTimeout(
+      withTestRepo(
+        {
+          repoUrl,
+          parentSha: commit.parentSha,
+          initCommand,
+          env,
+        },
+        async (repoDir) => {
+          // Select the appropriate runner
+          let runner: Runner
+          if (externalAgentType === 'claude') {
+            runner = new ClaudeRunner(repoDir, env)
+          } else if (externalAgentType === 'codex') {
+            runner = new CodexRunner(repoDir, env)
+          } else {
+            runner = new CodebuffRunner({
+              cwd: repoDir,
+              env,
+              client,
+              agentId,
+              localAgentDefinitions,
+              printEvents,
+              commitId: commit.id,
+              parentSha: commit.parentSha,
+            })
+          }
+
+          console.log(
+            `[${commit.id}] Running agent: ${externalAgentType || 'codebuff'}`,
+          )
+
+          const result = await runner.run(commit.prompt)
+          trace.push(...result.steps)
+          cost = result.totalCostUsd
+          diff = result.diff
+
+          const contextFilePaths = new Set<string>([
+            ...commit.supplementalFiles,
+            ...commit.fileDiffs.map((fd) => fd.path),
+          ])
+          for (const { status, path } of commit.fileDiffs) {
+            if (status === 'added') {
+              contextFilePaths.delete(path)
+            }
+          }
+
+          for (const filePath of contextFilePaths) {
+            try {
+              const content = execSync(
+                `git show ${commit.parentSha}:${JSON.stringify(filePath)}`,
+                {
+                  cwd: repoDir,
+                  encoding: 'utf-8',
+                  maxBuffer: 10 * 1024 * 1024,
+                },
+              )
+              contextFiles[filePath] = content
+            } catch (error) {
+              contextFiles[filePath] = ''
+            }
+          }
+
+          // Run final check commands if specified
+          if (finalCheckCommands && finalCheckCommands.length > 0) {
+            console.log(
+              `[${commit.id}] Running ${finalCheckCommands.length} final check commands...`,
+            )
+            finalCheckOutputs = await runFinalCheckCommands(
+              finalCheckCommands,
+              repoDir,
+              env,
+            )
+          }
+        },
+      ),
+      timeoutMs,
+      `Agent ${agentId} timed out after ${timeoutMs / 1000} seconds`,
+    )
+  } catch (e) {
+    error = e instanceof Error ? `${e.message}\n${e.stack}` : String(e)
+  }
+
+  const durationMs = Date.now() - startTime
+
+  return {
+    diff,
+    contextFiles,
+    durationMs,
+    cost,
+    error,
+    trace,
+    finalCheckOutputs,
+  }
+}
+
+async function runFinalCheckCommands(
+  commands: string[],
+  cwd: string,
+  env?: Record<string, string>,
+): Promise<FinalCheckOutput[]> {
+  const results: FinalCheckOutput[] = []
+
+  for (const command of commands) {
+    console.log(`  Running: ${command}`)
+    try {
+      const { stdout, stderr } = await execAsync(command, {
+        cwd,
+        encoding: 'utf-8',
+        maxBuffer: 10 * 1024 * 1024, // 10MB buffer
+        env: { ...process.env, ...env },
+      })
+      results.push({
+        command,
+        exitCode: 0,
+        stdout,
+        stderr,
+      })
+      console.log(`  ✓ Command succeeded: ${command}`)
+    } catch (error: any) {
+      // Command failed, but we still capture the output
+      results.push({
+        command,
+        exitCode: error.code || 1,
+        stdout: error.stdout || '',
+        stderr: error.stderr || error.message || '',
+      })
+      console.log(`  ✗ Command failed (exit ${error.code}): ${command}`)
+    }
+  }
+
+  return results
+}
diff --git a/evals/evalbuff/cli-runner.ts b/evals/evalbuff/cli-runner.ts
new file mode 100644
index 0000000000..07529c0ea8
--- /dev/null
+++ b/evals/evalbuff/cli-runner.ts
@@ -0,0 +1,94 @@
+import { execSync, spawn } from 'child_process'
+
+export interface CliRunnerOptions {
+  command: string // e.g., "claude -p" or "codex exec --full-auto"
+  prompt: string
+  cwd: string
+  timeoutMs: number // Default 300_000 (5 min)
+  env?: Record<string, string>
+}
+
+export interface CliRunnerResult {
+  diff: string
+  durationMs: number
+  exitCode: number
+  stdout: string
+  stderr: string
+}
+
+export async function runCliAgent(
+  options: CliRunnerOptions,
+): Promise<CliRunnerResult> {
+  const { command, prompt, cwd, timeoutMs, env } = options
+  const startTime = Date.now()
+
+  return new Promise((resolve, reject) => {
+    const [cmd, ...baseArgs] = command.split(' ')
+    const args = [...baseArgs, prompt]
+
+    console.log(`[CliRunner] Running: ${cmd} ${baseArgs.join(' ')} <prompt>`)
+
+    const child = spawn(cmd, args, {
+      cwd,
+      env: { ...process.env, ...env },
+      stdio: ['ignore', 'pipe', 'pipe'],
+    })
+
+    let stdout = ''
+    let stderr = ''
+
+    const timer = setTimeout(() => {
+      child.kill('SIGTERM')
+      // Give it 5 seconds to clean up, then force kill
+      setTimeout(() => {
+        if (!child.killed) {
+          child.kill('SIGKILL')
+        }
+      }, 5000)
+    }, timeoutMs)
+
+    child.stdout.on('data', (data: Buffer) => {
+      stdout += data.toString()
+    })
+
+    child.stderr.on('data', (data: Buffer) => {
+      stderr += data.toString()
+      process.stderr.write(data)
+    })
+
+    child.on('error', (error) => {
+      clearTimeout(timer)
+      reject(
+        new Error(
+          `CLI agent failed to start: ${error.message}. Make sure '${cmd}' is installed and in PATH.`,
+        ),
+      )
+    })
+
+    child.on('close', (code) => {
+      clearTimeout(timer)
+      const durationMs = Date.now() - startTime
+
+      // Capture git diff of agent's changes
+      let diff = ''
+      try {
+        execSync('git add .', { cwd, stdio: 'ignore' })
+        diff = execSync('git diff HEAD', {
+          cwd,
+          encoding: 'utf-8',
+          maxBuffer: 10 * 1024 * 1024,
+        })
+      } catch {
+        // Ignore git errors
+      }
+
+      resolve({
+        diff,
+        durationMs,
+        exitCode: code ?? 1,
+        stdout,
+        stderr,
+      })
+    })
+  })
+}
diff --git a/evals/evalbuff/criteria.ts b/evals/evalbuff/criteria.ts
new file mode 100644
index 0000000000..aa768baf43
--- /dev/null
+++ b/evals/evalbuff/criteria.ts
@@ -0,0 +1,145 @@
+import fs from 'fs'
+
+export interface QualityCriterion {
+  name: string
+  weight: number
+  description: string
+}
+
+export interface QualityCriteria {
+  level: number // 1-5
+  criteria: QualityCriterion[]
+  promotionThreshold: number // default 8.0
+  promotionWindow: number // default 10
+}
+
+export const DEFAULT_CRITERIA: Record<number, QualityCriterion[]> = {
+  1: [
+    {
+      name: 'Correctness',
+      weight: 3,
+      description:
+        'The code compiles, runs without errors, and produces the expected behavior.',
+    },
+    {
+      name: 'Completeness',
+      weight: 3,
+      description:
+        'All aspects of the prompt are addressed. No partial implementations or TODO comments.',
+    },
+    {
+      name: 'Basic Style',
+      weight: 1,
+      description:
+        'Code follows basic formatting conventions and is readable.',
+    },
+  ],
+  2: [
+    {
+      name: 'Pattern Consistency',
+      weight: 2,
+      description:
+        'New code follows the same patterns, naming conventions, and architectural style as existing code in the codebase.',
+    },
+  ],
+  3: [
+    {
+      name: 'Test Quality',
+      weight: 2,
+      description:
+        'Tests are meaningful, cover edge cases, and test behavior rather than implementation details.',
+    },
+  ],
+  4: [
+    {
+      name: 'Optimal Design',
+      weight: 2,
+      description:
+        'Code is DRY, uses the right abstractions, and the diff is minimal — no unnecessary changes.',
+    },
+  ],
+  5: [
+    {
+      name: 'Fluency',
+      weight: 1,
+      description:
+        'Code reads like a senior engineer wrote it. Idiomatic usage of the language and framework. No over-engineering.',
+    },
+  ],
+}
+
+export function getCriteriaForLevel(level: number): QualityCriterion[] {
+  const criteria: QualityCriterion[] = []
+  for (let l = 1; l <= Math.min(level, 5); l++) {
+    criteria.push(...(DEFAULT_CRITERIA[l] || []))
+  }
+  return criteria
+}
+
+export function loadCriteria(criteriaPath?: string): QualityCriteria {
+  if (criteriaPath && fs.existsSync(criteriaPath)) {
+    const raw = JSON.parse(fs.readFileSync(criteriaPath, 'utf-8'))
+    return raw as QualityCriteria
+  }
+  return {
+    level: 1,
+    criteria: getCriteriaForLevel(1),
+    promotionThreshold: 8.0,
+    promotionWindow: 10,
+  }
+}
+
+export function saveCriteria(
+  criteriaPath: string,
+  criteria: QualityCriteria,
+): void {
+  fs.writeFileSync(criteriaPath, JSON.stringify(criteria, null, 2))
+}
+
+/**
+ * Checks if criteria should be promoted to the next level.
+ * Returns the new level if promoted, or the current level if not.
+ */
+export function maybePromoteCriteria(
+  criteria: QualityCriteria,
+  recentScores: number[],
+): number {
+  if (criteria.level >= 5) return criteria.level
+  if (recentScores.length < criteria.promotionWindow) return criteria.level
+
+  const windowScores = recentScores.slice(-criteria.promotionWindow)
+  const avg = windowScores.reduce((sum, s) => sum + s, 0) / windowScores.length
+
+  if (avg >= criteria.promotionThreshold) {
+    const newLevel = criteria.level + 1
+    console.log(
+      `Criteria promoted from level ${criteria.level} to ${newLevel} (avg ${avg.toFixed(1)} >= ${criteria.promotionThreshold})`,
+    )
+    return newLevel
+  }
+
+  return criteria.level
+}
+
+/**
+ * Format criteria as text for injection into judge prompts.
+ */
+export function formatCriteriaForPrompt(criteria: QualityCriteria): string {
+  const lines = [
+    `## Quality Criteria (Level ${criteria.level}/5)`,
+    '',
+    'Apply these additional quality criteria when scoring. Higher levels add stricter standards:',
+    '',
+  ]
+
+  for (const c of criteria.criteria) {
+    lines.push(`- **${c.name}** (weight: ${c.weight}): ${c.description}`)
+  }
+
+  lines.push(
+    '',
+    'Weight these criteria proportionally when computing scores. A violation of a high-weight criterion should have a bigger impact on the score than a low-weight one.',
+  )
+
+  return lines.join('\n')
+}
diff --git a/evals/evalbuff/docs-optimizer.ts b/evals/evalbuff/docs-optimizer.ts
new file mode 100644
index 0000000000..cd9dfde782
--- /dev/null
+++ b/evals/evalbuff/docs-optimizer.ts
@@ -0,0 +1,233 @@
+import fs from 'fs'
+import path from 'path'
+
+import { withTimeout } from '@codebuff/common/util/promise'
+
+import type { JudgingResult } from './judge'
+import type { AgentDefinition, CodebuffClient } from '@codebuff/sdk'
+
+export interface DocSuggestion {
+  reasoning: string
+  suggestedDocPath: string // relative to docs/, e.g. "coding-patterns/error-handling.md"
+  suggestedContent: string
+}
+
+const docWriterAgent: AgentDefinition = {
+  id: 'doc-writer',
+  model: 'anthropic/claude-sonnet-4.5',
+  displayName: 'Doc Writer',
+  toolNames: ['set_output'],
+  inputSchema: {
+    prompt: { type: 'string', description: 'The analysis prompt' },
+  },
+  outputMode: 'structured_output',
+  outputSchema: {
+    type: 'object',
+    properties: {
+      reasoning: {
+        type: 'string',
+        description:
+          'Why this doc would help the agent avoid the identified failure',
+      },
+      suggestedDocPath: {
+        type: 'string',
+        description:
+          'File path relative to docs/ directory, e.g. "patterns/error-handling.md"',
+      },
+      suggestedContent: {
+        type: 'string',
+        description: 'The markdown content to write to the doc file',
+      },
+    },
+    required: ['reasoning', 'suggestedDocPath', 'suggestedContent'],
+  },
+  systemPrompt: `You are an expert at writing developer documentation that helps AI coding agents perform better.
+
+Your job: Given a coding agent's failure on a task, write a targeted documentation file that would prevent this class of error in the future.
+
+## Rules
+
+1. Be SPECIFIC and ACTIONABLE. Reference concrete file paths, function names, and patterns from the codebase.
+2. Do NOT write generic advice like "follow best practices" or "write clean code."
+3. Focus on the GAP between what the agent did and what it should have done.
+4. Write docs that a coding agent will read and immediately know what to do differently.
+5. Keep docs concise — under 200 lines. Dense information beats verbose explanations.
+6. Use a logical file path that groups related docs together (e.g., "patterns/", "conventions/", "architecture/").
+7. Include examples of correct patterns from the codebase when possible.`,
+}
+
+/**
+ * Analyze a failure and suggest a doc edit to prevent it.
+ * Returns null if score is above threshold (no improvement needed).
+ */
+export async function analyzeFailure({
+  client,
+  judgeResult,
+  taskPrompt,
+  agentDiff,
+  groundTruthDiff,
+  currentDocs,
+  scoreThreshold,
+}: {
+  client: CodebuffClient
+  judgeResult: JudgingResult
+  taskPrompt: string
+  agentDiff: string
+  groundTruthDiff: string
+  currentDocs: Record<string, string>
+  scoreThreshold: number
+}): Promise<DocSuggestion | null> {
+  if (judgeResult.overallScore >= scoreThreshold) {
+    return null
+  }
+
+  const docsContent = Object.entries(currentDocs)
+    .map(([docPath, content]) => `### ${docPath}\n\`\`\`\n${content}\n\`\`\``)
+    .join('\n\n')
+
+  const prompt = `## Task Prompt
+${taskPrompt}
+
+## Judge Analysis
+${judgeResult.analysis}
+
+## Judge Weaknesses Found
+${judgeResult.weaknesses.map((w) => `- ${w}`).join('\n')}
+
+## Ground Truth (what should have been done)
+\`\`\`diff
+${groundTruthDiff}
+\`\`\`
+
+## Agent's Changes (what was actually done)
+\`\`\`diff
+${agentDiff || '(No changes made)'}
+\`\`\`
+
+## Current Docs (already available to the agent)
+${docsContent || '(No docs yet)'}
+
+Based on the gap between what the agent did and what it should have done, write a doc file that would help the agent get it right next time. Focus on the specific weakness identified by the judge.`
+
+  try {
+    const result = await withTimeout(
+      client.run({
+        agent: docWriterAgent.id,
+        prompt,
+        agentDefinitions: [docWriterAgent],
+        handleEvent: () => {},
+      }),
+      10 * 60 * 1000,
+      'Doc writer agent timed out after 10 minutes',
+    )
+
+    if (result.output.type !== 'structuredOutput') {
+      console.error('Doc writer did not return structured output')
+      return null
+    }
+
+    const value = result.output.value as DocSuggestion
+    // Validate the path is under docs/
+    if (
+      value.suggestedDocPath.startsWith('/') ||
+      value.suggestedDocPath.includes('..')
+    ) {
+      console.error(
+        `Doc writer suggested invalid path: ${value.suggestedDocPath}`,
+      )
+      return null
+    }
+
+    return value
+  } catch (error) {
+    console.error('Doc writer failed:', error)
+    return null
+  }
+}
+
+/**
+ * Apply a doc edit to a repo — writes the file and updates AGENTS.md TOC.
+ */
+export function applyDocEdit(
+  repoPath: string,
+  docPath: string,
+  content: string,
+  agentsMdPath?: string,
+): boolean {
+  // Validate path is under docs/
+  if (docPath.startsWith('/') || docPath.includes('..')) {
+    console.error(`Rejected doc path outside docs/: ${docPath}`)
+    return false
+  }
+
+  const fullDocPath = path.join(repoPath, 'docs', docPath)
+  const fullAgentsMdPath = agentsMdPath || path.join(repoPath, 'AGENTS.md')
+
+  try {
+    // Create directory structure
+    fs.mkdirSync(path.dirname(fullDocPath), { recursive: true })
+
+    // Check if this is a new file (for AGENTS.md update)
+    const isNew = !fs.existsSync(fullDocPath)
+
+    // Write the doc file
+    fs.writeFileSync(fullDocPath, content)
+
+    // Update AGENTS.md if new file
+    if (isNew) {
+      let agentsMd = ''
+      if (fs.existsSync(fullAgentsMdPath)) {
+        agentsMd = fs.readFileSync(fullAgentsMdPath, 'utf-8')
+      } else {
+        agentsMd = '# Documentation\n\nTable of contents for project documentation.\n\n'
+      }
+
+      const entry = `- [docs/${docPath}](docs/${docPath})\n`
+      if (!agentsMd.includes(`docs/${docPath}`)) {
+        agentsMd += entry
+        fs.writeFileSync(fullAgentsMdPath, agentsMd)
+      }
+    }
+
+    return true
+  } catch (error) {
+    console.error(`Failed to apply doc edit: ${error}`)
+    return false
+  }
+}
+
+/**
+ * Compare scores to determine if a doc edit improved things.
+ */
+export function compareScores(
+  oldScore: number,
+  newScore: number,
+): 'improved' | 'same' | 'worse' {
+  if (newScore > oldScore) return 'improved'
+  if (newScore < oldScore) return 'worse'
+  return 'same'
+}
+
+/**
+ * Read all docs from a repo's docs/ directory.
+ */
+export function readCurrentDocs(repoPath: string): Record<string, string> {
+  const docsDir = path.join(repoPath, 'docs')
+  const docs: Record<string, string> = {}
+
+  if (!fs.existsSync(docsDir)) return docs
+
+  function readDir(dir: string, prefix: string) {
+    for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
+      if (entry.isDirectory()) {
+        readDir(path.join(dir, entry.name), `${prefix}${entry.name}/`)
+      } else if (entry.name.endsWith('.md')) {
+        const relPath = `${prefix}${entry.name}`
+        docs[relPath] = fs.readFileSync(path.join(dir, entry.name), 'utf-8')
+      }
+    }
+  }
+
+  readDir(docsDir, '')
+  return docs
+}
diff --git a/evals/evalbuff/evalbuff-criteria.json b/evals/evalbuff/evalbuff-criteria.json
new file mode 100644
index 0000000000..3d0790abb7
--- /dev/null
+++ b/evals/evalbuff/evalbuff-criteria.json
@@ -0,0 +1,22 @@
+{
+  "level": 1,
+  "criteria": [
+    {
+      "name": "Correctness",
+      "weight": 3,
+      "description": "The code compiles, runs without errors, and produces the expected behavior."
+    },
+    {
+      "name": "Completeness",
+      "weight": 3,
+      "description": "All aspects of the prompt are addressed. No partial implementations or TODO comments."
+    },
+    {
+      "name": "Basic Style",
+      "weight": 1,
+      "description": "Code follows basic formatting conventions and is readable."
+    }
+  ],
+  "promotionThreshold": 8.0,
+  "promotionWindow": 10
+}
diff --git a/evals/evalbuff/judge.ts b/evals/evalbuff/judge.ts
new file mode 100644
index 0000000000..d399e600f5
--- /dev/null
+++ b/evals/evalbuff/judge.ts
@@ -0,0 +1,308 @@
+import fs from 'fs'
+import path from 'path'
+
+import { withTimeout } from '@codebuff/common/util/promise'
+import { z } from 'zod/v4'
+
+import type { QualityCriteria } from './criteria'
+import { formatCriteriaForPrompt } from './criteria'
+import type { EvalCommitV2 } from './types'
+import type { AgentDefinition, CodebuffClient } from '@codebuff/sdk'
+
+const DEBUG_ERROR = true
+
+export const JudgingResultSchema = z.object({
+  analysis: z
+    .string()
+    .describe('Detailed analysis comparing agent changes to ground truth'),
+  strengths: z
+    .array(z.string())
+    .describe('Key strengths of the implementation'),
+  weaknesses: z.array(z.string()).describe('Key weaknesses or issues found'),
+  completionScore: z
+    .number()
+    .min(0)
+    .max(10)
+    .describe('How completely the prompt was addressed'),
+  codeQualityScore: z
+    .number()
+    .min(0)
+    .max(10)
+    .describe('Code structure and maintainability'),
+  overallScore: z.number().min(0).max(10).describe('Combined assessment'),
+})
+
+export type JudgingResult = z.infer<typeof JudgingResultSchema>
+
+const judgeAgentBase: Omit<AgentDefinition, 'id' | 'model'> = {
+  displayName: 'Judge',
+  toolNames: ['set_output'],
+  inputSchema: {
+    prompt: { type: 'string', description: 'The evaluation prompt' },
+  },
+  outputMode: 'structured_output',
+  outputSchema: {
+    type: 'object',
+    properties: {
+      analysis: {
+        type: 'string',
+        description:
+          'Detailed analysis comparing agent changes to ground truth',
+      },
+      strengths: {
+        type: 'array',
+        items: { type: 'string' },
+        description: 'Key strengths of the implementation',
+      },
+      weaknesses: {
+        type: 'array',
+        items: { type: 'string' },
+        description: 'Key weaknesses or issues found',
+      },
+      completionScore: {
+        type: 'number',
+        minimum: 0,
+        maximum: 10,
+        description: 'How completely the prompt was addressed',
+      },
+      codeQualityScore: {
+        type: 'number',
+        minimum: 0,
+        maximum: 10,
+        description: 'Code structure and maintainability',
+      },
+      overallScore: {
+        type: 'number',
+        minimum: 0,
+        maximum: 10,
+        description: 'Combined assessment',
+      },
+    },
+    required: [
+      'analysis',
+      'strengths',
+      'weaknesses',
+      'completionScore',
+      'codeQualityScore',
+      'overallScore',
+    ],
+  },
+  systemPrompt: `You are an expert software engineer evaluating AI-generated code changes with empathy for the task given.
+
+## Your Role
+
+You will receive:
+1. The user prompt that the coding agent was given
+2. Context files from the codebase
+3. The ground truth changes (expected outcome)
+4. The agent's actual changes
+
+## Evaluation Philosophy
+
+**Judge based on what the agent was asked to do, not on perfection.**
+
+- If the prompt is vague or high-level (e.g., "add authentication"), be lenient and accept any reasonable implementation that achieves the goal
+- If the prompt is specific and detailed, expect the implementation to match those details more closely
+- Focus on whether the agent understood and addressed the user's intent
+- Consider that there are often multiple valid ways to implement the same feature
+
+## Evaluation Criteria
+
+- **Completion** (0-10): How well did the agent address what was asked in the prompt? Consider the specificity of the prompt.
+- **Code Quality** (0-10): How well-structured and maintainable is the code?
+- **Overall** (0-10): Combined assessment of whether the agent successfully completed the task as requested
+
+## Ground Truth
+
+The ground truth shows ONE valid implementation, but it's not the only correct answer. The agent's implementation should be judged on:
+- Does it achieve the same functional outcome?
+- Is it a reasonable approach given the prompt?
+- Does it maintain code quality?
+
+Provide detailed analysis, strengths, weaknesses, and numerical scores.`,
+}
+
+const judgeAgents: Record<string, AgentDefinition> = {
+  'judge-gpt': {
+    id: 'judge-gpt',
+    model: 'openai/gpt-5.1',
+    ...judgeAgentBase,
+  },
+  'judge-gemini': {
+    id: 'judge-gemini',
+    model: 'google/gemini-3-pro-preview',
+    ...judgeAgentBase,
+  },
+  'judge-sonnet': {
+    id: 'judge-claude',
+    model: 'anthropic/claude-sonnet-4.5',
+    ...judgeAgentBase,
+  },
+}
+
+interface JudgeCommitResultInput {
+  client: CodebuffClient
+  commit: EvalCommitV2
+  contextFiles: Record<string, string>
+  agentDiff: string
+  error?: string
+  finalCheckOutputs?: string
+  criteria?: QualityCriteria
+}
+
+async function runSingleJudge(
+  input: JudgeCommitResultInput,
+  judgePrompt: string,
+  judgeAgentId: string,
+): Promise<JudgingResult | null> {
+  const { client } = input
+
+  const judgeAgent = judgeAgents[judgeAgentId]
+  const agentOutput: string[] = []
+  try {
+    const judgeResult = await withTimeout(
+      client.run({
+        agent: judgeAgent.id,
+        prompt: judgePrompt,
+        agentDefinitions: Object.values(judgeAgents),
+        handleEvent: (event) => {
+          if (event.type === 'text') {
+            agentOutput.push(event.text)
+          } else if (event.type === 'tool_call') {
+            agentOutput.push(JSON.stringify(event, null, 2))
+          } else if (event.type === 'error') {
+            console.warn(`[Judge ${judgeAgentId}] Error event:`, event.message)
+          }
+        },
+      }),
+      20 * 60 * 1000,
+      'Judge agent timed out after 20 minutes',
+    )
+
+    if (judgeResult.output.type !== 'structuredOutput') {
+      console.error(
+        `Judge ${judgeAgentId} - not structured output`,
+        JSON.stringify(judgeResult.output, null, 2),
+      )
+      console.error(
+        'Judge agent output:',
+        JSON.stringify(judgeResult.output, null, 2),
+        'Judge agent output trace:',
+        agentOutput.join(''),
+      )
+      if (DEBUG_ERROR) {
+        fs.writeFileSync(
+          path.join(
+            __dirname,
+            '..',
+            `${input.commit.id}-${judgeAgentId}-agent-output-error.json`,
+          ),
+          JSON.stringify(
+            { output: judgeResult.output, trace: agentOutput },
+            null,
+            2,
+          ),
+        )
+      }
+      return null
+    }
+
+    return judgeResult.output.value as JudgingResult
+  } catch (error) {
+    console.warn(`Judge ${judgeAgentId} failed:`, error)
+    return null
+  }
+}
+
+export async function judgeCommitResult(
+  input: JudgeCommitResultInput,
+): Promise<JudgingResult> {
+  const { commit, contextFiles, agentDiff, error, finalCheckOutputs, criteria } =
+    input
+
+  const { prompt, fileDiffs } = commit
+
+  const groundTruthDiffs = fileDiffs
+    .map(({ path, diff }) => {
+      return `### ${path}\n\`\`\`diff\n${diff}\n\`\`\``
+    })
+    .join('\n\n')
+
+  const contextFilesContent = Object.entries(contextFiles)
+    .map(([filePath, content]) => {
+      return `### ${filePath}\n\`\`\`\n${content}\n\`\`\``
+    })
+    .join('\n\n')
+
+  const judgePrompt = `## User Prompt (What the agent was asked to do)
+${prompt}
+
+## Context Files (from parent commit)
+${contextFilesContent || '(No context files)'}
+
+## Ground Truth Changes (One valid implementation)
+${groundTruthDiffs}
+
+## Agent's Changes (What the agent actually did)
+\`\`\`diff
+${agentDiff || '(No changes made)'}
+\`\`\`
+${error ? `\n## Error Encountered\n${error}` : ''}
+${finalCheckOutputs ? `\n## Final Check Command Outputs\n${finalCheckOutputs}` : ''}
+${criteria ? `\n${formatCriteriaForPrompt(criteria)}` : ''}`
+
+  // Run 2 judges in parallel
+  const judgePromises = [
+    runSingleJudge(input, judgePrompt, 'judge-gpt'),
+    runSingleJudge(input, judgePrompt, 'judge-gemini'),
+  ]
+
+  const judgeResults = await Promise.all(judgePromises)
+  const validResults = judgeResults.filter(
+    (result): result is JudgingResult => result !== null,
+  )
+
+  if (validResults.length === 0) {
+    console.error('All judges failed to provide results')
+    return {
+      analysis: 'Error running judge agent - all judges failed',
+      strengths: [],
+      weaknesses: ['All judges failed to provide structured output'],
+      completionScore: 0,
+      codeQualityScore: 0,
+      overallScore: 0,
+    }
+  }
+
+  // Sort judges by overall score and select the median for analysis
+  const sortedResults = validResults.sort(
+    (a, b) => a.overallScore - b.overallScore,
+  )
+  const medianIndex = Math.floor(sortedResults.length / 2)
+  const medianResult = sortedResults[medianIndex]
+
+  // Calculate average scores across all valid judges
+  const averageCompletionScore =
+    validResults.reduce((sum, r) => sum + r.completionScore, 0) /
+    validResults.length
+  const averageCodeQualityScore =
+    validResults.reduce((sum, r) => sum + r.codeQualityScore, 0) /
+    validResults.length
+  const averageOverallScore =
+    validResults.reduce((sum, r) => sum + r.overallScore, 0) /
+    validResults.length
+
+  console.log(
+    `Judging results overall score: ${averageOverallScore.toFixed(1)} (individual scores: ${validResults.map((r) => r.overallScore.toFixed(1)).join(', ')})`,
+  )
+
+  // Return median judge's analysis with averaged scores
+  return {
+    analysis: medianResult.analysis,
+    strengths: medianResult.strengths,
+    weaknesses: medianResult.weaknesses,
+    completionScore: averageCompletionScore,
+    codeQualityScore: averageCodeQualityScore,
+    overallScore: averageOverallScore,
+  }
+}
diff --git a/evals/evalbuff/morning-report.ts b/evals/evalbuff/morning-report.ts
new file mode 100644
index 0000000000..9682bed16e
--- /dev/null
+++ b/evals/evalbuff/morning-report.ts
@@ -0,0 +1,197 @@
+import fs from 'fs'
+
+export interface EvalbuffLogEntry {
+  taskId: string
+  timestamp: string
+  oldScore: number
+  newScore: number | null
+  docEdit: {
+    path: string
+    reasoning: string
+  } | null
+  scoreComparison: 'improved' | 'same' | 'worse' | null
+  costUsd: number
+  durationMs: number
+  error?: string
+  criteriaLevel: number
+}
+
+export interface MorningReportData {
+  startTime: string
+  endTime: string
+  totalIterations: number
+  totalCostUsd: number
+  totalDurationMs: number
+  avgOldScore: number
+  avgNewScore: number
+  docsAdded: number
+  docsKept: number
+  docsReverted: number
+  criteriaLevel: number
+  entries: EvalbuffLogEntry[]
+}
+
+export function generateMorningReport(logPath: string): string {
+  if (!fs.existsSync(logPath)) {
+    return generateEmptyReport()
+  }
+
+  const content = fs.readFileSync(logPath, 'utf-8').trim()
+  if (!content) {
+    return generateEmptyReport()
+  }
+
+  const entries: EvalbuffLogEntry[] = content
+    .split('\n')
+    .filter((line) => line.trim())
+    .map((line) => JSON.parse(line))
+
+  const data = computeReportData(entries)
+  return formatReport(data)
+}
+
+function generateEmptyReport(): string {
+  return `# Evalbuff Morning Report
+
+**No iterations were run.** The log file is empty or missing.
+
+| Metric | Value |
+|--------|-------|
+| Iterations | 0 |
+| Total Cost | $0.00 |
+| Total Duration | 0s |
+| Docs Added | 0 |
+| Docs Kept | 0 |
+| Criteria Level | - |
+`
+}
+
+function computeReportData(entries: EvalbuffLogEntry[]): MorningReportData {
+  const oldScores = entries.map((e) => e.oldScore)
+  const newScores = entries
+    .filter((e) => e.newScore !== null)
+    .map((e) => e.newScore!)
+
+  const docsAdded = entries.filter((e) => e.docEdit !== null).length
+  const docsKept = entries.filter((e) => e.scoreComparison === 'improved').length
+  const docsReverted = docsAdded - docsKept
+
+  return {
+    startTime: entries[0]?.timestamp || '',
+    endTime: entries[entries.length - 1]?.timestamp || '',
+    totalIterations: entries.length,
+    totalCostUsd: entries.reduce((sum, e) => sum + e.costUsd, 0),
+    totalDurationMs: entries.reduce((sum, e) => sum + e.durationMs, 0),
+    avgOldScore:
+      oldScores.length > 0
+        ? oldScores.reduce((a, b) => a + b, 0) / oldScores.length
+        : 0,
+    avgNewScore:
+      newScores.length > 0
+        ? newScores.reduce((a, b) => a + b, 0) / newScores.length
+        : 0,
+    docsAdded,
+    docsKept,
+    docsReverted,
+    criteriaLevel: entries[entries.length - 1]?.criteriaLevel || 1,
+    entries,
+  }
+}
+
+function formatDuration(ms: number): string {
+  const seconds = Math.floor(ms / 1000)
+  const minutes = Math.floor(seconds / 60)
+  const hours = Math.floor(minutes / 60)
+  if (hours > 0) return `${hours}h ${minutes % 60}m`
+  if (minutes > 0) return `${minutes}m ${seconds % 60}s`
+  return `${seconds}s`
+}
+
+function formatReport(data: MorningReportData): string {
+  const lines: string[] = [
+    '# Evalbuff Morning Report',
+    '',
+    `**Run:** ${data.startTime || 'N/A'} to ${data.endTime || 'N/A'}`,
+    '',
+    '## Summary',
+    '',
+    '| Metric | Value |',
+    '|--------|-------|',
+    `| Iterations | ${data.totalIterations} |`,
+    `| Total Cost | $${data.totalCostUsd.toFixed(2)} |`,
+    `| Total Duration | ${formatDuration(data.totalDurationMs)} |`,
+    `| Avg Score (before docs) | ${data.avgOldScore.toFixed(1)} |`,
+    `| Avg Score (after docs) | ${data.avgNewScore > 0 ? data.avgNewScore.toFixed(1) : 'N/A'} |`,
+    `| Docs Attempted | ${data.docsAdded} |`,
+    `| Docs Kept (improved score) | ${data.docsKept} |`,
+    `| Docs Reverted | ${data.docsReverted} |`,
+    `| Criteria Level | ${data.criteriaLevel}/5 |`,
+    '',
+  ]
+
+  // Doc changes table
+  const docEntries = data.entries.filter((e) => e.docEdit !== null)
+  if (docEntries.length > 0) {
+    lines.push('## Doc Changes')
+    lines.push('')
+    lines.push('| Task | Doc Path | Score Impact | Kept? | Reasoning |')
+    lines.push('|------|----------|-------------|-------|-----------|')
+    for (const entry of docEntries) {
+      const impact =
+        entry.newScore !== null
+          ? `${entry.oldScore.toFixed(1)} -> ${entry.newScore.toFixed(1)}`
+          : 'N/A'
+      const kept = entry.scoreComparison === 'improved' ? 'Yes' : 'No'
+      const reasoning =
+        entry.docEdit!.reasoning.length > 60
+          ? entry.docEdit!.reasoning.slice(0, 57) + '...'
+          : entry.docEdit!.reasoning
+      lines.push(
+        `| ${entry.taskId} | ${entry.docEdit!.path} | ${impact} | ${kept} | ${reasoning} |`,
+      )
+    }
+    lines.push('')
+  }
+
+  // Failed iterations
+  const failedEntries = data.entries.filter((e) => e.error)
+  if (failedEntries.length > 0) {
+    lines.push('## Errors')
+    lines.push('')
+    lines.push('| Task | Error |')
+    lines.push('|------|-------|')
+    for (const entry of failedEntries) {
+      const errorMsg =
+        entry.error!.length > 80
+          ? entry.error!.slice(0, 77) + '...'
+          : entry.error!
+      lines.push(`| ${entry.taskId} | ${errorMsg} |`)
+    }
+    lines.push('')
+  }
+
+  // Score trajectory
+  lines.push('## Score Trajectory')
+  lines.push('')
+  lines.push('```')
+  for (const entry of data.entries) {
+    const bar = '#'.repeat(Math.round(entry.oldScore))
+    const newBar =
+      entry.newScore !== null
+        ? ` -> ${'#'.repeat(Math.round(entry.newScore))}`
+        : ''
+    lines.push(
+      `${entry.taskId.padEnd(20)} ${entry.oldScore.toFixed(1).padStart(4)} ${bar}${newBar}`,
+    )
+  }
+  lines.push('```')
+
+  return lines.join('\n')
+}
+
+export function appendLogEntry(
+  logPath: string,
+  entry: EvalbuffLogEntry,
+): void {
+  fs.appendFileSync(logPath, JSON.stringify(entry) + '\n')
+}
diff --git a/evals/evalbuff/old/agents/context-agent.ts b/evals/evalbuff/old/agents/context-agent.ts
new file mode 100644
index 0000000000..7fc7b8ff2c
--- /dev/null
+++ b/evals/evalbuff/old/agents/context-agent.ts
@@ -0,0 +1,56 @@
+import type { AgentDefinition } from '@codebuff/sdk'
+
+export const contextAgent: AgentDefinition = {
+  id: 'evalbuff-context',
+  displayName: 'Evalbuff Context Agent',
+  model: 'anthropic/claude-sonnet-4.5',
+  toolNames: ['read_files', 'list_directory', 'code_search', 'glob', 'end_turn'],
+  spawnableAgents: [],
+  outputMode: 'last_message',
+  inputSchema: {
+    prompt: {
+      type: 'string',
+      description: 'What the user is about to work on',
+    },
+  },
+
+  systemPrompt: `You are the evalbuff Context Agent. Given a description of what a developer (or AI coding agent) is about to work on, you find the most relevant files, provide background knowledge, and surface potential gotchas.
+
+Your output MUST be well-formatted markdown with exactly three sections:
+
+## Relevant Files
+
+A bullet list of the most relevant files, each with a bold file path and a brief summary:
+- **\`path/to/file.ts\`** — What this file does and why it's relevant
+
+Order files by relevance (most relevant first). Include test files if relevant.
+
+## Background
+
+Provide context about the systems, patterns, and architecture involved. Reference specific files and patterns. This should help someone unfamiliar with this area of the codebase get oriented quickly.
+
+## Gotchas
+
+List potential pitfalls, non-obvious behaviors, edge cases, or things that have caused problems before. Be specific:
+- Reference specific files, functions, or configuration
+- Explain WHY something is a gotcha, not just WHAT it is
+- Include environment setup requirements if relevant
+
+Rules:
+- Use the tools available to explore the codebase. Read files, search for patterns, list directories.
+- Be thorough but concise. Quality over quantity.
+- If project knowledge files exist, they were provided in the context — use them.
+- Output ONLY the markdown. No preamble or explanation outside the three sections.`,
+
+  instructionsPrompt: `Find the most relevant files and context for the user's task. Use your tools:
+
+1. Think about what areas of the codebase are likely relevant based on the prompt.
+2. List directories to understand the project structure.
+3. Use code_search to find relevant patterns, imports, and definitions.
+4. Read the most important files to understand them.
+5. Use glob to find files matching relevant patterns.
+
+Then output your findings as markdown with the three required sections: Relevant Files, Background, Gotchas.
+
+Do NOT output anything besides the markdown. No tool calls after you start writing the markdown output.`,
+}
diff --git a/evals/evalbuff/old/agents/review-agent.ts b/evals/evalbuff/old/agents/review-agent.ts
new file mode 100644
index 0000000000..0f149e6f38
--- /dev/null
+++ b/evals/evalbuff/old/agents/review-agent.ts
@@ -0,0 +1,97 @@
+import type { AgentDefinition } from '@codebuff/sdk'
+
+export const reviewAgent: AgentDefinition = {
+  id: 'evalbuff-review',
+  displayName: 'Evalbuff Review Agent',
+  model: 'anthropic/claude-sonnet-4.5',
+  toolNames: ['read_files', 'code_search', 'end_turn'],
+  spawnableAgents: [],
+  outputMode: 'last_message',
+  inputSchema: {
+    prompt: {
+      type: 'string',
+      description: 'The diff to review, along with optional context about the original request',
+    },
+  },
+
+  systemPrompt: `You are the evalbuff Review Agent. You review code changes and provide structured, actionable feedback.
+
+You receive a git diff and optionally the original user request that motivated the changes. Your job is to find real issues, not nitpick.
+
+Your output MUST be well-formatted markdown following this structure:
+
+## Review Summary
+
+Start with a one-line summary: "Reviewed N files with M lines changed. Found X critical issues, Y warnings, and Z suggestions."
+
+If a prompt describing the original request was provided, include a **Goal Assessment** subsection:
+
+### Goal Assessment
+
+**Prompt:** "<the original prompt>"
+
+Use ✅ for things that are done correctly, ⚠️ for partial/concerning, and ❌ for missing or wrong:
+- ✅ Description of what was accomplished correctly
+- ⚠️ Description of concern
+- ❌ Description of what's missing or wrong
+
+## Issues
+
+List issues grouped by severity. Use this format for each:
+
+### 🔴 Critical: <brief title>
+
+**\`file/path.ts:line\`**
+
+Explanation of the issue and why it's critical.
+
+\`\`\`ts
+// Current (problematic)
+code here
+
+// Suggested fix
+fixed code here
+\`\`\`
+
+---
+
+### 🟡 Warning: <brief title>
+
+**\`file/path.ts:line\`**
+
+Explanation.
+
+## Suggestions
+
+- 💡 Suggestion with file reference and explanation.
+- 💡 Another suggestion.
+
+## Stats
+
+| Metric | Value |
+|--------|-------|
+| Files reviewed | N |
+| Lines changed | +X / -Y |
+| Critical issues | N |
+| Warnings | N |
+| Suggestions | N |
+
+Rules:
+- 🔴 Critical: Security vulnerabilities, data loss risks, crashes, logic errors that break functionality.
+- 🟡 Warning: Missing error handling, test gaps, potential performance issues, convention violations.
+- 💡 Suggestion: Style improvements, better approaches, refactoring opportunities.
+- Be specific: reference exact file paths and line numbers.
+- Provide code fixes for critical issues when possible.
+- Use the available tools to read full files for context around the diff.
+- If there are no issues, say so clearly. Don't invent problems.
+- Output ONLY the markdown. No preamble.`,
+
+  instructionsPrompt: `Review the provided code changes. You may use tools to read the full contents of modified files for better context.
+
+1. Analyze the diff carefully.
+2. If file paths are mentioned in the diff, read those files to understand the full context.
+3. Use code_search if you need to understand how changed functions are used elsewhere.
+4. Write your review following the exact markdown format specified in your system prompt.
+
+Do NOT output anything besides the review markdown. No tool calls after you start writing the review.`,
+}
diff --git a/evals/evalbuff/old/agents/scan-agent.ts b/evals/evalbuff/old/agents/scan-agent.ts
new file mode 100644
index 0000000000..bdc8cc2538
--- /dev/null
+++ b/evals/evalbuff/old/agents/scan-agent.ts
@@ -0,0 +1,46 @@
+import type { AgentDefinition } from '@codebuff/sdk'
+
+export const scanAgent: AgentDefinition = {
+  id: 'evalbuff-scan',
+  displayName: 'Evalbuff Scan Agent',
+  model: 'anthropic/claude-sonnet-4.5',
+  toolNames: ['read_files', 'list_directory', 'code_search', 'write_file', 'end_turn'],
+  spawnableAgents: [],
+  outputMode: 'last_message',
+  inputSchema: {
+    prompt: {
+      type: 'string',
+      description: 'Instructions for the scan agent',
+    },
+  },
+
+  systemPrompt: `You are a project analysis agent for evalbuff. Your job is to analyze a software project and generate knowledge files that help AI coding agents understand the project.
+
+You will analyze the project structure, tech stack, coding conventions, and testing infrastructure, then write your findings as markdown files.
+
+You MUST write exactly these four files using the write_file tool:
+1. \`.agents/knowledge/architecture.md\` — High-level overview: project type, directory structure, how components relate
+2. \`.agents/knowledge/tech-stack.md\` — Languages, frameworks, key dependencies, build system, runtime
+3. \`.agents/knowledge/conventions.md\` — Coding patterns observed: naming, file organization, error handling patterns
+4. \`.agents/knowledge/testing.md\` — Test frameworks, test directory layout, how to run tests, CI setup
+
+Rules:
+- ONLY write files under \`.agents/knowledge/\`. Do not write anywhere else.
+- Each file should be concise but informative (aim for 50-200 lines each).
+- Use markdown formatting with clear headers.
+- Base your analysis on actual evidence from the codebase (config files, imports, directory structure).
+- If knowledge files already exist, read them first and merge new observations rather than replacing user-curated content.`,
+
+  instructionsPrompt: `Analyze this project thoroughly:
+
+1. Start by reading key configuration files (package.json, Cargo.toml, requirements.txt, pyproject.toml, build.gradle, Makefile, Dockerfile, etc. — whatever exists).
+2. List the top-level directory to understand the project structure.
+3. Use code_search to find patterns like import styles, error handling, test frameworks.
+4. Read a few representative source files to understand coding conventions.
+5. Look for CI configuration (.github/workflows/, .gitlab-ci.yml, etc.).
+6. Check for existing knowledge files in \`.agents/knowledge/\` — if they exist, read them first.
+
+Then write all four knowledge files. Be specific and cite actual file paths and patterns you observed.
+
+After writing all files, end your turn with a brief summary of what you found.`,
+}
diff --git a/evals/evalbuff/old/cli/package.json b/evals/evalbuff/old/cli/package.json
new file mode 100644
index 0000000000..987856f22d
--- /dev/null
+++ b/evals/evalbuff/old/cli/package.json
@@ -0,0 +1,24 @@
+{
+  "name": "@codebuff/evalbuff",
+  "version": "0.1.0",
+  "description": "Codebase-specific evals, context, and review for AI coding agents",
+  "private": true,
+  "type": "module",
+  "bin": {
+    "evalbuff": "./src/index.ts"
+  },
+  "scripts": {
+    "dev": "bun src/index.ts",
+    "typecheck": "tsc --noEmit -p .",
+    "test": "bun test"
+  },
+  "dependencies": {
+    "@codebuff/sdk": "workspace:*",
+    "@codebuff/common": "workspace:*",
+    "commander": "^13.1.0",
+    "zod": "^4.2.1"
+  },
+  "devDependencies": {
+    "@types/node": "^22.9.0"
+  }
+}
diff --git a/evals/evalbuff/old/cli/src/commands/context.ts b/evals/evalbuff/old/cli/src/commands/context.ts
new file mode 100644
index 0000000000..4d96059c70
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/commands/context.ts
@@ -0,0 +1,87 @@
+import { CodebuffClient } from '@codebuff/sdk'
+
+import { contextAgent } from '../../../agents/context-agent'
+import { ensureAuth } from '../utils/auth'
+import { readConfig } from '../utils/config'
+import { readKnowledgeFiles } from '../utils/knowledge'
+import { printError, printWarning, Spinner } from '../utils/output'
+import { findProjectRoot } from '../utils/project'
+
+interface ContextOptions {
+  cwd?: string
+  maxFiles?: string
+  filesOnly?: boolean
+}
+
+export async function contextCommand(
+  prompt: string,
+  options: ContextOptions,
+): Promise<void> {
+  try {
+    const apiKey = await ensureAuth()
+    const projectRoot = findProjectRoot(options.cwd)
+
+    const config = readConfig(projectRoot)
+    if (!config) {
+      printWarning(
+        'evalbuff not initialized. Run "evalbuff init" for better results.',
+      )
+    }
+
+    const maxFiles = options.maxFiles
+      ? parseInt(options.maxFiles, 10)
+      : config?.context?.maxFiles ?? 15
+
+    const knowledgeFiles = readKnowledgeFiles(projectRoot)
+
+    const spinner = new Spinner()
+    spinner.start('Scanning project structure...')
+
+    const client = new CodebuffClient({ apiKey })
+
+    let agentPrompt = `Task: ${prompt}\n\nReturn up to ${maxFiles} relevant files.`
+
+    if (options.filesOnly) {
+      agentPrompt +=
+        '\n\nIMPORTANT: Output ONLY file paths, one per line. No markdown, no summaries, no sections. Just file paths.'
+    }
+
+    let output = ''
+
+    spinner.update('Finding relevant files...')
+
+    const result = await client.run({
+      agent: contextAgent,
+      prompt: agentPrompt,
+      cwd: projectRoot,
+      knowledgeFiles,
+      maxAgentSteps: 15,
+      handleStreamChunk: (chunk) => {
+        if (typeof chunk === 'string') {
+          output += chunk
+        }
+      },
+    })
+
+    spinner.stop()
+
+    if (result.output.type === 'error') {
+      printError(result.output.message)
+      process.exit(2)
+    }
+
+    process.stdout.write(output)
+    if (output.length > 0 && !output.endsWith('\n')) {
+      process.stdout.write('\n')
+    }
+
+    process.stderr.write('✓ Done\n')
+  } catch (error) {
+    printError(
+      error instanceof Error
+        ? error.message
+        : 'Failed to gather context.',
+    )
+    process.exit(2)
+  }
+}
diff --git a/evals/evalbuff/old/cli/src/commands/init.ts b/evals/evalbuff/old/cli/src/commands/init.ts
new file mode 100644
index 0000000000..dd2e045344
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/commands/init.ts
@@ -0,0 +1,127 @@
+import fs from 'fs'
+import path from 'path'
+import readline from 'readline'
+
+import { CodebuffClient } from '@codebuff/sdk'
+
+import { scanAgent } from '../../../agents/scan-agent'
+import { SKILL_TEMPLATE } from '../templates/skill'
+import { ensureAuth } from '../utils/auth'
+import {
+  configPath,
+  getDefaultConfig,
+  readConfig,
+  writeConfig,
+} from '../utils/config'
+import { ensureKnowledgeDir, readKnowledgeFiles } from '../utils/knowledge'
+import { printError, Spinner } from '../utils/output'
+import { findProjectRoot } from '../utils/project'
+
+interface InitOptions {
+  cwd?: string
+  skipScan?: boolean
+  force?: boolean
+}
+
+function promptConfirm(question: string): Promise<boolean> {
+  const rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stderr,
+  })
+  return new Promise((resolve) => {
+    rl.question(`${question} (y/N) `, (answer) => {
+      rl.close()
+      resolve(answer.toLowerCase() === 'y')
+    })
+  })
+}
+
+function installSkillFile(projectRoot: string, targetDir: string): string {
+  const skillPath = path.join(projectRoot, targetDir, 'evalbuff', 'SKILL.md')
+  const dir = path.dirname(skillPath)
+  if (!fs.existsSync(dir)) {
+    fs.mkdirSync(dir, { recursive: true })
+  }
+  fs.writeFileSync(skillPath, SKILL_TEMPLATE)
+  return path.relative(projectRoot, skillPath)
+}
+
+export async function initCommand(options: InitOptions): Promise<void> {
+  try {
+    const apiKey = await ensureAuth()
+    const projectRoot = findProjectRoot(options.cwd)
+
+    const existingConfig = readConfig(projectRoot)
+    if (existingConfig && !options.force) {
+      const shouldOverwrite = await promptConfirm(
+        'evalbuff is already initialized. Overwrite config and skill files?',
+      )
+      if (!shouldOverwrite) {
+        process.stderr.write('Aborted.\n')
+        return
+      }
+    }
+
+    const config = getDefaultConfig(projectRoot)
+    writeConfig(projectRoot, config)
+    const configRelPath = path.relative(projectRoot, configPath(projectRoot))
+    process.stderr.write(`✓ Created ${configRelPath}\n`)
+
+    const agentsSkillPath = installSkillFile(
+      projectRoot,
+      '.agents/skills',
+    )
+    process.stderr.write(`✓ Installed skill to ${agentsSkillPath}\n`)
+
+    const claudeSkillPath = installSkillFile(
+      projectRoot,
+      '.claude/skills',
+    )
+    process.stderr.write(`✓ Installed skill to ${claudeSkillPath}\n`)
+
+    ensureKnowledgeDir(projectRoot)
+
+    if (!options.skipScan) {
+      const spinner = new Spinner()
+      spinner.start('Scanning project...')
+
+      try {
+        const existingKnowledge = readKnowledgeFiles(projectRoot)
+
+        const client = new CodebuffClient({ apiKey })
+        let scanPrompt = 'Analyze this project and generate knowledge files.'
+        if (Object.keys(existingKnowledge).length > 0) {
+          scanPrompt +=
+            ' Knowledge files already exist — read them first and merge new observations rather than overwriting.'
+        }
+
+        const result = await client.run({
+          agent: scanAgent,
+          prompt: scanPrompt,
+          cwd: projectRoot,
+          knowledgeFiles: existingKnowledge,
+          maxAgentSteps: 20,
+        })
+
+        if (result.output.type === 'error') {
+          spinner.fail(`Scan failed: ${result.output.message}`)
+        } else {
+          spinner.succeed('Generated project knowledge')
+        }
+      } catch (error) {
+        spinner.fail(
+          `Scan failed: ${error instanceof Error ? error.message : String(error)}`,
+        )
+      }
+    }
+
+    process.stderr.write(
+      `\nEvalbuff is ready! Your coding agents will now automatically use evalbuff for context and review.\n\nTry it:\n  evalbuff context "add user authentication"\n  evalbuff review\n`,
+    )
+  } catch (error) {
+    printError(
+      error instanceof Error ? error.message : 'Init failed.',
+    )
+    process.exit(2)
+  }
+}
diff --git a/evals/evalbuff/old/cli/src/commands/login.ts b/evals/evalbuff/old/cli/src/commands/login.ts
new file mode 100644
index 0000000000..3d4a6a0052
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/commands/login.ts
@@ -0,0 +1,22 @@
+import { loginFlow, getUserCredentials } from '../utils/auth'
+import { printError } from '../utils/output'
+
+export async function loginCommand(): Promise<void> {
+  try {
+    const existing = getUserCredentials()
+    if (existing) {
+      process.stderr.write(
+        `Already logged in as ${existing.email}. Run "evalbuff logout" first to switch accounts.\n`,
+      )
+      return
+    }
+
+    const user = await loginFlow()
+    process.stderr.write(`\n✓ Logged in as ${user.email}\n`)
+  } catch (error) {
+    printError(
+      error instanceof Error ? error.message : 'Login failed.',
+    )
+    process.exit(2)
+  }
+}
diff --git a/evals/evalbuff/old/cli/src/commands/logout.ts b/evals/evalbuff/old/cli/src/commands/logout.ts
new file mode 100644
index 0000000000..696ac0b1ff
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/commands/logout.ts
@@ -0,0 +1,12 @@
+import { clearUserCredentials, getUserCredentials } from '../utils/auth'
+
+export function logoutCommand(): void {
+  const user = getUserCredentials()
+  clearUserCredentials()
+
+  if (user) {
+    process.stderr.write(`✓ Logged out (was ${user.email})\n`)
+  } else {
+    process.stderr.write('Already logged out.\n')
+  }
+}
diff --git a/evals/evalbuff/old/cli/src/commands/review.ts b/evals/evalbuff/old/cli/src/commands/review.ts
new file mode 100644
index 0000000000..e2653919fa
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/commands/review.ts
@@ -0,0 +1,139 @@
+import fs from 'fs'
+import path from 'path'
+
+import { CodebuffClient } from '@codebuff/sdk'
+
+import { reviewAgent } from '../../../agents/review-agent'
+import { ensureAuth } from '../utils/auth'
+import { readConfig } from '../utils/config'
+import {
+  getDiff,
+  getChangedFiles,
+  isGitRepo,
+} from '../utils/git'
+import { readKnowledgeFiles } from '../utils/knowledge'
+import { printError, printWarning, Spinner } from '../utils/output'
+import { findProjectRoot } from '../utils/project'
+
+interface ReviewOptions {
+  cwd?: string
+  files?: string[]
+  branch?: string | true
+  staged?: boolean
+  commit?: string
+}
+
+export async function reviewCommand(
+  prompt: string | undefined,
+  options: ReviewOptions,
+): Promise<void> {
+  try {
+    const apiKey = await ensureAuth()
+    const projectRoot = findProjectRoot(options.cwd)
+
+    if (!isGitRepo(projectRoot)) {
+      printError('Not a git repository. Run from within a git repo.')
+      process.exit(2)
+    }
+
+    const config = readConfig(projectRoot)
+    if (!config) {
+      printWarning(
+        'evalbuff not initialized. Run "evalbuff init" for better results.',
+      )
+    }
+
+    const defaultBranch = config?.review?.defaultBranch ?? 'main'
+
+    const diffOptions = {
+      cwd: projectRoot,
+      files: options.files,
+      branch: options.branch,
+      staged: options.staged,
+      commit: options.commit,
+      defaultBranch,
+    }
+
+    const diff = getDiff(diffOptions)
+
+    if (!diff.trim()) {
+      process.stderr.write('No changes to review.\n')
+      process.exit(0)
+    }
+
+    const changedFiles = options.files ?? getChangedFiles(diffOptions)
+
+    const spinner = new Spinner()
+    spinner.start('Collecting diff...')
+
+    const fileContents: Record<string, string> = {}
+    for (const filePath of changedFiles) {
+      const absPath = path.join(projectRoot, filePath)
+      if (fs.existsSync(absPath)) {
+        try {
+          fileContents[filePath] = fs.readFileSync(absPath, 'utf8')
+        } catch {
+          // skip unreadable files
+        }
+      }
+    }
+
+    const knowledgeFiles = readKnowledgeFiles(projectRoot)
+
+    spinner.update(`Analyzing ${changedFiles.length} changed files...`)
+
+    let agentPrompt = `## Git Diff\n\n\`\`\`diff\n${diff}\n\`\`\`\n\n`
+    agentPrompt += `## Changed Files (full contents)\n\n`
+    for (const [filePath, content] of Object.entries(fileContents)) {
+      agentPrompt += `### ${filePath}\n\n\`\`\`\n${content}\n\`\`\`\n\n`
+    }
+
+    if (prompt) {
+      agentPrompt += `## Original Request\n\nThe user's original request was: "${prompt}"\n\nInclude a Goal Assessment in your review that evaluates whether the changes fulfill this intent.\n`
+    }
+
+    const client = new CodebuffClient({ apiKey })
+
+    let output = ''
+
+    spinner.update('Generating review...')
+
+    const result = await client.run({
+      agent: reviewAgent,
+      prompt: agentPrompt,
+      cwd: projectRoot,
+      knowledgeFiles,
+      maxAgentSteps: 10,
+      handleStreamChunk: (chunk) => {
+        if (typeof chunk === 'string') {
+          output += chunk
+        }
+      },
+    })
+
+    spinner.stop()
+
+    if (result.output.type === 'error') {
+      printError(result.output.message)
+      process.exit(2)
+    }
+
+    process.stdout.write(output)
+    if (output.length > 0 && !output.endsWith('\n')) {
+      process.stdout.write('\n')
+    }
+
+    process.stderr.write('✓ Done\n')
+
+    if (output.includes('🔴')) {
+      process.exit(1)
+    }
+  } catch (error) {
+    printError(
+      error instanceof Error
+        ? error.message
+        : 'Review failed.',
+    )
+    process.exit(2)
+  }
+}
diff --git a/evals/evalbuff/old/cli/src/index.ts b/evals/evalbuff/old/cli/src/index.ts
new file mode 100644
index 0000000000..a6830a1f34
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/index.ts
@@ -0,0 +1,82 @@
+#!/usr/bin/env bun
+import { Command } from 'commander'
+
+import { contextCommand } from './commands/context'
+import { initCommand } from './commands/init'
+import { loginCommand } from './commands/login'
+import { logoutCommand } from './commands/logout'
+import { reviewCommand } from './commands/review'
+
+const program = new Command()
+  .name('evalbuff')
+  .description(
+    'Codebase-specific evals, context, and review for AI coding agents',
+  )
+  .version('0.1.0')
+
+program
+  .command('init')
+  .description('Initialize evalbuff in a project')
+  .option('--cwd <path>', 'Project root directory')
+  .option('--skip-scan', 'Skip the initial project scan')
+  .option('--force', 'Overwrite existing configuration without prompting')
+  .action(async (options) => {
+    await initCommand({
+      cwd: options.cwd,
+      skipScan: options.skipScan,
+      force: options.force,
+    })
+  })
+
+program
+  .command('context')
+  .description('Get relevant files, knowledge, and gotchas for a task')
+  .argument('<prompt>', 'Description of what you are about to work on')
+  .option('--cwd <path>', 'Project root directory')
+  .option('--max-files <n>', 'Maximum number of files to return')
+  .option('--files-only', 'Output only file paths, one per line')
+  .action(async (prompt: string, options) => {
+    await contextCommand(prompt, {
+      cwd: options.cwd,
+      maxFiles: options.maxFiles,
+      filesOnly: options.filesOnly,
+    })
+  })
+
+program
+  .command('review')
+  .description('Review code changes with structured feedback')
+  .argument('[prompt]', 'Description of the original request for goal assessment')
+  .option('--cwd <path>', 'Project root directory')
+  .option('--files <paths...>', 'Scope the review to specific files')
+  .option(
+    '--branch [base]',
+    'Compare current branch against a base branch',
+  )
+  .option('--staged', 'Review only staged changes')
+  .option('--commit <sha>', 'Review a specific commit')
+  .action(async (prompt: string | undefined, options) => {
+    await reviewCommand(prompt, {
+      cwd: options.cwd,
+      files: options.files,
+      branch: options.branch,
+      staged: options.staged,
+      commit: options.commit,
+    })
+  })
+
+program
+  .command('login')
+  .description('Authenticate with evalbuff')
+  .action(async () => {
+    await loginCommand()
+  })
+
+program
+  .command('logout')
+  .description('Clear stored credentials')
+  .action(() => {
+    logoutCommand()
+  })
+
+program.parse()
diff --git a/evals/evalbuff/old/cli/src/templates/skill.ts b/evals/evalbuff/old/cli/src/templates/skill.ts
new file mode 100644
index 0000000000..f666241a84
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/templates/skill.ts
@@ -0,0 +1,45 @@
+export const SKILL_TEMPLATE = `---
+name: evalbuff
+description: Use evalbuff to get project context before coding and review changes before committing
+---
+
+# Evalbuff
+
+This project uses evalbuff for AI-assisted context gathering and change review.
+
+## Before Starting a Task
+
+Run evalbuff to get oriented before making changes:
+
+    evalbuff context "<description of what you're about to do>"
+
+This returns:
+- **Relevant files** with summaries — so you know what to read
+- **Background knowledge** about the systems involved
+- **Gotchas and lessons** from past work — so you avoid known pitfalls
+
+Use this output to inform which files to read and what to watch out for.
+
+## After Making Changes
+
+Run evalbuff to review your changes before considering the task complete. Include a description of what the user originally asked for so the reviewer can verify the changes match the intent:
+
+    evalbuff review "<description of what the user asked you to do>"
+
+This returns structured feedback including:
+- 🔴 **Critical issues** that must be fixed
+- 🟡 **Warnings** that should be addressed
+- 💡 **Suggestions** for improvement
+- Whether the changes actually accomplish the stated goal
+
+If there are critical issues (🔴), fix them and re-run the review.
+If there are only warnings and suggestions, use your judgment.
+
+## Tips
+
+- Always run \`evalbuff context\` first — it often surfaces non-obvious files and gotchas.
+- Always pass the user's original request to \`evalbuff review\` — this helps catch missing requirements and verify the changes match intent.
+- Run \`evalbuff review\` even for small changes — it catches things like missing error handling, test gaps, and convention violations.
+- You can review specific files: \`evalbuff review "add auth" --files src/auth.ts src/db.ts\`
+- You can review staged changes only: \`evalbuff review "fix login bug" --staged\`
+`
diff --git a/evals/evalbuff/old/cli/src/utils/auth.ts b/evals/evalbuff/old/cli/src/utils/auth.ts
new file mode 100644
index 0000000000..e20d299c22
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/utils/auth.ts
@@ -0,0 +1,188 @@
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+import { execSync } from 'child_process'
+
+import { WEBSITE_URL } from '@codebuff/sdk'
+import { z } from 'zod'
+
+const EVALBUFF_API_KEY_ENV_VAR = 'EVALBUFF_API_KEY'
+
+const userSchema = z.object({
+  name: z.string(),
+  email: z.string(),
+  authToken: z.string(),
+  fingerprintId: z.string().optional(),
+  fingerprintHash: z.string().optional(),
+})
+
+type User = z.infer<typeof userSchema>
+
+const credentialsSchema = z.object({
+  default: userSchema.optional(),
+})
+
+export function getConfigDir(): string {
+  return path.join(os.homedir(), '.config', 'evalbuff')
+}
+
+export function getCredentialsPath(): string {
+  return path.join(getConfigDir(), 'credentials.json')
+}
+
+export function getUserCredentials(): User | null {
+  const credentialsPath = getCredentialsPath()
+  if (!fs.existsSync(credentialsPath)) return null
+
+  try {
+    const raw = fs.readFileSync(credentialsPath, 'utf8')
+    const parsed = credentialsSchema.parse(JSON.parse(raw))
+    return parsed.default ?? null
+  } catch {
+    return null
+  }
+}
+
+export function getAuthToken(): string | undefined {
+  const envToken = process.env[EVALBUFF_API_KEY_ENV_VAR]
+  if (envToken) return envToken
+
+  const user = getUserCredentials()
+  return user?.authToken
+}
+
+export function saveUserCredentials(user: User): void {
+  const configDir = getConfigDir()
+  const credentialsPath = getCredentialsPath()
+
+  if (!fs.existsSync(configDir)) {
+    fs.mkdirSync(configDir, { recursive: true })
+  }
+
+  let existing: Record<string, unknown> = {}
+  if (fs.existsSync(credentialsPath)) {
+    try {
+      existing = JSON.parse(fs.readFileSync(credentialsPath, 'utf8'))
+    } catch {
+      // ignore
+    }
+  }
+
+  fs.writeFileSync(
+    credentialsPath,
+    JSON.stringify({ ...existing, default: user }, null, 2),
+  )
+}
+
+export function clearUserCredentials(): void {
+  const credentialsPath = getCredentialsPath()
+  if (!fs.existsSync(credentialsPath)) return
+
+  try {
+    const { default: _, ...rest } = JSON.parse(
+      fs.readFileSync(credentialsPath, 'utf8'),
+    )
+    if (Object.keys(rest).length === 0) {
+      fs.unlinkSync(credentialsPath)
+    } else {
+      fs.writeFileSync(credentialsPath, JSON.stringify(rest, null, 2))
+    }
+  } catch {
+    // ignore
+  }
+}
+
+function generateFingerprintId(): string {
+  return `evalbuff-${Math.random().toString(36).substring(2, 15)}`
+}
+
+function openBrowser(url: string): void {
+  try {
+    const platform = process.platform
+    if (platform === 'darwin') {
+      execSync(`open ${JSON.stringify(url)}`, { stdio: 'ignore' })
+    } else if (platform === 'linux') {
+      execSync(`xdg-open ${JSON.stringify(url)}`, { stdio: 'ignore' })
+    } else if (platform === 'win32') {
+      execSync(`start ${JSON.stringify(url)}`, { stdio: 'ignore' })
+    }
+  } catch {
+    // Browser open failed, user will need to copy the URL
+  }
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms))
+}
+
+export async function loginFlow(): Promise<User> {
+  const fingerprintId = generateFingerprintId()
+
+  const codeResponse = await fetch(`${WEBSITE_URL}/api/auth/cli/code`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ fingerprintId }),
+  })
+
+  if (!codeResponse.ok) {
+    throw new Error('Failed to initiate login. Check your internet connection.')
+  }
+
+  const { loginUrl, fingerprintHash, expiresAt } = (await codeResponse.json()) as {
+    loginUrl: string
+    fingerprintHash: string
+    expiresAt: string
+  }
+
+  process.stderr.write(`\nOpen this URL to log in:\n\n  ${loginUrl}\n\n`)
+  process.stderr.write('Waiting for authentication...\n')
+  openBrowser(loginUrl)
+
+  const startTime = Date.now()
+  const timeoutMs = 5 * 60 * 1000
+  const pollIntervalMs = 5000
+
+  while (Date.now() - startTime < timeoutMs) {
+    await sleep(pollIntervalMs)
+
+    try {
+      const params = new URLSearchParams({
+        fingerprintId,
+        fingerprintHash,
+        expiresAt,
+      })
+      const statusResponse = await fetch(
+        `${WEBSITE_URL}/api/auth/cli/status?${params}`,
+      )
+
+      if (statusResponse.ok) {
+        const data = (await statusResponse.json()) as {
+          user?: Record<string, unknown>
+        }
+        if (data.user) {
+          const user: User = {
+            name: String(data.user.name ?? ''),
+            email: String(data.user.email ?? ''),
+            authToken: String(data.user.authToken ?? ''),
+            fingerprintId,
+            fingerprintHash,
+          }
+          saveUserCredentials(user)
+          return user
+        }
+      }
+    } catch {
+      // Network error during polling, continue
+    }
+  }
+
+  throw new Error('Login timed out. Please try again.')
+}
+
+export async function ensureAuth(): Promise<string> {
+  const token = getAuthToken()
+  if (token) return token
+
+  const user = await loginFlow()
+  return user.authToken
+}
diff --git a/evals/evalbuff/old/cli/src/utils/config.ts b/evals/evalbuff/old/cli/src/utils/config.ts
new file mode 100644
index 0000000000..f07e997321
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/utils/config.ts
@@ -0,0 +1,119 @@
+import fs from 'fs'
+import path from 'path'
+
+import { z } from 'zod'
+
+const CONFIG_PATH = '.agents/evals/evalbuff.json'
+
+const evalbuffConfigSchema = z.object({
+  version: z.number(),
+  project: z
+    .object({
+      name: z.string().optional(),
+      description: z.string().optional(),
+    })
+    .optional(),
+  context: z
+    .object({
+      maxFiles: z.number().optional(),
+      excludePatterns: z.array(z.string()).optional(),
+    })
+    .optional(),
+  review: z
+    .object({
+      defaultBranch: z.string().optional(),
+    })
+    .optional(),
+})
+
+export type EvalbuffConfig = z.infer<typeof evalbuffConfigSchema>
+
+export function configPath(projectRoot: string): string {
+  return path.join(projectRoot, CONFIG_PATH)
+}
+
+export function readConfig(projectRoot: string): EvalbuffConfig | null {
+  const filePath = configPath(projectRoot)
+  if (!fs.existsSync(filePath)) return null
+
+  try {
+    const raw = JSON.parse(fs.readFileSync(filePath, 'utf8'))
+    return evalbuffConfigSchema.parse(raw)
+  } catch (error) {
+    process.stderr.write(
+      `Warning: Failed to parse evalbuff.json: ${error instanceof Error ? error.message : String(error)}. Using defaults.\n`,
+    )
+    return null
+  }
+}
+
+export function writeConfig(
+  projectRoot: string,
+  config: EvalbuffConfig,
+): void {
+  const filePath = configPath(projectRoot)
+  const dir = path.dirname(filePath)
+  if (!fs.existsSync(dir)) {
+    fs.mkdirSync(dir, { recursive: true })
+  }
+  fs.writeFileSync(filePath, JSON.stringify(config, null, 2) + '\n')
+}
+
+export function detectProjectName(projectRoot: string): string {
+  const pkgPath = path.join(projectRoot, 'package.json')
+  if (fs.existsSync(pkgPath)) {
+    try {
+      const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf8'))
+      if (typeof pkg.name === 'string' && pkg.name) return pkg.name
+    } catch {
+      // ignore
+    }
+  }
+
+  const pyprojectPath = path.join(projectRoot, 'pyproject.toml')
+  if (fs.existsSync(pyprojectPath)) {
+    try {
+      const content = fs.readFileSync(pyprojectPath, 'utf8')
+      const nameMatch = content.match(/^name\s*=\s*"([^"]+)"/m)
+      if (nameMatch) return nameMatch[1]
+    } catch {
+      // ignore
+    }
+  }
+
+  return path.basename(projectRoot)
+}
+
+export function detectProjectDescription(projectRoot: string): string {
+  const pkgPath = path.join(projectRoot, 'package.json')
+  if (fs.existsSync(pkgPath)) {
+    try {
+      const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf8'))
+      if (typeof pkg.description === 'string' && pkg.description)
+        return pkg.description
+    } catch {
+      // ignore
+    }
+  }
+  return ''
+}
+
+export function getDefaultConfig(projectRoot: string): EvalbuffConfig {
+  const name = detectProjectName(projectRoot)
+  const description = detectProjectDescription(projectRoot)
+
+  return {
+    version: 1,
+    project: {
+      name,
+      ...(description && { description }),
+    },
+    context: {
+      maxFiles: 15,
+      excludePatterns: ['dist/**', 'node_modules/**', '*.generated.ts'],
+    },
+    review: {
+      defaultBranch: 'main',
+    },
+  }
+}
diff --git a/evals/evalbuff/old/cli/src/utils/git.ts b/evals/evalbuff/old/cli/src/utils/git.ts
new file mode 100644
index 0000000000..7eab0a44f4
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/utils/git.ts
@@ -0,0 +1,110 @@
+import { execSync } from 'child_process'
+
+export function isGitRepo(cwd: string): boolean {
+  try {
+    execSync('git rev-parse --is-inside-work-tree', {
+      cwd,
+      stdio: 'pipe',
+    })
+    return true
+  } catch {
+    return false
+  }
+}
+
+export function getGitRoot(cwd: string): string | null {
+  try {
+    return execSync('git rev-parse --show-toplevel', {
+      cwd,
+      stdio: 'pipe',
+      encoding: 'utf8',
+    }).trim()
+  } catch {
+    return null
+  }
+}
+
+export function getDefaultBranch(cwd: string): string {
+  try {
+    const result = execSync(
+      'git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null || echo refs/remotes/origin/main',
+      { cwd, stdio: 'pipe', encoding: 'utf8' },
+    ).trim()
+    return result.replace('refs/remotes/origin/', '')
+  } catch {
+    return 'main'
+  }
+}
+
+export interface DiffOptions {
+  cwd: string
+  files?: string[]
+  branch?: string | true
+  staged?: boolean
+  commit?: string
+  defaultBranch?: string
+}
+
+export function getDiff(options: DiffOptions): string {
+  const { cwd, files, branch, staged, commit, defaultBranch = 'main' } = options
+
+  let cmd: string
+
+  if (commit) {
+    cmd = `git diff ${commit}~1 ${commit}`
+  } else if (branch !== undefined) {
+    const baseBranch = typeof branch === 'string' ? branch : defaultBranch
+    const mergeBase = execSync(`git merge-base ${baseBranch} HEAD`, {
+      cwd,
+      stdio: 'pipe',
+      encoding: 'utf8',
+    }).trim()
+    cmd = `git diff ${mergeBase} HEAD`
+  } else if (staged) {
+    cmd = 'git diff --cached'
+  } else {
+    cmd = 'git diff HEAD'
+  }
+
+  if (files && files.length > 0) {
+    cmd += ' -- ' + files.map((f) => JSON.stringify(f)).join(' ')
+  }
+
+  try {
+    return execSync(cmd, { cwd, stdio: 'pipe', encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 })
+  } catch {
+    return ''
+  }
+}
+
+export function getChangedFiles(options: DiffOptions): string[] {
+  const { cwd, branch, staged, commit, defaultBranch = 'main' } = options
+
+  let cmd: string
+
+  if (commit) {
+    cmd = `git diff --name-only ${commit}~1 ${commit}`
+  } else if (branch !== undefined) {
+    const baseBranch = typeof branch === 'string' ? branch : defaultBranch
+    const mergeBase = execSync(`git merge-base ${baseBranch} HEAD`, {
+      cwd,
+      stdio: 'pipe',
+      encoding: 'utf8',
+    }).trim()
+    cmd = `git diff --name-only ${mergeBase} HEAD`
+  } else if (staged) {
+    cmd = 'git diff --cached --name-only'
+  } else {
+    cmd = 'git diff HEAD --name-only'
+  }
+
+  try {
+    const result = execSync(cmd, { cwd, stdio: 'pipe', encoding: 'utf8' })
+    return result
+      .trim()
+      .split('\n')
+      .filter((f) => f.length > 0)
+  } catch {
+    return []
+  }
+}
diff --git a/evals/evalbuff/old/cli/src/utils/knowledge.ts b/evals/evalbuff/old/cli/src/utils/knowledge.ts
new file mode 100644
index 0000000000..76718c3570
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/utils/knowledge.ts
@@ -0,0 +1,50 @@
+import fs from 'fs'
+import path from 'path'
+
+const KNOWLEDGE_DIR = '.agents/knowledge'
+
+export function knowledgeDir(projectRoot: string): string {
+  return path.join(projectRoot, KNOWLEDGE_DIR)
+}
+
+export function ensureKnowledgeDir(projectRoot: string): void {
+  const dir = knowledgeDir(projectRoot)
+  if (!fs.existsSync(dir)) {
+    fs.mkdirSync(dir, { recursive: true })
+  }
+}
+
+export function readKnowledgeFiles(
+  projectRoot: string,
+): Record<string, string> {
+  const dir = knowledgeDir(projectRoot)
+  if (!fs.existsSync(dir)) return {}
+
+  const files: Record<string, string> = {}
+  try {
+    const entries = fs.readdirSync(dir)
+    for (const entry of entries) {
+      if (!entry.endsWith('.md')) continue
+      const filePath = path.join(dir, entry)
+      try {
+        files[path.join(KNOWLEDGE_DIR, entry)] = fs.readFileSync(
+          filePath,
+          'utf8',
+        )
+      } catch {
+        // skip unreadable files
+      }
+    }
+  } catch {
+    // directory doesn't exist or can't be read
+  }
+
+  return files
+}
+
+export const KNOWLEDGE_FILE_NAMES = [
+  'architecture.md',
+  'tech-stack.md',
+  'conventions.md',
+  'testing.md',
+] as const
diff --git a/evals/evalbuff/old/cli/src/utils/output.ts b/evals/evalbuff/old/cli/src/utils/output.ts
new file mode 100644
index 0000000000..ea4f61d372
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/utils/output.ts
@@ -0,0 +1,62 @@
+const SPINNER_FRAMES = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']
+const SPINNER_INTERVAL_MS = 80
+
+export function isTTY(): boolean {
+  return process.stderr.isTTY === true
+}
+
+export class Spinner {
+  private frameIndex = 0
+  private timer: ReturnType<typeof setInterval> | null = null
+  private currentMessage = ''
+
+  start(message: string): void {
+    this.currentMessage = message
+    if (!isTTY()) return
+
+    this.render()
+    this.timer = setInterval(() => {
+      this.frameIndex = (this.frameIndex + 1) % SPINNER_FRAMES.length
+      this.render()
+    }, SPINNER_INTERVAL_MS)
+  }
+
+  update(message: string): void {
+    this.currentMessage = message
+    if (!isTTY()) return
+    this.render()
+  }
+
+  stop(): void {
+    if (this.timer) {
+      clearInterval(this.timer)
+      this.timer = null
+    }
+    if (isTTY()) {
+      process.stderr.write('\r\x1b[K')
+    }
+  }
+
+  succeed(message: string): void {
+    this.stop()
+    process.stderr.write(`✓ ${message}\n`)
+  }
+
+  fail(message: string): void {
+    this.stop()
+    process.stderr.write(`✗ ${message}\n`)
+  }
+
+  private render(): void {
+    const frame = SPINNER_FRAMES[this.frameIndex]
+    process.stderr.write(`\r\x1b[K${frame} ${this.currentMessage}`)
+  }
+}
+
+export function printError(message: string): void {
+  process.stderr.write(`Error: ${message}\n`)
+}
+
+export function printWarning(message: string): void {
+  process.stderr.write(`Warning: ${message}\n`)
+}
diff --git a/evals/evalbuff/old/cli/src/utils/project.ts b/evals/evalbuff/old/cli/src/utils/project.ts
new file mode 100644
index 0000000000..7d32f6e074
--- /dev/null
+++ b/evals/evalbuff/old/cli/src/utils/project.ts
@@ -0,0 +1,9 @@
+import path from 'path'
+
+import { getGitRoot } from './git'
+
+export function findProjectRoot(cwd?: string): string {
+  const startDir = cwd ? path.resolve(cwd) : process.cwd()
+  const gitRoot = getGitRoot(startDir)
+  return gitRoot ?? startDir
+}
diff --git a/evals/evalbuff/old/cli/tsconfig.json b/evals/evalbuff/old/cli/tsconfig.json
new file mode 100644
index 0000000000..30b7a1ec13
--- /dev/null
+++ b/evals/evalbuff/old/cli/tsconfig.json
@@ -0,0 +1,12 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": {
+    "types": ["bun-types"],
+    "skipLibCheck": true,
+    "paths": {
+      "@codebuff/sdk": ["../../sdk/src/index.ts"]
+    }
+  },
+  "include": ["src", "../agents"],
+  "exclude": ["node_modules", "dist"]
+}
diff --git a/evals/evalbuff/run-evalbuff.ts b/evals/evalbuff/run-evalbuff.ts
new file mode 100644
index 0000000000..d7f0405db4
--- /dev/null
+++ b/evals/evalbuff/run-evalbuff.ts
@@ -0,0 +1,428 @@
+import { execSync } from 'child_process'
+import fs from 'fs'
+import path from 'path'
+
+import { CodebuffClient } from '@codebuff/sdk'
+
+import { runCliAgent } from './cli-runner'
+import {
+  getCriteriaForLevel,
+  loadCriteria,
+  maybePromoteCriteria,
+  saveCriteria,
+} from './criteria'
+import {
+  analyzeFailure,
+  applyDocEdit,
+  compareScores,
+  readCurrentDocs,
+} from './docs-optimizer'
+import { judgeCommitResult } from './judge'
+import {
+  appendLogEntry,
+  generateMorningReport,
+} from './morning-report'
+import { withTestRepo } from './test-repo-utils'
+
+import type { QualityCriteria } from './criteria'
+import type { EvalbuffLogEntry } from './morning-report'
+import type { EvalCommitV2, EvalDataV2 } from './types'
+
+export interface EvalbuffOptions {
+  repoPath: string
+  agentCommand: string
+  evalDataPaths: string[]
+  maxIterations: number
+  maxCostUsd: number
+  scoreThreshold: number
+  agentTimeoutMs: number
+  criteriaPath?: string
+}
+
+interface EvalbuffState {
+  completedTaskIds: string[]
+  totalCostUsd: number
+  recentScores: number[]
+}
+
+function loadState(statePath: string): EvalbuffState {
+  if (fs.existsSync(statePath)) {
+    return JSON.parse(fs.readFileSync(statePath, 'utf-8'))
+  }
+  return { completedTaskIds: [], totalCostUsd: 0, recentScores: [] }
+}
+
+function saveState(statePath: string, state: EvalbuffState): void {
+  fs.writeFileSync(statePath, JSON.stringify(state, null, 2))
+}
+
+function loadEvalTasks(evalDataPaths: string[]): Array<{
+  task: EvalCommitV2
+  evalData: EvalDataV2
+}> {
+  const tasks: Array<{ task: EvalCommitV2; evalData: EvalDataV2 }> = []
+  for (const evalPath of evalDataPaths) {
+    const evalData: EvalDataV2 = JSON.parse(
+      fs.readFileSync(evalPath, 'utf-8'),
+    )
+    for (const commit of evalData.evalCommits) {
+      tasks.push({ task: commit, evalData })
+    }
+  }
+  return tasks
+}
+
+function copyDocsIntoRepo(
+  sourceRepoPath: string,
+  targetRepoPath: string,
+): void {
+  const sourceDocsDir = path.join(sourceRepoPath, 'docs')
+  const sourceAgentsMd = path.join(sourceRepoPath, 'AGENTS.md')
+  const targetDocsDir = path.join(targetRepoPath, 'docs')
+  const targetAgentsMd = path.join(targetRepoPath, 'AGENTS.md')
+
+  if (fs.existsSync(sourceDocsDir)) {
+    fs.cpSync(sourceDocsDir, targetDocsDir, { recursive: true })
+  }
+  if (fs.existsSync(sourceAgentsMd)) {
+    fs.cpSync(sourceAgentsMd, targetAgentsMd)
+  }
+}
+
+function getContextFiles(
+  repoDir: string,
+  commit: EvalCommitV2,
+): Record<string, string> {
+  const contextFiles: Record<string, string> = {}
+  const contextFilePaths = new Set<string>([
+    ...commit.supplementalFiles,
+    ...commit.fileDiffs.map((fd) => fd.path),
+  ])
+  for (const { status, path: filePath } of commit.fileDiffs) {
+    if (status === 'added') contextFilePaths.delete(filePath)
+  }
+
+  for (const filePath of contextFilePaths) {
+    try {
+      const content = execSync(
+        `git show ${commit.parentSha}:${JSON.stringify(filePath)}`,
+        { cwd: repoDir, encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 },
+      )
+      contextFiles[filePath] = content
+    } catch {
+      contextFiles[filePath] = ''
+    }
+  }
+  return contextFiles
+}
+
+export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
+  const {
+    repoPath,
+    agentCommand,
+    evalDataPaths,
+    maxIterations,
+    maxCostUsd,
+    scoreThreshold,
+    agentTimeoutMs,
+    criteriaPath,
+  } = options
+
+  const statePath = path.join(repoPath, 'evalbuff-state.json')
+  const logPath = path.join(repoPath, 'evalbuff-log.jsonl')
+  const defaultCriteriaPath =
+    criteriaPath || path.join(repoPath, 'evalbuff-criteria.json')
+
+  const state = loadState(statePath)
+  let criteria = loadCriteria(defaultCriteriaPath)
+  const tasks = loadEvalTasks(evalDataPaths)
+
+  const client = new CodebuffClient({})
+
+  console.log(`Evalbuff starting:`)
+  console.log(`  Repo: ${repoPath}`)
+  console.log(`  Agent: ${agentCommand}`)
+  console.log(`  Tasks: ${tasks.length}`)
+  console.log(`  Max iterations: ${maxIterations}`)
+  console.log(`  Max cost: $${maxCostUsd}`)
+  console.log(`  Score threshold: ${scoreThreshold}`)
+  console.log(`  Criteria level: ${criteria.level}/5`)
+  console.log(`  Completed: ${state.completedTaskIds.length} tasks`)
+
+  let iterations = 0
+
+  for (const { task, evalData } of tasks) {
+    // Budget checks
+    if (iterations >= maxIterations) {
+      console.log(`Reached max iterations (${maxIterations}). Stopping.`)
+      break
+    }
+    if (state.totalCostUsd >= maxCostUsd) {
+      console.log(
+        `Reached max cost ($${state.totalCostUsd.toFixed(2)} >= $${maxCostUsd}). Stopping.`,
+      )
+      break
+    }
+
+    // Skip completed tasks
+    if (state.completedTaskIds.includes(task.id)) {
+      console.log(`Skipping completed task: ${task.id}`)
+      continue
+    }
+
+    iterations++
+    const iterationStart = Date.now()
+    console.log(
+      `\n${'='.repeat(60)}\n[${iterations}/${maxIterations}] Task: ${task.id}\n${'='.repeat(60)}`,
+    )
+
+    let logEntry: EvalbuffLogEntry = {
+      taskId: task.id,
+      timestamp: new Date().toISOString(),
+      oldScore: 0,
+      newScore: null,
+      docEdit: null,
+      scoreComparison: null,
+      costUsd: 0,
+      durationMs: 0,
+      criteriaLevel: criteria.level,
+    }
+
+    try {
+      // Step 1: Run agent with current docs
+      console.log(`Running agent on task ${task.id}...`)
+      const oldResult = await withTestRepo(
+        {
+          repoUrl: evalData.repoUrl,
+          parentSha: task.parentSha,
+          initCommand: evalData.initCommand,
+          env: evalData.env,
+        },
+        async (repoDir) => {
+          // Copy current docs into the test repo
+          copyDocsIntoRepo(repoPath, repoDir)
+
+          const result = await runCliAgent({
+            command: agentCommand,
+            prompt: task.prompt,
+            cwd: repoDir,
+            timeoutMs: agentTimeoutMs,
+            env: evalData.env,
+          })
+
+          const contextFiles = getContextFiles(repoDir, task)
+
+          return { ...result, contextFiles }
+        },
+      )
+
+      // Judge the result
+      console.log(`Judging result...`)
+      const oldJudging = await judgeCommitResult({
+        client,
+        commit: task,
+        contextFiles: oldResult.contextFiles,
+        agentDiff: oldResult.diff,
+        error: oldResult.exitCode !== 0 ? oldResult.stderr : undefined,
+        criteria,
+      })
+
+      logEntry.oldScore = oldJudging.overallScore
+      logEntry.costUsd += oldResult.durationMs * 0.001 // rough estimate
+
+      console.log(`Score: ${oldJudging.overallScore.toFixed(1)}/10`)
+
+      // Step 2: If score is low, try to improve docs
+      if (oldJudging.overallScore < scoreThreshold) {
+        console.log(`Score below threshold (${scoreThreshold}). Analyzing failure...`)
+
+        const groundTruthDiff = task.fileDiffs
+          .map(({ path: p, diff }) => `--- ${p}\n${diff}`)
+          .join('\n\n')
+
+        const currentDocs = readCurrentDocs(repoPath)
+
+        const docSuggestion = await analyzeFailure({
+          client,
+          judgeResult: oldJudging,
+          taskPrompt: task.prompt,
+          agentDiff: oldResult.diff,
+          groundTruthDiff,
+          currentDocs,
+          scoreThreshold,
+        })
+
+        if (docSuggestion) {
+          console.log(
+            `Doc suggestion: ${docSuggestion.suggestedDocPath} - ${docSuggestion.reasoning}`,
+          )
+          logEntry.docEdit = {
+            path: docSuggestion.suggestedDocPath,
+            reasoning: docSuggestion.reasoning,
+          }
+
+          // Re-run with updated docs on a FRESH repo
+          console.log(`Re-running agent with new doc...`)
+          const newResult = await withTestRepo(
+            {
+              repoUrl: evalData.repoUrl,
+              parentSha: task.parentSha,
+              initCommand: evalData.initCommand,
+              env: evalData.env,
+            },
+            async (freshRepoDir) => {
+              // Copy existing docs + new doc
+              copyDocsIntoRepo(repoPath, freshRepoDir)
+              applyDocEdit(
+                freshRepoDir,
+                docSuggestion.suggestedDocPath,
+                docSuggestion.suggestedContent,
+              )
+
+              const result = await runCliAgent({
+                command: agentCommand,
+                prompt: task.prompt,
+                cwd: freshRepoDir,
+                timeoutMs: agentTimeoutMs,
+                env: evalData.env,
+              })
+
+              const contextFiles = getContextFiles(freshRepoDir, task)
+              return { ...result, contextFiles }
+            },
+          )
+
+          // Judge the new result
+          const newJudging = await judgeCommitResult({
+            client,
+            commit: task,
+            contextFiles: newResult.contextFiles,
+            agentDiff: newResult.diff,
+            error: newResult.exitCode !== 0 ? newResult.stderr : undefined,
+            criteria,
+          })
+
+          logEntry.newScore = newJudging.overallScore
+          logEntry.costUsd += newResult.durationMs * 0.001
+          logEntry.scoreComparison = compareScores(
+            oldJudging.overallScore,
+            newJudging.overallScore,
+          )
+
+          console.log(
+            `New score: ${newJudging.overallScore.toFixed(1)}/10 (${logEntry.scoreComparison})`,
+          )
+
+          // Keep doc if it improved
+          if (logEntry.scoreComparison === 'improved') {
+            console.log(`Keeping doc edit: ${docSuggestion.suggestedDocPath}`)
+            applyDocEdit(
+              repoPath,
+              docSuggestion.suggestedDocPath,
+              docSuggestion.suggestedContent,
+            )
+
+            // Commit the doc change
+            try {
+              execSync('git add docs/ AGENTS.md', {
+                cwd: repoPath,
+                stdio: 'ignore',
+              })
+              execSync(
+                `git commit -m "evalbuff: add docs for ${task.id}"`,
+                {
+                  cwd: repoPath,
+                  stdio: 'ignore',
+                },
+              )
+            } catch {
+              console.warn('Failed to commit doc change (may have no changes)')
+            }
+          } else {
+            console.log(`Reverting doc edit (${logEntry.scoreComparison})`)
+          }
+        }
+      }
+
+      // Update scores tracking
+      state.recentScores.push(
+        logEntry.newScore !== null ? logEntry.newScore : logEntry.oldScore,
+      )
+
+      // Check criteria promotion
+      const newLevel = maybePromoteCriteria(criteria, state.recentScores)
+      if (newLevel !== criteria.level) {
+        criteria = {
+          ...criteria,
+          level: newLevel,
+          criteria: getCriteriaForLevel(newLevel),
+        }
+        saveCriteria(defaultCriteriaPath, criteria)
+        logEntry.criteriaLevel = newLevel
+      }
+    } catch (error) {
+      const errorMsg =
+        error instanceof Error ? error.message : String(error)
+      console.error(`Error on task ${task.id}:`, errorMsg)
+      logEntry.error = errorMsg
+    }
+
+    logEntry.durationMs = Date.now() - iterationStart
+    state.totalCostUsd += logEntry.costUsd
+    state.completedTaskIds.push(task.id)
+
+    // Persist state and log
+    appendLogEntry(logPath, logEntry)
+    saveState(statePath, state)
+  }
+
+  // Generate morning report
+  console.log('\nGenerating morning report...')
+  const report = generateMorningReport(logPath)
+
+  const reportPath = path.join(
+    repoPath,
+    `evalbuff-report-${new Date().toISOString().slice(0, 10)}.md`,
+  )
+  fs.writeFileSync(reportPath, report)
+  console.log(`Morning report written to: ${reportPath}`)
+  console.log(report)
+}
+
+// CLI entry point
+async function main() {
+  const args = process.argv.slice(2)
+  const getArg = (name: string, defaultValue?: string): string => {
+    const idx = args.indexOf(`--${name}`)
+    if (idx >= 0 && idx + 1 < args.length) return args[idx + 1]
+    if (defaultValue !== undefined) return defaultValue
+    throw new Error(`Missing required argument: --${name}`)
+  }
+
+  const repoPath = getArg('repo')
+  const agentCommand = getArg('agent')
+  const evalDataPaths = getArg('evals').split(',')
+  const maxIterations = parseInt(getArg('max-iterations', '50'))
+  const maxCostUsd = parseFloat(getArg('max-cost', '50'))
+  const scoreThreshold = parseFloat(getArg('score-threshold', '7.0'))
+  const agentTimeoutMs = parseInt(getArg('agent-timeout', '300000'))
+  const criteriaPath = args.includes('--criteria')
+    ? getArg('criteria')
+    : undefined
+
+  await runEvalbuff({
+    repoPath,
+    agentCommand,
+    evalDataPaths,
+    maxIterations,
+    maxCostUsd,
+    scoreThreshold,
+    agentTimeoutMs,
+    criteriaPath,
+  })
+}
+
+main().catch((error) => {
+  console.error('Evalbuff failed:', error)
+  process.exit(1)
+})
diff --git a/evals/evalbuff/runners/claude.ts b/evals/evalbuff/runners/claude.ts
new file mode 100644
index 0000000000..1ecd200567
--- /dev/null
+++ b/evals/evalbuff/runners/claude.ts
@@ -0,0 +1,176 @@
+import { execSync, spawn } from 'child_process'
+
+import type { Runner, RunnerResult, AgentStep } from './runner'
+import type {
+  PrintModeToolCall,
+  PrintModeToolResult,
+} from '@codebuff/common/types/print-mode'
+
+export class ClaudeRunner implements Runner {
+  private cwd: string
+  private env: Record<string, string>
+
+  constructor(cwd: string, env: Record<string, string> = {}) {
+    this.cwd = cwd
+    this.env = env
+  }
+
+  async run(prompt: string): Promise<RunnerResult> {
+    const steps: AgentStep[] = []
+    let totalCostUsd = 0
+
+    return new Promise((resolve, reject) => {
+      const args = [
+        '-p',
+        prompt,
+        '--output-format',
+        'stream-json',
+        '--verbose',
+        '--dangerously-skip-permissions',
+        '--model',
+        'claude-opus-4-5-20251101',
+      ]
+
+      console.log(`[ClaudeRunner] Running: claude ${args.join(' ')}`)
+
+      const child = spawn('claude', args, {
+        cwd: this.cwd,
+        env: {
+          ...process.env,
+          ...this.env,
+          // Ensure ANTHROPIC_API_KEY is set from CLAUDE_CODE_KEY if available
+          ANTHROPIC_API_KEY:
+            process.env.CLAUDE_CODE_KEY || process.env.ANTHROPIC_API_KEY,
+        },
+        // Use 'ignore' for stdin to prevent the CLI from waiting for input
+        stdio: ['ignore', 'pipe', 'pipe'],
+      })
+
+      let _stdout = ''
+      let stderr = ''
+      let responseText = ''
+      let toolCalls: PrintModeToolCall[] = []
+      let toolResults: PrintModeToolResult[] = []
+
+      function flushStep() {
+        if (responseText.length > 0) {
+          steps.push({ type: 'text', text: responseText })
+        }
+        for (const call of toolCalls) {
+          steps.push(call)
+        }
+        for (const result of toolResults) {
+          steps.push(result)
+        }
+        responseText = ''
+        toolCalls = []
+        toolResults = []
+      }
+
+      child.stdout.on('data', (data: Buffer) => {
+        const chunk = data.toString()
+        _stdout += chunk
+
+        // Parse streaming JSON output from Claude CLI
+        const lines = chunk.split('\n').filter((line) => line.trim())
+        for (const line of lines) {
+          try {
+            const event = JSON.parse(line)
+
+            if (event.type === 'assistant') {
+              if (event.message?.content) {
+                for (const content of event.message.content) {
+                  if (content.type === 'text') {
+                    if (toolResults.length > 0) {
+                      flushStep()
+                    }
+                    responseText += content.text
+                    process.stdout.write(content.text)
+                  } else if (content.type === 'tool_use') {
+                    toolCalls.push({
+                      type: 'tool_call',
+                      toolName: content.name,
+                      toolCallId: content.id,
+                      input: content.input || {},
+                    })
+                  }
+                }
+              }
+            } else if (event.type === 'user') {
+              if (event.message?.content) {
+                for (const content of event.message.content) {
+                  if (content.type === 'tool_result') {
+                    toolResults.push({
+                      type: 'tool_result',
+                      toolName: 'unknown',
+                      toolCallId: content.tool_use_id,
+                      output: [
+                        {
+                          type: 'json',
+                          value:
+                            typeof content.content === 'string'
+                              ? content.content
+                              : content.content,
+                        },
+                      ],
+                    })
+                  }
+                }
+              }
+            } else if (event.type === 'result') {
+              if (event.total_cost_usd) {
+                totalCostUsd += event.total_cost_usd
+              }
+            }
+          } catch {
+            // Not JSON, might be plain text output
+            responseText += line
+          }
+        }
+      })
+
+      child.stderr.on('data', (data: Buffer) => {
+        stderr += data.toString()
+        process.stderr.write(data)
+      })
+
+      child.on('error', (error) => {
+        reject(
+          new Error(
+            `Claude CLI failed to start: ${error.message}. Make sure 'claude' is installed and in PATH.`,
+          ),
+        )
+      })
+
+      child.on('close', (code) => {
+        flushStep()
+
+        // Get git diff after Claude has made changes
+        let diff = ''
+        try {
+          execSync('git add .', { cwd: this.cwd, stdio: 'ignore' })
+          diff = execSync('git diff HEAD', {
+            cwd: this.cwd,
+            encoding: 'utf-8',
+            maxBuffer: 10 * 1024 * 1024,
+          })
+        } catch {
+          // Ignore git errors
+        }
+
+        if (code !== 0) {
+          reject(
+            new Error(`Claude CLI exited with code ${code}. stderr: ${stderr}`),
+          )
+          return
+        }
+
+        resolve({
+          steps,
+          totalCostUsd,
+          diff,
+        })
+      })
+    })
+  }
+}
diff --git a/evals/evalbuff/runners/codebuff.ts b/evals/evalbuff/runners/codebuff.ts
new file mode 100644
index 0000000000..867b95ee1a
--- /dev/null
+++ b/evals/evalbuff/runners/codebuff.ts
@@ -0,0 +1,139 @@
+import { execSync } from 'child_process'
+import fs from 'fs'
+import path from 'path'
+
+import type { Runner, RunnerResult, AgentStep } from './runner'
+import type { CodebuffClient } from '@codebuff/sdk'
+
+
+const DEBUG_ERROR = true
+
+export class CodebuffRunner implements Runner {
+  private cwd: string
+  private env?: Record<string, string>
+  private client: CodebuffClient
+  private agentId: string
+  private localAgentDefinitions: any[]
+  private printEvents: boolean
+  private commitId: string
+  private parentSha: string
+
+  constructor(options: {
+    cwd: string
+    env?: Record<string, string>
+    client: CodebuffClient
+    agentId: string
+    localAgentDefinitions: any[]
+    printEvents: boolean
+    commitId: string
+    parentSha: string
+  }) {
+    this.cwd = options.cwd
+    this.env = options.env
+    this.client = options.client
+    this.agentId = options.agentId
+    this.localAgentDefinitions = options.localAgentDefinitions
+    this.printEvents = options.printEvents
+    this.commitId = options.commitId
+    this.parentSha = options.parentSha
+  }
+
+  async run(prompt: string): Promise<RunnerResult> {
+    const steps: AgentStep[] = []
+    let totalCostUsd = 0
+
+    const maxAgentSteps = 40
+    const result = await this.client.run({
+      agent: this.agentId,
+      prompt,
+      agentDefinitions: this.localAgentDefinitions,
+      cwd: this.cwd,
+      env: this.env,
+      maxAgentSteps,
+      handleEvent: (event) => {
+        if (
+          (event.type === 'tool_call' || event.type === 'tool_result') &&
+          event.toolName === 'set_messages'
+        ) {
+          return
+        }
+        if (event.type === 'error') {
+          console.error(
+            `[${this.commitId}:${this.agentId}] Error event:`,
+            event.message,
+          )
+          if (DEBUG_ERROR && !event.message.startsWith('Invalid JSON')) {
+            // Save errors in a file, but not tool calls with invalid json.
+            fs.writeFileSync(
+              path.join(
+                __dirname,
+                '..',
+                `${this.commitId}-${this.agentId}-error-${Math.random().toString(36).substring(2, 6)}.json`,
+              ),
+              JSON.stringify(
+                {
+                  error: event.message,
+                  trace: steps,
+                },
+                null,
+                2,
+              ),
+            )
+          }
+        } else if (this.printEvents) {
+          console.log(
+            `[${this.commitId}:${this.agentId}]`,
+            JSON.stringify(event, null, 2),
+          )
+        }
+        steps.push(event)
+      },
+    })
+
+    if (result.output.type === 'error') {
+      console.error(
+        `[${this.commitId}:${this.agentId}] Error:`,
+        result.output.message,
+      )
+      if (DEBUG_ERROR) {
+        // Save errors in a file, but not tool calls with invalid json.
+        fs.writeFileSync(
+          path.join(
+            __dirname,
+            '..',
+            `${this.commitId}-${this.agentId}-error-${Math.random().toString(36).substring(2, 6)}.json`,
+          ),
+          JSON.stringify(
+            {
+              ...result.output,
+              trace: steps,
+            },
+            null,
+            2,
+          ),
+        )
+      }
+    }
+
+    totalCostUsd = (result.sessionState?.mainAgentState.creditsUsed ?? 0) / 100
+
+    // Get git diff after Codebuff has made changes
+    let diff = ''
+    try {
+      execSync('git add .', { cwd: this.cwd, stdio: 'ignore' })
+      diff = execSync(`git diff ${this.parentSha}`, {
+        cwd: this.cwd,
+        encoding: 'utf-8',
+        maxBuffer: 10 * 1024 * 1024,
+      })
+    } catch {
+      // Ignore git errors
+    }
+
+    return {
+      steps,
+      totalCostUsd,
+      diff,
+    }
+  }
+}
diff --git a/evals/evalbuff/runners/codex.ts b/evals/evalbuff/runners/codex.ts
new file mode 100644
index 0000000000..b8a3ad7726
--- /dev/null
+++ b/evals/evalbuff/runners/codex.ts
@@ -0,0 +1,143 @@
+import { execSync, spawn } from 'child_process'
+
+import type { Runner, RunnerResult, AgentStep } from './runner'
+
+export class CodexRunner implements Runner {
+  private cwd: string
+  private env: Record<string, string>
+
+  constructor(cwd: string, env: Record<string, string> = {}) {
+    this.cwd = cwd
+    this.env = env
+  }
+
+  async run(prompt: string): Promise<RunnerResult> {
+    const steps: AgentStep[] = []
+    let totalCostUsd = 0
+
+    return new Promise((resolve, reject) => {
+      // Codex CLI uses the prompt as a positional argument
+      // Use exec subcommand with --full-auto for automatic execution
+      // --full-auto enables -a on-failure and --sandbox workspace-write
+      // Use --json for structured output that we can parse
+      const args = [
+        'exec',
+        '--full-auto',
+        '--json',
+        '-m',
+        'gpt-5.1-codex',
+        prompt,
+      ]
+
+      console.log(`[CodexRunner] Running: codex ${args.join(' ')}`)
+
+      const child = spawn('codex', args, {
+        cwd: this.cwd,
+        env: {
+          ...process.env,
+          ...this.env,
+          CODEX_API_KEY: process.env.OPENAI_API_KEY || this.env.OPENAI_API_KEY,
+        },
+        // Use 'ignore' for stdin to prevent the CLI from waiting for input
+        stdio: ['ignore', 'pipe', 'pipe'],
+      })
+
+      let _stdout = ''
+      let stderr = ''
+
+      child.stdout.on('data', (data: Buffer) => {
+        const chunk = data.toString()
+        _stdout += chunk
+        process.stdout.write(chunk)
+
+        // Codex outputs events as JSON lines in some modes
+        const lines = chunk.split('\n').filter((line) => line.trim())
+        for (const line of lines) {
+          try {
+            const event = JSON.parse(line)
+            if (event.type === 'message') {
+              steps.push({
+                type: 'text',
+                text: event.content || event.message || '',
+              })
+            } else if (
+              event.type === 'function_call' ||
+              event.type === 'tool'
+            ) {
+              steps.push({
+                type: 'tool_call',
+                toolName: event.name || event.function?.name || 'unknown',
+                toolCallId: event.id || `codex-${Date.now()}`,
+                input: event.arguments || event.function?.arguments || {},
+              })
+            } else if (
+              event.type === 'function_result' ||
+              event.type === 'tool_result'
+            ) {
+              steps.push({
+                type: 'tool_result',
+                toolName: event.name || 'unknown',
+                toolCallId: event.id || `codex-${Date.now()}`,
+                output: [
+                  {
+                    type: 'json',
+                    value: event.result || event.output || '',
+                  },
+                ],
+              })
+            }
+          } catch {
+            // Plain text output, add as text step
+            if (line.trim()) {
+              steps.push({
+                type: 'text',
+                text: line,
+              })
+            }
+          }
+        }
+      })
+
+      child.stderr.on('data', (data: Buffer) => {
+        stderr += data.toString()
+        process.stderr.write(data)
+      })
+
+      child.on('error', (error) => {
+        reject(
+          new Error(
+            `Codex CLI failed to start: ${error.message}. Make sure 'codex' is installed and in PATH.`,
+          ),
+        )
+      })
+
+      child.on('close', (code) => {
+        // Get git diff after Codex has made changes
+        let diff = ''
+        try {
+          execSync('git add .', { cwd: this.cwd, stdio: 'ignore' })
+          diff = execSync('git diff HEAD', {
+            cwd: this.cwd,
+            encoding: 'utf-8',
+            maxBuffer: 10 * 1024 * 1024,
+          })
+        } catch {
+          // Ignore git errors
+        }
+
+        if (code !== 0) {
+          reject(
+            new Error(`Codex CLI exited with code ${code}. stderr: ${stderr}`),
+          )
+          return
+        }
+
+        resolve({
+          steps,
+          totalCostUsd, // Codex doesn't report cost in CLI output
+          diff,
+        })
+      })
+    })
+  }
+}
diff --git a/evals/evalbuff/runners/index.ts b/evals/evalbuff/runners/index.ts
new file mode 100644
index 0000000000..99adc3d28a
--- /dev/null
+++ b/evals/evalbuff/runners/index.ts
@@ -0,0 +1,3 @@
+export { ClaudeRunner } from './claude'
+export { CodexRunner } from './codex'
+export type { Runner, RunnerResult } from './runner'
diff --git a/evals/evalbuff/runners/runner.ts b/evals/evalbuff/runners/runner.ts
new file mode 100644
index 0000000000..ea450caaab
--- /dev/null
+++ b/evals/evalbuff/runners/runner.ts
@@ -0,0 +1,13 @@
+import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
+
+export type AgentStep = PrintModeEvent
+
+export type RunnerResult = {
+  steps: AgentStep[]
+  totalCostUsd: number
+  diff: string
+}
+
+export interface Runner {
+  run: (prompt: string) => Promise<RunnerResult>
+}
diff --git a/evals/evalbuff/test-repo-utils.ts b/evals/evalbuff/test-repo-utils.ts
new file mode 100644
index 0000000000..60039a3a62
--- /dev/null
+++ b/evals/evalbuff/test-repo-utils.ts
@@ -0,0 +1,131 @@
+import { execSync } from 'child_process'
+import fs from 'fs'
+import * as os from 'os'
+import path from 'path'
+
+import { getErrorObject } from '@codebuff/common/util/error'
+
+/**
+ * Helper function to manage test repository lifecycle
+ * Sets up a test repo, runs a function with the repo cwd, then cleans up
+ */
+export const withTestRepo = async <T>(
+  repoConfig: {
+    repoUrl: string
+    // The sha of the commit to checkout. If you have a commit with changes to replicate, you would check out the parent commit.
+    parentSha: string
+    initCommand?: string
+    env?: Record<string, string>
+  },
+  fn: (cwd: string) => Promise<T>,
+): Promise<T> => {
+  const { repoUrl, parentSha, initCommand, env } = repoConfig
+
+  // Create a temporary directory for the test repo
+  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codebuff-eval-'))
+  const repoDir = path.join(tempDir, 'repo')
+
+  try {
+    execSync(`git clone --depth 1 ${repoUrl} ${repoDir}`, { stdio: 'ignore' })
+
+    execSync(`git fetch --depth 1 origin ${parentSha}`, {
+      cwd: repoDir,
+      stdio: 'ignore',
+    })
+    execSync(`git checkout ${parentSha}`, { cwd: repoDir, stdio: 'ignore' })
+
+    if (initCommand) {
+      console.log(`Running init command: ${initCommand}...`)
+      try {
+        execSync(initCommand, {
+          cwd: repoDir,
+          stdio: 'ignore',
+          env: { ...process.env, ...env },
+        })
+      } catch (error) {
+        console.error(
+          `Error running init command: ${getErrorObject(error).message}`,
+        )
+      }
+    }
+
+    // Run the provided function with the repo directory
+    return await fn(repoDir)
+  } finally {
+    // Clean up the temporary directory
+    try {
+      fs.rmSync(tempDir, { recursive: true, force: true })
+    } catch (error) {
+      console.warn(`Failed to clean up temporary directory: ${error}`)
+    }
+  }
+}
+
+export const withTestRepoAndParent = async <T>(
+  repoConfig: {
+    repoUrl: string
+    commitSha: string
+    initCommand?: string
+  },
+  fn: (cwd: string, commitSha: string, parentSha: string) => Promise<T>,
+): Promise<T | null> => {
+  const { repoUrl, commitSha, initCommand } = repoConfig
+
+  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codebuff-eval-'))
+  const repoDir = path.join(tempDir, 'repo')
+
+  try {
+    execSync(`git clone --depth 1 ${repoUrl} ${repoDir}`, { stdio: 'ignore' })
+
+    execSync(`git fetch --depth 2 origin ${commitSha}`, {
+      cwd: repoDir,
+      stdio: 'ignore',
+    })
+
+    execSync(`git checkout ${commitSha}`, { cwd: repoDir, stdio: 'ignore' })
+
+    let parentSha: string
+    try {
+      const parents = execSync(`git log --pretty=%P -n 1 ${commitSha}`, {
+        cwd: repoDir,
+        encoding: 'utf-8',
+        stdio: ['ignore', 'pipe', 'ignore'],
+      }).trim()
+
+      if (!parents) {
+        console.warn(
+          `Commit ${commitSha.slice(0, 8)} has no parent (initial commit)`,
+        )
+        return null
+      }
+
+      const parentList = parents.split(' ')
+      if (parentList.length > 1) {
+        console.warn(
+          `Commit ${commitSha.slice(0, 8)} is a merge commit (${parentList.length} parents)`,
+        )
+        return null
+      }
+
+      parentSha = parentList[0]
+    } catch (error) {
+      console.error(`Error getting parent for ${commitSha.slice(0, 8)}:`, error)
+      return null
+    }
+
+    execSync(`git checkout ${parentSha}`, { cwd: repoDir, stdio: 'ignore' })
+
+    if (initCommand) {
+      console.log(`Running init command: ${initCommand}...`)
+      execSync(initCommand, { cwd: repoDir, stdio: 'ignore' })
+    }
+
+    return await fn(repoDir, commitSha, parentSha)
+  } finally {
+    try {
+      fs.rmSync(tempDir, { recursive: true, force: true })
+    } catch (error) {
+      console.warn(`Failed to clean up temporary directory: ${error}`)
+    }
+  }
+}
diff --git a/evals/evalbuff/types.ts b/evals/evalbuff/types.ts
new file mode 100644
index 0000000000..52d30196aa
--- /dev/null
+++ b/evals/evalbuff/types.ts
@@ -0,0 +1,83 @@
+import type { JudgingResult } from './judge'
+
+export interface FileState {
+  path: string
+  preContent: string
+  postContent: string
+}
+
+export interface EvalCommit {
+  sha: string
+  parentSha: string
+  spec: string
+  fileStates: FileState[]
+}
+
+export interface EvalData {
+  repoUrl: string
+  testRepoName?: string
+  generationDate: string
+  initCommand?: string
+  evalCommits: EvalCommit[]
+}
+
+export interface FileDiff {
+  path: string
+  status: 'modified' | 'added' | 'deleted' | 'renamed'
+  oldPath?: string
+  diff: string
+}
+
+export interface EvalCommitV2 {
+  id: string
+  sha: string
+  parentSha: string
+  spec: string
+  prompt: string
+  supplementalFiles: string[]
+  fileDiffs: FileDiff[]
+}
+
+export interface BinInstall {
+  name: string
+  installScript: string
+  binPath: string
+}
+
+export interface EvalDataV2 {
+  repoUrl: string
+  testRepoName?: string
+  generationDate: string
+  initCommand?: string
+  binInstalls?: BinInstall[]
+  env?: Record<string, string>
+  finalCheckCommands?: string[]
+  evalCommits: EvalCommitV2[]
+}
+
+export interface FinalCheckOutput {
+  command: string
+  exitCode: number
+  stdout: string
+  stderr: string
+}
+
+export interface EvalRun {
+  commitSha: string
+  prompt: string
+  diff: string
+  judging: JudgingResult
+  cost: number
+  durationMs: number
+  error?: string
+  finalCheckOutputs?: FinalCheckOutput[]
+}
+
+export interface AgentEvalResults {
+  agentId: string
+  runs: EvalRun[]
+  averageScore: number
+  averageScoreExcludingFailures: number
+  averageCost: number
+  averageDuration: number
+}
diff --git a/evals/package.json b/evals/package.json
index c27555a957..bf7746dc71 100644
--- a/evals/package.json
+++ b/evals/package.json
@@ -23,6 +23,7 @@
     "run-eval-set": "bun run git-evals/run-eval-set.ts",
     "run-buffbench": "bun run buffbench/main.ts",
     "run-buffbench-nightly": "bun run buffbench/main-nightly.ts",
+    "run-evalbuff": "bun run evalbuff/run-evalbuff.ts",
     "trigger-buffbench": "bun run scripts/trigger-buffbench.ts",
     "setup-codebuff-repo": "bun run setup-codebuff-repo.ts"
   },

From f411a3f1b6e226a597a73465cdf68d51cbde5099 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 11:30:39 -0700
Subject: [PATCH 02/12] Add unit tests for evalbuff (35 tests across 4 files)

Tests for criteria (promotion logic, level accumulation), docs-optimizer
(apply/overwrite/reject/AGENTS.md creation, compareScores, readCurrentDocs),
cli-runner (happy path, diff capture, crash, timeout, CLI not found),
and morning-report (normal/empty/error reports, score trajectory, JSONL append).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 evals/evalbuff/__tests__/cli-runner.test.ts   | 107 ++++++++++++
 evals/evalbuff/__tests__/criteria.test.ts     | 111 ++++++++++++
 .../evalbuff/__tests__/docs-optimizer.test.ts | 126 ++++++++++++++
 .../evalbuff/__tests__/morning-report.test.ts | 161 ++++++++++++++++++
 4 files changed, 505 insertions(+)
 create mode 100644 evals/evalbuff/__tests__/cli-runner.test.ts
 create mode 100644 evals/evalbuff/__tests__/criteria.test.ts
 create mode 100644 evals/evalbuff/__tests__/docs-optimizer.test.ts
 create mode 100644 evals/evalbuff/__tests__/morning-report.test.ts

diff --git a/evals/evalbuff/__tests__/cli-runner.test.ts b/evals/evalbuff/__tests__/cli-runner.test.ts
new file mode 100644
index 0000000000..a0aab3f8a7
--- /dev/null
+++ b/evals/evalbuff/__tests__/cli-runner.test.ts
@@ -0,0 +1,107 @@
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+import { execSync } from 'child_process'
+
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test'
+
+import { runCliAgent } from '../cli-runner'
+
+let tmpDir: string
+
+beforeEach(() => {
+  tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-cli-test-'))
+  // Initialize a git repo so git diff works
+  execSync('git init && git add . && git commit --allow-empty -m "init"', {
+    cwd: tmpDir,
+    stdio: 'ignore',
+  })
+})
+
+afterEach(() => {
+  fs.rmSync(tmpDir, { recursive: true, force: true })
+})
+
+describe('runCliAgent', () => {
+  it('happy path: captures stdout and exit code 0', async () => {
+    const result = await runCliAgent({
+      command: 'echo',
+      prompt: 'hello world',
+      cwd: tmpDir,
+      timeoutMs: 10_000,
+    })
+
+    expect(result.exitCode).toBe(0)
+    expect(result.stdout.trim()).toBe('hello world')
+    expect(result.durationMs).toBeGreaterThan(0)
+  })
+
+  it('captures git diff when agent creates a file', async () => {
+    // Use a bash command that creates a file
+    const scriptPath = path.join(tmpDir, 'agent.sh')
+    fs.writeFileSync(
+      scriptPath,
+      '#!/bin/bash\necho "new content" > newfile.txt\n',
+    )
+    fs.chmodSync(scriptPath, '755')
+
+    const result = await runCliAgent({
+      command: scriptPath,
+      prompt: 'create a file',
+      cwd: tmpDir,
+      timeoutMs: 10_000,
+    })
+
+    expect(result.exitCode).toBe(0)
+    expect(result.diff).toContain('newfile.txt')
+    expect(result.diff).toContain('new content')
+  })
+
+  it('handles agent crash with non-zero exit code', async () => {
+    const result = await runCliAgent({
+      command: 'bash -c',
+      prompt: 'exit 42',
+      cwd: tmpDir,
+      timeoutMs: 10_000,
+    })
+
+    expect(result.exitCode).toBe(42)
+  })
+
+  it('returns empty diff when agent makes no changes', async () => {
+    const result = await runCliAgent({
+      command: 'echo',
+      prompt: 'do nothing',
+      cwd: tmpDir,
+      timeoutMs: 10_000,
+    })
+
+    expect(result.diff).toBe('')
+  })
+
+  it('rejects when agent CLI is not found', async () => {
+    const promise = runCliAgent({
+      command: 'nonexistent-agent-binary-xyz',
+      prompt: 'test',
+      cwd: tmpDir,
+      timeoutMs: 10_000,
+    })
+
+    await expect(promise).rejects.toThrow('CLI agent failed to start')
+    await expect(promise).rejects.toThrow('nonexistent-agent-binary-xyz')
+  })
+
+  it('kills agent on timeout', async () => {
+    const result = await runCliAgent({
+      command: 'sleep',
+      prompt: '30',
+      cwd: tmpDir,
+      timeoutMs: 500, // 500ms timeout
+    })
+
+    // Process should have been killed
+    expect(result.durationMs).toBeLessThan(5000)
+    // Exit code is null when killed by signal, which becomes 1
+    expect(result.exitCode).not.toBe(0)
+  })
+})
diff --git a/evals/evalbuff/__tests__/criteria.test.ts b/evals/evalbuff/__tests__/criteria.test.ts
new file mode 100644
index 0000000000..1052fcdbac
--- /dev/null
+++ b/evals/evalbuff/__tests__/criteria.test.ts
@@ -0,0 +1,111 @@
+import { describe, expect, it } from 'bun:test'
+
+import {
+  formatCriteriaForPrompt,
+  getCriteriaForLevel,
+  maybePromoteCriteria,
+} from '../criteria'
+
+import type { QualityCriteria } from '../criteria'
+
+function makeCriteria(
+  level: number,
+  threshold = 8.0,
+  window = 10,
+): QualityCriteria {
+  return {
+    level,
+    criteria: getCriteriaForLevel(level),
+    promotionThreshold: threshold,
+    promotionWindow: window,
+  }
+}
+
+describe('getCriteriaForLevel', () => {
+  it('returns only L1 criteria at level 1', () => {
+    const criteria = getCriteriaForLevel(1)
+    expect(criteria).toHaveLength(3)
+    expect(criteria.map((c) => c.name)).toEqual([
+      'Correctness',
+      'Completeness',
+      'Basic Style',
+    ])
+  })
+
+  it('accumulates criteria up to level 3', () => {
+    const criteria = getCriteriaForLevel(3)
+    expect(criteria.map((c) => c.name)).toEqual([
+      'Correctness',
+      'Completeness',
+      'Basic Style',
+      'Pattern Consistency',
+      'Test Quality',
+    ])
+  })
+
+  it('includes all criteria at level 5', () => {
+    const criteria = getCriteriaForLevel(5)
+    expect(criteria).toHaveLength(7)
+    expect(criteria[criteria.length - 1].name).toBe('Fluency')
+  })
+
+  it('caps at level 5 even if higher number passed', () => {
+    const criteria = getCriteriaForLevel(10)
+    expect(criteria).toHaveLength(7)
+  })
+})
+
+describe('maybePromoteCriteria', () => {
+  it('promotes when avg above threshold over window', () => {
+    const criteria = makeCriteria(1, 8.0, 5)
+    const scores = [8.5, 9.0, 8.2, 8.8, 8.6]
+    const newLevel = maybePromoteCriteria(criteria, scores)
+    expect(newLevel).toBe(2)
+  })
+
+  it('does NOT promote when avg below threshold', () => {
+    const criteria = makeCriteria(1, 8.0, 5)
+    const scores = [7.0, 6.5, 8.0, 7.5, 7.0]
+    const newLevel = maybePromoteCriteria(criteria, scores)
+    expect(newLevel).toBe(1)
+  })
+
+  it('does NOT promote when already at max level (5)', () => {
+    const criteria = makeCriteria(5, 8.0, 3)
+    const scores = [9.0, 9.5, 9.0]
+    const newLevel = maybePromoteCriteria(criteria, scores)
+    expect(newLevel).toBe(5)
+  })
+
+  it('does NOT promote when fewer iterations than window size', () => {
+    const criteria = makeCriteria(1, 8.0, 10)
+    const scores = [9.0, 9.5, 9.0]
+    const newLevel = maybePromoteCriteria(criteria, scores)
+    expect(newLevel).toBe(1)
+  })
+
+  it('uses only the last N scores in the window', () => {
+    const criteria = makeCriteria(2, 8.0, 3)
+    // Old scores are low, but last 3 are high
+    const scores = [3.0, 4.0, 5.0, 8.5, 9.0, 8.5]
+    const newLevel = maybePromoteCriteria(criteria, scores)
+    expect(newLevel).toBe(3)
+  })
+})
+
+describe('formatCriteriaForPrompt', () => {
+  it('includes level and all criteria names', () => {
+    const criteria = makeCriteria(2)
+    const prompt = formatCriteriaForPrompt(criteria)
+    expect(prompt).toContain('Level 2/5')
+    expect(prompt).toContain('Correctness')
+    expect(prompt).toContain('Pattern Consistency')
+  })
+
+  it('includes weights', () => {
+    const criteria = makeCriteria(1)
+    const prompt = formatCriteriaForPrompt(criteria)
+    expect(prompt).toContain('weight: 3')
+    expect(prompt).toContain('weight: 1')
+  })
+})
diff --git a/evals/evalbuff/__tests__/docs-optimizer.test.ts b/evals/evalbuff/__tests__/docs-optimizer.test.ts
new file mode 100644
index 0000000000..5d96d84d99
--- /dev/null
+++ b/evals/evalbuff/__tests__/docs-optimizer.test.ts
@@ -0,0 +1,126 @@
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test'
+
+import { applyDocEdit, compareScores, readCurrentDocs } from '../docs-optimizer'
+
+let tmpDir: string
+
+beforeEach(() => {
+  tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-test-'))
+})
+
+afterEach(() => {
+  fs.rmSync(tmpDir, { recursive: true, force: true })
+})
+
+describe('applyDocEdit', () => {
+  it('creates new file under docs/ and updates AGENTS.md TOC', () => {
+    const result = applyDocEdit(
+      tmpDir,
+      'patterns/error-handling.md',
+      '# Error Handling\n\nAlways use try/catch.',
+    )
+    expect(result).toBe(true)
+
+    const docPath = path.join(tmpDir, 'docs', 'patterns', 'error-handling.md')
+    expect(fs.existsSync(docPath)).toBe(true)
+    expect(fs.readFileSync(docPath, 'utf-8')).toContain('Error Handling')
+
+    const agentsMd = fs.readFileSync(
+      path.join(tmpDir, 'AGENTS.md'),
+      'utf-8',
+    )
+    expect(agentsMd).toContain('docs/patterns/error-handling.md')
+  })
+
+  it('overwrites existing file content', () => {
+    // Create initial doc
+    applyDocEdit(tmpDir, 'conventions/naming.md', 'Original content')
+
+    // Overwrite
+    applyDocEdit(tmpDir, 'conventions/naming.md', 'Updated content')
+
+    const content = fs.readFileSync(
+      path.join(tmpDir, 'docs', 'conventions', 'naming.md'),
+      'utf-8',
+    )
+    expect(content).toBe('Updated content')
+  })
+
+  it('does not duplicate AGENTS.md entry on overwrite', () => {
+    applyDocEdit(tmpDir, 'test.md', 'v1')
+    applyDocEdit(tmpDir, 'test.md', 'v2')
+
+    const agentsMd = fs.readFileSync(
+      path.join(tmpDir, 'AGENTS.md'),
+      'utf-8',
+    )
+    // The link format is "- [docs/test.md](docs/test.md)" — one entry has two occurrences of the path
+    const entryMatches = agentsMd.match(/- \[docs\/test\.md\]/g)
+    expect(entryMatches).toHaveLength(1)
+  })
+
+  it('rejects path starting with /', () => {
+    const result = applyDocEdit(tmpDir, '/etc/passwd', 'bad')
+    expect(result).toBe(false)
+  })
+
+  it('rejects path with ..', () => {
+    const result = applyDocEdit(tmpDir, '../outside/file.md', 'bad')
+    expect(result).toBe(false)
+  })
+
+  it('creates AGENTS.md if it does not exist', () => {
+    expect(fs.existsSync(path.join(tmpDir, 'AGENTS.md'))).toBe(false)
+    applyDocEdit(tmpDir, 'new-doc.md', 'content')
+    expect(fs.existsSync(path.join(tmpDir, 'AGENTS.md'))).toBe(true)
+
+    const agentsMd = fs.readFileSync(
+      path.join(tmpDir, 'AGENTS.md'),
+      'utf-8',
+    )
+    expect(agentsMd).toContain('# Documentation')
+    expect(agentsMd).toContain('docs/new-doc.md')
+  })
+})
+
+describe('compareScores', () => {
+  it('returns improved when new > old', () => {
+    expect(compareScores(5.0, 7.0)).toBe('improved')
+  })
+
+  it('returns same when new == old', () => {
+    expect(compareScores(5.0, 5.0)).toBe('same')
+  })
+
+  it('returns worse when new < old', () => {
+    expect(compareScores(7.0, 5.0)).toBe('worse')
+  })
+})
+
+describe('readCurrentDocs', () => {
+  it('returns empty object when docs/ does not exist', () => {
+    const docs = readCurrentDocs(tmpDir)
+    expect(docs).toEqual({})
+  })
+
+  it('reads all markdown files recursively', () => {
+    const docsDir = path.join(tmpDir, 'docs')
+    fs.mkdirSync(path.join(docsDir, 'patterns'), { recursive: true })
+    fs.writeFileSync(path.join(docsDir, 'intro.md'), 'intro content')
+    fs.writeFileSync(
+      path.join(docsDir, 'patterns', 'api.md'),
+      'api patterns',
+    )
+    // Non-md file should be ignored
+    fs.writeFileSync(path.join(docsDir, 'notes.txt'), 'ignored')
+
+    const docs = readCurrentDocs(tmpDir)
+    expect(Object.keys(docs).sort()).toEqual(['intro.md', 'patterns/api.md'])
+    expect(docs['intro.md']).toBe('intro content')
+    expect(docs['patterns/api.md']).toBe('api patterns')
+  })
+})
diff --git a/evals/evalbuff/__tests__/morning-report.test.ts b/evals/evalbuff/__tests__/morning-report.test.ts
new file mode 100644
index 0000000000..3819b9c3ee
--- /dev/null
+++ b/evals/evalbuff/__tests__/morning-report.test.ts
@@ -0,0 +1,161 @@
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test'
+
+import { appendLogEntry, generateMorningReport } from '../morning-report'
+
+import type { EvalbuffLogEntry } from '../morning-report'
+
+let tmpDir: string
+let logPath: string
+
+beforeEach(() => {
+  tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-report-test-'))
+  logPath = path.join(tmpDir, 'evalbuff-log.jsonl')
+})
+
+afterEach(() => {
+  fs.rmSync(tmpDir, { recursive: true, force: true })
+})
+
+function makeEntry(overrides: Partial<EvalbuffLogEntry> = {}): EvalbuffLogEntry {
+  return {
+    taskId: 'task-001',
+    timestamp: '2026-03-25T08:00:00.000Z',
+    oldScore: 5.0,
+    newScore: null,
+    docEdit: null,
+    scoreComparison: null,
+    costUsd: 0.5,
+    durationMs: 60_000,
+    criteriaLevel: 1,
+    ...overrides,
+  }
+}
+
+describe('generateMorningReport', () => {
+  it('generates valid report from JSONL log with all stats', () => {
+    const entries: EvalbuffLogEntry[] = [
+      makeEntry({
+        taskId: 'task-001',
+        oldScore: 5.0,
+        newScore: 7.5,
+        docEdit: { path: 'patterns/api.md', reasoning: 'Agent missed API pattern' },
+        scoreComparison: 'improved',
+        costUsd: 1.2,
+        durationMs: 120_000,
+      }),
+      makeEntry({
+        taskId: 'task-002',
+        timestamp: '2026-03-25T09:00:00.000Z',
+        oldScore: 8.0,
+        costUsd: 0.8,
+        durationMs: 90_000,
+      }),
+    ]
+
+    for (const entry of entries) {
+      appendLogEntry(logPath, entry)
+    }
+
+    const report = generateMorningReport(logPath)
+
+    expect(report).toContain('# Evalbuff Morning Report')
+    expect(report).toContain('Iterations | 2')
+    expect(report).toContain('$2.00')
+    expect(report).toContain('Docs Attempted | 1')
+    expect(report).toContain('Docs Kept (improved score) | 1')
+    expect(report).toContain('task-001')
+    expect(report).toContain('task-002')
+    expect(report).toContain('patterns/api.md')
+  })
+
+  it('generates empty report when log file does not exist', () => {
+    const report = generateMorningReport(
+      path.join(tmpDir, 'nonexistent.jsonl'),
+    )
+    expect(report).toContain('No iterations were run')
+    expect(report).toContain('Iterations | 0')
+  })
+
+  it('generates empty report when log file is empty', () => {
+    fs.writeFileSync(logPath, '')
+    const report = generateMorningReport(logPath)
+    expect(report).toContain('No iterations were run')
+  })
+
+  it('shows errors table when iterations have errors', () => {
+    appendLogEntry(
+      logPath,
+      makeEntry({
+        taskId: 'task-fail',
+        error: 'Agent timed out after 300s',
+      }),
+    )
+
+    const report = generateMorningReport(logPath)
+    expect(report).toContain('## Errors')
+    expect(report).toContain('task-fail')
+    expect(report).toContain('Agent timed out')
+  })
+
+  it('shows score trajectory section', () => {
+    appendLogEntry(logPath, makeEntry({ taskId: 'task-a', oldScore: 3.0 }))
+    appendLogEntry(logPath, makeEntry({ taskId: 'task-b', oldScore: 7.0 }))
+
+    const report = generateMorningReport(logPath)
+    expect(report).toContain('## Score Trajectory')
+    expect(report).toContain('task-a')
+    expect(report).toContain('task-b')
+  })
+
+  it('shows doc changes with score impact', () => {
+    appendLogEntry(
+      logPath,
+      makeEntry({
+        taskId: 'task-doc',
+        oldScore: 4.0,
+        newScore: 6.5,
+        docEdit: { path: 'conventions/naming.md', reasoning: 'Naming was wrong' },
+        scoreComparison: 'improved',
+      }),
+    )
+    appendLogEntry(
+      logPath,
+      makeEntry({
+        taskId: 'task-revert',
+        oldScore: 5.0,
+        newScore: 4.0,
+        docEdit: { path: 'patterns/bad.md', reasoning: 'Did not help' },
+        scoreComparison: 'worse',
+      }),
+    )
+
+    const report = generateMorningReport(logPath)
+    expect(report).toContain('## Doc Changes')
+    expect(report).toContain('4.0 -> 6.5')
+    expect(report).toContain('Yes') // kept
+    expect(report).toContain('5.0 -> 4.0')
+    expect(report).toContain('No') // reverted
+  })
+})
+
+describe('appendLogEntry', () => {
+  it('appends JSONL entries that can be parsed back', () => {
+    const entry1 = makeEntry({ taskId: 'a' })
+    const entry2 = makeEntry({ taskId: 'b' })
+
+    appendLogEntry(logPath, entry1)
+    appendLogEntry(logPath, entry2)
+
+    const lines = fs
+      .readFileSync(logPath, 'utf-8')
+      .trim()
+      .split('\n')
+    expect(lines).toHaveLength(2)
+    expect(JSON.parse(lines[0]).taskId).toBe('a')
+    expect(JSON.parse(lines[1]).taskId).toBe('b')
+  })
+})

From 884a5653dc91db24b5eff225ecdc97cb8f61e778 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 11:33:44 -0700
Subject: [PATCH 03/12] Add integration tests for evalbuff orchestrator loop

6 integration tests covering: full iteration flow, doc edit attempts,
maxIterations budget cap, cost budget cap, resume from state file,
and doc revert on score regression. Uses bun mock.module to avoid
real LLM calls and remote repo cloning.

Also guards run-evalbuff.ts CLI entry point with import.meta.main
and adds test:evalbuff script that runs unit + integration tests
in separate processes to avoid mock.module leakage.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../__tests__/loop.integration.test.ts        | 340 ++++++++++++++++++
 evals/evalbuff/run-evalbuff.ts                |  11 +-
 evals/package.json                            |   1 +
 3 files changed, 348 insertions(+), 4 deletions(-)
 create mode 100644 evals/evalbuff/__tests__/loop.integration.test.ts

diff --git a/evals/evalbuff/__tests__/loop.integration.test.ts b/evals/evalbuff/__tests__/loop.integration.test.ts
new file mode 100644
index 0000000000..80182447de
--- /dev/null
+++ b/evals/evalbuff/__tests__/loop.integration.test.ts
@@ -0,0 +1,340 @@
+import { execSync } from 'child_process'
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+
+import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test'
+
+import type { JudgingResult } from '../judge'
+import type { DocSuggestion } from '../docs-optimizer'
+import type { EvalDataV2 } from '../types'
+
+// --- Mocks ---
+
+// Track calls to mocked functions
+let judgeCallCount = 0
+let judgeScores: number[] = []
+let analyzeFailureResult: DocSuggestion | null = null
+let cliRunnerCallCount = 0
+
+// Mock withTestRepo to use a local temp dir instead of cloning
+mock.module('../test-repo-utils', () => ({
+  withTestRepo: async (_config: any, fn: (cwd: string) => Promise<any>) => {
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-mock-repo-'))
+    execSync('git init && git add . && git commit --allow-empty -m "init"', {
+      cwd: dir,
+      stdio: 'ignore',
+    })
+    try {
+      return await fn(dir)
+    } finally {
+      fs.rmSync(dir, { recursive: true, force: true })
+    }
+  },
+}))
+
+// Mock CLI runner to return a fake result
+mock.module('../cli-runner', () => ({
+  runCliAgent: async () => {
+    cliRunnerCallCount++
+    return {
+      diff: 'mock diff content',
+      durationMs: 1000,
+      exitCode: 0,
+      stdout: 'mock stdout',
+      stderr: '',
+    }
+  },
+}))
+
+// Mock judge to return configurable scores
+mock.module('../judge', () => ({
+  judgeCommitResult: async () => {
+    const score = judgeScores[judgeCallCount] ?? 5.0
+    judgeCallCount++
+    return {
+      analysis: 'Mock analysis',
+      strengths: ['Good'],
+      weaknesses: ['Could improve'],
+      completionScore: score,
+      codeQualityScore: score,
+      overallScore: score,
+    } satisfies JudgingResult
+  },
+}))
+
+// Mock docs-optimizer LLM calls but keep pure functions
+const actualDocsOptimizer = await import('../docs-optimizer')
+mock.module('../docs-optimizer', () => ({
+  ...actualDocsOptimizer,
+  analyzeFailure: async () => analyzeFailureResult,
+}))
+
+// Mock CodebuffClient
+mock.module('@codebuff/sdk', () => ({
+  CodebuffClient: class {
+    constructor() {}
+    async run() {
+      return { output: { type: 'text', value: '' } }
+    }
+  },
+}))
+
+// Import after mocks are set up
+const { runEvalbuff } = await import('../run-evalbuff')
+
+// --- Test fixtures ---
+
+let repoDir: string
+let evalFilePath: string
+
+function createEvalFile(taskCount: number): string {
+  const evalData: EvalDataV2 = {
+    repoUrl: 'https://github.com/test/repo',
+    generationDate: '2026-03-25',
+    evalCommits: Array.from({ length: taskCount }, (_, i) => ({
+      id: `task-${i + 1}`,
+      sha: `sha-${i + 1}`,
+      parentSha: `parent-${i + 1}`,
+      spec: `Test task ${i + 1}`,
+      prompt: `Do task ${i + 1}`,
+      supplementalFiles: [],
+      fileDiffs: [
+        {
+          path: `src/file${i + 1}.ts`,
+          status: 'modified' as const,
+          diff: `@@ -1 +1 @@\n-old\n+new`,
+        },
+      ],
+    })),
+  }
+
+  const filePath = path.join(repoDir, `eval-test.json`)
+  fs.writeFileSync(filePath, JSON.stringify(evalData))
+  return filePath
+}
+
+beforeEach(() => {
+  repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-integ-'))
+  execSync('git init && git add . && git commit --allow-empty -m "init"', {
+    cwd: repoDir,
+    stdio: 'ignore',
+  })
+  evalFilePath = createEvalFile(5)
+
+  // Reset mock state
+  judgeCallCount = 0
+  judgeScores = []
+  analyzeFailureResult = null
+  cliRunnerCallCount = 0
+})
+
+afterEach(() => {
+  fs.rmSync(repoDir, { recursive: true, force: true })
+})
+
+// --- Tests ---
+
+describe('runEvalbuff integration', () => {
+  it('completes one full iteration: runs agent, judges, and logs', async () => {
+    judgeScores = [8.0] // Above threshold, no doc edit attempted
+
+    await runEvalbuff({
+      repoPath: repoDir,
+      agentCommand: 'echo',
+      evalDataPaths: [evalFilePath],
+      maxIterations: 1,
+      maxCostUsd: 100,
+      scoreThreshold: 7.0,
+      agentTimeoutMs: 10_000,
+    })
+
+    // Verify log was written
+    const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
+    expect(fs.existsSync(logPath)).toBe(true)
+    const logLines = fs
+      .readFileSync(logPath, 'utf-8')
+      .trim()
+      .split('\n')
+    expect(logLines).toHaveLength(1)
+
+    const entry = JSON.parse(logLines[0])
+    expect(entry.taskId).toBe('task-1')
+    expect(entry.oldScore).toBe(8.0)
+    expect(entry.docEdit).toBeNull()
+
+    // Verify state was saved
+    const statePath = path.join(repoDir, 'evalbuff-state.json')
+    expect(fs.existsSync(statePath)).toBe(true)
+    const state = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
+    expect(state.completedTaskIds).toContain('task-1')
+
+    // Verify morning report was generated
+    const reportFiles = fs
+      .readdirSync(repoDir)
+      .filter((f) => f.startsWith('evalbuff-report-'))
+    expect(reportFiles.length).toBeGreaterThan(0)
+  })
+
+  it('attempts doc edit when score is below threshold', async () => {
+    // First judge call returns low score, second (after doc edit) returns higher
+    judgeScores = [4.0, 6.0]
+    analyzeFailureResult = {
+      reasoning: 'Agent missed error handling patterns',
+      suggestedDocPath: 'patterns/errors.md',
+      suggestedContent: '# Error Handling\n\nAlways use try/catch.',
+    }
+
+    await runEvalbuff({
+      repoPath: repoDir,
+      agentCommand: 'echo',
+      evalDataPaths: [evalFilePath],
+      maxIterations: 1,
+      maxCostUsd: 100,
+      scoreThreshold: 7.0,
+      agentTimeoutMs: 10_000,
+    })
+
+    const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
+    const entry = JSON.parse(fs.readFileSync(logPath, 'utf-8').trim())
+    expect(entry.oldScore).toBe(4.0)
+    expect(entry.newScore).toBe(6.0)
+    expect(entry.scoreComparison).toBe('improved')
+    expect(entry.docEdit).not.toBeNull()
+    expect(entry.docEdit.path).toBe('patterns/errors.md')
+
+    // Doc should have been applied to the real repo
+    const docPath = path.join(repoDir, 'docs', 'patterns', 'errors.md')
+    expect(fs.existsSync(docPath)).toBe(true)
+    expect(fs.readFileSync(docPath, 'utf-8')).toContain('Error Handling')
+  })
+
+  it('stops at maxIterations', async () => {
+    judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0]
+
+    await runEvalbuff({
+      repoPath: repoDir,
+      agentCommand: 'echo',
+      evalDataPaths: [evalFilePath], // 5 tasks available
+      maxIterations: 2,
+      maxCostUsd: 100,
+      scoreThreshold: 7.0,
+      agentTimeoutMs: 10_000,
+    })
+
+    const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
+    const logLines = fs
+      .readFileSync(logPath, 'utf-8')
+      .trim()
+      .split('\n')
+    expect(logLines).toHaveLength(2)
+
+    const state = JSON.parse(
+      fs.readFileSync(path.join(repoDir, 'evalbuff-state.json'), 'utf-8'),
+    )
+    expect(state.completedTaskIds).toHaveLength(2)
+  })
+
+  it('stops when cost exceeds maxCostUsd', async () => {
+    judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0]
+
+    // First run — complete 1 task, which will accumulate some cost
+    await runEvalbuff({
+      repoPath: repoDir,
+      agentCommand: 'echo',
+      evalDataPaths: [evalFilePath],
+      maxIterations: 1,
+      maxCostUsd: 100,
+      scoreThreshold: 7.0,
+      agentTimeoutMs: 10_000,
+    })
+
+    // Manually set cost in state to be at the limit
+    const statePath = path.join(repoDir, 'evalbuff-state.json')
+    const state = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
+    state.totalCostUsd = 100.0
+    fs.writeFileSync(statePath, JSON.stringify(state))
+
+    // Second run — should stop immediately due to cost (>= maxCostUsd)
+    await runEvalbuff({
+      repoPath: repoDir,
+      agentCommand: 'echo',
+      evalDataPaths: [evalFilePath],
+      maxIterations: 50,
+      maxCostUsd: 100,
+      scoreThreshold: 7.0,
+      agentTimeoutMs: 10_000,
+    })
+
+    // Should still only have 1 completed task (cost check prevents new tasks)
+    const finalState = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
+    expect(finalState.completedTaskIds).toHaveLength(1)
+  })
+
+  it('resumes from state file and skips completed tasks', async () => {
+    judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0]
+
+    // Pre-populate state with 2 completed tasks
+    const statePath = path.join(repoDir, 'evalbuff-state.json')
+    fs.writeFileSync(
+      statePath,
+      JSON.stringify({
+        completedTaskIds: ['task-1', 'task-2'],
+        totalCostUsd: 5.0,
+        recentScores: [7.0, 8.0],
+      }),
+    )
+
+    await runEvalbuff({
+      repoPath: repoDir,
+      agentCommand: 'echo',
+      evalDataPaths: [evalFilePath], // 5 tasks
+      maxIterations: 50,
+      maxCostUsd: 100,
+      scoreThreshold: 7.0,
+      agentTimeoutMs: 10_000,
+    })
+
+    // Should have processed tasks 3-5 (skipped 1 and 2)
+    const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
+    const logLines = fs
+      .readFileSync(logPath, 'utf-8')
+      .trim()
+      .split('\n')
+    expect(logLines).toHaveLength(3)
+
+    const taskIds = logLines.map((l) => JSON.parse(l).taskId)
+    expect(taskIds).toEqual(['task-3', 'task-4', 'task-5'])
+
+    const finalState = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
+    expect(finalState.completedTaskIds).toHaveLength(5)
+  })
+
+  it('reverts doc edit when score does not improve', async () => {
+    // First judge: low score, second judge: even lower (doc didn't help)
+    judgeScores = [4.0, 3.0]
+    analyzeFailureResult = {
+      reasoning: 'Tried to help',
+      suggestedDocPath: 'bad-doc.md',
+      suggestedContent: '# Bad Doc\n\nThis will not help.',
+    }
+
+    await runEvalbuff({
+      repoPath: repoDir,
+      agentCommand: 'echo',
+      evalDataPaths: [evalFilePath],
+      maxIterations: 1,
+      maxCostUsd: 100,
+      scoreThreshold: 7.0,
+      agentTimeoutMs: 10_000,
+    })
+
+    const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
+    const entry = JSON.parse(fs.readFileSync(logPath, 'utf-8').trim())
+    expect(entry.scoreComparison).toBe('worse')
+
+    // Doc should NOT exist in the real repo
+    const docPath = path.join(repoDir, 'docs', 'bad-doc.md')
+    expect(fs.existsSync(docPath)).toBe(false)
+  })
+})
diff --git a/evals/evalbuff/run-evalbuff.ts b/evals/evalbuff/run-evalbuff.ts
index d7f0405db4..9a54b4a69b 100644
--- a/evals/evalbuff/run-evalbuff.ts
+++ b/evals/evalbuff/run-evalbuff.ts
@@ -422,7 +422,10 @@ async function main() {
   })
 }
 
-main().catch((error) => {
-  console.error('Evalbuff failed:', error)
-  process.exit(1)
-})
+// Only run CLI when executed directly (not when imported)
+if (import.meta.main) {
+  main().catch((error) => {
+    console.error('Evalbuff failed:', error)
+    process.exit(1)
+  })
+}
diff --git a/evals/package.json b/evals/package.json
index bf7746dc71..8fe669f910 100644
--- a/evals/package.json
+++ b/evals/package.json
@@ -24,6 +24,7 @@
     "run-buffbench": "bun run buffbench/main.ts",
     "run-buffbench-nightly": "bun run buffbench/main-nightly.ts",
     "run-evalbuff": "bun run evalbuff/run-evalbuff.ts",
+    "test:evalbuff": "bun test evalbuff/__tests__/criteria.test.ts evalbuff/__tests__/docs-optimizer.test.ts evalbuff/__tests__/morning-report.test.ts evalbuff/__tests__/cli-runner.test.ts && bun test evalbuff/__tests__/loop.integration.test.ts",
     "trigger-buffbench": "bun run scripts/trigger-buffbench.ts",
     "setup-codebuff-repo": "bun run setup-codebuff-repo.ts"
   },

From 1a754cebb768602d80c0c68094c506a36140353c Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 11:35:24 -0700
Subject: [PATCH 04/12] Add E2E test for evalbuff full loop

Verifies the complete evalbuff pipeline: 3 eval tasks run through the
orchestrator with mock LLM judges, doc edits applied and committed,
morning report generated, state tracking, and AGENTS.md TOC created.

Total test coverage: 42 tests (35 unit + 6 integration + 1 E2E).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 evals/evalbuff/__tests__/e2e.test.ts | 231 +++++++++++++++++++++++++++
 evals/package.json                   |   2 +-
 2 files changed, 232 insertions(+), 1 deletion(-)
 create mode 100644 evals/evalbuff/__tests__/e2e.test.ts

diff --git a/evals/evalbuff/__tests__/e2e.test.ts b/evals/evalbuff/__tests__/e2e.test.ts
new file mode 100644
index 0000000000..987256b8e0
--- /dev/null
+++ b/evals/evalbuff/__tests__/e2e.test.ts
@@ -0,0 +1,231 @@
+/**
+ * E2E test for evalbuff.
+ *
+ * This test runs the full evalbuff loop with a real (mock) agent on a local
+ * git repo with synthetic eval tasks. It verifies:
+ * - The morning report is generated
+ * - Log entries are written
+ * - State file tracks completed tasks
+ * - Doc edits are committed to the repo when they improve scores
+ *
+ * This test uses mock.module to replace LLM calls but runs the full
+ * orchestrator, CLI runner, and git operations for real.
+ *
+ * Run: bun test evals/evalbuff/__tests__/e2e.test.ts
+ */
+import { execSync } from 'child_process'
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+
+import { afterAll, beforeAll, describe, expect, it, mock } from 'bun:test'
+
+import type { JudgingResult } from '../judge'
+import type { DocSuggestion } from '../docs-optimizer'
+import type { EvalDataV2 } from '../types'
+
+// --- Mocks for LLM calls only ---
+
+let judgeCallCount = 0
+
+mock.module('../test-repo-utils', () => ({
+  withTestRepo: async (_config: any, fn: (cwd: string) => Promise<any>) => {
+    // Create a real local git repo for each call
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-repo-'))
+    execSync('git init && git add . && git commit --allow-empty -m "init"', {
+      cwd: dir,
+      stdio: 'ignore',
+      env: { ...process.env, GIT_AUTHOR_NAME: 'test', GIT_AUTHOR_EMAIL: 'test@test.com', GIT_COMMITTER_NAME: 'test', GIT_COMMITTER_EMAIL: 'test@test.com' },
+    })
+    try {
+      return await fn(dir)
+    } finally {
+      fs.rmSync(dir, { recursive: true, force: true })
+    }
+  },
+}))
+
+// Judge returns alternating scores: low (triggers doc edit), then higher (confirms improvement)
+mock.module('../judge', () => ({
+  judgeCommitResult: async () => {
+    const scores = [3.0, 6.0, 8.5, 5.0, 7.0, 9.0]
+    const score = scores[judgeCallCount % scores.length]
+    judgeCallCount++
+    return {
+      analysis: `Mock analysis for call ${judgeCallCount}`,
+      strengths: ['Correctly identified the problem'],
+      weaknesses: ['Missing error handling', 'No tests added'],
+      completionScore: score,
+      codeQualityScore: score,
+      overallScore: score,
+    } satisfies JudgingResult
+  },
+}))
+
+const actualDocsOptimizer = await import('../docs-optimizer')
+mock.module('../docs-optimizer', () => ({
+  ...actualDocsOptimizer,
+  analyzeFailure: async () =>
+    ({
+      reasoning: 'Agent consistently misses error handling patterns in async code',
+      suggestedDocPath: 'patterns/async-error-handling.md',
+      suggestedContent:
+        '# Async Error Handling\n\nAll async functions should use try/catch blocks.\nPropagate errors with meaningful messages.\n\n## Examples\n\n```ts\nasync function fetchData() {\n  try {\n    const result = await api.get("/data")\n    return result\n  } catch (error) {\n    throw new Error(`Failed to fetch data: ${error.message}`)\n  }\n}\n```\n',
+    }) satisfies DocSuggestion,
+}))
+
+mock.module('@codebuff/sdk', () => ({
+  CodebuffClient: class {
+    constructor() {}
+  },
+}))
+
+const { runEvalbuff } = await import('../run-evalbuff')
+
+// --- Test setup ---
+
+let repoDir: string
+let evalFilePath: string
+
+beforeAll(() => {
+  // Create a "target repo" where docs will be written
+  repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-target-'))
+  execSync('git init && git add . && git commit --allow-empty -m "init"', {
+    cwd: repoDir,
+    stdio: 'ignore',
+    env: { ...process.env, GIT_AUTHOR_NAME: 'test', GIT_AUTHOR_EMAIL: 'test@test.com', GIT_COMMITTER_NAME: 'test', GIT_COMMITTER_EMAIL: 'test@test.com' },
+  })
+
+  // Create eval file with 3 tasks
+  const evalData: EvalDataV2 = {
+    repoUrl: 'https://github.com/test/repo',
+    generationDate: '2026-03-25',
+    evalCommits: [
+      {
+        id: 'e2e-task-1',
+        sha: 'aaa111',
+        parentSha: 'aaa000',
+        spec: 'Add error handling to fetchData',
+        prompt: 'Add try/catch error handling to the fetchData function in src/api.ts',
+        supplementalFiles: [],
+        fileDiffs: [
+          {
+            path: 'src/api.ts',
+            status: 'modified',
+            diff: '@@ -5,3 +5,7 @@\n-const data = await fetch(url)\n+try {\n+  const data = await fetch(url)\n+} catch (e) {\n+  throw new Error(`Fetch failed: ${e.message}`)\n+}',
+          },
+        ],
+      },
+      {
+        id: 'e2e-task-2',
+        sha: 'bbb222',
+        parentSha: 'bbb000',
+        spec: 'Add input validation',
+        prompt: 'Add input validation to the createUser endpoint',
+        supplementalFiles: [],
+        fileDiffs: [
+          {
+            path: 'src/routes/users.ts',
+            status: 'modified',
+            diff: '@@ -1 +1,5 @@\n+if (!name || !email) {\n+  throw new Error("name and email required")\n+}',
+          },
+        ],
+      },
+      {
+        id: 'e2e-task-3',
+        sha: 'ccc333',
+        parentSha: 'ccc000',
+        spec: 'Refactor logger',
+        prompt: 'Refactor the logger to use structured JSON output',
+        supplementalFiles: [],
+        fileDiffs: [
+          {
+            path: 'src/logger.ts',
+            status: 'modified',
+            diff: '@@ -1 +1,3 @@\n-console.log(msg)\n+const entry = { timestamp: Date.now(), message: msg }\n+process.stdout.write(JSON.stringify(entry) + "\\n")',
+          },
+        ],
+      },
+    ],
+  }
+
+  evalFilePath = path.join(repoDir, 'eval-e2e.json')
+  fs.writeFileSync(evalFilePath, JSON.stringify(evalData))
+
+  judgeCallCount = 0
+})
+
+afterAll(() => {
+  fs.rmSync(repoDir, { recursive: true, force: true })
+})
+
+// --- E2E tests ---
+
+describe('evalbuff E2E', () => {
+  it('runs full loop: agent, judge, doc edit, morning report', async () => {
+    await runEvalbuff({
+      repoPath: repoDir,
+      agentCommand: 'echo', // echo just prints the prompt and exits
+      evalDataPaths: [evalFilePath],
+      maxIterations: 3,
+      maxCostUsd: 50,
+      scoreThreshold: 7.0,
+      agentTimeoutMs: 10_000,
+    })
+
+    // 1. Morning report exists
+    const reportFiles = fs
+      .readdirSync(repoDir)
+      .filter((f) => f.startsWith('evalbuff-report-'))
+    expect(reportFiles.length).toBe(1)
+    const report = fs.readFileSync(
+      path.join(repoDir, reportFiles[0]),
+      'utf-8',
+    )
+    expect(report).toContain('# Evalbuff Morning Report')
+    expect(report).toContain('Iterations | 3')
+
+    // 2. Log has 3 entries
+    const logPath = path.join(repoDir, 'evalbuff-log.jsonl')
+    expect(fs.existsSync(logPath)).toBe(true)
+    const logLines = fs
+      .readFileSync(logPath, 'utf-8')
+      .trim()
+      .split('\n')
+    expect(logLines).toHaveLength(3)
+
+    // 3. State tracks all 3 completed tasks
+    const statePath = path.join(repoDir, 'evalbuff-state.json')
+    const state = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
+    expect(state.completedTaskIds).toEqual([
+      'e2e-task-1',
+      'e2e-task-2',
+      'e2e-task-3',
+    ])
+
+    // 4. At least one doc was written (first task scores 3.0, below threshold)
+    const docsDir = path.join(repoDir, 'docs')
+    expect(fs.existsSync(docsDir)).toBe(true)
+
+    // 5. AGENTS.md was created with TOC
+    const agentsMdPath = path.join(repoDir, 'AGENTS.md')
+    expect(fs.existsSync(agentsMdPath)).toBe(true)
+    const agentsMd = fs.readFileSync(agentsMdPath, 'utf-8')
+    expect(agentsMd).toContain('async-error-handling.md')
+
+    // 6. Doc edits were committed to git
+    const gitLog = execSync('git log --oneline', {
+      cwd: repoDir,
+      encoding: 'utf-8',
+    })
+    expect(gitLog).toContain('evalbuff:')
+
+    // 7. Log entries have correct task IDs
+    const parsedEntries = logLines.map((l) => JSON.parse(l))
+    expect(parsedEntries.map((e: any) => e.taskId)).toEqual([
+      'e2e-task-1',
+      'e2e-task-2',
+      'e2e-task-3',
+    ])
+  })
+})
diff --git a/evals/package.json b/evals/package.json
index 8fe669f910..f335804ebc 100644
--- a/evals/package.json
+++ b/evals/package.json
@@ -24,7 +24,7 @@
     "run-buffbench": "bun run buffbench/main.ts",
     "run-buffbench-nightly": "bun run buffbench/main-nightly.ts",
     "run-evalbuff": "bun run evalbuff/run-evalbuff.ts",
-    "test:evalbuff": "bun test evalbuff/__tests__/criteria.test.ts evalbuff/__tests__/docs-optimizer.test.ts evalbuff/__tests__/morning-report.test.ts evalbuff/__tests__/cli-runner.test.ts && bun test evalbuff/__tests__/loop.integration.test.ts",
+    "test:evalbuff": "bun test evalbuff/__tests__/criteria.test.ts evalbuff/__tests__/docs-optimizer.test.ts evalbuff/__tests__/morning-report.test.ts evalbuff/__tests__/cli-runner.test.ts && bun test evalbuff/__tests__/loop.integration.test.ts && bun test evalbuff/__tests__/e2e.test.ts",
     "trigger-buffbench": "bun run scripts/trigger-buffbench.ts",
     "setup-codebuff-repo": "bun run setup-codebuff-repo.ts"
   },

From bed5094500c4eccb4930efa90b75b18558eda741 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 11:56:31 -0700
Subject: [PATCH 05/12] Overhaul judge and criteria for E2E testing with CLI
 agent reviewers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Major changes:

Judge: Replaced CodebuffClient SDK-based LLM judges with real CLI coding
agents (Claude Code, Codex, Gemini) that run IN the repo. Reviewer agents
can build, run tests, start the dev server, use browser tools, curl
endpoints, check logs — actual E2E verification, not just diff reading.
Structured output via result file (evalbuff-review-result.json) with
fallback to stdout JSON extraction.

Criteria: Shifted from code style (correctness, completeness, pattern
consistency, fluency) to E2E verification levels:
- L1: Builds, existing tests pass, basic completeness
- L2: Feature works E2E (browser/curl/client), logs clean
- L3: Edge cases & error states tested E2E, UI verification
- L4: Cross-component integration, performance, no regressions
- L5: Production readiness (migrations, env vars, error recovery)

Orchestrator: Judge now runs inside withTestRepo callback so reviewer
agents have access to the live repo. CodebuffClient only used for
doc writer (analyzeFailure). Added --reviewers CLI flag.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 evals/evalbuff/__tests__/criteria.test.ts     |  40 +-
 evals/evalbuff/__tests__/e2e.test.ts          |   2 +
 .../__tests__/loop.integration.test.ts        |   2 +
 evals/evalbuff/criteria.ts                    |  60 +-
 evals/evalbuff/evalbuff-criteria.json         |  14 +-
 evals/evalbuff/judge.ts                       | 599 +++++++++++-------
 evals/evalbuff/run-evalbuff.ts                |  81 ++-
 7 files changed, 507 insertions(+), 291 deletions(-)

diff --git a/evals/evalbuff/__tests__/criteria.test.ts b/evals/evalbuff/__tests__/criteria.test.ts
index 1052fcdbac..3b25cfb5c9 100644
--- a/evals/evalbuff/__tests__/criteria.test.ts
+++ b/evals/evalbuff/__tests__/criteria.test.ts
@@ -26,32 +26,34 @@ describe('getCriteriaForLevel', () => {
     const criteria = getCriteriaForLevel(1)
     expect(criteria).toHaveLength(3)
     expect(criteria.map((c) => c.name)).toEqual([
-      'Correctness',
-      'Completeness',
-      'Basic Style',
+      'Builds & Compiles',
+      'Existing Tests Pass',
+      'Basic Completeness',
     ])
   })
 
   it('accumulates criteria up to level 3', () => {
     const criteria = getCriteriaForLevel(3)
     expect(criteria.map((c) => c.name)).toEqual([
-      'Correctness',
-      'Completeness',
-      'Basic Style',
-      'Pattern Consistency',
-      'Test Quality',
+      'Builds & Compiles',
+      'Existing Tests Pass',
+      'Basic Completeness',
+      'Feature Works E2E',
+      'Logs & Observability',
+      'Edge Cases & Error States',
+      'UI/UX Verification',
     ])
   })
 
   it('includes all criteria at level 5', () => {
     const criteria = getCriteriaForLevel(5)
-    expect(criteria).toHaveLength(7)
-    expect(criteria[criteria.length - 1].name).toBe('Fluency')
+    expect(criteria).toHaveLength(10)
+    expect(criteria[criteria.length - 1].name).toBe('Production Readiness')
   })
 
   it('caps at level 5 even if higher number passed', () => {
     const criteria = getCriteriaForLevel(10)
-    expect(criteria).toHaveLength(7)
+    expect(criteria).toHaveLength(10)
   })
 })
 
@@ -86,7 +88,6 @@ describe('maybePromoteCriteria', () => {
 
   it('uses only the last N scores in the window', () => {
     const criteria = makeCriteria(2, 8.0, 3)
-    // Old scores are low, but last 3 are high
     const scores = [3.0, 4.0, 5.0, 8.5, 9.0, 8.5]
     const newLevel = maybePromoteCriteria(criteria, scores)
     expect(newLevel).toBe(3)
@@ -94,18 +95,25 @@ describe('maybePromoteCriteria', () => {
 })
 
 describe('formatCriteriaForPrompt', () => {
-  it('includes level and all criteria names', () => {
+  it('includes level and E2E-focused criteria names', () => {
     const criteria = makeCriteria(2)
     const prompt = formatCriteriaForPrompt(criteria)
     expect(prompt).toContain('Level 2/5')
-    expect(prompt).toContain('Correctness')
-    expect(prompt).toContain('Pattern Consistency')
+    expect(prompt).toContain('Builds & Compiles')
+    expect(prompt).toContain('Feature Works E2E')
   })
 
   it('includes weights', () => {
     const criteria = makeCriteria(1)
     const prompt = formatCriteriaForPrompt(criteria)
     expect(prompt).toContain('weight: 3')
-    expect(prompt).toContain('weight: 1')
+    expect(prompt).toContain('weight: 2')
+  })
+
+  it('instructs E2E verification', () => {
+    const criteria = makeCriteria(1)
+    const prompt = formatCriteriaForPrompt(criteria)
+    expect(prompt).toContain('MUST verify')
+    expect(prompt).toContain('E2E testing')
   })
 })
diff --git a/evals/evalbuff/__tests__/e2e.test.ts b/evals/evalbuff/__tests__/e2e.test.ts
index 987256b8e0..b64f142520 100644
--- a/evals/evalbuff/__tests__/e2e.test.ts
+++ b/evals/evalbuff/__tests__/e2e.test.ts
@@ -55,8 +55,10 @@ mock.module('../judge', () => ({
       analysis: `Mock analysis for call ${judgeCallCount}`,
       strengths: ['Correctly identified the problem'],
       weaknesses: ['Missing error handling', 'No tests added'],
+      e2eTestsPerformed: ['Started dev server', 'Tested API endpoint'],
       completionScore: score,
       codeQualityScore: score,
+      e2eScore: score,
       overallScore: score,
     } satisfies JudgingResult
   },
diff --git a/evals/evalbuff/__tests__/loop.integration.test.ts b/evals/evalbuff/__tests__/loop.integration.test.ts
index 80182447de..d4e5636d33 100644
--- a/evals/evalbuff/__tests__/loop.integration.test.ts
+++ b/evals/evalbuff/__tests__/loop.integration.test.ts
@@ -56,8 +56,10 @@ mock.module('../judge', () => ({
       analysis: 'Mock analysis',
       strengths: ['Good'],
       weaknesses: ['Could improve'],
+      e2eTestsPerformed: ['Mock E2E test'],
       completionScore: score,
       codeQualityScore: score,
+      e2eScore: score,
       overallScore: score,
     } satisfies JudgingResult
   },
diff --git a/evals/evalbuff/criteria.ts b/evals/evalbuff/criteria.ts
index aa768baf43..bc3f9cd290 100644
--- a/evals/evalbuff/criteria.ts
+++ b/evals/evalbuff/criteria.ts
@@ -16,54 +16,72 @@ export interface QualityCriteria {
 export const DEFAULT_CRITERIA: Record<number, QualityCriterion[]> = {
   1: [
     {
-      name: 'Correctness',
+      name: 'Builds & Compiles',
       weight: 3,
       description:
-        'The code compiles, runs without errors, and produces the expected behavior.',
+        'The code compiles, builds, and the project starts without errors. Run the build command and verify it succeeds.',
     },
     {
-      name: 'Completeness',
+      name: 'Existing Tests Pass',
       weight: 3,
       description:
-        'All aspects of the prompt are addressed. No partial implementations or TODO comments.',
+        'All pre-existing tests still pass. Run the test suite and confirm no regressions were introduced.',
     },
     {
-      name: 'Basic Style',
-      weight: 1,
+      name: 'Basic Completeness',
+      weight: 2,
       description:
-        'Code follows basic formatting conventions and is readable.',
+        'All aspects of the prompt are addressed. No partial implementations or TODO comments left behind.',
     },
   ],
   2: [
     {
-      name: 'Pattern Consistency',
-      weight: 2,
+      name: 'Feature Works E2E',
+      weight: 4,
       description:
-        'New code follows the same patterns, naming conventions, and architectural style as existing code in the codebase.',
+        'The new feature or bug fix actually works when you use the application. Start the app, navigate to the relevant page or endpoint, and exercise the feature. Use browser tools, curl, or the appropriate client to verify the happy path end-to-end.',
+    },
+    {
+      name: 'Logs & Observability',
+      weight: 1,
+      description:
+        'Check application logs for errors, warnings, or stack traces during E2E testing. Verify no unexpected errors appear when exercising the feature.',
     },
   ],
   3: [
     {
-      name: 'Test Quality',
+      name: 'Edge Cases & Error States',
+      weight: 3,
+      description:
+        'Test error states and edge cases E2E. Submit invalid inputs, trigger error conditions, test boundary values. Verify the app handles them gracefully without crashing.',
+    },
+    {
+      name: 'UI/UX Verification',
       weight: 2,
       description:
-        'Tests are meaningful, cover edge cases, and test behavior rather than implementation details.',
+        'For UI changes: visually verify the rendered output. Check layout, responsiveness, and that the UI matches expectations. Take screenshots to document.',
     },
   ],
   4: [
     {
-      name: 'Optimal Design',
+      name: 'Cross-Component Integration',
       weight: 2,
       description:
-        'Code is DRY, uses the right abstractions, and the diff is minimal — no unnecessary changes.',
+        'Verify the change works correctly with related features. Test flows that cross component boundaries. If a backend change was made, verify the frontend still works. If a DB migration was added, verify queries work.',
+    },
+    {
+      name: 'Performance & No Regressions',
+      weight: 2,
+      description:
+        'Verify no performance regressions. Check page load times, API response times, or resource usage. Ensure the change does not break unrelated features.',
     },
   ],
   5: [
     {
-      name: 'Fluency',
-      weight: 1,
+      name: 'Production Readiness',
+      weight: 2,
       description:
-        'Code reads like a senior engineer wrote it. Idiomatic usage of the language and framework. No over-engineering.',
+        'Full production readiness check. Verify migrations, environment variable handling, error recovery, and graceful degradation. The change should be safe to deploy.',
     },
   ],
 }
@@ -122,13 +140,13 @@ export function maybePromoteCriteria(
 }
 
 /**
- * Format criteria as text for injection into judge prompts.
+ * Format criteria as text for injection into reviewer agent prompts.
  */
 export function formatCriteriaForPrompt(criteria: QualityCriteria): string {
   const lines = [
     `## Quality Criteria (Level ${criteria.level}/5)`,
     '',
-    'Apply these additional quality criteria when scoring. Higher levels add stricter standards:',
+    'You MUST verify each of these criteria. Higher levels require deeper E2E testing:',
     '',
   ]
 
@@ -138,7 +156,9 @@ export function formatCriteriaForPrompt(criteria: QualityCriteria): string {
 
   lines.push(
     '',
-    'Weight these criteria proportionally when computing scores. A violation of a high-weight criterion should have a bigger impact on the score than a low-weight one.',
+    'For each criterion, describe what you tested and what you observed. If you cannot test a criterion (e.g., no UI for a backend change), note that and explain why.',
+    '',
+    'Weight these criteria proportionally when computing scores. A failure on a high-weight criterion should have a bigger impact on the score than a low-weight one.',
   )
 
   return lines.join('\n')
diff --git a/evals/evalbuff/evalbuff-criteria.json b/evals/evalbuff/evalbuff-criteria.json
index 3d0790abb7..f080586b81 100644
--- a/evals/evalbuff/evalbuff-criteria.json
+++ b/evals/evalbuff/evalbuff-criteria.json
@@ -2,19 +2,19 @@
   "level": 1,
   "criteria": [
     {
-      "name": "Correctness",
+      "name": "Builds & Compiles",
       "weight": 3,
-      "description": "The code compiles, runs without errors, and produces the expected behavior."
+      "description": "The code compiles, builds, and the project starts without errors. Run the build command and verify it succeeds."
     },
     {
-      "name": "Completeness",
+      "name": "Existing Tests Pass",
       "weight": 3,
-      "description": "All aspects of the prompt are addressed. No partial implementations or TODO comments."
+      "description": "All pre-existing tests still pass. Run the test suite and confirm no regressions were introduced."
     },
     {
-      "name": "Basic Style",
-      "weight": 1,
-      "description": "Code follows basic formatting conventions and is readable."
+      "name": "Basic Completeness",
+      "weight": 2,
+      "description": "All aspects of the prompt are addressed. No partial implementations or TODO comments left behind."
     }
   ],
   "promotionThreshold": 8.0,
diff --git a/evals/evalbuff/judge.ts b/evals/evalbuff/judge.ts
index d399e600f5..9b09a844b7 100644
--- a/evals/evalbuff/judge.ts
+++ b/evals/evalbuff/judge.ts
@@ -1,24 +1,25 @@
+import { execSync, spawn } from 'child_process'
 import fs from 'fs'
 import path from 'path'
 
-import { withTimeout } from '@codebuff/common/util/promise'
 import { z } from 'zod/v4'
 
-import type { QualityCriteria } from './criteria'
 import { formatCriteriaForPrompt } from './criteria'
-import type { EvalCommitV2 } from './types'
-import type { AgentDefinition, CodebuffClient } from '@codebuff/sdk'
 
-const DEBUG_ERROR = true
+import type { QualityCriteria } from './criteria'
+import type { EvalCommitV2 } from './types'
 
 export const JudgingResultSchema = z.object({
   analysis: z
     .string()
-    .describe('Detailed analysis comparing agent changes to ground truth'),
+    .describe('Detailed analysis of what was tested and found'),
   strengths: z
     .array(z.string())
     .describe('Key strengths of the implementation'),
   weaknesses: z.array(z.string()).describe('Key weaknesses or issues found'),
+  e2eTestsPerformed: z
+    .array(z.string())
+    .describe('List of E2E tests that were actually performed'),
   completionScore: z
     .number()
     .min(0)
@@ -29,280 +30,450 @@ export const JudgingResultSchema = z.object({
     .min(0)
     .max(10)
     .describe('Code structure and maintainability'),
+  e2eScore: z
+    .number()
+    .min(0)
+    .max(10)
+    .describe('How well the change works when tested end-to-end'),
   overallScore: z.number().min(0).max(10).describe('Combined assessment'),
 })
 
 export type JudgingResult = z.infer<typeof JudgingResultSchema>
 
-const judgeAgentBase: Omit<AgentDefinition, 'id' | 'model'> = {
-  displayName: 'Judge',
-  toolNames: ['set_output'],
-  inputSchema: {
-    prompt: { type: 'string', description: 'The evaluation prompt' },
+// --- Reviewer agent types ---
+
+export type ReviewerAgentType = 'claude' | 'codex' | 'gemini'
+
+interface ReviewerConfig {
+  type: ReviewerAgentType
+  command: string[]
+  env?: Record<string, string>
+  timeoutMs: number
+}
+
+const REVIEWER_CONFIGS: Record<ReviewerAgentType, ReviewerConfig> = {
+  claude: {
+    type: 'claude',
+    command: [
+      'claude',
+      '-p',
+      '__PROMPT__',
+      '--output-format',
+      'stream-json',
+      '--dangerously-skip-permissions',
+    ],
+    timeoutMs: 30 * 60 * 1000, // 30 min — needs time for E2E testing
   },
-  outputMode: 'structured_output',
-  outputSchema: {
-    type: 'object',
-    properties: {
-      analysis: {
-        type: 'string',
-        description:
-          'Detailed analysis comparing agent changes to ground truth',
-      },
-      strengths: {
-        type: 'array',
-        items: { type: 'string' },
-        description: 'Key strengths of the implementation',
-      },
-      weaknesses: {
-        type: 'array',
-        items: { type: 'string' },
-        description: 'Key weaknesses or issues found',
-      },
-      completionScore: {
-        type: 'number',
-        minimum: 0,
-        maximum: 10,
-        description: 'How completely the prompt was addressed',
-      },
-      codeQualityScore: {
-        type: 'number',
-        minimum: 0,
-        maximum: 10,
-        description: 'Code structure and maintainability',
-      },
-      overallScore: {
-        type: 'number',
-        minimum: 0,
-        maximum: 10,
-        description: 'Combined assessment',
-      },
-    },
-    required: [
-      'analysis',
-      'strengths',
-      'weaknesses',
-      'completionScore',
-      'codeQualityScore',
-      'overallScore',
+  codex: {
+    type: 'codex',
+    command: [
+      'codex',
+      'exec',
+      '--full-auto',
+      '--json',
+      '-m',
+      'gpt-5.1-codex',
+      '__PROMPT__',
     ],
+    timeoutMs: 30 * 60 * 1000,
+  },
+  gemini: {
+    type: 'gemini',
+    command: ['gemini', '--yolo', '-p', '__PROMPT__'],
+    timeoutMs: 30 * 60 * 1000,
   },
-  systemPrompt: `You are an expert software engineer evaluating AI-generated code changes with empathy for the task given.
+}
 
-## Your Role
+// The result file name the reviewer agent is instructed to write
+const RESULT_FILE_NAME = 'evalbuff-review-result.json'
 
-You will receive:
-1. The user prompt that the coding agent was given
-2. Context files from the codebase
-3. The ground truth changes (expected outcome)
-4. The agent's actual changes
+function buildReviewerPrompt(input: {
+  commit: EvalCommitV2
+  contextFiles: Record<string, string>
+  agentDiff: string
+  error?: string
+  criteria?: QualityCriteria
+  docsDir?: string
+}): string {
+  const { commit, contextFiles, agentDiff, error, criteria, docsDir } = input
 
-## Evaluation Philosophy
+  const groundTruthDiffs = commit.fileDiffs
+    .map(({ path: p, diff }) => `### ${p}\n\`\`\`diff\n${diff}\n\`\`\``)
+    .join('\n\n')
 
-**Judge based on what the agent was asked to do, not on perfection.**
+  const contextFilesContent = Object.entries(contextFiles)
+    .map(([filePath, content]) => `### ${filePath}\n\`\`\`\n${content}\n\`\`\``)
+    .join('\n\n')
 
-- If the prompt is vague or high-level (e.g., "add authentication"), be lenient and accept any reasonable implementation that achieves the goal
-- If the prompt is specific and detailed, expect the implementation to match those details more closely
-- Focus on whether the agent understood and addressed the user's intent
-- Consider that there are often multiple valid ways to implement the same feature
+  const criteriaText = criteria
+    ? formatCriteriaForPrompt(criteria)
+    : ''
 
-## Evaluation Criteria
+  const docsSection = docsDir
+    ? `\n## Project Docs\nRead the docs in the \`docs/\` directory and \`AGENTS.md\` for project-specific patterns and conventions before reviewing.\n`
+    : ''
 
-- **Completion** (0-10): How well did the agent address what was asked in the prompt? Consider the specificity of the prompt.
-- **Code Quality** (0-10): How well-structured and maintainable is the code?
-- **Overall** (0-10): Combined assessment of whether the agent successfully completed the task as requested
+  return `You are a senior engineer performing a thorough code review with E2E testing.
 
-## Ground Truth
+## Your Mission
 
-The ground truth shows ONE valid implementation, but it's not the only correct answer. The agent's implementation should be judged on:
-- Does it achieve the same functional outcome?
-- Is it a reasonable approach given the prompt?
-- Does it maintain code quality?
+You have been given a coding task, the ground truth solution, and an AI agent's attempt. Your job is to:
 
-Provide detailed analysis, strengths, weaknesses, and numerical scores.`,
-}
+1. **Read the project docs** (if present) to understand conventions and patterns
+2. **Review the agent's diff** against the ground truth
+3. **Actually test the changes** end-to-end:
+   - Start the application if possible (check package.json for start/dev scripts)
+   - Use browser tools, curl, or the appropriate client to exercise the feature
+   - Check logs for errors
+   - Test edge cases and error states
+   - Take screenshots of UI changes if applicable
+4. **Write your judgment** to a JSON file
 
-const judgeAgents: Record<string, AgentDefinition> = {
-  'judge-gpt': {
-    id: 'judge-gpt',
-    model: 'openai/gpt-5.1',
-    ...judgeAgentBase,
-  },
-  'judge-gemini': {
-    id: 'judge-gemini',
-    model: 'google/gemini-3-pro-preview',
-    ...judgeAgentBase,
-  },
-  'judge-sonnet': {
-    id: 'judge-claude',
-    model: 'anthropic/claude-sonnet-4.5',
-    ...judgeAgentBase,
-  },
+## Important: You have full access to the repository and can run any commands.
+
+Use whatever tools you need to verify the change actually works:
+- Run the build/compile step
+- Run the test suite
+- Start the dev server
+- Use browser tools to test the UI
+- curl API endpoints
+- Check logs
+- Use tmux for long-running processes
+- Any other verification method appropriate for the change
+
+${docsSection}
+## User Prompt (What the agent was asked to do)
+${commit.prompt}
+
+## Context Files (from parent commit)
+${contextFilesContent || '(No context files)'}
+
+## Ground Truth Changes (One valid implementation)
+${groundTruthDiffs}
+
+## Agent's Changes (What the agent actually did)
+\`\`\`diff
+${agentDiff || '(No changes made)'}
+\`\`\`
+${error ? `\n## Error Encountered During Agent Run\n${error}\n` : ''}
+${criteriaText}
+
+## Required Output
+
+After your review and testing, write your judgment to the file \`${RESULT_FILE_NAME}\` in the current working directory. The JSON must have exactly this structure:
+
+\`\`\`json
+{
+  "analysis": "Detailed analysis of what you tested and found...",
+  "strengths": ["strength 1", "strength 2"],
+  "weaknesses": ["weakness 1", "weakness 2"],
+  "e2eTestsPerformed": ["Started dev server and loaded /dashboard", "Submitted form with invalid email", "Checked network tab for API errors"],
+  "completionScore": 7,
+  "codeQualityScore": 8,
+  "e2eScore": 6,
+  "overallScore": 7
 }
+\`\`\`
 
-interface JudgeCommitResultInput {
-  client: CodebuffClient
-  commit: EvalCommitV2
-  contextFiles: Record<string, string>
-  agentDiff: string
-  error?: string
-  finalCheckOutputs?: string
-  criteria?: QualityCriteria
+All scores are 0-10. The e2eScore specifically measures how well the change works when actually tested, not just how the code looks.
+
+IMPORTANT: You MUST write the result file. This is the only way your review gets recorded. Do it as your very last action.`
 }
 
-async function runSingleJudge(
-  input: JudgeCommitResultInput,
-  judgePrompt: string,
-  judgeAgentId: string,
+/**
+ * Run a single reviewer agent in the given repo directory.
+ * The agent writes its judgment to a JSON file which we parse.
+ */
+async function runReviewerAgent(
+  agentType: ReviewerAgentType,
+  prompt: string,
+  cwd: string,
+  env?: Record<string, string>,
 ): Promise<JudgingResult | null> {
-  const { client } = input
+  const config = REVIEWER_CONFIGS[agentType]
+  const args = config.command
+    .slice(1)
+    .map((a) => (a === '__PROMPT__' ? prompt : a))
 
-  const judgeAgent = judgeAgents[judgeAgentId]
-  const agentOutput: string[] = []
-  try {
-    const judgeResult = await withTimeout(
-      client.run({
-        agent: judgeAgent.id,
-        prompt: judgePrompt,
-        agentDefinitions: Object.values(judgeAgents),
-        handleEvent: (event) => {
-          if (event.type === 'text') {
-            agentOutput.push(event.text)
-          } else if (event.type === 'tool_call') {
-            agentOutput.push(JSON.stringify(event, null, 2))
-          } else if (event.type === 'error') {
-            console.warn(`[Judge ${judgeAgentId}] Error event:`, event.message)
-          }
-        },
-      }),
-      20 * 60 * 1000,
-      'Judge agent timed out after 20 minutes',
-    )
+  const cmd = config.command[0]
 
-    if (judgeResult.output.type !== 'structuredOutput') {
-      console.error(
-        `Judge ${judgeAgentId} - not structured output`,
-        JSON.stringify(judgeResult.output, null, 2),
+  console.log(`[Reviewer:${agentType}] Starting review in ${cwd}`)
+
+  return new Promise((resolve) => {
+    const child = spawn(cmd, args, {
+      cwd,
+      env: { ...process.env, ...config.env, ...env },
+      stdio: ['ignore', 'pipe', 'pipe'],
+    })
+
+    let stdout = ''
+    let stderr = ''
+
+    const timer = setTimeout(() => {
+      console.warn(
+        `[Reviewer:${agentType}] Timed out after ${config.timeoutMs / 1000}s`,
       )
+      child.kill('SIGTERM')
+      setTimeout(() => {
+        if (!child.killed) child.kill('SIGKILL')
+      }, 5000)
+    }, config.timeoutMs)
+
+    child.stdout.on('data', (data: Buffer) => {
+      stdout += data.toString()
+    })
+
+    child.stderr.on('data', (data: Buffer) => {
+      stderr += data.toString()
+    })
+
+    child.on('error', (error) => {
+      clearTimeout(timer)
       console.error(
-        'Judge agent output:',
-        JSON.stringify(judgeResult.output, null, 2),
-        'Judge agent output trace:',
-        agentOutput.join(''),
+        `[Reviewer:${agentType}] Failed to start: ${error.message}`,
       )
-      if (DEBUG_ERROR) {
-        fs.writeFileSync(
-          path.join(
-            __dirname,
-            '..',
-            `${input.commit.id}-${judgeAgentId}-agent-output-error.json`,
-          ),
-          JSON.stringify(
-            { output: judgeResult.output, trace: agentOutput },
-            null,
-            2,
-          ),
-        )
+      resolve(null)
+    })
+
+    child.on('close', (code) => {
+      clearTimeout(timer)
+      console.log(
+        `[Reviewer:${agentType}] Exited with code ${code}`,
+      )
+
+      // Try to read the result file the agent wrote
+      const resultPath = path.join(cwd, RESULT_FILE_NAME)
+      const result = parseResultFile(resultPath, agentType)
+
+      if (result) {
+        resolve(result)
+        return
+      }
+
+      // Fallback: try to extract JSON from stdout
+      const extracted = extractJsonFromOutput(stdout, agentType)
+      if (extracted) {
+        resolve(extracted)
+        return
       }
-      return null
-    }
 
-    return judgeResult.output.value as JudgingResult
+      console.warn(
+        `[Reviewer:${agentType}] No result file or parseable output found`,
+      )
+      resolve(null)
+    })
+  })
+}
+
+/**
+ * Try to parse the result file written by the reviewer agent.
+ */
+function parseResultFile(
+  resultPath: string,
+  agentType: string,
+): JudgingResult | null {
+  try {
+    if (!fs.existsSync(resultPath)) return null
+    const raw = JSON.parse(fs.readFileSync(resultPath, 'utf-8'))
+    const parsed = JudgingResultSchema.safeParse(raw)
+    if (parsed.success) {
+      console.log(
+        `[Reviewer:${agentType}] Parsed result file successfully`,
+      )
+      return parsed.data
+    }
+    console.warn(
+      `[Reviewer:${agentType}] Result file failed validation:`,
+      parsed.error,
+    )
+    // Try to salvage partial result
+    return salvagePartialResult(raw)
   } catch (error) {
-    console.warn(`Judge ${judgeAgentId} failed:`, error)
+    console.warn(
+      `[Reviewer:${agentType}] Failed to parse result file:`,
+      error,
+    )
     return null
   }
 }
 
-export async function judgeCommitResult(
-  input: JudgeCommitResultInput,
-): Promise<JudgingResult> {
-  const { commit, contextFiles, agentDiff, error, finalCheckOutputs, criteria } =
-    input
+/**
+ * Try to extract JSON from the agent's stdout as a fallback.
+ * Looks for the last JSON block that matches our schema.
+ */
+function extractJsonFromOutput(
+  output: string,
+  agentType: string,
+): JudgingResult | null {
+  // Try to find JSON blocks in the output (between ``` or raw JSON objects)
+  const jsonPatterns = [
+    // Match JSON in code fences
+    /```(?:json)?\s*\n({[\s\S]*?})\n\s*```/g,
+    // Match standalone JSON objects (greedy, last match wins)
+    /(\{[^{}]*"overallScore"[^{}]*\})/g,
+  ]
 
-  const { prompt, fileDiffs } = commit
+  for (const pattern of jsonPatterns) {
+    const matches = [...output.matchAll(pattern)]
+    // Try last match first (most likely to be the final result)
+    for (let i = matches.length - 1; i >= 0; i--) {
+      try {
+        const raw = JSON.parse(matches[i][1])
+        const parsed = JudgingResultSchema.safeParse(raw)
+        if (parsed.success) {
+          console.log(
+            `[Reviewer:${agentType}] Extracted result from stdout`,
+          )
+          return parsed.data
+        }
+        const salvaged = salvagePartialResult(raw)
+        if (salvaged) return salvaged
+      } catch {
+        continue
+      }
+    }
+  }
 
-  const groundTruthDiffs = fileDiffs
-    .map(({ path, diff }) => {
-      return `### ${path}\n\`\`\`diff\n${diff}\n\`\`\``
-    })
-    .join('\n\n')
+  return null
+}
 
-  const contextFilesContent = Object.entries(contextFiles)
-    .map(([filePath, content]) => {
-      return `### ${filePath}\n\`\`\`\n${content}\n\`\`\``
-    })
-    .join('\n\n')
+/**
+ * Try to salvage a partially valid result by filling in defaults.
+ */
+function salvagePartialResult(raw: any): JudgingResult | null {
+  if (typeof raw !== 'object' || raw === null) return null
+  if (typeof raw.overallScore !== 'number') return null
 
-  const judgePrompt = `## User Prompt (What the agent was asked to do)
-${prompt}
+  return {
+    analysis: raw.analysis || 'No analysis provided',
+    strengths: Array.isArray(raw.strengths) ? raw.strengths : [],
+    weaknesses: Array.isArray(raw.weaknesses) ? raw.weaknesses : [],
+    e2eTestsPerformed: Array.isArray(raw.e2eTestsPerformed)
+      ? raw.e2eTestsPerformed
+      : [],
+    completionScore:
+      typeof raw.completionScore === 'number' ? raw.completionScore : raw.overallScore,
+    codeQualityScore:
+      typeof raw.codeQualityScore === 'number'
+        ? raw.codeQualityScore
+        : raw.overallScore,
+    e2eScore:
+      typeof raw.e2eScore === 'number' ? raw.e2eScore : raw.overallScore,
+    overallScore: raw.overallScore,
+  }
+}
 
-## Context Files (from parent commit)
-${contextFilesContent || '(No context files)'}
+// --- Public API ---
 
-## Ground Truth Changes (One valid implementation)
-${groundTruthDiffs}
+export interface JudgeCommitResultInput {
+  commit: EvalCommitV2
+  contextFiles: Record<string, string>
+  agentDiff: string
+  repoDir: string // the test repo where the agent's changes live
+  error?: string
+  criteria?: QualityCriteria
+  reviewerAgents?: ReviewerAgentType[]
+  env?: Record<string, string>
+}
 
-## Agent's Changes (What the agent actually did)
-\`\`\`diff
-${agentDiff || '(No changes made)'}
-\`\`\`
-${error ? `\n## Error Encountered\n${error}` : ''}
-${finalCheckOutputs ? `\n## Final Check Command Outputs\n${finalCheckOutputs}` : ''}
-${criteria ? `\n${formatCriteriaForPrompt(criteria)}` : ''}`
-
-  // Run 2 judges in parallel
-  const judgePromises = [
-    runSingleJudge(input, judgePrompt, 'judge-gpt'),
-    runSingleJudge(input, judgePrompt, 'judge-gemini'),
-  ]
+/**
+ * Judge a commit result by running reviewer agents in the repo.
+ * Each reviewer agent can read docs, run the app, test E2E, and write a result file.
+ */
+export async function judgeCommitResult(
+  input: JudgeCommitResultInput,
+): Promise<JudgingResult> {
+  const {
+    commit,
+    contextFiles,
+    agentDiff,
+    repoDir,
+    error,
+    criteria,
+    reviewerAgents = ['claude', 'codex'],
+    env,
+  } = input
+
+  const prompt = buildReviewerPrompt({
+    commit,
+    contextFiles,
+    agentDiff,
+    error,
+    criteria,
+    docsDir: fs.existsSync(path.join(repoDir, 'docs')) ? repoDir : undefined,
+  })
+
+  // Run reviewer agents in parallel, each in their own copy of the repo
+  const reviewPromises = reviewerAgents.map(async (agentType) => {
+    // Each reviewer gets its own copy of the repo so they don't interfere
+    const reviewDir = `${repoDir}-review-${agentType}`
+    try {
+      execSync(`cp -r ${repoDir} ${reviewDir}`, { stdio: 'ignore' })
+      return await runReviewerAgent(agentType, prompt, reviewDir, env)
+    } finally {
+      try {
+        fs.rmSync(reviewDir, { recursive: true, force: true })
+      } catch {
+        // ignore cleanup errors
+      }
+    }
+  })
 
-  const judgeResults = await Promise.all(judgePromises)
-  const validResults = judgeResults.filter(
-    (result): result is JudgingResult => result !== null,
+  const results = await Promise.all(reviewPromises)
+  const validResults = results.filter(
+    (r): r is JudgingResult => r !== null,
   )
 
   if (validResults.length === 0) {
-    console.error('All judges failed to provide results')
+    console.error(
+      `All reviewer agents failed (${reviewerAgents.join(', ')})`,
+    )
     return {
-      analysis: 'Error running judge agent - all judges failed',
+      analysis: 'Error: all reviewer agents failed to provide results',
       strengths: [],
-      weaknesses: ['All judges failed to provide structured output'],
+      weaknesses: ['All reviewer agents failed'],
+      e2eTestsPerformed: [],
       completionScore: 0,
       codeQualityScore: 0,
+      e2eScore: 0,
       overallScore: 0,
     }
   }
 
-  // Sort judges by overall score and select the median for analysis
-  const sortedResults = validResults.sort(
+  // Sort by overall score, pick median for analysis
+  const sorted = validResults.sort(
     (a, b) => a.overallScore - b.overallScore,
   )
-  const medianIndex = Math.floor(sortedResults.length / 2)
-  const medianResult = sortedResults[medianIndex]
+  const medianIdx = Math.floor(sorted.length / 2)
+  const medianResult = sorted[medianIdx]
 
-  // Calculate average scores across all valid judges
-  const averageCompletionScore =
-    validResults.reduce((sum, r) => sum + r.completionScore, 0) /
-    validResults.length
-  const averageCodeQualityScore =
-    validResults.reduce((sum, r) => sum + r.codeQualityScore, 0) /
-    validResults.length
-  const averageOverallScore =
-    validResults.reduce((sum, r) => sum + r.overallScore, 0) /
+  // Average scores across all valid reviewers
+  const avg = (key: keyof JudgingResult) =>
+    validResults.reduce((sum, r) => sum + (r[key] as number), 0) /
     validResults.length
 
+  const avgCompletionScore = avg('completionScore')
+  const avgCodeQualityScore = avg('codeQualityScore')
+  const avgE2eScore = avg('e2eScore')
+  const avgOverallScore = avg('overallScore')
+
+  // Merge e2eTestsPerformed from all reviewers
+  const allE2eTests = [
+    ...new Set(validResults.flatMap((r) => r.e2eTestsPerformed)),
+  ]
+
   console.log(
-    `Judging results overall score: ${averageOverallScore.toFixed(1)} (individual scores: ${validResults.map((r) => r.overallScore.toFixed(1)).join(', ')})`,
+    `Review results: overall=${avgOverallScore.toFixed(1)}, e2e=${avgE2eScore.toFixed(1)} (${validResults.length}/${reviewerAgents.length} reviewers)`,
   )
 
-  // Return median judge's analysis with averaged scores
   return {
     analysis: medianResult.analysis,
     strengths: medianResult.strengths,
     weaknesses: medianResult.weaknesses,
-    completionScore: averageCompletionScore,
-    codeQualityScore: averageCodeQualityScore,
-    overallScore: averageOverallScore,
+    e2eTestsPerformed: allE2eTests,
+    completionScore: avgCompletionScore,
+    codeQualityScore: avgCodeQualityScore,
+    e2eScore: avgE2eScore,
+    overallScore: avgOverallScore,
   }
 }
diff --git a/evals/evalbuff/run-evalbuff.ts b/evals/evalbuff/run-evalbuff.ts
index 9a54b4a69b..1acd3cb041 100644
--- a/evals/evalbuff/run-evalbuff.ts
+++ b/evals/evalbuff/run-evalbuff.ts
@@ -25,6 +25,7 @@ import {
 import { withTestRepo } from './test-repo-utils'
 
 import type { QualityCriteria } from './criteria'
+import type { ReviewerAgentType } from './judge'
 import type { EvalbuffLogEntry } from './morning-report'
 import type { EvalCommitV2, EvalDataV2 } from './types'
 
@@ -37,6 +38,7 @@ export interface EvalbuffOptions {
   scoreThreshold: number
   agentTimeoutMs: number
   criteriaPath?: string
+  reviewerAgents?: ReviewerAgentType[]
 }
 
 interface EvalbuffState {
@@ -126,6 +128,7 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
     scoreThreshold,
     agentTimeoutMs,
     criteriaPath,
+    reviewerAgents,
   } = options
 
   const statePath = path.join(repoPath, 'evalbuff-state.json')
@@ -137,11 +140,13 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
   let criteria = loadCriteria(defaultCriteriaPath)
   const tasks = loadEvalTasks(evalDataPaths)
 
+  // CodebuffClient is only used for doc writer (analyzeFailure), not for judging
   const client = new CodebuffClient({})
 
   console.log(`Evalbuff starting:`)
   console.log(`  Repo: ${repoPath}`)
   console.log(`  Agent: ${agentCommand}`)
+  console.log(`  Reviewer agents: ${(reviewerAgents || ['claude', 'codex']).join(', ')}`)
   console.log(`  Tasks: ${tasks.length}`)
   console.log(`  Max iterations: ${maxIterations}`)
   console.log(`  Max cost: $${maxCostUsd}`)
@@ -189,9 +194,9 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
     }
 
     try {
-      // Step 1: Run agent with current docs
+      // Step 1: Run agent with current docs, then judge in the same repo
       console.log(`Running agent on task ${task.id}...`)
-      const oldResult = await withTestRepo(
+      const oldJudging = await withTestRepo(
         {
           repoUrl: evalData.repoUrl,
           parentSha: task.parentSha,
@@ -211,26 +216,28 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
           })
 
           const contextFiles = getContextFiles(repoDir, task)
+          logEntry.costUsd += result.durationMs * 0.001
 
-          return { ...result, contextFiles }
+          // Judge the result — reviewer agents run IN the repo
+          // so they can build, test, start the app, use browser tools, etc.
+          console.log(`Judging result with reviewer agents...`)
+          const judging = await judgeCommitResult({
+            commit: task,
+            contextFiles,
+            agentDiff: result.diff,
+            repoDir,
+            error: result.exitCode !== 0 ? result.stderr : undefined,
+            criteria,
+            reviewerAgents,
+            env: evalData.env,
+          })
+
+          return judging
         },
       )
 
-      // Judge the result
-      console.log(`Judging result...`)
-      const oldJudging = await judgeCommitResult({
-        client,
-        commit: task,
-        contextFiles: oldResult.contextFiles,
-        agentDiff: oldResult.diff,
-        error: oldResult.exitCode !== 0 ? oldResult.stderr : undefined,
-        criteria,
-      })
-
       logEntry.oldScore = oldJudging.overallScore
-      logEntry.costUsd += oldResult.durationMs * 0.001 // rough estimate
-
-      console.log(`Score: ${oldJudging.overallScore.toFixed(1)}/10`)
+      console.log(`Score: ${oldJudging.overallScore.toFixed(1)}/10 (e2e: ${oldJudging.e2eScore.toFixed(1)})`)
 
       // Step 2: If score is low, try to improve docs
       if (oldJudging.overallScore < scoreThreshold) {
@@ -246,7 +253,7 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
           client,
           judgeResult: oldJudging,
           taskPrompt: task.prompt,
-          agentDiff: oldResult.diff,
+          agentDiff: '', // agent diff not preserved after withTestRepo cleanup
           groundTruthDiff,
           currentDocs,
           scoreThreshold,
@@ -261,9 +268,9 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
             reasoning: docSuggestion.reasoning,
           }
 
-          // Re-run with updated docs on a FRESH repo
+          // Re-run with updated docs on a FRESH repo, judge inside
           console.log(`Re-running agent with new doc...`)
-          const newResult = await withTestRepo(
+          const newJudging = await withTestRepo(
             {
               repoUrl: evalData.repoUrl,
               parentSha: task.parentSha,
@@ -271,7 +278,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
               env: evalData.env,
             },
             async (freshRepoDir) => {
-              // Copy existing docs + new doc
               copyDocsIntoRepo(repoPath, freshRepoDir)
               applyDocEdit(
                 freshRepoDir,
@@ -288,22 +294,23 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
               })
 
               const contextFiles = getContextFiles(freshRepoDir, task)
-              return { ...result, contextFiles }
+              logEntry.costUsd += result.durationMs * 0.001
+
+              console.log(`Re-judging with reviewer agents...`)
+              return await judgeCommitResult({
+                commit: task,
+                contextFiles,
+                agentDiff: result.diff,
+                repoDir: freshRepoDir,
+                error: result.exitCode !== 0 ? result.stderr : undefined,
+                criteria,
+                reviewerAgents,
+                env: evalData.env,
+              })
             },
           )
 
-          // Judge the new result
-          const newJudging = await judgeCommitResult({
-            client,
-            commit: task,
-            contextFiles: newResult.contextFiles,
-            agentDiff: newResult.diff,
-            error: newResult.exitCode !== 0 ? newResult.stderr : undefined,
-            criteria,
-          })
-
           logEntry.newScore = newJudging.overallScore
-          logEntry.costUsd += newResult.durationMs * 0.001
           logEntry.scoreComparison = compareScores(
             oldJudging.overallScore,
             newJudging.overallScore,
@@ -322,7 +329,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
               docSuggestion.suggestedContent,
             )
 
-            // Commit the doc change
             try {
               execSync('git add docs/ AGENTS.md', {
                 cwd: repoPath,
@@ -409,6 +415,12 @@ async function main() {
   const criteriaPath = args.includes('--criteria')
     ? getArg('criteria')
     : undefined
+  const reviewerAgentsArg = args.includes('--reviewers')
+    ? getArg('reviewers')
+    : undefined
+  const reviewerAgents = reviewerAgentsArg
+    ? (reviewerAgentsArg.split(',') as ReviewerAgentType[])
+    : undefined
 
   await runEvalbuff({
     repoPath,
@@ -419,6 +431,7 @@ async function main() {
     scoreThreshold,
     agentTimeoutMs,
     criteriaPath,
+    reviewerAgents,
   })
 }
 

From 7122c0574fdafd20b8fa9ebfc4112f28288d0b0b Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 14:07:24 -0700
Subject: [PATCH 06/12] evalbuff: add docs for add-deep-thinkers

---
 AGENTS.md                                |   5 +
 docs/patterns/handle-steps-generators.md | 180 +++++++++++++++++++++++
 2 files changed, 185 insertions(+)
 create mode 100644 AGENTS.md
 create mode 100644 docs/patterns/handle-steps-generators.md

diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000000..6511972cd5
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,5 @@
+# Documentation
+
+Table of contents for project documentation.
+
+- [docs/patterns/handle-steps-generators.md](docs/patterns/handle-steps-generators.md)
diff --git a/docs/patterns/handle-steps-generators.md b/docs/patterns/handle-steps-generators.md
new file mode 100644
index 0000000000..a3db4b672f
--- /dev/null
+++ b/docs/patterns/handle-steps-generators.md
@@ -0,0 +1,180 @@
+# handleSteps Generator Pattern for Programmatic Agents
+
+When creating agents that use `handleSteps` generators to programmatically execute tool calls, follow these exact patterns to avoid TypeScript compilation errors.
+
+## Correct handleSteps Signature
+
+```typescript
+import type { AgentDefinition } from '../types/agent-definition'
+
+const definition: AgentDefinition = {
+  // ... other fields
+  
+  handleSteps: function* ({ agentState, prompt, params }) {
+    // Generator body
+  },
+}
+```
+
+## Yielding Tool Calls
+
+Yield objects with `toolName` and `input` properties. The input schema must match the tool's expected parameters exactly.
+
+### spawn_agents Tool
+
+```typescript
+handleSteps: function* ({ agentState, prompt, params }) {
+  const promptWithDefault = prompt ?? 'Default prompt'
+  
+  yield {
+    toolName: 'spawn_agents',
+    input: {
+      agents: [
+        {
+          agent_type: 'agent-id-1',
+          prompt: promptWithDefault,
+        },
+        {
+          agent_type: 'agent-id-2', 
+          prompt: promptWithDefault,
+        },
+      ],
+    },
+  }
+  
+  // After tool execution, yield 'STEP' to let the agent process results
+  yield 'STEP'
+},
+```
+
+### Common Mistakes
+
+**WRONG:** Using incorrect property names or nested structures
+```typescript
+// ❌ Incorrect - wrong tool call structure
+yield {
+  type: 'tool_call',
+  name: 'spawn_agents',
+  arguments: { ... }
+}
+```
+
+**WRONG:** Using `think_deeply` or custom tool names that don't exist
+```typescript
+// ❌ Incorrect - this tool doesn't exist
+yield {
+  toolName: 'think_deeply',
+  input: { ... }
+}
+```
+
+**CORRECT:** Use `toolName` and `input` at the top level
+```typescript
+// ✅ Correct
+yield {
+  toolName: 'spawn_agents',
+  input: {
+    agents: [{ agent_type: 'my-agent', prompt: 'Do something' }]
+  }
+}
+```
+
+## Yielding STEP
+
+After yielding tool calls, yield the string `'STEP'` to let the main agent process the results:
+
+```typescript
+handleSteps: function* ({ prompt }) {
+  yield {
+    toolName: 'spawn_agents',
+    input: { agents: [...] },
+  }
+  
+  // This tells the runtime to run an LLM step to process spawn results
+  yield 'STEP'
+},
+```
+
+## Agent Definition Requirements for Spawning
+
+Agents that spawn sub-agents must include:
+
+1. `toolNames: ['spawn_agents']` - Enable the spawn tool
+2. `spawnableAgents: ['agent-id-1', 'agent-id-2']` - List allowed sub-agents
+
+```typescript
+const definition: AgentDefinition = {
+  id: 'coordinator',
+  model: 'openai/gpt-5',
+  toolNames: ['spawn_agents'],
+  spawnableAgents: ['sub-agent-1', 'sub-agent-2', 'sub-agent-3'],
+  // ...
+}
+```
+
+## Complete Example: Multi-Model Coordinator
+
+See `.agents/deep-thinking/deep-thinker.ts` for a working example:
+
+```typescript
+import type { AgentDefinition } from '../types/agent-definition'
+
+const definition: AgentDefinition = {
+  id: 'deep-thinker',
+  displayName: 'Deep Thinker Agent',
+  model: 'openai/gpt-5',
+  
+  toolNames: ['spawn_agents'],
+  spawnableAgents: ['gpt5-thinker', 'sonnet-thinker', 'gemini-thinker'],
+  
+  inputSchema: {
+    prompt: {
+      type: 'string',
+      description: 'The topic to analyze',
+    },
+  },
+  
+  outputMode: 'last_message',
+  
+  handleSteps: function* ({ prompt }) {
+    const promptWithDefault = prompt ?? 'Think about this topic'
+    
+    yield {
+      toolName: 'spawn_agents',
+      input: {
+        agents: [
+          { agent_type: 'gpt5-thinker', prompt: promptWithDefault },
+          { agent_type: 'sonnet-thinker', prompt: promptWithDefault },
+          { agent_type: 'gemini-thinker', prompt: promptWithDefault },
+        ],
+      },
+    }
+    
+    yield 'STEP'
+  },
+}
+
+export default definition
+```
+
+## Directory Structure
+
+Place related agents in subdirectories under `.agents/`:
+
+```
+.agents/
+└── deep-thinking/
+    ├── deep-thinker.ts      # Coordinator
+    ├── deepest-thinker.ts   # Meta-coordinator  
+    ├── gpt5-thinker.ts      # Sub-agent
+    ├── sonnet-thinker.ts    # Sub-agent
+    └── gemini-thinker.ts    # Sub-agent
+```
+
+## Avoid Over-Engineering
+
+When implementing agents:
+- Only create files that are directly requested
+- Don't add documentation files unless explicitly asked
+- Keep agent definitions simple - use `AgentDefinition` type, not custom wrappers
+- Don't create factory patterns unless there's clear reuse need
\ No newline at end of file

From b238947a2a3a8e82c8712e1929edc913c33d36f4 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 14:30:47 -0700
Subject: [PATCH 07/12] evalbuff: rewrite docs-optimizer to use Claude CLI
 instead of CodebuffClient

Removes the CodebuffClient/SDK dependency from analyzeFailure. Uses Claude
CLI with a temp file for the prompt (avoids CLI arg length limits). Adds
JSON extraction with markdown fence stripping and validation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 evals/evalbuff/docs-optimizer.ts | 112 ++++++++++++++++---------------
 1 file changed, 59 insertions(+), 53 deletions(-)

diff --git a/evals/evalbuff/docs-optimizer.ts b/evals/evalbuff/docs-optimizer.ts
index cd9dfde782..cc9b95d0d7 100644
--- a/evals/evalbuff/docs-optimizer.ts
+++ b/evals/evalbuff/docs-optimizer.ts
@@ -1,10 +1,9 @@
+import { execSync } from 'child_process'
 import fs from 'fs'
+import os from 'os'
 import path from 'path'
 
-import { withTimeout } from '@codebuff/common/util/promise'
-
 import type { JudgingResult } from './judge'
-import type { AgentDefinition, CodebuffClient } from '@codebuff/sdk'
 
 export interface DocSuggestion {
   reasoning: string
@@ -12,36 +11,7 @@ export interface DocSuggestion {
   suggestedContent: string
 }
 
-const docWriterAgent: AgentDefinition = {
-  id: 'doc-writer',
-  model: 'anthropic/claude-sonnet-4.5',
-  displayName: 'Doc Writer',
-  toolNames: ['set_output'],
-  inputSchema: {
-    prompt: { type: 'string', description: 'The analysis prompt' },
-  },
-  outputMode: 'structured_output',
-  outputSchema: {
-    type: 'object',
-    properties: {
-      reasoning: {
-        type: 'string',
-        description:
-          'Why this doc would help the agent avoid the identified failure',
-      },
-      suggestedDocPath: {
-        type: 'string',
-        description:
-          'File path relative to docs/ directory, e.g. "patterns/error-handling.md"',
-      },
-      suggestedContent: {
-        type: 'string',
-        description: 'The markdown content to write to the doc file',
-      },
-    },
-    required: ['reasoning', 'suggestedDocPath', 'suggestedContent'],
-  },
-  systemPrompt: `You are an expert at writing developer documentation that helps AI coding agents perform better.
+const DOC_WRITER_SYSTEM_PROMPT = `You are an expert at writing developer documentation that helps AI coding agents perform better.
 
 Your job: Given a coding agent's failure on a task, write a targeted documentation file that would prevent this class of error in the future.
 
@@ -53,15 +23,23 @@ Your job: Given a coding agent's failure on a task, write a targeted documentati
 4. Write docs that a coding agent will read and immediately know what to do differently.
 5. Keep docs concise — under 200 lines. Dense information beats verbose explanations.
 6. Use a logical file path that groups related docs together (e.g., "patterns/", "conventions/", "architecture/").
-7. Include examples of correct patterns from the codebase when possible.`,
-}
+7. Include examples of correct patterns from the codebase when possible.
+
+## Output Format
+
+You MUST respond with ONLY a JSON object (no markdown fences, no explanation). The JSON must have exactly these fields:
+{
+  "reasoning": "Why this doc would help",
+  "suggestedDocPath": "path/relative/to/docs/dir.md",
+  "suggestedContent": "The markdown content"
+}`
 
 /**
  * Analyze a failure and suggest a doc edit to prevent it.
+ * Uses Claude CLI to generate suggestions.
  * Returns null if score is above threshold (no improvement needed).
  */
 export async function analyzeFailure({
-  client,
   judgeResult,
   taskPrompt,
   agentDiff,
@@ -69,13 +47,13 @@ export async function analyzeFailure({
   currentDocs,
   scoreThreshold,
 }: {
-  client: CodebuffClient
   judgeResult: JudgingResult
   taskPrompt: string
   agentDiff: string
   groundTruthDiff: string
   currentDocs: Record<string, string>
   scoreThreshold: number
+  client?: unknown // kept for backwards compat, ignored
 }): Promise<DocSuggestion | null> {
   if (judgeResult.overallScore >= scoreThreshold) {
     return null
@@ -85,7 +63,9 @@ export async function analyzeFailure({
     .map(([docPath, content]) => `### ${docPath}\n\`\`\`\n${content}\n\`\`\``)
     .join('\n\n')
 
-  const prompt = `## Task Prompt
+  const prompt = `${DOC_WRITER_SYSTEM_PROMPT}
+
+## Task Prompt
 ${taskPrompt}
 
 ## Judge Analysis
@@ -107,26 +87,47 @@ ${agentDiff || '(No changes made)'}
 ## Current Docs (already available to the agent)
 ${docsContent || '(No docs yet)'}
 
-Based on the gap between what the agent did and what it should have done, write a doc file that would help the agent get it right next time. Focus on the specific weakness identified by the judge.`
+Based on the gap between what the agent did and what it should have done, write a doc file that would help the agent get it right next time. Focus on the specific weakness identified by the judge.
+
+Respond with ONLY the JSON object.`
 
   try {
-    const result = await withTimeout(
-      client.run({
-        agent: docWriterAgent.id,
-        prompt,
-        agentDefinitions: [docWriterAgent],
-        handleEvent: () => {},
-      }),
-      10 * 60 * 1000,
-      'Doc writer agent timed out after 10 minutes',
-    )
-
-    if (result.output.type !== 'structuredOutput') {
-      console.error('Doc writer did not return structured output')
+    // Write prompt to temp file to avoid CLI arg length limits
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docwriter-'))
+    const promptFile = path.join(tmpDir, 'DOC_WRITER_PROMPT.md')
+    fs.writeFileSync(promptFile, prompt)
+
+    let output: string
+    try {
+      output = execSync(
+        `claude --dangerously-skip-permissions -p "Read the file ${promptFile} and follow all instructions in it. Respond with ONLY the JSON object as specified."`,
+        {
+          encoding: 'utf-8',
+          timeout: 5 * 60 * 1000,
+          stdio: ['ignore', 'pipe', 'pipe'],
+          maxBuffer: 10 * 1024 * 1024,
+        },
+      ).trim()
+    } finally {
+      fs.rmSync(tmpDir, { recursive: true, force: true })
+    }
+
+    // Try to extract JSON from the output
+    let jsonStr = output
+    // Strip markdown code fences if present
+    const jsonMatch = output.match(/```(?:json)?\s*\n([\s\S]*?)\n\s*```/)
+    if (jsonMatch) {
+      jsonStr = jsonMatch[1]
+    }
+    // Try to find a JSON object
+    const objMatch = jsonStr.match(/\{[\s\S]*\}/)
+    if (!objMatch) {
+      console.error('Doc writer did not return JSON')
       return null
     }
 
-    const value = result.output.value as DocSuggestion
+    const value = JSON.parse(objMatch[0]) as DocSuggestion
+
     // Validate the path is under docs/
     if (
       value.suggestedDocPath.startsWith('/') ||
@@ -138,6 +139,11 @@ Based on the gap between what the agent did and what it should have done, write
       return null
     }
 
+    if (!value.reasoning || !value.suggestedDocPath || !value.suggestedContent) {
+      console.error('Doc writer returned incomplete suggestion')
+      return null
+    }
+
     return value
   } catch (error) {
     console.error('Doc writer failed:', error)

From 3100dda8a36ed60807433897863400b390fb3ca3 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 14:30:55 -0700
Subject: [PATCH 08/12] evalbuff: fix reviewer agent invocation and cost
 tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Write reviewer prompt to file instead of CLI args (avoids length limits)
- Use rsync + node_modules symlink instead of cp -r (1.7GB → fast)
- Don't pass eval env to reviewers (test API keys break real agents)
- Strip API key env vars from coding agent env too
- Remove CodebuffClient dependency from orchestrator
- Fix cost estimate: was $1/sec, now $0.01/sec
- Always log stderr/stdout on reviewer failure
- Remove --output-format/--json flags from reviewer commands

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 evals/evalbuff/judge.ts        | 40 ++++++++++++++++++++++++++++------
 evals/evalbuff/run-evalbuff.ts | 27 +++++++++++++----------
 2 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/evals/evalbuff/judge.ts b/evals/evalbuff/judge.ts
index 9b09a844b7..f543afd3dc 100644
--- a/evals/evalbuff/judge.ts
+++ b/evals/evalbuff/judge.ts
@@ -58,8 +58,6 @@ const REVIEWER_CONFIGS: Record<ReviewerAgentType, ReviewerConfig> = {
       'claude',
       '-p',
       '__PROMPT__',
-      '--output-format',
-      'stream-json',
       '--dangerously-skip-permissions',
     ],
     timeoutMs: 30 * 60 * 1000, // 30 min — needs time for E2E testing
@@ -70,7 +68,6 @@ const REVIEWER_CONFIGS: Record<ReviewerAgentType, ReviewerConfig> = {
       'codex',
       'exec',
       '--full-auto',
-      '--json',
       '-m',
       'gpt-5.1-codex',
       '__PROMPT__',
@@ -180,9 +177,14 @@ All scores are 0-10. The e2eScore specifically measures how well the change work
 IMPORTANT: You MUST write the result file. This is the only way your review gets recorded. Do it as your very last action.`
 }
 
+const PROMPT_FILE_NAME = 'EVALBUFF_REVIEW_PROMPT.md'
+
+const BOOTSTRAP_PROMPT = `Read the file ${PROMPT_FILE_NAME} in the current directory and follow all instructions in it exactly. The file contains a code review task. After your review and testing, you MUST write your judgment to ${RESULT_FILE_NAME} as specified in the prompt file.`
+
 /**
  * Run a single reviewer agent in the given repo directory.
- * The agent writes its judgment to a JSON file which we parse.
+ * Writes the full prompt to a file in the repo, then gives the agent
+ * a short bootstrap prompt to read it (avoids CLI arg length limits).
  */
 async function runReviewerAgent(
   agentType: ReviewerAgentType,
@@ -191,9 +193,13 @@ async function runReviewerAgent(
   env?: Record<string, string>,
 ): Promise<JudgingResult | null> {
   const config = REVIEWER_CONFIGS[agentType]
+
+  // Write the full prompt to a file in the repo
+  fs.writeFileSync(path.join(cwd, PROMPT_FILE_NAME), prompt)
+
   const args = config.command
     .slice(1)
-    .map((a) => (a === '__PROMPT__' ? prompt : a))
+    .map((a) => (a === '__PROMPT__' ? BOOTSTRAP_PROMPT : a))
 
   const cmd = config.command[0]
 
@@ -240,6 +246,14 @@ async function runReviewerAgent(
       console.log(
         `[Reviewer:${agentType}] Exited with code ${code}`,
       )
+      if (code !== 0) {
+        console.warn(
+          `[Reviewer:${agentType}] stderr (last 1000 chars): ${stderr.slice(-1000)}`,
+        )
+        console.warn(
+          `[Reviewer:${agentType}] stdout (last 500 chars): ${stdout.slice(-500)}`,
+        )
+      }
 
       // Try to read the result file the agent wrote
       const resultPath = path.join(cwd, RESULT_FILE_NAME)
@@ -408,8 +422,20 @@ export async function judgeCommitResult(
     // Each reviewer gets its own copy of the repo so they don't interfere
     const reviewDir = `${repoDir}-review-${agentType}`
     try {
-      execSync(`cp -r ${repoDir} ${reviewDir}`, { stdio: 'ignore' })
-      return await runReviewerAgent(agentType, prompt, reviewDir, env)
+      // Fast copy: use rsync to exclude heavy dirs, then symlink them
+      const nodeModulesPath = path.join(repoDir, 'node_modules')
+      const hasNodeModules = fs.existsSync(nodeModulesPath)
+      if (hasNodeModules) {
+        execSync(
+          `rsync -a --exclude node_modules "${repoDir}/" "${reviewDir}/"`,
+          { stdio: 'ignore' },
+        )
+        fs.symlinkSync(nodeModulesPath, path.join(reviewDir, 'node_modules'))
+      } else {
+        execSync(`cp -r "${repoDir}" "${reviewDir}"`, { stdio: 'ignore' })
+      }
+      // Don't pass eval env to reviewers — they need real API keys, not test ones
+      return await runReviewerAgent(agentType, prompt, reviewDir)
     } finally {
       try {
         fs.rmSync(reviewDir, { recursive: true, force: true })
diff --git a/evals/evalbuff/run-evalbuff.ts b/evals/evalbuff/run-evalbuff.ts
index 1acd3cb041..07800758ef 100644
--- a/evals/evalbuff/run-evalbuff.ts
+++ b/evals/evalbuff/run-evalbuff.ts
@@ -2,8 +2,6 @@ import { execSync } from 'child_process'
 import fs from 'fs'
 import path from 'path'
 
-import { CodebuffClient } from '@codebuff/sdk'
-
 import { runCliAgent } from './cli-runner'
 import {
   getCriteriaForLevel,
@@ -133,6 +131,18 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
 
   const statePath = path.join(repoPath, 'evalbuff-state.json')
   const logPath = path.join(repoPath, 'evalbuff-log.jsonl')
+
+  // Strip API key env vars — eval data provides test keys for init commands
+  // but agents need their real API keys to function
+  const API_KEY_PATTERN = /(_KEY|_SECRET|_TOKEN|_API_KEY)$/i
+  const stripApiKeys = (env?: Record<string, string>) => {
+    if (!env) return undefined
+    return Object.fromEntries(
+      Object.entries(env).filter(([k]) => !API_KEY_PATTERN.test(k)),
+    )
+  }
+  const safeEnv = (evalData: { env?: Record<string, string> }) =>
+    stripApiKeys(evalData.env)
   const defaultCriteriaPath =
     criteriaPath || path.join(repoPath, 'evalbuff-criteria.json')
 
@@ -140,8 +150,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
   let criteria = loadCriteria(defaultCriteriaPath)
   const tasks = loadEvalTasks(evalDataPaths)
 
-  // CodebuffClient is only used for doc writer (analyzeFailure), not for judging
-  const client = new CodebuffClient({})
 
   console.log(`Evalbuff starting:`)
   console.log(`  Repo: ${repoPath}`)
@@ -212,11 +220,11 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
             prompt: task.prompt,
             cwd: repoDir,
             timeoutMs: agentTimeoutMs,
-            env: evalData.env,
+            env: safeEnv(evalData),
           })
 
           const contextFiles = getContextFiles(repoDir, task)
-          logEntry.costUsd += result.durationMs * 0.001
+          logEntry.costUsd += result.durationMs * 0.00001 // ~$0.01/sec rough estimate
 
           // Judge the result — reviewer agents run IN the repo
           // so they can build, test, start the app, use browser tools, etc.
@@ -229,7 +237,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
             error: result.exitCode !== 0 ? result.stderr : undefined,
             criteria,
             reviewerAgents,
-            env: evalData.env,
           })
 
           return judging
@@ -250,7 +257,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
         const currentDocs = readCurrentDocs(repoPath)
 
         const docSuggestion = await analyzeFailure({
-          client,
           judgeResult: oldJudging,
           taskPrompt: task.prompt,
           agentDiff: '', // agent diff not preserved after withTestRepo cleanup
@@ -290,11 +296,11 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
                 prompt: task.prompt,
                 cwd: freshRepoDir,
                 timeoutMs: agentTimeoutMs,
-                env: evalData.env,
+                env: safeEnv(evalData),
               })
 
               const contextFiles = getContextFiles(freshRepoDir, task)
-              logEntry.costUsd += result.durationMs * 0.001
+              logEntry.costUsd += result.durationMs * 0.00001 // ~$0.01/sec rough estimate
 
               console.log(`Re-judging with reviewer agents...`)
               return await judgeCommitResult({
@@ -305,7 +311,6 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise<void> {
                 error: result.exitCode !== 0 ? result.stderr : undefined,
                 criteria,
                 reviewerAgents,
-                env: evalData.env,
               })
             },
           )

From 86d3bce05357d160cd4d68cd4806f5dc14f086cb Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 14:32:41 -0700
Subject: [PATCH 09/12] evalbuff: add real E2E test runner script

Creates a local git repo with a simple subtract bug, generates an eval
task, and runs the full evalbuff loop with real CLI agents. No mocks.

Usage: bun run evals/evalbuff/run-e2e-test.ts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 evals/evalbuff/run-e2e-test.ts | 379 +++++++++++++++++++++++++++++++++
 1 file changed, 379 insertions(+)
 create mode 100644 evals/evalbuff/run-e2e-test.ts

diff --git a/evals/evalbuff/run-e2e-test.ts b/evals/evalbuff/run-e2e-test.ts
new file mode 100644
index 0000000000..433dd22865
--- /dev/null
+++ b/evals/evalbuff/run-e2e-test.ts
@@ -0,0 +1,379 @@
+/**
+ * Real E2E test for evalbuff.
+ *
+ * Creates a local git repo with a simple project, generates an eval task,
+ * and runs the full evalbuff loop with real CLI coding agents and real
+ * reviewer agents. No mocks.
+ *
+ * Prerequisites:
+ *   - `claude` CLI installed and authenticated
+ *   - (Optional) `codex` CLI installed with OPENAI_API_KEY set
+ *
+ * Usage:
+ *   bun run evals/evalbuff/run-e2e-test.ts
+ */
+import { execSync } from 'child_process'
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+
+import { runEvalbuff } from './run-evalbuff'
+
+import type { ReviewerAgentType } from './judge'
+import type { EvalDataV2 } from './types'
+
+// --- Setup ---
+
+const BASE_DIR = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-real-e2e-'))
+const PROJECT_DIR = path.join(BASE_DIR, 'project')
+const BARE_REPO = path.join(BASE_DIR, 'project.git')
+const TARGET_DIR = path.join(BASE_DIR, 'target')
+
+const gitEnv = {
+  GIT_AUTHOR_NAME: 'evalbuff-test',
+  GIT_AUTHOR_EMAIL: 'test@evalbuff.dev',
+  GIT_COMMITTER_NAME: 'evalbuff-test',
+  GIT_COMMITTER_EMAIL: 'test@evalbuff.dev',
+}
+
+function git(cmd: string, cwd: string) {
+  return execSync(`git ${cmd}`, {
+    cwd,
+    encoding: 'utf-8',
+    stdio: ['ignore', 'pipe', 'pipe'],
+    env: { ...process.env, ...gitEnv },
+  }).trim()
+}
+
+function setupProject() {
+  console.log('\n=== Setting up test project ===')
+
+  // Create project directory
+  fs.mkdirSync(PROJECT_DIR, { recursive: true })
+  git('init', PROJECT_DIR)
+
+  // Initial commit: a simple Node.js project with a bug
+  fs.writeFileSync(
+    path.join(PROJECT_DIR, 'package.json'),
+    JSON.stringify(
+      {
+        name: 'evalbuff-test-project',
+        version: '1.0.0',
+        type: 'module',
+        scripts: {
+          test: 'node test.js',
+          start: 'node index.js',
+        },
+      },
+      null,
+      2,
+    ),
+  )
+
+  fs.writeFileSync(
+    path.join(PROJECT_DIR, 'index.js'),
+    `// Simple math utility
+export function add(a, b) {
+  return a + b
+}
+
+export function multiply(a, b) {
+  return a * b
+}
+
+// BUG: subtract is wrong — it adds instead of subtracting
+export function subtract(a, b) {
+  return a + b
+}
+
+export function divide(a, b) {
+  if (b === 0) throw new Error('Division by zero')
+  return a / b
+}
+`,
+  )
+
+  fs.writeFileSync(
+    path.join(PROJECT_DIR, 'test.js'),
+    `import { add, subtract, multiply, divide } from './index.js'
+
+let passed = 0
+let failed = 0
+
+function assert(name, actual, expected) {
+  if (actual === expected) {
+    console.log(\`  ✓ \${name}\`)
+    passed++
+  } else {
+    console.log(\`  ✗ \${name}: expected \${expected}, got \${actual}\`)
+    failed++
+  }
+}
+
+console.log('Running tests...')
+assert('add(2, 3)', add(2, 3), 5)
+assert('multiply(3, 4)', multiply(3, 4), 12)
+assert('subtract(10, 3)', subtract(10, 3), 7)
+assert('divide(10, 2)', divide(10, 2), 5)
+
+try {
+  divide(1, 0)
+  console.log('  ✗ divide by zero should throw')
+  failed++
+} catch (e) {
+  console.log('  ✓ divide by zero throws')
+  passed++
+}
+
+console.log(\`\\n\${passed} passed, \${failed} failed\`)
+if (failed > 0) process.exit(1)
+`,
+  )
+
+  git('add .', PROJECT_DIR)
+  git('commit -m "Initial project with bug in subtract"', PROJECT_DIR)
+  const parentSha = git('rev-parse HEAD', PROJECT_DIR)
+
+  console.log(`  Parent commit (with bug): ${parentSha.slice(0, 8)}`)
+
+  // Now create the ground truth fix
+  fs.writeFileSync(
+    path.join(PROJECT_DIR, 'index.js'),
+    `// Simple math utility
+export function add(a, b) {
+  return a + b
+}
+
+export function multiply(a, b) {
+  return a * b
+}
+
+export function subtract(a, b) {
+  return a - b
+}
+
+export function divide(a, b) {
+  if (b === 0) throw new Error('Division by zero')
+  return a / b
+}
+`,
+  )
+
+  git('add .', PROJECT_DIR)
+  git('commit -m "Fix subtract function"', PROJECT_DIR)
+  const fixSha = git('rev-parse HEAD', PROJECT_DIR)
+
+  console.log(`  Fix commit (ground truth): ${fixSha.slice(0, 8)}`)
+
+  // Get the diff for the ground truth
+  const diff = git(`diff ${parentSha} ${fixSha} -- index.js`, PROJECT_DIR)
+
+  // Create bare clone for withTestRepo to clone from
+  execSync(`git clone --bare ${PROJECT_DIR} ${BARE_REPO}`, {
+    stdio: 'ignore',
+    env: { ...process.env, ...gitEnv },
+  })
+  console.log(`  Bare repo created at: ${BARE_REPO}`)
+
+  return { parentSha, fixSha, diff }
+}
+
+function createEvalFile(parentSha: string, fixSha: string, diff: string) {
+  console.log('\n=== Creating eval file ===')
+
+  const evalData: EvalDataV2 = {
+    repoUrl: `file://${BARE_REPO}`,
+    generationDate: new Date().toISOString(),
+    evalCommits: [
+      {
+        id: 'fix-subtract-bug',
+        sha: fixSha,
+        parentSha,
+        spec: 'Fix the subtract function which incorrectly adds instead of subtracting',
+        prompt:
+          'The subtract function in index.js has a bug — it adds the two numbers instead of subtracting them. Fix it. Then run the tests to make sure they pass.',
+        supplementalFiles: ['test.js'],
+        fileDiffs: [
+          {
+            path: 'index.js',
+            status: 'modified',
+            diff,
+          },
+        ],
+      },
+    ],
+  }
+
+  const evalPath = path.join(BASE_DIR, 'eval.json')
+  fs.writeFileSync(evalPath, JSON.stringify(evalData, null, 2))
+  console.log(`  Eval file: ${evalPath}`)
+  return evalPath
+}
+
+function setupTargetRepo() {
+  console.log('\n=== Setting up target repo (for docs output) ===')
+
+  fs.mkdirSync(TARGET_DIR, { recursive: true })
+  git('init', TARGET_DIR)
+  git('commit --allow-empty -m "init"', TARGET_DIR)
+  console.log(`  Target repo: ${TARGET_DIR}`)
+  return TARGET_DIR
+}
+
+function detectAvailableReviewers(): ReviewerAgentType[] {
+  const reviewers: ReviewerAgentType[] = []
+
+  try {
+    execSync('which claude', { stdio: 'ignore' })
+    reviewers.push('claude')
+    console.log('  ✓ claude CLI found')
+  } catch {
+    console.log('  ✗ claude CLI not found')
+  }
+
+  try {
+    execSync('which codex', { stdio: 'ignore' })
+    if (process.env.OPENAI_API_KEY) {
+      reviewers.push('codex')
+      console.log('  ✓ codex CLI found (OPENAI_API_KEY set)')
+    } else {
+      console.log('  ✗ codex CLI found but OPENAI_API_KEY not set')
+    }
+  } catch {
+    console.log('  ✗ codex CLI not found')
+  }
+
+  return reviewers
+}
+
+async function main() {
+  console.log('╔══════════════════════════════════════════╗')
+  console.log('║   Evalbuff Real E2E Test                 ║')
+  console.log('╚══════════════════════════════════════════╝')
+  console.log(`\nBase dir: ${BASE_DIR}`)
+
+  // Detect available agents
+  console.log('\n=== Detecting available agents ===')
+  const reviewers = detectAvailableReviewers()
+
+  if (reviewers.length === 0) {
+    console.error('\nNo reviewer agents available. Need at least one of: claude, codex')
+    process.exit(1)
+  }
+
+  // Detect coding agent
+  let agentCommand = ''
+  try {
+    execSync('which claude', { stdio: 'ignore' })
+    agentCommand = 'claude --dangerously-skip-permissions -p'
+    console.log(`  Using coding agent: ${agentCommand}`)
+  } catch {
+    console.error('\nClaude CLI not found. Install with: npm install -g @anthropic-ai/claude-code')
+    process.exit(1)
+  }
+
+  // Setup
+  const { parentSha, fixSha, diff } = setupProject()
+  const evalPath = createEvalFile(parentSha, fixSha, diff)
+  const targetDir = setupTargetRepo()
+
+  // Run evalbuff
+  console.log('\n=== Running evalbuff ===')
+  console.log(`  Agent: ${agentCommand}`)
+  console.log(`  Reviewers: ${reviewers.join(', ')}`)
+  console.log(`  Task: fix-subtract-bug`)
+  console.log('')
+
+  const startTime = Date.now()
+
+  try {
+    await runEvalbuff({
+      repoPath: targetDir,
+      agentCommand,
+      evalDataPaths: [evalPath],
+      maxIterations: 1,
+      maxCostUsd: 10,
+      scoreThreshold: 7.0,
+      agentTimeoutMs: 5 * 60 * 1000, // 5 min for the coding agent
+      reviewerAgents: reviewers,
+    })
+  } catch (error) {
+    console.error('\nEvalbuff failed:', error)
+  }
+
+  const durationMs = Date.now() - startTime
+
+  // Verify results
+  console.log('\n=== Verifying results ===')
+
+  const logPath = path.join(targetDir, 'evalbuff-log.jsonl')
+  if (fs.existsSync(logPath)) {
+    const logContent = fs.readFileSync(logPath, 'utf-8').trim()
+    if (logContent) {
+      const entries = logContent.split('\n').map((l) => JSON.parse(l))
+      console.log(`  Log entries: ${entries.length}`)
+      for (const entry of entries) {
+        console.log(`  Task: ${entry.taskId}`)
+        console.log(`    Old score: ${entry.oldScore}`)
+        console.log(`    New score: ${entry.newScore ?? 'N/A'}`)
+        console.log(`    Doc edit: ${entry.docEdit ? entry.docEdit.path : 'none'}`)
+        console.log(`    Score comparison: ${entry.scoreComparison ?? 'N/A'}`)
+        console.log(`    Duration: ${(entry.durationMs / 1000).toFixed(1)}s`)
+        console.log(`    Error: ${entry.error ?? 'none'}`)
+      }
+    } else {
+      console.log('  ✗ Log file is empty')
+    }
+  } else {
+    console.log('  ✗ Log file not found')
+  }
+
+  // Check morning report
+  const reportFiles = fs
+    .readdirSync(targetDir)
+    .filter((f) => f.startsWith('evalbuff-report-'))
+  if (reportFiles.length > 0) {
+    console.log(`\n  ✓ Morning report: ${reportFiles[0]}`)
+    const report = fs.readFileSync(
+      path.join(targetDir, reportFiles[0]),
+      'utf-8',
+    )
+    console.log('\n--- Morning Report ---')
+    console.log(report)
+    console.log('--- End Report ---')
+  } else {
+    console.log('  ✗ No morning report generated')
+  }
+
+  // Check docs
+  const docsDir = path.join(targetDir, 'docs')
+  if (fs.existsSync(docsDir)) {
+    const docFiles = execSync(`find ${docsDir} -name '*.md'`, {
+      encoding: 'utf-8',
+    }).trim()
+    if (docFiles) {
+      console.log(`\n  ✓ Docs generated:`)
+      for (const f of docFiles.split('\n')) {
+        console.log(`    ${f}`)
+      }
+    }
+  }
+
+  // Check state
+  const statePath = path.join(targetDir, 'evalbuff-state.json')
+  if (fs.existsSync(statePath)) {
+    const state = JSON.parse(fs.readFileSync(statePath, 'utf-8'))
+    console.log(`\n  ✓ State: ${state.completedTaskIds.length} completed, $${state.totalCostUsd.toFixed(2)} spent`)
+  }
+
+  console.log(`\n=== E2E test completed in ${(durationMs / 1000).toFixed(1)}s ===`)
+  console.log(`Base dir (for inspection): ${BASE_DIR}`)
+
+  // Cleanup prompt
+  console.log(`\nTo clean up: rm -rf ${BASE_DIR}`)
+}
+
+main().catch((error) => {
+  console.error('E2E test failed:', error)
+  process.exit(1)
+})

From 78bba566be412e4600245f8409cde381aa55a93e Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 14:38:43 -0700
Subject: [PATCH 10/12] Move evalbuff from evals/ to top-level package

Evalbuff is now its own workspace package (@codebuff/evalbuff) instead of
a subdirectory of evals. Adds package.json, tsconfig.json, and updates
workspace config. All 42 tests pass from the new location.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bun.lock                                      | 11 ++++++++++
 evalbuff/package.json                         | 21 +++++++++++++++++++
 {evals/evalbuff => evalbuff/src}/README.md    |  6 +++---
 .../src}/__tests__/cli-runner.test.ts         |  0
 .../src}/__tests__/criteria.test.ts           |  0
 .../src}/__tests__/docs-optimizer.test.ts     |  0
 .../src}/__tests__/e2e.test.ts                |  2 +-
 .../src}/__tests__/loop.integration.test.ts   |  0
 .../src}/__tests__/morning-report.test.ts     |  0
 .../evalbuff => evalbuff/src}/agent-runner.ts |  0
 .../evalbuff => evalbuff/src}/cli-runner.ts   |  0
 {evals/evalbuff => evalbuff/src}/criteria.ts  |  0
 .../src}/docs-optimizer.ts                    |  0
 .../src}/evalbuff-criteria.json               |  0
 {evals/evalbuff => evalbuff/src}/judge.ts     |  0
 .../src}/morning-report.ts                    |  0
 .../src}/old/agents/context-agent.ts          |  0
 .../src}/old/agents/review-agent.ts           |  0
 .../src}/old/agents/scan-agent.ts             |  0
 .../src}/old/cli/package.json                 |  0
 .../src}/old/cli/src/commands/context.ts      |  0
 .../src}/old/cli/src/commands/init.ts         |  0
 .../src}/old/cli/src/commands/login.ts        |  0
 .../src}/old/cli/src/commands/logout.ts       |  0
 .../src}/old/cli/src/commands/review.ts       |  0
 .../src}/old/cli/src/index.ts                 |  0
 .../src}/old/cli/src/templates/skill.ts       |  0
 .../src}/old/cli/src/utils/auth.ts            |  0
 .../src}/old/cli/src/utils/config.ts          |  0
 .../src}/old/cli/src/utils/git.ts             |  0
 .../src}/old/cli/src/utils/knowledge.ts       |  0
 .../src}/old/cli/src/utils/output.ts          |  0
 .../src}/old/cli/src/utils/project.ts         |  0
 .../src}/old/cli/tsconfig.json                |  0
 .../evalbuff => evalbuff/src}/run-e2e-test.ts |  2 +-
 .../evalbuff => evalbuff/src}/run-evalbuff.ts |  0
 .../src}/runners/claude.ts                    |  0
 .../src}/runners/codebuff.ts                  |  0
 .../src}/runners/codex.ts                     |  0
 .../src}/runners/index.ts                     |  0
 .../src}/runners/runner.ts                    |  0
 .../src}/test-repo-utils.ts                   |  0
 {evals/evalbuff => evalbuff/src}/types.ts     |  0
 evalbuff/tsconfig.json                        | 14 +++++++++++++
 evals/package.json                            |  2 --
 package.json                                  |  1 +
 46 files changed, 52 insertions(+), 7 deletions(-)
 create mode 100644 evalbuff/package.json
 rename {evals/evalbuff => evalbuff/src}/README.md (98%)
 rename {evals/evalbuff => evalbuff/src}/__tests__/cli-runner.test.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/__tests__/criteria.test.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/__tests__/docs-optimizer.test.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/__tests__/e2e.test.ts (99%)
 rename {evals/evalbuff => evalbuff/src}/__tests__/loop.integration.test.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/__tests__/morning-report.test.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/agent-runner.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/cli-runner.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/criteria.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/docs-optimizer.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/evalbuff-criteria.json (100%)
 rename {evals/evalbuff => evalbuff/src}/judge.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/morning-report.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/agents/context-agent.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/agents/review-agent.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/agents/scan-agent.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/package.json (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/commands/context.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/commands/init.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/commands/login.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/commands/logout.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/commands/review.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/index.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/templates/skill.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/utils/auth.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/utils/config.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/utils/git.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/utils/knowledge.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/utils/output.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/src/utils/project.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/old/cli/tsconfig.json (100%)
 rename {evals/evalbuff => evalbuff/src}/run-e2e-test.ts (99%)
 rename {evals/evalbuff => evalbuff/src}/run-evalbuff.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/runners/claude.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/runners/codebuff.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/runners/codex.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/runners/index.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/runners/runner.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/test-repo-utils.ts (100%)
 rename {evals/evalbuff => evalbuff/src}/types.ts (100%)
 create mode 100644 evalbuff/tsconfig.json

diff --git a/bun.lock b/bun.lock
index 00a9d0d549..cb61364991 100644
--- a/bun.lock
+++ b/bun.lock
@@ -107,6 +107,15 @@
         "@types/parse-path": "^7.1.0",
       },
     },
+    "evalbuff": {
+      "name": "@codebuff/evalbuff",
+      "version": "1.0.0",
+      "dependencies": {
+        "@codebuff/common": "workspace:*",
+        "@codebuff/sdk": "workspace:*",
+        "zod": "^4.2.1",
+      },
+    },
     "evals": {
       "name": "@codebuff/evals",
       "version": "1.0.0",
@@ -489,6 +498,8 @@
 
     "@codebuff/common": ["@codebuff/common@workspace:common"],
 
+    "@codebuff/evalbuff": ["@codebuff/evalbuff@workspace:evalbuff"],
+
     "@codebuff/evals": ["@codebuff/evals@workspace:evals"],
 
     "@codebuff/freebuff": ["@codebuff/freebuff@workspace:freebuff"],
diff --git a/evalbuff/package.json b/evalbuff/package.json
new file mode 100644
index 0000000000..f3374246dd
--- /dev/null
+++ b/evalbuff/package.json
@@ -0,0 +1,21 @@
+{
+  "name": "@codebuff/evalbuff",
+  "version": "1.0.0",
+  "description": "Automated docs optimization loop: run agent → judge → analyze failures → propose doc edits",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "typecheck": "tsc --noEmit -p .",
+    "test": "bun test src/__tests__/criteria.test.ts src/__tests__/docs-optimizer.test.ts src/__tests__/morning-report.test.ts src/__tests__/cli-runner.test.ts && bun test src/__tests__/loop.integration.test.ts && bun test src/__tests__/e2e.test.ts",
+    "test:unit": "bun test src/__tests__/criteria.test.ts src/__tests__/docs-optimizer.test.ts src/__tests__/morning-report.test.ts src/__tests__/cli-runner.test.ts",
+    "test:integration": "bun test src/__tests__/loop.integration.test.ts",
+    "test:e2e": "bun test src/__tests__/e2e.test.ts",
+    "test:e2e-real": "bun run src/run-e2e-test.ts",
+    "run": "bun run src/run-evalbuff.ts"
+  },
+  "dependencies": {
+    "@codebuff/common": "workspace:*",
+    "@codebuff/sdk": "workspace:*",
+    "zod": "^4.2.1"
+  }
+}
diff --git a/evals/evalbuff/README.md b/evalbuff/src/README.md
similarity index 98%
rename from evals/evalbuff/README.md
rename to evalbuff/src/README.md
index df88d41065..130ba48311 100644
--- a/evals/evalbuff/README.md
+++ b/evalbuff/src/README.md
@@ -82,7 +82,7 @@ This prevents the system from penalizing an agent for style issues when it can't
 ### Command Line
 
 ```bash
-bun run evals/evalbuff/run-evalbuff.ts \
+bun run evalbuff/src/run-evalbuff.ts \
   --repo /path/to/target-repo \
   --agent "claude -p" \
   --evals evals/buffbench/eval-codebuff.json,evals/buffbench/eval-manifold.json \
@@ -95,7 +95,7 @@ bun run evals/evalbuff/run-evalbuff.ts \
 Or via the workspace script:
 
 ```bash
-bun run --filter @codebuff/evals run-evalbuff -- \
+bun run --filter @codebuff/evalbuff run -- \
   --repo /path/to/target-repo \
   --agent "codex exec --full-auto" \
   --evals evals/buffbench/eval-codebuff.json
@@ -119,7 +119,7 @@ bun run --filter @codebuff/evals run-evalbuff -- \
 For an overnight run, set generous limits and let it go:
 
 ```bash
-nohup bun run evals/evalbuff/run-evalbuff.ts \
+nohup bun run evalbuff/src/run-evalbuff.ts \
   --repo /path/to/repo \
   --agent "claude -p" \
   --evals evals/buffbench/eval-codebuff.json \
diff --git a/evals/evalbuff/__tests__/cli-runner.test.ts b/evalbuff/src/__tests__/cli-runner.test.ts
similarity index 100%
rename from evals/evalbuff/__tests__/cli-runner.test.ts
rename to evalbuff/src/__tests__/cli-runner.test.ts
diff --git a/evals/evalbuff/__tests__/criteria.test.ts b/evalbuff/src/__tests__/criteria.test.ts
similarity index 100%
rename from evals/evalbuff/__tests__/criteria.test.ts
rename to evalbuff/src/__tests__/criteria.test.ts
diff --git a/evals/evalbuff/__tests__/docs-optimizer.test.ts b/evalbuff/src/__tests__/docs-optimizer.test.ts
similarity index 100%
rename from evals/evalbuff/__tests__/docs-optimizer.test.ts
rename to evalbuff/src/__tests__/docs-optimizer.test.ts
diff --git a/evals/evalbuff/__tests__/e2e.test.ts b/evalbuff/src/__tests__/e2e.test.ts
similarity index 99%
rename from evals/evalbuff/__tests__/e2e.test.ts
rename to evalbuff/src/__tests__/e2e.test.ts
index b64f142520..646559fa39 100644
--- a/evals/evalbuff/__tests__/e2e.test.ts
+++ b/evalbuff/src/__tests__/e2e.test.ts
@@ -11,7 +11,7 @@
  * This test uses mock.module to replace LLM calls but runs the full
  * orchestrator, CLI runner, and git operations for real.
  *
- * Run: bun test evals/evalbuff/__tests__/e2e.test.ts
+ * Run: bun test evalbuff/src/__tests__/e2e.test.ts
  */
 import { execSync } from 'child_process'
 import fs from 'fs'
diff --git a/evals/evalbuff/__tests__/loop.integration.test.ts b/evalbuff/src/__tests__/loop.integration.test.ts
similarity index 100%
rename from evals/evalbuff/__tests__/loop.integration.test.ts
rename to evalbuff/src/__tests__/loop.integration.test.ts
diff --git a/evals/evalbuff/__tests__/morning-report.test.ts b/evalbuff/src/__tests__/morning-report.test.ts
similarity index 100%
rename from evals/evalbuff/__tests__/morning-report.test.ts
rename to evalbuff/src/__tests__/morning-report.test.ts
diff --git a/evals/evalbuff/agent-runner.ts b/evalbuff/src/agent-runner.ts
similarity index 100%
rename from evals/evalbuff/agent-runner.ts
rename to evalbuff/src/agent-runner.ts
diff --git a/evals/evalbuff/cli-runner.ts b/evalbuff/src/cli-runner.ts
similarity index 100%
rename from evals/evalbuff/cli-runner.ts
rename to evalbuff/src/cli-runner.ts
diff --git a/evals/evalbuff/criteria.ts b/evalbuff/src/criteria.ts
similarity index 100%
rename from evals/evalbuff/criteria.ts
rename to evalbuff/src/criteria.ts
diff --git a/evals/evalbuff/docs-optimizer.ts b/evalbuff/src/docs-optimizer.ts
similarity index 100%
rename from evals/evalbuff/docs-optimizer.ts
rename to evalbuff/src/docs-optimizer.ts
diff --git a/evals/evalbuff/evalbuff-criteria.json b/evalbuff/src/evalbuff-criteria.json
similarity index 100%
rename from evals/evalbuff/evalbuff-criteria.json
rename to evalbuff/src/evalbuff-criteria.json
diff --git a/evals/evalbuff/judge.ts b/evalbuff/src/judge.ts
similarity index 100%
rename from evals/evalbuff/judge.ts
rename to evalbuff/src/judge.ts
diff --git a/evals/evalbuff/morning-report.ts b/evalbuff/src/morning-report.ts
similarity index 100%
rename from evals/evalbuff/morning-report.ts
rename to evalbuff/src/morning-report.ts
diff --git a/evals/evalbuff/old/agents/context-agent.ts b/evalbuff/src/old/agents/context-agent.ts
similarity index 100%
rename from evals/evalbuff/old/agents/context-agent.ts
rename to evalbuff/src/old/agents/context-agent.ts
diff --git a/evals/evalbuff/old/agents/review-agent.ts b/evalbuff/src/old/agents/review-agent.ts
similarity index 100%
rename from evals/evalbuff/old/agents/review-agent.ts
rename to evalbuff/src/old/agents/review-agent.ts
diff --git a/evals/evalbuff/old/agents/scan-agent.ts b/evalbuff/src/old/agents/scan-agent.ts
similarity index 100%
rename from evals/evalbuff/old/agents/scan-agent.ts
rename to evalbuff/src/old/agents/scan-agent.ts
diff --git a/evals/evalbuff/old/cli/package.json b/evalbuff/src/old/cli/package.json
similarity index 100%
rename from evals/evalbuff/old/cli/package.json
rename to evalbuff/src/old/cli/package.json
diff --git a/evals/evalbuff/old/cli/src/commands/context.ts b/evalbuff/src/old/cli/src/commands/context.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/commands/context.ts
rename to evalbuff/src/old/cli/src/commands/context.ts
diff --git a/evals/evalbuff/old/cli/src/commands/init.ts b/evalbuff/src/old/cli/src/commands/init.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/commands/init.ts
rename to evalbuff/src/old/cli/src/commands/init.ts
diff --git a/evals/evalbuff/old/cli/src/commands/login.ts b/evalbuff/src/old/cli/src/commands/login.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/commands/login.ts
rename to evalbuff/src/old/cli/src/commands/login.ts
diff --git a/evals/evalbuff/old/cli/src/commands/logout.ts b/evalbuff/src/old/cli/src/commands/logout.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/commands/logout.ts
rename to evalbuff/src/old/cli/src/commands/logout.ts
diff --git a/evals/evalbuff/old/cli/src/commands/review.ts b/evalbuff/src/old/cli/src/commands/review.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/commands/review.ts
rename to evalbuff/src/old/cli/src/commands/review.ts
diff --git a/evals/evalbuff/old/cli/src/index.ts b/evalbuff/src/old/cli/src/index.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/index.ts
rename to evalbuff/src/old/cli/src/index.ts
diff --git a/evals/evalbuff/old/cli/src/templates/skill.ts b/evalbuff/src/old/cli/src/templates/skill.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/templates/skill.ts
rename to evalbuff/src/old/cli/src/templates/skill.ts
diff --git a/evals/evalbuff/old/cli/src/utils/auth.ts b/evalbuff/src/old/cli/src/utils/auth.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/utils/auth.ts
rename to evalbuff/src/old/cli/src/utils/auth.ts
diff --git a/evals/evalbuff/old/cli/src/utils/config.ts b/evalbuff/src/old/cli/src/utils/config.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/utils/config.ts
rename to evalbuff/src/old/cli/src/utils/config.ts
diff --git a/evals/evalbuff/old/cli/src/utils/git.ts b/evalbuff/src/old/cli/src/utils/git.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/utils/git.ts
rename to evalbuff/src/old/cli/src/utils/git.ts
diff --git a/evals/evalbuff/old/cli/src/utils/knowledge.ts b/evalbuff/src/old/cli/src/utils/knowledge.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/utils/knowledge.ts
rename to evalbuff/src/old/cli/src/utils/knowledge.ts
diff --git a/evals/evalbuff/old/cli/src/utils/output.ts b/evalbuff/src/old/cli/src/utils/output.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/utils/output.ts
rename to evalbuff/src/old/cli/src/utils/output.ts
diff --git a/evals/evalbuff/old/cli/src/utils/project.ts b/evalbuff/src/old/cli/src/utils/project.ts
similarity index 100%
rename from evals/evalbuff/old/cli/src/utils/project.ts
rename to evalbuff/src/old/cli/src/utils/project.ts
diff --git a/evals/evalbuff/old/cli/tsconfig.json b/evalbuff/src/old/cli/tsconfig.json
similarity index 100%
rename from evals/evalbuff/old/cli/tsconfig.json
rename to evalbuff/src/old/cli/tsconfig.json
diff --git a/evals/evalbuff/run-e2e-test.ts b/evalbuff/src/run-e2e-test.ts
similarity index 99%
rename from evals/evalbuff/run-e2e-test.ts
rename to evalbuff/src/run-e2e-test.ts
index 433dd22865..252a65664a 100644
--- a/evals/evalbuff/run-e2e-test.ts
+++ b/evalbuff/src/run-e2e-test.ts
@@ -10,7 +10,7 @@
  *   - (Optional) `codex` CLI installed with OPENAI_API_KEY set
  *
  * Usage:
- *   bun run evals/evalbuff/run-e2e-test.ts
+ *   bun run evalbuff/src/run-e2e-test.ts
  */
 import { execSync } from 'child_process'
 import fs from 'fs'
diff --git a/evals/evalbuff/run-evalbuff.ts b/evalbuff/src/run-evalbuff.ts
similarity index 100%
rename from evals/evalbuff/run-evalbuff.ts
rename to evalbuff/src/run-evalbuff.ts
diff --git a/evals/evalbuff/runners/claude.ts b/evalbuff/src/runners/claude.ts
similarity index 100%
rename from evals/evalbuff/runners/claude.ts
rename to evalbuff/src/runners/claude.ts
diff --git a/evals/evalbuff/runners/codebuff.ts b/evalbuff/src/runners/codebuff.ts
similarity index 100%
rename from evals/evalbuff/runners/codebuff.ts
rename to evalbuff/src/runners/codebuff.ts
diff --git a/evals/evalbuff/runners/codex.ts b/evalbuff/src/runners/codex.ts
similarity index 100%
rename from evals/evalbuff/runners/codex.ts
rename to evalbuff/src/runners/codex.ts
diff --git a/evals/evalbuff/runners/index.ts b/evalbuff/src/runners/index.ts
similarity index 100%
rename from evals/evalbuff/runners/index.ts
rename to evalbuff/src/runners/index.ts
diff --git a/evals/evalbuff/runners/runner.ts b/evalbuff/src/runners/runner.ts
similarity index 100%
rename from evals/evalbuff/runners/runner.ts
rename to evalbuff/src/runners/runner.ts
diff --git a/evals/evalbuff/test-repo-utils.ts b/evalbuff/src/test-repo-utils.ts
similarity index 100%
rename from evals/evalbuff/test-repo-utils.ts
rename to evalbuff/src/test-repo-utils.ts
diff --git a/evals/evalbuff/types.ts b/evalbuff/src/types.ts
similarity index 100%
rename from evals/evalbuff/types.ts
rename to evalbuff/src/types.ts
diff --git a/evalbuff/tsconfig.json b/evalbuff/tsconfig.json
new file mode 100644
index 0000000000..fcd93ea3e0
--- /dev/null
+++ b/evalbuff/tsconfig.json
@@ -0,0 +1,14 @@
+{
+  "extends": "../tsconfig.base.json",
+  "compilerOptions": {
+    "types": ["bun", "node"],
+    "baseUrl": ".",
+    "skipLibCheck": true,
+    "paths": {
+      "@codebuff/sdk": ["../sdk/src/index.ts"],
+      "@codebuff/sdk/*": ["../sdk/src/*"]
+    }
+  },
+  "include": ["src/**/*.ts"],
+  "exclude": ["node_modules"]
+}
diff --git a/evals/package.json b/evals/package.json
index f335804ebc..c27555a957 100644
--- a/evals/package.json
+++ b/evals/package.json
@@ -23,8 +23,6 @@
     "run-eval-set": "bun run git-evals/run-eval-set.ts",
     "run-buffbench": "bun run buffbench/main.ts",
     "run-buffbench-nightly": "bun run buffbench/main-nightly.ts",
-    "run-evalbuff": "bun run evalbuff/run-evalbuff.ts",
-    "test:evalbuff": "bun test evalbuff/__tests__/criteria.test.ts evalbuff/__tests__/docs-optimizer.test.ts evalbuff/__tests__/morning-report.test.ts evalbuff/__tests__/cli-runner.test.ts && bun test evalbuff/__tests__/loop.integration.test.ts && bun test evalbuff/__tests__/e2e.test.ts",
     "trigger-buffbench": "bun run scripts/trigger-buffbench.ts",
     "setup-codebuff-repo": "bun run setup-codebuff-repo.ts"
   },
diff --git a/package.json b/package.json
index ad1d8002dc..ef4f2ea967 100644
--- a/package.json
+++ b/package.json
@@ -13,6 +13,7 @@
     "packages/*",
     "scripts",
     "evals",
+    "evalbuff",
     "sdk",
     "agents",
     "cli"

From 76f18c08e1d14bc5271acfdad5cae47c4eb5f921 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 14:47:45 -0700
Subject: [PATCH 11/12] evalbuff: consolidate old code and planning docs into
 evalbuff/old/

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 evalbuff/{ => old}/BRAINSTORM.md                   | 0
 evalbuff/{ => old}/PHASE-1-SPEC.md                 | 0
 evalbuff/{ => old}/README.md                       | 0
 evalbuff/{src => }/old/agents/context-agent.ts     | 0
 evalbuff/{src => }/old/agents/review-agent.ts      | 0
 evalbuff/{src => }/old/agents/scan-agent.ts        | 0
 evalbuff/{src => }/old/cli/package.json            | 0
 evalbuff/{src => }/old/cli/src/commands/context.ts | 0
 evalbuff/{src => }/old/cli/src/commands/init.ts    | 0
 evalbuff/{src => }/old/cli/src/commands/login.ts   | 0
 evalbuff/{src => }/old/cli/src/commands/logout.ts  | 0
 evalbuff/{src => }/old/cli/src/commands/review.ts  | 0
 evalbuff/{src => }/old/cli/src/index.ts            | 0
 evalbuff/{src => }/old/cli/src/templates/skill.ts  | 0
 evalbuff/{src => }/old/cli/src/utils/auth.ts       | 0
 evalbuff/{src => }/old/cli/src/utils/config.ts     | 0
 evalbuff/{src => }/old/cli/src/utils/git.ts        | 0
 evalbuff/{src => }/old/cli/src/utils/knowledge.ts  | 0
 evalbuff/{src => }/old/cli/src/utils/output.ts     | 0
 evalbuff/{src => }/old/cli/src/utils/project.ts    | 0
 evalbuff/{src => }/old/cli/tsconfig.json           | 0
 21 files changed, 0 insertions(+), 0 deletions(-)
 rename evalbuff/{ => old}/BRAINSTORM.md (100%)
 rename evalbuff/{ => old}/PHASE-1-SPEC.md (100%)
 rename evalbuff/{ => old}/README.md (100%)
 rename evalbuff/{src => }/old/agents/context-agent.ts (100%)
 rename evalbuff/{src => }/old/agents/review-agent.ts (100%)
 rename evalbuff/{src => }/old/agents/scan-agent.ts (100%)
 rename evalbuff/{src => }/old/cli/package.json (100%)
 rename evalbuff/{src => }/old/cli/src/commands/context.ts (100%)
 rename evalbuff/{src => }/old/cli/src/commands/init.ts (100%)
 rename evalbuff/{src => }/old/cli/src/commands/login.ts (100%)
 rename evalbuff/{src => }/old/cli/src/commands/logout.ts (100%)
 rename evalbuff/{src => }/old/cli/src/commands/review.ts (100%)
 rename evalbuff/{src => }/old/cli/src/index.ts (100%)
 rename evalbuff/{src => }/old/cli/src/templates/skill.ts (100%)
 rename evalbuff/{src => }/old/cli/src/utils/auth.ts (100%)
 rename evalbuff/{src => }/old/cli/src/utils/config.ts (100%)
 rename evalbuff/{src => }/old/cli/src/utils/git.ts (100%)
 rename evalbuff/{src => }/old/cli/src/utils/knowledge.ts (100%)
 rename evalbuff/{src => }/old/cli/src/utils/output.ts (100%)
 rename evalbuff/{src => }/old/cli/src/utils/project.ts (100%)
 rename evalbuff/{src => }/old/cli/tsconfig.json (100%)

diff --git a/evalbuff/BRAINSTORM.md b/evalbuff/old/BRAINSTORM.md
similarity index 100%
rename from evalbuff/BRAINSTORM.md
rename to evalbuff/old/BRAINSTORM.md
diff --git a/evalbuff/PHASE-1-SPEC.md b/evalbuff/old/PHASE-1-SPEC.md
similarity index 100%
rename from evalbuff/PHASE-1-SPEC.md
rename to evalbuff/old/PHASE-1-SPEC.md
diff --git a/evalbuff/README.md b/evalbuff/old/README.md
similarity index 100%
rename from evalbuff/README.md
rename to evalbuff/old/README.md
diff --git a/evalbuff/src/old/agents/context-agent.ts b/evalbuff/old/agents/context-agent.ts
similarity index 100%
rename from evalbuff/src/old/agents/context-agent.ts
rename to evalbuff/old/agents/context-agent.ts
diff --git a/evalbuff/src/old/agents/review-agent.ts b/evalbuff/old/agents/review-agent.ts
similarity index 100%
rename from evalbuff/src/old/agents/review-agent.ts
rename to evalbuff/old/agents/review-agent.ts
diff --git a/evalbuff/src/old/agents/scan-agent.ts b/evalbuff/old/agents/scan-agent.ts
similarity index 100%
rename from evalbuff/src/old/agents/scan-agent.ts
rename to evalbuff/old/agents/scan-agent.ts
diff --git a/evalbuff/src/old/cli/package.json b/evalbuff/old/cli/package.json
similarity index 100%
rename from evalbuff/src/old/cli/package.json
rename to evalbuff/old/cli/package.json
diff --git a/evalbuff/src/old/cli/src/commands/context.ts b/evalbuff/old/cli/src/commands/context.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/commands/context.ts
rename to evalbuff/old/cli/src/commands/context.ts
diff --git a/evalbuff/src/old/cli/src/commands/init.ts b/evalbuff/old/cli/src/commands/init.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/commands/init.ts
rename to evalbuff/old/cli/src/commands/init.ts
diff --git a/evalbuff/src/old/cli/src/commands/login.ts b/evalbuff/old/cli/src/commands/login.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/commands/login.ts
rename to evalbuff/old/cli/src/commands/login.ts
diff --git a/evalbuff/src/old/cli/src/commands/logout.ts b/evalbuff/old/cli/src/commands/logout.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/commands/logout.ts
rename to evalbuff/old/cli/src/commands/logout.ts
diff --git a/evalbuff/src/old/cli/src/commands/review.ts b/evalbuff/old/cli/src/commands/review.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/commands/review.ts
rename to evalbuff/old/cli/src/commands/review.ts
diff --git a/evalbuff/src/old/cli/src/index.ts b/evalbuff/old/cli/src/index.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/index.ts
rename to evalbuff/old/cli/src/index.ts
diff --git a/evalbuff/src/old/cli/src/templates/skill.ts b/evalbuff/old/cli/src/templates/skill.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/templates/skill.ts
rename to evalbuff/old/cli/src/templates/skill.ts
diff --git a/evalbuff/src/old/cli/src/utils/auth.ts b/evalbuff/old/cli/src/utils/auth.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/utils/auth.ts
rename to evalbuff/old/cli/src/utils/auth.ts
diff --git a/evalbuff/src/old/cli/src/utils/config.ts b/evalbuff/old/cli/src/utils/config.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/utils/config.ts
rename to evalbuff/old/cli/src/utils/config.ts
diff --git a/evalbuff/src/old/cli/src/utils/git.ts b/evalbuff/old/cli/src/utils/git.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/utils/git.ts
rename to evalbuff/old/cli/src/utils/git.ts
diff --git a/evalbuff/src/old/cli/src/utils/knowledge.ts b/evalbuff/old/cli/src/utils/knowledge.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/utils/knowledge.ts
rename to evalbuff/old/cli/src/utils/knowledge.ts
diff --git a/evalbuff/src/old/cli/src/utils/output.ts b/evalbuff/old/cli/src/utils/output.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/utils/output.ts
rename to evalbuff/old/cli/src/utils/output.ts
diff --git a/evalbuff/src/old/cli/src/utils/project.ts b/evalbuff/old/cli/src/utils/project.ts
similarity index 100%
rename from evalbuff/src/old/cli/src/utils/project.ts
rename to evalbuff/old/cli/src/utils/project.ts
diff --git a/evalbuff/src/old/cli/tsconfig.json b/evalbuff/old/cli/tsconfig.json
similarity index 100%
rename from evalbuff/src/old/cli/tsconfig.json
rename to evalbuff/old/cli/tsconfig.json

From 3242bb24a46d22556025627b6e1772b331903d50 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Thu, 26 Mar 2026 14:49:11 -0700
Subject: [PATCH 12/12] evalbuff: move README to package root

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 evalbuff/{src => }/README.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename evalbuff/{src => }/README.md (100%)

diff --git a/evalbuff/src/README.md b/evalbuff/README.md
similarity index 100%
rename from evalbuff/src/README.md
rename to evalbuff/README.md