diff --git a/AGENTS.md b/AGENTS.md index 7c87e6110..ca06ab44c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -42,3 +42,4 @@ Make an efficient learning agent that can do anything. - [`docs/environment-variables.md`](docs/environment-variables.md) — Env var rules, DI helpers, loading order - [`docs/agents-and-tools.md`](docs/agents-and-tools.md) — Agent system, shell shims, tool definitions - [`docs/patterns/handle-steps-generators.md`](docs/patterns/handle-steps-generators.md) — handleSteps generator patterns and spawn_agents tool calls +- [docs/evalbuff/interpreting-task-prompts.md](docs/evalbuff/interpreting-task-prompts.md) diff --git a/docs/evalbuff/interpreting-task-prompts.md b/docs/evalbuff/interpreting-task-prompts.md new file mode 100644 index 000000000..0ccaf4e9e --- /dev/null +++ b/docs/evalbuff/interpreting-task-prompts.md @@ -0,0 +1,63 @@ +# Interpreting Task Prompts (Especially Eval-Generated Ones) + +When working with task prompts, especially those auto-generated from commit history for evaluation purposes, the prompt text may not accurately describe the actual work needed. + +## The Problem + +Evalbuff generates task prompts by analyzing commits. Sometimes the prompt will say "create documentation about X" when the actual ground truth is "fix test scripts in package.json and CI workflow files." This happens when: + +1. The commit message is misleading (e.g., "Simplify AGENTS.md" when it actually removes test scripts) +2. The prompt generator focuses on visible file additions rather than the semantic meaning of the change +3. The task is stated in terms of what a developer might ASK for, not what they actually need + +## Solution: Always Check Ground Truth First + +Before implementing ANY task: + +1. **Check if there's a ground truth diff available** - look for references to expected changes, test files, or "what should have been done" +2. **Examine file paths and extensions in the ground truth**: + - `.json` files (especially `package.json`) → likely config/dependency changes + - `.yml`/`.yaml` files in `.github/workflows/` → CI/CD configuration changes + - `.md` files → documentation (but could also be removing or editing existing docs) + - `.ts`/`.js` files → code changes +3. **Read the actual diff content, not just the prompt** - the diff shows EXACTLY what changed +4. **Distinguish between creation vs. modification**: + - Does the ground truth show `new file mode` or additions to existing files? + - Is this refactoring, removal, or net-new functionality? + +## Example: The AGENTS.md Confusion + +Prompt said: +> "Can you create an AGENTS.md file at the root that provides an overview..." + +Ground truth showed: +```diff +--- a/.agents/package.json ++++ b/.agents/package.json +- "test:e2e": "bun test e2e" +--- a/.github/workflows/nightly-e2e.yml ++++ b/.github/workflows/nightly-e2e.yml +- run: cd .agents && bun run test:e2e ++ run: cd agents && bun run test:e2e +``` + +The actual task was about: +- Removing a test script from package.json +- Fixing directory references in a CI workflow +- NOT about creating documentation + +The agent should have recognized the ground truth shows `.json` and `.yml` config files, not `.md` documentation files. + +## When In Doubt + +If the prompt seems to conflict with file paths/types in the ground truth: +1. Trust the ground truth diff over the prompt text +2. Read the actual file contents being changed +3. Understand the PURPOSE of the change (fixing tests, updating config, refactoring) before implementing +4. Ask clarifying questions if the task is genuinely ambiguous + +## Red Flags + +- Prompt says "create docs" but ground truth shows only config file changes → likely NOT a docs task +- Prompt says "add feature X" but ground truth removes code → likely a cleanup/refactor task +- Prompt uses vague language ("simplify", "improve") → read the diff to understand the specific technical change \ No newline at end of file diff --git a/evalbuff/README.md b/evalbuff/README.md index 130ba4831..518fbce6c 100644 --- a/evalbuff/README.md +++ b/evalbuff/README.md @@ -1,104 +1,80 @@ # Evalbuff -Evalbuff is an automated system that iteratively improves a coding agent's performance by optimizing project documentation. It runs overnight, discovers what an agent gets wrong, writes docs to fix those gaps, and keeps only the changes that measurably improve scores. +Evalbuff improves a coding agent's performance by iteratively optimizing project documentation. It watches an agent fail, writes docs to fix the pattern, and keeps only the changes that measurably help. -## The Idea +## Two Modes -Most coding agents read project documentation before making changes. Better docs lead to better code. But writing good docs is hard — you don't know what an agent needs to know until you watch it fail. +### 1. Commit Learning Mode (default) -Evalbuff closes this loop automatically: +Walks through your repo's git history commit-by-commit, using each commit as a learning opportunity: -1. **Run** a coding agent on real eval tasks (reconstructing git commits) -2. **Judge** the output with AI judges that apply living quality criteria -3. **Analyze** failures — feed the judge's weaknesses to a doc-writer agent -4. **Test** whether a proposed doc edit actually improves the agent's score -5. **Keep** doc changes that help, revert ones that don't -6. **Repeat** until the budget runs out or scores plateau +1. Start at HEAD~500 (configurable) and process commits one at a time, oldest first +2. For each commit, craft a human-like prompt that vaguely describes the change (via LLM) +3. Run N agents in parallel (default 5) on that prompt against the parent commit +4. Judge all runs — using the actual commit diff as ground truth +5. Always analyze failures and propose doc changes (ensuring they're generic enough to help future tasks, not just this one) +6. Re-run N agents with the proposed docs +7. If scores improve, keep the docs and try to propose more improvements +8. If scores don't improve, reject the docs and move to the next commit +9. State is saved after each commit — resume at any time -The result: a `docs/` directory and `AGENTS.md` table of contents that encode exactly what the agent needs to know to perform well on your codebase. Any agent that reads project docs benefits — Claude Code, Codex, Codebuff, or anything else with a CLI. +The result: a `docs/` directory that encodes patterns the agent needs to know, learned from real historical changes. -## Why Documentation? +### 2. Prompt Mode -We chose documentation as the improvement lever because: +Run a specific coding prompt and improve docs for it — no git history needed: -- **Agent-agnostic.** Every modern coding agent reads project docs. Improving docs improves all agents, not just one. -- **Interpretable.** Unlike fine-tuning weights or tweaking system prompts, docs are human-readable. You can review what evalbuff learned and decide if it makes sense. -- **Composable.** Doc improvements stack. A doc about error handling patterns doesn't conflict with a doc about naming conventions. -- **Persistent.** Docs live in the repo and benefit every future session, not just the current one. +1. Given a prompt describing a coding task +2. Run N agents in parallel on the prompt against the current HEAD +3. Judge all runs — no ground truth, relies entirely on e2e testing by the judge +4. Analyze and propose doc changes +5. Re-run and keep/reject as with learn mode -## Living Quality Criteria - -Evalbuff uses a leveling system so it doesn't try to optimize everything at once: +Useful for targeted doc improvement around known pain points. -| Level | Criteria Added | When | -|-------|---------------|------| -| L1 | Correctness, Completeness, Basic Style | Start | -| L2 | + Pattern Consistency | After L1 avg >= 8.0 over 10 tasks | -| L3 | + Test Quality | After L2 avg >= 8.0 over 10 tasks | -| L4 | + Optimal Design | After L3 avg >= 8.0 over 10 tasks | -| L5 | + Fluency | After L4 avg >= 8.0 over 10 tasks | - -This prevents the system from penalizing an agent for style issues when it can't even get the code to compile. Criteria are injected directly into the AI judge prompts. - -## Architecture +## How It Works ``` -┌─────────────────────────────────────────────────────┐ -│ Orchestrator │ -│ (run-evalbuff.ts) │ -│ │ -│ for each eval task: │ -│ 1. Clone repo into isolated temp dir │ -│ 2. Copy current docs/ into the clone │ -│ 3. Run agent CLI on the task prompt │ -│ 4. Judge the diff against ground truth │ -│ 5. If score < threshold: │ -│ a. Analyze failure → propose doc edit │ -│ b. Re-run agent with new doc │ -│ c. Re-judge → keep doc if score improved │ -│ 6. Update criteria level if scores are high │ -│ 7. Log entry to JSONL, save state │ -│ │ -│ Generate morning report │ -└─────────────────────────────────────────────────────┘ +for each task (commit or prompt): + ┌─────────────────────────────────────────────────────┐ + │ 1. Run N agents in parallel (baseline) │ + │ 2. Judge all N runs → average score │ + │ 3. Analyze worst run → propose generic doc │ + │ 4. Apply doc to repo │ + │ 5. Re-run N agents with new doc │ + │ 6. Score improved? Keep doc, try more improvements │ + │ Score same/worse? Reject doc, next task │ + └─────────────────────────────────────────────────────┘ ``` -### Components - -| File | Role | -|------|------| -| `run-evalbuff.ts` | Main orchestrator loop with budget caps and resumable state | -| `cli-runner.ts` | Agent-agnostic CLI runner — spawns any agent command, captures git diff | -| `judge.ts` | AI judging system (GPT-5.1 + Gemini) with criteria injection | -| `docs-optimizer.ts` | Failure analysis, doc writing, doc application, score comparison | -| `criteria.ts` | Living quality criteria with L1-L5 promotion logic | -| `morning-report.ts` | Generates markdown summary from overnight JSONL log | -| `test-repo-utils.ts` | Creates isolated git repos per eval task | -| `agent-runner.ts` | BuffBench-style agent runner (for Codebuff SDK agents) | -| `types.ts` | Shared types (EvalCommitV2, EvalDataV2, etc.) | +Key design decisions: +- **Low-cost agent** (`codebuff --agent base2-free` by default) — runs many times cheaply +- **N parallel runs** for statistical significance — one run is noisy, five gives a decent signal +- **Always analyze** — no score threshold; every task is a learning opportunity +- **Generic docs only** — the doc writer is instructed to skip task-specific advice and focus on patterns +- **Iterative improvement** — keeps proposing docs until one is rejected, then moves on ## Usage -### Command Line +### Commit Learning Mode ```bash bun run evalbuff/src/run-evalbuff.ts \ --repo /path/to/target-repo \ - --agent "claude -p" \ - --evals evals/buffbench/eval-codebuff.json,evals/buffbench/eval-manifold.json \ - --max-iterations 50 \ - --max-cost 50 \ - --score-threshold 7.0 \ - --agent-timeout 300000 + --agent "codebuff --agent base2-free" \ + --commits 500 \ + --parallelism 5 \ + --max-cost 100 ``` -Or via the workspace script: +### Prompt Mode ```bash -bun run --filter @codebuff/evalbuff run -- \ +bun run evalbuff/src/run-evalbuff.ts \ --repo /path/to/target-repo \ - --agent "codex exec --full-auto" \ - --evals evals/buffbench/eval-codebuff.json + --agent "codebuff --agent base2-free" \ + --prompt "Add a dark mode toggle to the settings page" \ + --parallelism 5 ``` ### Arguments @@ -106,109 +82,70 @@ bun run --filter @codebuff/evalbuff run -- \ | Argument | Default | Description | |----------|---------|-------------| | `--repo` | required | Path to the target repo where docs/ will be written | -| `--agent` | required | Agent CLI command (prompt is appended as last arg) | -| `--evals` | required | Comma-separated paths to eval JSON files | -| `--max-iterations` | 50 | Stop after this many tasks | -| `--max-cost` | 50 | Stop after spending this many USD (estimated) | -| `--score-threshold` | 7.0 | Only attempt doc edits for scores below this | -| `--agent-timeout` | 300000 | Per-task agent timeout in ms (5 min default) | +| `--agent` | `codebuff --agent base2-free` | Agent CLI command (prompt appended as last arg) | +| `--prompt` | — | If set, runs in prompt mode instead of learn mode | +| `--commits` | 500 | How many commits back to start from (learn mode) | +| `--parallelism` | 5 | Number of agents to run in parallel per task | +| `--max-cost` | 100 | Stop after spending this many USD (estimated) | +| `--agent-timeout` | 300000 | Per-agent timeout in ms (5 min default) | +| `--init-command` | — | Command to run in each test repo (e.g., `npm install`) | | `--criteria` | auto | Path to criteria JSON (auto-created if omitted) | +| `--reviewers` | `claude,codex` | Comma-separated reviewer agent types | -### Overnight Run +### Resuming -For an overnight run, set generous limits and let it go: +State is saved to `evalbuff-state.json` in the target repo after each commit. Re-running with the same `--repo` automatically resumes from where it left off — it knows which commit was last processed and continues from there. + +### Overnight Run ```bash nohup bun run evalbuff/src/run-evalbuff.ts \ --repo /path/to/repo \ - --agent "claude -p" \ - --evals evals/buffbench/eval-codebuff.json \ - --max-iterations 200 \ - --max-cost 100 \ + --commits 500 \ + --parallelism 5 \ + --max-cost 200 \ > evalbuff-overnight.log 2>&1 & ``` -Check results in the morning: -- `/evalbuff-report-YYYY-MM-DD.md` — morning report -- `/evalbuff-log.jsonl` — detailed per-task log -- `/docs/` — the docs that were kept -- `/AGENTS.md` — table of contents - -### Resumable - -Evalbuff saves state to `evalbuff-state.json` in the target repo. If interrupted, re-running with the same arguments will skip completed tasks and continue where it left off. - -## How It Decides What Docs to Write - -When an agent scores below the threshold on a task, evalbuff: - -1. **Feeds the judge's weaknesses** to a doc-writer LLM agent -2. The doc writer sees: the task prompt, ground truth diff, agent's diff, judge analysis, and all current docs -3. It produces a **targeted doc file** — specific to the gap between what the agent did and what it should have done -4. The doc is written to `docs/.md` and `AGENTS.md` is updated - -The doc writer is instructed to be specific and actionable — referencing concrete file paths, function names, and patterns. Generic advice like "follow best practices" is explicitly rejected. - ## What Gets Produced -After a run, the target repo will contain: - ``` target-repo/ -├── docs/ +├── docs/ # Generated documentation │ ├── patterns/ -│ │ └── error-handling.md # Evalbuff-generated +│ │ └── error-handling.md │ ├── conventions/ -│ │ └── naming.md # Evalbuff-generated +│ │ └── naming.md │ └── architecture/ -│ └── data-flow.md # Evalbuff-generated -├── AGENTS.md # Table of contents -├── evalbuff-state.json # Resumable state -├── evalbuff-log.jsonl # Per-task log -├── evalbuff-criteria.json # Current criteria level -└── evalbuff-report-2026-03-25.md # Morning report +│ └── data-flow.md +├── AGENTS.md # Table of contents +├── evalbuff-state.json # Resumable state (last commit SHA) +├── evalbuff-log.jsonl # Per-task log +├── evalbuff-criteria.json # Current criteria level +└── evalbuff-report-2026-03-26.md # Report ``` -### Morning Report - -The morning report includes: -- Summary table (iterations, cost, duration, score deltas) -- Doc changes table (which docs were tried, score impact, kept/reverted) -- Error log -- Score trajectory visualization - -## Eval Data Format - -Evalbuff reuses BuffBench's `EvalDataV2` format. Eval tasks are real git commits from open source repos, turned into prompts: - -```json -{ - "repoUrl": "https://github.com/org/repo", - "evalCommits": [ - { - "id": "task-abc123", - "sha": "abc123", - "parentSha": "def456", - "prompt": "Add error handling to the API endpoint...", - "fileDiffs": [{ "path": "src/api.ts", "diff": "..." }], - "supplementalFiles": ["src/types.ts"] - } - ] -} -``` - -Generate new evals with BuffBench's eval generation tools, then point evalbuff at the JSON files. +## Living Quality Criteria -## Relationship to BuffBench +Judges use a leveling system to avoid over-optimizing prematurely: -BuffBench benchmarks agents against each other. Evalbuff improves a single agent's performance over time. +| Level | Criteria Added | Promotion | +|-------|---------------|-----------| +| L1 | Builds, tests pass, basic completeness | Start | +| L2 | + Feature works E2E, logs clean | After L1 avg >= 8.0 over 10 tasks | +| L3 | + Edge cases, UI verification | After L2 avg >= 8.0 | +| L4 | + Cross-component integration, performance | After L3 avg >= 8.0 | +| L5 | + Production readiness | After L4 avg >= 8.0 | -| | BuffBench | Evalbuff | -|---|-----------|----------| -| **Goal** | Compare agents | Improve an agent | -| **Output** | Scores + rankings | Documentation | -| **Loop** | Single pass | Iterative | -| **Judges** | 3 (GPT, Gemini, Claude) | 2 (GPT, Gemini) | -| **Agent coupling** | Codebuff SDK | Any CLI agent | +## Architecture -Evalbuff was deep-copied from BuffBench and modified — they share types and eval data format but are independent codebases. +| File | Role | +|------|------| +| `run-evalbuff.ts` | Main orchestrator — learn mode + prompt mode | +| `commit-task-generator.ts` | Extract tasks from git history, generate prompts from commits | +| `cli-runner.ts` | Agent-agnostic CLI runner — spawns any agent, captures diff | +| `judge.ts` | AI judging with/without ground truth, multi-reviewer aggregation | +| `docs-optimizer.ts` | Failure analysis, generic doc writing, doc application/revert | +| `criteria.ts` | Living quality criteria with L1-L5 promotion | +| `morning-report.ts` | Report generation from JSONL log | +| `test-repo-utils.ts` | Isolated git repo lifecycle management | diff --git a/evalbuff/src/__tests__/e2e.test.ts b/evalbuff/src/__tests__/e2e.test.ts index 646559fa3..abc317e99 100644 --- a/evalbuff/src/__tests__/e2e.test.ts +++ b/evalbuff/src/__tests__/e2e.test.ts @@ -1,16 +1,13 @@ /** * E2E test for evalbuff. * - * This test runs the full evalbuff loop with a real (mock) agent on a local - * git repo with synthetic eval tasks. It verifies: + * This test runs the full evalbuff loop with mocked LLM calls but real + * orchestration. It verifies: * - The morning report is generated * - Log entries are written - * - State file tracks completed tasks + * - State file tracks processed commits * - Doc edits are committed to the repo when they improve scores * - * This test uses mock.module to replace LLM calls but runs the full - * orchestrator, CLI runner, and git operations for real. - * * Run: bun test evalbuff/src/__tests__/e2e.test.ts */ import { execSync } from 'child_process' @@ -22,7 +19,6 @@ import { afterAll, beforeAll, describe, expect, it, mock } from 'bun:test' import type { JudgingResult } from '../judge' import type { DocSuggestion } from '../docs-optimizer' -import type { EvalDataV2 } from '../types' // --- Mocks for LLM calls only --- @@ -30,7 +26,6 @@ let judgeCallCount = 0 mock.module('../test-repo-utils', () => ({ withTestRepo: async (_config: any, fn: (cwd: string) => Promise) => { - // Create a real local git repo for each call const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-repo-')) execSync('git init && git add . && git commit --allow-empty -m "init"', { cwd: dir, @@ -45,9 +40,19 @@ mock.module('../test-repo-utils', () => ({ }, })) +mock.module('../cli-runner', () => ({ + runCliAgent: async () => ({ + diff: 'mock diff content', + durationMs: 1000, + exitCode: 0, + stdout: 'mock stdout', + stderr: '', + }), +})) + // Judge returns alternating scores: low (triggers doc edit), then higher (confirms improvement) mock.module('../judge', () => ({ - judgeCommitResult: async () => { + judgeTaskResult: async () => { const scores = [3.0, 6.0, 8.5, 5.0, 7.0, 9.0] const score = scores[judgeCallCount % scores.length] judgeCallCount++ @@ -72,87 +77,40 @@ mock.module('../docs-optimizer', () => ({ reasoning: 'Agent consistently misses error handling patterns in async code', suggestedDocPath: 'patterns/async-error-handling.md', suggestedContent: - '# Async Error Handling\n\nAll async functions should use try/catch blocks.\nPropagate errors with meaningful messages.\n\n## Examples\n\n```ts\nasync function fetchData() {\n try {\n const result = await api.get("/data")\n return result\n } catch (error) {\n throw new Error(`Failed to fetch data: ${error.message}`)\n }\n}\n```\n', + '# Async Error Handling\n\nAll async functions should use try/catch blocks.\nPropagate errors with meaningful messages.\n', }) satisfies DocSuggestion, })) -mock.module('@codebuff/sdk', () => ({ - CodebuffClient: class { - constructor() {} - }, +// Mock commit-task-generator +mock.module('../commit-task-generator', () => ({ + getCommitList: () => ['sha-1', 'sha-2', 'sha-3'], + buildCommitTask: async (_repoPath: string, sha: string) => ({ + sha, + parentSha: `parent-${sha}`, + message: `Commit ${sha}`, + prompt: `Do the thing for ${sha}`, + diff: `mock diff for ${sha}`, + filesChanged: ['src/file.ts'], + }), })) -const { runEvalbuff } = await import('../run-evalbuff') +const { runLearnMode } = await import('../run-evalbuff') // --- Test setup --- let repoDir: string -let evalFilePath: string beforeAll(() => { - // Create a "target repo" where docs will be written repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-target-')) execSync('git init && git add . && git commit --allow-empty -m "init"', { cwd: repoDir, stdio: 'ignore', env: { ...process.env, GIT_AUTHOR_NAME: 'test', GIT_AUTHOR_EMAIL: 'test@test.com', GIT_COMMITTER_NAME: 'test', GIT_COMMITTER_EMAIL: 'test@test.com' }, }) - - // Create eval file with 3 tasks - const evalData: EvalDataV2 = { - repoUrl: 'https://github.com/test/repo', - generationDate: '2026-03-25', - evalCommits: [ - { - id: 'e2e-task-1', - sha: 'aaa111', - parentSha: 'aaa000', - spec: 'Add error handling to fetchData', - prompt: 'Add try/catch error handling to the fetchData function in src/api.ts', - supplementalFiles: [], - fileDiffs: [ - { - path: 'src/api.ts', - status: 'modified', - diff: '@@ -5,3 +5,7 @@\n-const data = await fetch(url)\n+try {\n+ const data = await fetch(url)\n+} catch (e) {\n+ throw new Error(`Fetch failed: ${e.message}`)\n+}', - }, - ], - }, - { - id: 'e2e-task-2', - sha: 'bbb222', - parentSha: 'bbb000', - spec: 'Add input validation', - prompt: 'Add input validation to the createUser endpoint', - supplementalFiles: [], - fileDiffs: [ - { - path: 'src/routes/users.ts', - status: 'modified', - diff: '@@ -1 +1,5 @@\n+if (!name || !email) {\n+ throw new Error("name and email required")\n+}', - }, - ], - }, - { - id: 'e2e-task-3', - sha: 'ccc333', - parentSha: 'ccc000', - spec: 'Refactor logger', - prompt: 'Refactor the logger to use structured JSON output', - supplementalFiles: [], - fileDiffs: [ - { - path: 'src/logger.ts', - status: 'modified', - diff: '@@ -1 +1,3 @@\n-console.log(msg)\n+const entry = { timestamp: Date.now(), message: msg }\n+process.stdout.write(JSON.stringify(entry) + "\\n")', - }, - ], - }, - ], - } - - evalFilePath = path.join(repoDir, 'eval-e2e.json') - fs.writeFileSync(evalFilePath, JSON.stringify(evalData)) + execSync('git remote add origin https://github.com/test/repo', { + cwd: repoDir, + stdio: 'ignore', + }) judgeCallCount = 0 }) @@ -164,15 +122,15 @@ afterAll(() => { // --- E2E tests --- describe('evalbuff E2E', () => { - it('runs full loop: agent, judge, doc edit, morning report', async () => { - await runEvalbuff({ + it('runs full learn loop: processes commits, improves docs, generates report', async () => { + await runLearnMode({ + mode: 'learn', repoPath: repoDir, - agentCommand: 'echo', // echo just prints the prompt and exits - evalDataPaths: [evalFilePath], - maxIterations: 3, + agentCommand: 'echo', + parallelism: 1, maxCostUsd: 50, - scoreThreshold: 7.0, agentTimeoutMs: 10_000, + commitCount: 500, }) // 1. Morning report exists @@ -185,27 +143,23 @@ describe('evalbuff E2E', () => { 'utf-8', ) expect(report).toContain('# Evalbuff Morning Report') - expect(report).toContain('Iterations | 3') - // 2. Log has 3 entries + // 2. Log has entries const logPath = path.join(repoDir, 'evalbuff-log.jsonl') expect(fs.existsSync(logPath)).toBe(true) const logLines = fs .readFileSync(logPath, 'utf-8') .trim() .split('\n') - expect(logLines).toHaveLength(3) + expect(logLines.length).toBeGreaterThan(0) - // 3. State tracks all 3 completed tasks + // 3. State tracks last processed commit const statePath = path.join(repoDir, 'evalbuff-state.json') const state = JSON.parse(fs.readFileSync(statePath, 'utf-8')) - expect(state.completedTaskIds).toEqual([ - 'e2e-task-1', - 'e2e-task-2', - 'e2e-task-3', - ]) + expect(state.lastProcessedCommitSha).toBe('sha-3') + expect(state.processedCommitCount).toBe(3) - // 4. At least one doc was written (first task scores 3.0, below threshold) + // 4. At least one doc was written (first task scores 3.0) const docsDir = path.join(repoDir, 'docs') expect(fs.existsSync(docsDir)).toBe(true) @@ -221,13 +175,5 @@ describe('evalbuff E2E', () => { encoding: 'utf-8', }) expect(gitLog).toContain('evalbuff:') - - // 7. Log entries have correct task IDs - const parsedEntries = logLines.map((l) => JSON.parse(l)) - expect(parsedEntries.map((e: any) => e.taskId)).toEqual([ - 'e2e-task-1', - 'e2e-task-2', - 'e2e-task-3', - ]) }) }) diff --git a/evalbuff/src/__tests__/loop.integration.test.ts b/evalbuff/src/__tests__/loop.integration.test.ts index d4e5636d3..334dc545e 100644 --- a/evalbuff/src/__tests__/loop.integration.test.ts +++ b/evalbuff/src/__tests__/loop.integration.test.ts @@ -7,14 +7,13 @@ import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test' import type { JudgingResult } from '../judge' import type { DocSuggestion } from '../docs-optimizer' -import type { EvalDataV2 } from '../types' // --- Mocks --- -// Track calls to mocked functions let judgeCallCount = 0 let judgeScores: number[] = [] -let analyzeFailureResult: DocSuggestion | null = null +let analyzeCallCount = 0 +let analyzeFailureResults: Array = [] let cliRunnerCallCount = 0 // Mock withTestRepo to use a local temp dir instead of cloning @@ -49,6 +48,20 @@ mock.module('../cli-runner', () => ({ // Mock judge to return configurable scores mock.module('../judge', () => ({ + judgeTaskResult: async () => { + const score = judgeScores[judgeCallCount] ?? 5.0 + judgeCallCount++ + return { + analysis: 'Mock analysis', + strengths: ['Good'], + weaknesses: ['Could improve'], + e2eTestsPerformed: ['Mock E2E test'], + completionScore: score, + codeQualityScore: score, + e2eScore: score, + overallScore: score, + } satisfies JudgingResult + }, judgeCommitResult: async () => { const score = judgeScores[judgeCallCount] ?? 5.0 judgeCallCount++ @@ -69,52 +82,32 @@ mock.module('../judge', () => ({ const actualDocsOptimizer = await import('../docs-optimizer') mock.module('../docs-optimizer', () => ({ ...actualDocsOptimizer, - analyzeFailure: async () => analyzeFailureResult, + analyzeFailure: async () => { + const result = analyzeFailureResults[analyzeCallCount] ?? null + analyzeCallCount++ + return result + }, })) -// Mock CodebuffClient -mock.module('@codebuff/sdk', () => ({ - CodebuffClient: class { - constructor() {} - async run() { - return { output: { type: 'text', value: '' } } - } - }, +// Mock commit-task-generator to avoid real git and LLM calls +mock.module('../commit-task-generator', () => ({ + getCommitList: () => ['sha-1', 'sha-2', 'sha-3'], + buildCommitTask: async (_repoPath: string, sha: string) => ({ + sha, + parentSha: `parent-${sha}`, + message: `Commit ${sha}`, + prompt: `Do the thing for ${sha}`, + diff: `mock diff for ${sha}`, + filesChanged: ['src/file.ts'], + }), })) // Import after mocks are set up -const { runEvalbuff } = await import('../run-evalbuff') +const { runLearnMode, runPromptMode } = await import('../run-evalbuff') // --- Test fixtures --- let repoDir: string -let evalFilePath: string - -function createEvalFile(taskCount: number): string { - const evalData: EvalDataV2 = { - repoUrl: 'https://github.com/test/repo', - generationDate: '2026-03-25', - evalCommits: Array.from({ length: taskCount }, (_, i) => ({ - id: `task-${i + 1}`, - sha: `sha-${i + 1}`, - parentSha: `parent-${i + 1}`, - spec: `Test task ${i + 1}`, - prompt: `Do task ${i + 1}`, - supplementalFiles: [], - fileDiffs: [ - { - path: `src/file${i + 1}.ts`, - status: 'modified' as const, - diff: `@@ -1 +1 @@\n-old\n+new`, - }, - ], - })), - } - - const filePath = path.join(repoDir, `eval-test.json`) - fs.writeFileSync(filePath, JSON.stringify(evalData)) - return filePath -} beforeEach(() => { repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-integ-')) @@ -122,12 +115,17 @@ beforeEach(() => { cwd: repoDir, stdio: 'ignore', }) - evalFilePath = createEvalFile(5) + // Set up a fake remote so git remote get-url works + execSync('git remote add origin https://github.com/test/repo', { + cwd: repoDir, + stdio: 'ignore', + }) // Reset mock state judgeCallCount = 0 judgeScores = [] - analyzeFailureResult = null + analyzeCallCount = 0 + analyzeFailureResults = [] cliRunnerCallCount = 0 }) @@ -137,39 +135,37 @@ afterEach(() => { // --- Tests --- -describe('runEvalbuff integration', () => { - it('completes one full iteration: runs agent, judges, and logs', async () => { - judgeScores = [8.0] // Above threshold, no doc edit attempted +describe('runLearnMode integration', () => { + it('processes commits, runs agents in parallel, judges, and logs', async () => { + // With parallelism=1 and 3 commits, we get 3 baseline runs (1 per commit) + // Each baseline run gets judged once + judgeScores = [8.0, 8.0, 8.0] - await runEvalbuff({ + await runLearnMode({ + mode: 'learn', repoPath: repoDir, agentCommand: 'echo', - evalDataPaths: [evalFilePath], - maxIterations: 1, + parallelism: 1, maxCostUsd: 100, - scoreThreshold: 7.0, agentTimeoutMs: 10_000, + commitCount: 500, }) - // Verify log was written + // Verify log was written with entries for each commit const logPath = path.join(repoDir, 'evalbuff-log.jsonl') expect(fs.existsSync(logPath)).toBe(true) const logLines = fs .readFileSync(logPath, 'utf-8') .trim() .split('\n') - expect(logLines).toHaveLength(1) - - const entry = JSON.parse(logLines[0]) - expect(entry.taskId).toBe('task-1') - expect(entry.oldScore).toBe(8.0) - expect(entry.docEdit).toBeNull() + expect(logLines).toHaveLength(3) - // Verify state was saved + // Verify state was saved with lastProcessedCommitSha const statePath = path.join(repoDir, 'evalbuff-state.json') expect(fs.existsSync(statePath)).toBe(true) const state = JSON.parse(fs.readFileSync(statePath, 'utf-8')) - expect(state.completedTaskIds).toContain('task-1') + expect(state.lastProcessedCommitSha).toBe('sha-3') + expect(state.processedCommitCount).toBe(3) // Verify morning report was generated const reportFiles = fs @@ -178,32 +174,40 @@ describe('runEvalbuff integration', () => { expect(reportFiles.length).toBeGreaterThan(0) }) - it('attempts doc edit when score is below threshold', async () => { - // First judge call returns low score, second (after doc edit) returns higher - judgeScores = [4.0, 6.0] - analyzeFailureResult = { + it('attempts doc edit and keeps it when score improves', async () => { + // parallelism=1: commit1 baseline=4.0, rerun with doc=7.0 (improved, kept) + // Then analyze returns null to stop loop. commit2 baseline=8.0, analyze returns null. + // commit3 baseline=8.0, analyze returns null. + judgeScores = [4.0, 7.0, 8.0, 8.0, 8.0, 8.0] + const docSuggestion: DocSuggestion = { reasoning: 'Agent missed error handling patterns', suggestedDocPath: 'patterns/errors.md', suggestedContent: '# Error Handling\n\nAlways use try/catch.', } + // First analyze call returns suggestion, then null to stop iterating + analyzeFailureResults = [docSuggestion, null, null, null] - await runEvalbuff({ + await runLearnMode({ + mode: 'learn', repoPath: repoDir, agentCommand: 'echo', - evalDataPaths: [evalFilePath], - maxIterations: 1, + parallelism: 1, maxCostUsd: 100, - scoreThreshold: 7.0, agentTimeoutMs: 10_000, + commitCount: 500, }) const logPath = path.join(repoDir, 'evalbuff-log.jsonl') - const entry = JSON.parse(fs.readFileSync(logPath, 'utf-8').trim()) - expect(entry.oldScore).toBe(4.0) - expect(entry.newScore).toBe(6.0) - expect(entry.scoreComparison).toBe('improved') - expect(entry.docEdit).not.toBeNull() - expect(entry.docEdit.path).toBe('patterns/errors.md') + const entries = fs + .readFileSync(logPath, 'utf-8') + .trim() + .split('\n') + .map((l) => JSON.parse(l)) + + // First entry should show doc improvement + expect(entries[0].oldScore).toBe(4.0) + expect(entries[0].newScore).toBe(7.0) + expect(entries[0].docEdit).not.toBeNull() // Doc should have been applied to the real repo const docPath = path.join(repoDir, 'docs', 'patterns', 'errors.md') @@ -211,132 +215,94 @@ describe('runEvalbuff integration', () => { expect(fs.readFileSync(docPath, 'utf-8')).toContain('Error Handling') }) - it('stops at maxIterations', async () => { - judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0] - - await runEvalbuff({ - repoPath: repoDir, - agentCommand: 'echo', - evalDataPaths: [evalFilePath], // 5 tasks available - maxIterations: 2, - maxCostUsd: 100, - scoreThreshold: 7.0, - agentTimeoutMs: 10_000, - }) - - const logPath = path.join(repoDir, 'evalbuff-log.jsonl') - const logLines = fs - .readFileSync(logPath, 'utf-8') - .trim() - .split('\n') - expect(logLines).toHaveLength(2) - - const state = JSON.parse( - fs.readFileSync(path.join(repoDir, 'evalbuff-state.json'), 'utf-8'), - ) - expect(state.completedTaskIds).toHaveLength(2) - }) - it('stops when cost exceeds maxCostUsd', async () => { - judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0] + judgeScores = [8.0, 8.0, 8.0] - // First run — complete 1 task, which will accumulate some cost - await runEvalbuff({ - repoPath: repoDir, - agentCommand: 'echo', - evalDataPaths: [evalFilePath], - maxIterations: 1, - maxCostUsd: 100, - scoreThreshold: 7.0, - agentTimeoutMs: 10_000, - }) - - // Manually set cost in state to be at the limit - const statePath = path.join(repoDir, 'evalbuff-state.json') - const state = JSON.parse(fs.readFileSync(statePath, 'utf-8')) - state.totalCostUsd = 100.0 - fs.writeFileSync(statePath, JSON.stringify(state)) - - // Second run — should stop immediately due to cost (>= maxCostUsd) - await runEvalbuff({ - repoPath: repoDir, - agentCommand: 'echo', - evalDataPaths: [evalFilePath], - maxIterations: 50, - maxCostUsd: 100, - scoreThreshold: 7.0, - agentTimeoutMs: 10_000, - }) - - // Should still only have 1 completed task (cost check prevents new tasks) - const finalState = JSON.parse(fs.readFileSync(statePath, 'utf-8')) - expect(finalState.completedTaskIds).toHaveLength(1) - }) - - it('resumes from state file and skips completed tasks', async () => { - judgeScores = [8.0, 8.0, 8.0, 8.0, 8.0] - - // Pre-populate state with 2 completed tasks + // Pre-set cost at limit const statePath = path.join(repoDir, 'evalbuff-state.json') fs.writeFileSync( statePath, JSON.stringify({ - completedTaskIds: ['task-1', 'task-2'], - totalCostUsd: 5.0, - recentScores: [7.0, 8.0], + lastProcessedCommitSha: null, + totalCostUsd: 100.0, + recentScores: [], + processedCommitCount: 0, }), ) - await runEvalbuff({ + await runLearnMode({ + mode: 'learn', repoPath: repoDir, agentCommand: 'echo', - evalDataPaths: [evalFilePath], // 5 tasks - maxIterations: 50, + parallelism: 1, maxCostUsd: 100, - scoreThreshold: 7.0, agentTimeoutMs: 10_000, + commitCount: 500, }) - // Should have processed tasks 3-5 (skipped 1 and 2) + // Should not have processed any commits (cost already at limit) const logPath = path.join(repoDir, 'evalbuff-log.jsonl') - const logLines = fs - .readFileSync(logPath, 'utf-8') - .trim() - .split('\n') - expect(logLines).toHaveLength(3) - - const taskIds = logLines.map((l) => JSON.parse(l).taskId) - expect(taskIds).toEqual(['task-3', 'task-4', 'task-5']) - - const finalState = JSON.parse(fs.readFileSync(statePath, 'utf-8')) - expect(finalState.completedTaskIds).toHaveLength(5) + expect(fs.existsSync(logPath)).toBe(false) }) - it('reverts doc edit when score does not improve', async () => { - // First judge: low score, second judge: even lower (doc didn't help) - judgeScores = [4.0, 3.0] - analyzeFailureResult = { - reasoning: 'Tried to help', - suggestedDocPath: 'bad-doc.md', - suggestedContent: '# Bad Doc\n\nThis will not help.', - } - - await runEvalbuff({ + it('rejects doc edit when score does not improve', async () => { + // Commit1: baseline 4.0, rerun 3.0 (worse) — doc rejected, loop stops. + // Commit2: baseline 8.0, analyze returns null. Commit3: baseline 8.0, null. + judgeScores = [4.0, 3.0, 8.0, 8.0] + analyzeFailureResults = [ + { + reasoning: 'Tried to help', + suggestedDocPath: 'bad-doc.md', + suggestedContent: '# Bad Doc\n\nThis will not help.', + }, + null, + null, + ] + + await runLearnMode({ + mode: 'learn', repoPath: repoDir, agentCommand: 'echo', - evalDataPaths: [evalFilePath], - maxIterations: 1, + parallelism: 1, maxCostUsd: 100, - scoreThreshold: 7.0, agentTimeoutMs: 10_000, + commitCount: 500, }) - const logPath = path.join(repoDir, 'evalbuff-log.jsonl') - const entry = JSON.parse(fs.readFileSync(logPath, 'utf-8').trim()) - expect(entry.scoreComparison).toBe('worse') - // Doc should NOT exist in the real repo const docPath = path.join(repoDir, 'docs', 'bad-doc.md') expect(fs.existsSync(docPath)).toBe(false) }) }) + +describe('runPromptMode integration', () => { + it('runs agents on a prompt and attempts doc improvement', async () => { + judgeScores = [5.0, 7.0] + analyzeFailureResults = [ + { + reasoning: 'Agent needs better context', + suggestedDocPath: 'conventions/api.md', + suggestedContent: '# API Conventions\n\nUse REST.', + }, + null, // stop after first improvement + ] + + await runPromptMode({ + mode: 'prompt', + repoPath: repoDir, + agentCommand: 'echo', + parallelism: 1, + maxCostUsd: 100, + agentTimeoutMs: 10_000, + prompt: 'Add a new API endpoint for users', + }) + + // Verify log was written + const logPath = path.join(repoDir, 'evalbuff-log.jsonl') + expect(fs.existsSync(logPath)).toBe(true) + const entry = JSON.parse( + fs.readFileSync(logPath, 'utf-8').trim(), + ) + expect(entry.taskId).toBe('prompt-mode') + }) +}) diff --git a/evalbuff/src/__tests__/trace-compressor.test.ts b/evalbuff/src/__tests__/trace-compressor.test.ts new file mode 100644 index 000000000..7039465fd --- /dev/null +++ b/evalbuff/src/__tests__/trace-compressor.test.ts @@ -0,0 +1,159 @@ +import fs from 'fs' +import os from 'os' +import path from 'path' + +import { afterEach, beforeEach, describe, expect, it } from 'bun:test' + +import { compressTrace, cleanupTraceDir } from '../trace-compressor' + +let traceDir: string + +beforeEach(() => { + traceDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-trace-test-')) +}) + +afterEach(() => { + cleanupTraceDir(traceDir) +}) + +describe('compressTrace', () => { + it('leaves short traces unchanged', () => { + const trace = 'Thinking about the problem...\nLooking at the code.\nDone.' + const result = compressTrace(trace, traceDir) + + expect(result.inline).toBe(trace) + expect(fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt'))).toHaveLength(0) + }) + + it('extracts large code fence blocks to files', () => { + const largeBlock = 'x'.repeat(500) + const trace = `Thinking about the problem... +\`\`\` +${largeBlock} +\`\`\` +Done.` + + const result = compressTrace(trace, traceDir) + + // The inline trace should have a pointer instead of the large block + expect(result.inline).toContain('[Code block stored in:') + expect(result.inline).toMatch(/\d+ chars/) + expect(result.inline).not.toContain(largeBlock) + + // The file should contain the block + const files = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt')) + expect(files).toHaveLength(1) + const fileContent = fs.readFileSync(path.join(traceDir, files[0]), 'utf-8') + expect(fileContent).toContain(largeBlock) + }) + + it('keeps small code fence blocks inline', () => { + const trace = `Looking at code: +\`\`\` +const x = 1 +\`\`\` +Done.` + + const result = compressTrace(trace, traceDir) + + expect(result.inline).toContain('const x = 1') + expect(result.inline).not.toContain('[Code block stored in:') + expect(fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt'))).toHaveLength(0) + }) + + it('extracts large indented blocks', () => { + const indentedLines = Array.from({ length: 20 }, (_, i) => ` line ${i}: ${'content '.repeat(10)}`).join('\n') + const trace = `Running command:\n${indentedLines}\nDone.` + + const result = compressTrace(trace, traceDir) + + expect(result.inline).toContain('[Indented block stored in:') + expect(result.inline).toContain('20 lines') + + const files = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt')) + expect(files).toHaveLength(1) + }) + + it('handles JSON-lines format (Claude streaming)', () => { + const largeContent = 'x'.repeat(500) + const events = [ + JSON.stringify({ type: 'tool_use', name: 'Read', input: { path: 'src/index.ts' } }), + JSON.stringify({ type: 'tool_result', content: largeContent }), + JSON.stringify({ type: 'text', content: 'Now I understand the code.' }), + ] + const trace = events.join('\n') + + const result = compressTrace(trace, traceDir) + + // Tool use should still be inline + expect(result.inline).toContain('"name":"Read"') + // Large tool result should be extracted + expect(result.inline).toContain('[Stored in:') + expect(result.inline).not.toContain(largeContent) + // Text event should be inline + expect(result.inline).toContain('Now I understand the code') + + const files = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt')) + expect(files).toHaveLength(1) + }) + + it('keeps small JSON tool results inline', () => { + const events = [ + JSON.stringify({ type: 'tool_use', name: 'Read', input: { path: 'a.ts' } }), + JSON.stringify({ type: 'tool_result', content: 'short result' }), + ] + const trace = events.join('\n') + + const result = compressTrace(trace, traceDir) + + expect(result.inline).toContain('short result') + expect(result.inline).not.toContain('[Stored in:') + }) + + it('extracts multiple large blocks', () => { + const block1 = 'a'.repeat(500) + const block2 = 'b'.repeat(500) + const trace = `Step 1: +\`\`\` +${block1} +\`\`\` +Step 2: +\`\`\` +${block2} +\`\`\` +Done.` + + const result = compressTrace(trace, traceDir) + + const files = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt')) + expect(files).toHaveLength(2) + expect(result.inline).not.toContain(block1) + expect(result.inline).not.toContain(block2) + }) + + it('generates a content summary in the pointer', () => { + const jsonBlock = '{\n "name": "test",\n' + ' "data": "x",\n'.repeat(50) + '}' + const trace = `Result:\n\`\`\`\n${jsonBlock}\n\`\`\`\nDone.` + + const result = compressTrace(trace, traceDir) + + // Should have a summary mentioning it's a code block + expect(result.inline).toContain('code block') + }) +}) + +describe('cleanupTraceDir', () => { + it('removes the directory and all files', () => { + fs.writeFileSync(path.join(traceDir, 'test.txt'), 'content') + expect(fs.existsSync(traceDir)).toBe(true) + + cleanupTraceDir(traceDir) + + expect(fs.existsSync(traceDir)).toBe(false) + }) + + it('does not throw on non-existent directory', () => { + cleanupTraceDir('/tmp/nonexistent-evalbuff-trace-dir-xyz') + // Should not throw + }) +}) diff --git a/evalbuff/src/cli-runner.ts b/evalbuff/src/cli-runner.ts index 07529c0ea..fdd3cd50c 100644 --- a/evalbuff/src/cli-runner.ts +++ b/evalbuff/src/cli-runner.ts @@ -28,23 +28,39 @@ export async function runCliAgent( console.log(`[CliRunner] Running: ${cmd} ${baseArgs.join(' ')} `) + // Use detached + process group so we can kill the entire tree on timeout const child = spawn(cmd, args, { cwd, env: { ...process.env, ...env }, stdio: ['ignore', 'pipe', 'pipe'], + detached: true, }) let stdout = '' let stderr = '' - const timer = setTimeout(() => { - child.kill('SIGTERM') - // Give it 5 seconds to clean up, then force kill - setTimeout(() => { - if (!child.killed) { - child.kill('SIGKILL') + const killTree = () => { + const pid = child.pid + if (pid != null) { + try { + // Kill the entire process group (negative pid) + process.kill(-pid, 'SIGTERM') + } catch { + // Process may already be dead } - }, 5000) + setTimeout(() => { + try { + process.kill(-pid, 'SIGKILL') + } catch { + // ignore + } + }, 5000) + } + } + + const timer = setTimeout(() => { + console.warn(`[CliRunner] Timeout after ${timeoutMs}ms, killing process tree`) + killTree() }, timeoutMs) child.stdout.on('data', (data: Buffer) => { @@ -90,5 +106,8 @@ export async function runCliAgent( stderr, }) }) + + // Don't let the detached child keep the parent alive + child.unref() }) } diff --git a/evalbuff/src/commit-task-generator.ts b/evalbuff/src/commit-task-generator.ts new file mode 100644 index 000000000..036f93ef8 --- /dev/null +++ b/evalbuff/src/commit-task-generator.ts @@ -0,0 +1,269 @@ +import { execSync } from 'child_process' +import fs from 'fs' +import os from 'os' +import path from 'path' + +export interface CommitTask { + sha: string + parentSha: string + message: string + prompt: string + diff: string + filesChanged: string[] +} + +const MAX_DIFF_CHARS = 200_000 + +/** + * Get a list of commits from the repo, oldest first. + * Starts from `startAfterSha` (exclusive) or HEAD~commitCount if no state. + */ +export function getCommitList( + repoPath: string, + commitCount: number, + startAfterSha?: string, +): string[] { + if (startAfterSha) { + // Get all commits from startAfterSha (exclusive) to HEAD + const output = execSync( + `git log --format=%H --reverse ${startAfterSha}..HEAD`, + { cwd: repoPath, encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }, + ).trim() + return output ? output.split('\n') : [] + } + + // Get last N commits, oldest first + const output = execSync( + `git log --format=%H -n ${commitCount} --reverse`, + { cwd: repoPath, encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }, + ).trim() + return output ? output.split('\n') : [] +} + +/** + * Extract commit info needed to build a task. + * Returns null for merge commits or commits with no parent. + */ +export function getCommitInfo( + repoPath: string, + sha: string, +): { parentSha: string; message: string; diff: string; filesChanged: string[] } | null { + try { + // Get parent SHA + const parents = execSync(`git log --pretty=%P -n 1 ${sha}`, { + cwd: repoPath, + encoding: 'utf-8', + }).trim() + + if (!parents) return null // initial commit + + const parentList = parents.split(' ') + if (parentList.length > 1) return null // merge commit + + const parentSha = parentList[0] + + // Get commit message + const message = execSync(`git log --format=%B -n 1 ${sha}`, { + cwd: repoPath, + encoding: 'utf-8', + }).trim() + + // Get diff + const diff = execSync(`git diff ${parentSha} ${sha}`, { + cwd: repoPath, + encoding: 'utf-8', + maxBuffer: 10 * 1024 * 1024, + }) + + // Get files changed + const filesOutput = execSync(`git diff --name-only ${parentSha} ${sha}`, { + cwd: repoPath, + encoding: 'utf-8', + }).trim() + const filesChanged = filesOutput ? filesOutput.split('\n') : [] + + return { parentSha, message, diff, filesChanged } + } catch { + return null + } +} + +/** + * Read a file's content at a specific commit SHA. + * Returns null if the file doesn't exist at that commit. + */ +function readFileAtCommit( + repoPath: string, + sha: string, + filePath: string, +): string | null { + try { + return execSync(`git show ${sha}:${JSON.stringify(filePath)}`, { + cwd: repoPath, + encoding: 'utf-8', + maxBuffer: 10 * 1024 * 1024, + }) + } catch { + return null + } +} + +/** + * Read the full contents of all files being modified at the parent commit. + * This gives the prompt generator context about what the code looks like + * before the change, so it can write a realistic human prompt. + */ +function readFilesAtParent( + repoPath: string, + parentSha: string, + filesChanged: string[], +): Record { + const files: Record = {} + let totalSize = 0 + const maxTotalSize = 500_000 // 500K total for all files + + for (const filePath of filesChanged) { + if (totalSize >= maxTotalSize) break + + const content = readFileAtCommit(repoPath, parentSha, filePath) + if (content != null && content.length > 0) { + files[filePath] = content + totalSize += content.length + } + } + + return files +} + +const PROMPT_GEN_SYSTEM = `You are generating a task prompt that a human developer would realistically write to ask an AI coding agent to make changes to their codebase. + +You will receive: +- A git diff showing exactly what was changed +- The full contents of all files being modified (as they looked BEFORE the change) +- The commit message (as a hint, but don't just copy it) + +Your job is to write a natural, human-sounding prompt — the kind of thing a developer would type into a chat with an AI assistant. + +## Key Principles + +1. Focus on high-level functional requirements, not implementation details + - GOOD: "add user authentication to the API" + - BAD: "implement an authenticateUser function in src/auth/middleware.ts" + +2. Use natural language — like a Slack message or ticket description + - GOOD: "the nightly CI is pointing at the wrong directory, it should be agents not .agents" + - BAD: "Update the directory reference in .github/workflows/nightly-e2e.yml from .agents to agents" + +3. Describe what you WANT or what's WRONG, not how to fix it + - GOOD: "the hover state on buttons looks broken" + - BAD: "change the CSS hover opacity from 0.5 to 0.8 in Button.tsx" + +4. Don't reference specific file paths unless a human naturally would. Humans describe the feature area, not the file tree. + - GOOD: "our login page needs to redirect to freebuff.com instead of codebuff.com" + - BAD: "update src/auth/login.ts, src/config/urls.ts, and tests/auth.test.ts to change codebuff.com to freebuff.com" + +5. Don't over-specify. Leave room for the agent to figure out the implementation. + +6. Keep it to 1-4 sentences. + +7. Read the FULL file contents to understand context. The diff alone can be misleading — understanding the surrounding code helps you write a prompt that makes sense for this codebase. + +## Output + +Respond with ONLY the prompt text. No quotes, no preamble, no explanation.` + +/** + * Generate a human-like task prompt from a commit. + * Reads the full files at the parent commit for context, similar to how + * buffbench uses file-explorer agents to understand the codebase. + */ +export async function generatePromptFromCommit( + repoPath: string, + parentSha: string, + message: string, + diff: string, + filesChanged: string[], +): Promise { + // Read full file contents at the parent commit for context + const fileContents = readFilesAtParent(repoPath, parentSha, filesChanged) + + let filesSection = '' + if (Object.keys(fileContents).length > 0) { + filesSection = `## File Contents (before the change)\n\n` + for (const [filePath, content] of Object.entries(fileContents)) { + filesSection += `### ${filePath}\n\`\`\`\n${content}\n\`\`\`\n\n` + } + } + + const userPrompt = `## Commit Message +${message} + +${filesSection}## Diff +\`\`\`diff +${diff} +\`\`\`` + + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-promptgen-')) + const promptFile = path.join(tmpDir, 'PROMPT_GEN.md') + + try { + fs.writeFileSync(promptFile, `${PROMPT_GEN_SYSTEM}\n\n---\n\n${userPrompt}`) + + const output = execSync( + `claude --dangerously-skip-permissions -p "Read ${promptFile} and follow all instructions. Respond with ONLY the task prompt text."`, + { + encoding: 'utf-8', + timeout: 2 * 60 * 1000, + stdio: ['ignore', 'pipe', 'pipe'], + maxBuffer: 10 * 1024 * 1024, + }, + ).trim() + + return output || message + } catch { + // Fallback to the commit message itself + return message + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }) + } +} + +/** + * Build a full CommitTask from a SHA. + * Returns null if the commit can't be used (merge, initial, too large diff, etc). + */ +export async function buildCommitTask( + repoPath: string, + sha: string, +): Promise { + const info = getCommitInfo(repoPath, sha) + if (!info) return null + + // Skip commits with diffs that exceed our limit + if (info.diff.length > MAX_DIFF_CHARS) { + console.log(`Skipping ${sha.slice(0, 8)}: diff too large (${info.diff.length} chars)`) + return null + } + + // Skip commits with no meaningful code changes + if (info.filesChanged.length === 0) { + return null + } + + const prompt = await generatePromptFromCommit( + repoPath, + info.parentSha, + info.message, + info.diff, + info.filesChanged, + ) + + return { + sha, + parentSha: info.parentSha, + message: info.message, + prompt, + diff: info.diff, + filesChanged: info.filesChanged, + } +} diff --git a/evalbuff/src/docs-optimizer.ts b/evalbuff/src/docs-optimizer.ts index cc9b95d0d..9673eddfe 100644 --- a/evalbuff/src/docs-optimizer.ts +++ b/evalbuff/src/docs-optimizer.ts @@ -3,6 +3,8 @@ import fs from 'fs' import os from 'os' import path from 'path' +import { compressTrace, cleanupTraceDir } from './trace-compressor' + import type { JudgingResult } from './judge' export interface DocSuggestion { @@ -13,56 +15,108 @@ export interface DocSuggestion { const DOC_WRITER_SYSTEM_PROMPT = `You are an expert at writing developer documentation that helps AI coding agents perform better. -Your job: Given a coding agent's failure on a task, write a targeted documentation file that would prevent this class of error in the future. +Your job: Given the results of an AI coding agent's attempt at a task, write a targeted documentation file that would help the agent perform better on FUTURE tasks — not just this specific one. + +## Critical Rule: Genericity + +The docs you write must be **generic enough to be useful across many future tasks**, not solely useful for the specific task that was just attempted. Think about: +- What general PATTERN does this failure reveal? +- What CONVENTION or ARCHITECTURE knowledge would prevent a whole class of similar errors? +- What would a senior developer tell a new team member on their first day? + +DO NOT write docs that only help with one specific task. If the failure is too task-specific and doesn't reveal a general pattern, respond with: {"skip": true, "reasoning": "Too task-specific to generalize"} + +## Using the Agent Trace + +You may be given the agent's trace (stdout) showing its reasoning process, tool calls, and decisions. This is the most valuable signal — it shows you WHY the agent went wrong, not just WHAT it got wrong. Look for: +- **Wrong assumptions** about the codebase structure or conventions +- **Misunderstood patterns** — the agent tried something that doesn't match how this codebase works +- **Missing context** — the agent didn't know about a key file, config, or convention +- **Wrong approach** — the agent took a fundamentally different approach than needed + +The trace shows the full agent reasoning inline, but large tool results (file contents, command output) have been extracted to separate files. You'll see markers like: + [Stored in: /tmp/evalbuff-traces-xxx/result-003.txt (2847 chars) — file content, 84 lines] +You can read these files if you need the full content to understand what the agent saw. + +Write docs that address the ROOT CAUSE visible in the trace, not just the symptom visible in the diff. ## Rules 1. Be SPECIFIC and ACTIONABLE. Reference concrete file paths, function names, and patterns from the codebase. 2. Do NOT write generic advice like "follow best practices" or "write clean code." -3. Focus on the GAP between what the agent did and what it should have done. -4. Write docs that a coding agent will read and immediately know what to do differently. +3. Focus on the general PATTERN behind the gap, not the specific gap itself. +4. Write docs that a coding agent will read and immediately know what to do differently on any similar task. 5. Keep docs concise — under 200 lines. Dense information beats verbose explanations. 6. Use a logical file path that groups related docs together (e.g., "patterns/", "conventions/", "architecture/"). 7. Include examples of correct patterns from the codebase when possible. +8. If a doc already exists on a similar topic, suggest UPDATING it (use the same path) rather than creating a new one. ## Output Format You MUST respond with ONLY a JSON object (no markdown fences, no explanation). The JSON must have exactly these fields: { - "reasoning": "Why this doc would help", + "reasoning": "Why this doc would help (referencing the general pattern, not just this task)", "suggestedDocPath": "path/relative/to/docs/dir.md", "suggestedContent": "The markdown content" -}` +} + +Or if too task-specific: +{"skip": true, "reasoning": "explanation"}` /** - * Analyze a failure and suggest a doc edit to prevent it. - * Uses Claude CLI to generate suggestions. - * Returns null if score is above threshold (no improvement needed). + * Analyze agent run results and suggest a doc edit to improve future performance. + * Always analyzes — no score threshold check. + * Returns null if the doc writer decides the failure is too task-specific to generalize. */ export async function analyzeFailure({ judgeResult, taskPrompt, agentDiff, + agentTrace, groundTruthDiff, currentDocs, - scoreThreshold, }: { judgeResult: JudgingResult taskPrompt: string agentDiff: string - groundTruthDiff: string + agentTrace?: string // stdout from the agent — reasoning, tool calls, errors + groundTruthDiff?: string // optional — not available in prompt mode currentDocs: Record - scoreThreshold: number - client?: unknown // kept for backwards compat, ignored }): Promise { - if (judgeResult.overallScore >= scoreThreshold) { - return null - } - const docsContent = Object.entries(currentDocs) .map(([docPath, content]) => `### ${docPath}\n\`\`\`\n${content}\n\`\`\``) .join('\n\n') + const groundTruthSection = groundTruthDiff + ? `## Ground Truth (what should have been done) +\`\`\`diff +${groundTruthDiff} +\`\`\`` + : '## Ground Truth\n(Not available — judge should have tested the output directly)' + + // Compress agent trace: keep reasoning inline, extract large tool results to files + // The doc writer agent can read those files if it needs the full content + let compressed: ReturnType | null = null + let traceSection = '' + + if (agentTrace) { + const traceDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-traces-')) + compressed = compressTrace(agentTrace, traceDir) + + const resultFiles = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt')) + + traceSection = `## Agent Trace (reasoning, tool calls, and decisions) + +This is the agent's stdout showing its reasoning process, tool calls, and decisions. +Large tool results have been extracted to separate files — you can read them if needed. +Look for: what the agent misunderstood, wrong assumptions it made, where it went off track. + +${resultFiles.length > 0 ? `**${resultFiles.length} tool result(s) stored in ${traceDir}/** — read any file for full content.\n` : ''} +\`\`\` +${compressed.inline} +\`\`\`` + } + const prompt = `${DOC_WRITER_SYSTEM_PROMPT} ## Task Prompt @@ -74,25 +128,28 @@ ${judgeResult.analysis} ## Judge Weaknesses Found ${judgeResult.weaknesses.map((w) => `- ${w}`).join('\n')} -## Ground Truth (what should have been done) -\`\`\`diff -${groundTruthDiff} -\`\`\` +## Judge Strengths Found +${judgeResult.strengths.map((s) => `- ${s}`).join('\n')} + +## Overall Score: ${judgeResult.overallScore}/10 + +${groundTruthSection} ## Agent's Changes (what was actually done) \`\`\`diff ${agentDiff || '(No changes made)'} \`\`\` +${traceSection} + ## Current Docs (already available to the agent) ${docsContent || '(No docs yet)'} -Based on the gap between what the agent did and what it should have done, write a doc file that would help the agent get it right next time. Focus on the specific weakness identified by the judge. +Based on the agent's trace (if available), the gap between what the agent did and what it should have done, and the judge's analysis, write a doc file that captures a GENERAL PATTERN that would help the agent across many similar tasks. Focus on what the agent MISUNDERSTOOD (visible in the trace) rather than just what it got wrong (visible in the diff). If this failure doesn't reveal a generalizable pattern, respond with {"skip": true, "reasoning": "..."}. Respond with ONLY the JSON object.` try { - // Write prompt to temp file to avoid CLI arg length limits const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docwriter-')) const promptFile = path.join(tmpDir, 'DOC_WRITER_PROMPT.md') fs.writeFileSync(promptFile, prompt) @@ -110,41 +167,51 @@ Respond with ONLY the JSON object.` ).trim() } finally { fs.rmSync(tmpDir, { recursive: true, force: true }) + // Clean up trace files after doc writer is done + if (compressed) { + cleanupTraceDir(compressed.traceDir) + } } // Try to extract JSON from the output let jsonStr = output - // Strip markdown code fences if present const jsonMatch = output.match(/```(?:json)?\s*\n([\s\S]*?)\n\s*```/) if (jsonMatch) { jsonStr = jsonMatch[1] } - // Try to find a JSON object const objMatch = jsonStr.match(/\{[\s\S]*\}/) if (!objMatch) { console.error('Doc writer did not return JSON') return null } - const value = JSON.parse(objMatch[0]) as DocSuggestion + const value = JSON.parse(objMatch[0]) + + // Check if the doc writer decided to skip + if (value.skip) { + console.log(`Doc writer skipped: ${value.reasoning}`) + return null + } + + const suggestion = value as DocSuggestion // Validate the path is under docs/ if ( - value.suggestedDocPath.startsWith('/') || - value.suggestedDocPath.includes('..') + suggestion.suggestedDocPath.startsWith('/') || + suggestion.suggestedDocPath.includes('..') ) { console.error( - `Doc writer suggested invalid path: ${value.suggestedDocPath}`, + `Doc writer suggested invalid path: ${suggestion.suggestedDocPath}`, ) return null } - if (!value.reasoning || !value.suggestedDocPath || !value.suggestedContent) { + if (!suggestion.reasoning || !suggestion.suggestedDocPath || !suggestion.suggestedContent) { console.error('Doc writer returned incomplete suggestion') return null } - return value + return suggestion } catch (error) { console.error('Doc writer failed:', error) return null @@ -160,7 +227,6 @@ export function applyDocEdit( content: string, agentsMdPath?: string, ): boolean { - // Validate path is under docs/ if (docPath.startsWith('/') || docPath.includes('..')) { console.error(`Rejected doc path outside docs/: ${docPath}`) return false @@ -170,16 +236,11 @@ export function applyDocEdit( const fullAgentsMdPath = agentsMdPath || path.join(repoPath, 'AGENTS.md') try { - // Create directory structure fs.mkdirSync(path.dirname(fullDocPath), { recursive: true }) - // Check if this is a new file (for AGENTS.md update) const isNew = !fs.existsSync(fullDocPath) - - // Write the doc file fs.writeFileSync(fullDocPath, content) - // Update AGENTS.md if new file if (isNew) { let agentsMd = '' if (fs.existsSync(fullAgentsMdPath)) { @@ -202,6 +263,39 @@ export function applyDocEdit( } } +/** + * Remove a doc edit from a repo — deletes the file and removes from AGENTS.md. + */ +export function revertDocEdit( + repoPath: string, + docPath: string, + agentsMdPath?: string, +): boolean { + const fullDocPath = path.join(repoPath, 'docs', docPath) + const fullAgentsMdPath = agentsMdPath || path.join(repoPath, 'AGENTS.md') + + try { + if (fs.existsSync(fullDocPath)) { + fs.rmSync(fullDocPath) + } + + // Remove from AGENTS.md + if (fs.existsSync(fullAgentsMdPath)) { + let agentsMd = fs.readFileSync(fullAgentsMdPath, 'utf-8') + const entry = `- [docs/${docPath}](docs/${docPath})\n` + if (agentsMd.includes(entry)) { + agentsMd = agentsMd.replace(entry, '') + fs.writeFileSync(fullAgentsMdPath, agentsMd) + } + } + + return true + } catch (error) { + console.error(`Failed to revert doc edit: ${error}`) + return false + } +} + /** * Compare scores to determine if a doc edit improved things. */ diff --git a/evalbuff/src/judge.ts b/evalbuff/src/judge.ts index f543afd3d..14ef8bebd 100644 --- a/evalbuff/src/judge.ts +++ b/evalbuff/src/judge.ts @@ -60,7 +60,7 @@ const REVIEWER_CONFIGS: Record = { '__PROMPT__', '--dangerously-skip-permissions', ], - timeoutMs: 30 * 60 * 1000, // 30 min — needs time for E2E testing + timeoutMs: 30 * 60 * 1000, }, codex: { type: 'codex', @@ -81,26 +81,44 @@ const REVIEWER_CONFIGS: Record = { }, } -// The result file name the reviewer agent is instructed to write const RESULT_FILE_NAME = 'evalbuff-review-result.json' function buildReviewerPrompt(input: { - commit: EvalCommitV2 - contextFiles: Record + commit?: EvalCommitV2 + taskPrompt: string + contextFiles?: Record agentDiff: string + groundTruthDiff?: string error?: string criteria?: QualityCriteria docsDir?: string }): string { - const { commit, contextFiles, agentDiff, error, criteria, docsDir } = input - - const groundTruthDiffs = commit.fileDiffs - .map(({ path: p, diff }) => `### ${p}\n\`\`\`diff\n${diff}\n\`\`\``) - .join('\n\n') + const { commit, taskPrompt, contextFiles, agentDiff, groundTruthDiff, error, criteria, docsDir } = input + + const groundTruthSection = groundTruthDiff + ? `## Ground Truth Changes (One valid implementation) +${groundTruthDiff}` + : `## Ground Truth +No reference implementation is available. You must judge the agent's work solely by testing it end-to-end. Focus heavily on: +- Does it build and run? +- Does the feature actually work when you test it? +- Are there errors in the logs? +- Does it handle edge cases?` + + const contextFilesContent = contextFiles + ? Object.entries(contextFiles) + .map(([filePath, content]) => `### ${filePath}\n\`\`\`\n${content}\n\`\`\``) + .join('\n\n') + : '' - const contextFilesContent = Object.entries(contextFiles) - .map(([filePath, content]) => `### ${filePath}\n\`\`\`\n${content}\n\`\`\``) - .join('\n\n') + // Legacy support: build ground truth from commit fileDiffs if no explicit groundTruthDiff + const groundTruth = groundTruthDiff + ? groundTruthSection + : commit?.fileDiffs + ? `## Ground Truth Changes (One valid implementation)\n${commit.fileDiffs + .map(({ path: p, diff }) => `### ${p}\n\`\`\`diff\n${diff}\n\`\`\``) + .join('\n\n')}` + : groundTruthSection const criteriaText = criteria ? formatCriteriaForPrompt(criteria) @@ -114,10 +132,10 @@ function buildReviewerPrompt(input: { ## Your Mission -You have been given a coding task, the ground truth solution, and an AI agent's attempt. Your job is to: +You have been given a coding task and an AI agent's attempt. Your job is to: 1. **Read the project docs** (if present) to understand conventions and patterns -2. **Review the agent's diff** against the ground truth +2. **Review the agent's diff** ${groundTruthDiff || commit?.fileDiffs ? 'against the ground truth' : 'for correctness and completeness'} 3. **Actually test the changes** end-to-end: - Start the application if possible (check package.json for start/dev scripts) - Use browser tools, curl, or the appropriate client to exercise the feature @@ -140,13 +158,11 @@ Use whatever tools you need to verify the change actually works: ${docsSection} ## User Prompt (What the agent was asked to do) -${commit.prompt} +${taskPrompt} -## Context Files (from parent commit) -${contextFilesContent || '(No context files)'} +${contextFilesContent ? `## Context Files (from parent commit)\n${contextFilesContent}` : ''} -## Ground Truth Changes (One valid implementation) -${groundTruthDiffs} +${groundTruth} ## Agent's Changes (What the agent actually did) \`\`\`diff @@ -181,11 +197,6 @@ const PROMPT_FILE_NAME = 'EVALBUFF_REVIEW_PROMPT.md' const BOOTSTRAP_PROMPT = `Read the file ${PROMPT_FILE_NAME} in the current directory and follow all instructions in it exactly. The file contains a code review task. After your review and testing, you MUST write your judgment to ${RESULT_FILE_NAME} as specified in the prompt file.` -/** - * Run a single reviewer agent in the given repo directory. - * Writes the full prompt to a file in the repo, then gives the agent - * a short bootstrap prompt to read it (avoids CLI arg length limits). - */ async function runReviewerAgent( agentType: ReviewerAgentType, prompt: string, @@ -194,7 +205,6 @@ async function runReviewerAgent( ): Promise { const config = REVIEWER_CONFIGS[agentType] - // Write the full prompt to a file in the repo fs.writeFileSync(path.join(cwd, PROMPT_FILE_NAME), prompt) const args = config.command @@ -255,7 +265,6 @@ async function runReviewerAgent( ) } - // Try to read the result file the agent wrote const resultPath = path.join(cwd, RESULT_FILE_NAME) const result = parseResultFile(resultPath, agentType) @@ -264,7 +273,6 @@ async function runReviewerAgent( return } - // Fallback: try to extract JSON from stdout const extracted = extractJsonFromOutput(stdout, agentType) if (extracted) { resolve(extracted) @@ -279,9 +287,6 @@ async function runReviewerAgent( }) } -/** - * Try to parse the result file written by the reviewer agent. - */ function parseResultFile( resultPath: string, agentType: string, @@ -300,7 +305,6 @@ function parseResultFile( `[Reviewer:${agentType}] Result file failed validation:`, parsed.error, ) - // Try to salvage partial result return salvagePartialResult(raw) } catch (error) { console.warn( @@ -311,25 +315,17 @@ function parseResultFile( } } -/** - * Try to extract JSON from the agent's stdout as a fallback. - * Looks for the last JSON block that matches our schema. - */ function extractJsonFromOutput( output: string, agentType: string, ): JudgingResult | null { - // Try to find JSON blocks in the output (between ``` or raw JSON objects) const jsonPatterns = [ - // Match JSON in code fences /```(?:json)?\s*\n({[\s\S]*?})\n\s*```/g, - // Match standalone JSON objects (greedy, last match wins) /(\{[^{}]*"overallScore"[^{}]*\})/g, ] for (const pattern of jsonPatterns) { const matches = [...output.matchAll(pattern)] - // Try last match first (most likely to be the final result) for (let i = matches.length - 1; i >= 0; i--) { try { const raw = JSON.parse(matches[i][1]) @@ -351,9 +347,6 @@ function extractJsonFromOutput( return null } -/** - * Try to salvage a partially valid result by filling in defaults. - */ function salvagePartialResult(raw: any): JudgingResult | null { if (typeof raw !== 'object' || raw === null) return null if (typeof raw.overallScore !== 'number') return null @@ -383,7 +376,7 @@ export interface JudgeCommitResultInput { commit: EvalCommitV2 contextFiles: Record agentDiff: string - repoDir: string // the test repo where the agent's changes live + repoDir: string error?: string criteria?: QualityCriteria reviewerAgents?: ReviewerAgentType[] @@ -410,6 +403,7 @@ export async function judgeCommitResult( const prompt = buildReviewerPrompt({ commit, + taskPrompt: commit.prompt, contextFiles, agentDiff, error, @@ -417,12 +411,62 @@ export async function judgeCommitResult( docsDir: fs.existsSync(path.join(repoDir, 'docs')) ? repoDir : undefined, }) - // Run reviewer agents in parallel, each in their own copy of the repo + return runReviewersAndAggregate(prompt, repoDir, reviewerAgents, env) +} + +/** + * Judge an agent's work on a task prompt — no ground truth commit needed. + * Used for both commit-learning mode (with ground truth diff) and prompt mode (without). + */ +export interface JudgeTaskResultInput { + taskPrompt: string + agentDiff: string + groundTruthDiff?: string + repoDir: string + error?: string + criteria?: QualityCriteria + reviewerAgents?: ReviewerAgentType[] + env?: Record +} + +export async function judgeTaskResult( + input: JudgeTaskResultInput, +): Promise { + const { + taskPrompt, + agentDiff, + groundTruthDiff, + repoDir, + error, + criteria, + reviewerAgents = ['claude', 'codex'], + env, + } = input + + const prompt = buildReviewerPrompt({ + taskPrompt, + agentDiff, + groundTruthDiff, + error, + criteria, + docsDir: fs.existsSync(path.join(repoDir, 'docs')) ? repoDir : undefined, + }) + + return runReviewersAndAggregate(prompt, repoDir, reviewerAgents, env) +} + +/** + * Shared logic: run reviewer agents in parallel and aggregate results. + */ +async function runReviewersAndAggregate( + prompt: string, + repoDir: string, + reviewerAgents: ReviewerAgentType[], + env?: Record, +): Promise { const reviewPromises = reviewerAgents.map(async (agentType) => { - // Each reviewer gets its own copy of the repo so they don't interfere const reviewDir = `${repoDir}-review-${agentType}` try { - // Fast copy: use rsync to exclude heavy dirs, then symlink them const nodeModulesPath = path.join(repoDir, 'node_modules') const hasNodeModules = fs.existsSync(nodeModulesPath) if (hasNodeModules) { @@ -434,7 +478,6 @@ export async function judgeCommitResult( } else { execSync(`cp -r "${repoDir}" "${reviewDir}"`, { stdio: 'ignore' }) } - // Don't pass eval env to reviewers — they need real API keys, not test ones return await runReviewerAgent(agentType, prompt, reviewDir) } finally { try { @@ -466,14 +509,12 @@ export async function judgeCommitResult( } } - // Sort by overall score, pick median for analysis const sorted = validResults.sort( (a, b) => a.overallScore - b.overallScore, ) const medianIdx = Math.floor(sorted.length / 2) const medianResult = sorted[medianIdx] - // Average scores across all valid reviewers const avg = (key: keyof JudgingResult) => validResults.reduce((sum, r) => sum + (r[key] as number), 0) / validResults.length @@ -483,7 +524,6 @@ export async function judgeCommitResult( const avgE2eScore = avg('e2eScore') const avgOverallScore = avg('overallScore') - // Merge e2eTestsPerformed from all reviewers const allE2eTests = [ ...new Set(validResults.flatMap((r) => r.e2eTestsPerformed)), ] diff --git a/evalbuff/src/run-e2e-test.ts b/evalbuff/src/run-e2e-test.ts index 252a65664..56840ed5e 100644 --- a/evalbuff/src/run-e2e-test.ts +++ b/evalbuff/src/run-e2e-test.ts @@ -1,12 +1,13 @@ /** * Real E2E test for evalbuff. * - * Creates a local git repo with a simple project, generates an eval task, - * and runs the full evalbuff loop with real CLI coding agents and real - * reviewer agents. No mocks. + * Creates a local git repo with a simple project, then runs evalbuff's + * learn mode against it using real CLI coding agents and real reviewer agents. + * No mocks. * * Prerequisites: * - `claude` CLI installed and authenticated + * - `codebuff` CLI installed * - (Optional) `codex` CLI installed with OPENAI_API_KEY set * * Usage: @@ -17,17 +18,14 @@ import fs from 'fs' import os from 'os' import path from 'path' -import { runEvalbuff } from './run-evalbuff' +import { runLearnMode } from './run-evalbuff' import type { ReviewerAgentType } from './judge' -import type { EvalDataV2 } from './types' // --- Setup --- const BASE_DIR = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-real-e2e-')) const PROJECT_DIR = path.join(BASE_DIR, 'project') -const BARE_REPO = path.join(BASE_DIR, 'project.git') -const TARGET_DIR = path.join(BASE_DIR, 'target') const gitEnv = { GIT_AUTHOR_NAME: 'evalbuff-test', @@ -48,11 +46,10 @@ function git(cmd: string, cwd: string) { function setupProject() { console.log('\n=== Setting up test project ===') - // Create project directory fs.mkdirSync(PROJECT_DIR, { recursive: true }) git('init', PROJECT_DIR) - // Initial commit: a simple Node.js project with a bug + // Initial commit fs.writeFileSync( path.join(PROJECT_DIR, 'package.json'), JSON.stringify( @@ -80,32 +77,22 @@ export function add(a, b) { export function multiply(a, b) { return a * b } - -// BUG: subtract is wrong — it adds instead of subtracting -export function subtract(a, b) { - return a + b -} - -export function divide(a, b) { - if (b === 0) throw new Error('Division by zero') - return a / b -} `, ) fs.writeFileSync( path.join(PROJECT_DIR, 'test.js'), - `import { add, subtract, multiply, divide } from './index.js' + `import { add, multiply } from './index.js' let passed = 0 let failed = 0 function assert(name, actual, expected) { if (actual === expected) { - console.log(\` ✓ \${name}\`) + console.log(\` pass: \${name}\`) passed++ } else { - console.log(\` ✗ \${name}: expected \${expected}, got \${actual}\`) + console.log(\` fail: \${name}: expected \${expected}, got \${actual}\`) failed++ } } @@ -113,17 +100,6 @@ function assert(name, actual, expected) { console.log('Running tests...') assert('add(2, 3)', add(2, 3), 5) assert('multiply(3, 4)', multiply(3, 4), 12) -assert('subtract(10, 3)', subtract(10, 3), 7) -assert('divide(10, 2)', divide(10, 2), 5) - -try { - divide(1, 0) - console.log(' ✗ divide by zero should throw') - failed++ -} catch (e) { - console.log(' ✓ divide by zero throws') - passed++ -} console.log(\`\\n\${passed} passed, \${failed} failed\`) if (failed > 0) process.exit(1) @@ -131,12 +107,9 @@ if (failed > 0) process.exit(1) ) git('add .', PROJECT_DIR) - git('commit -m "Initial project with bug in subtract"', PROJECT_DIR) - const parentSha = git('rev-parse HEAD', PROJECT_DIR) - - console.log(` Parent commit (with bug): ${parentSha.slice(0, 8)}`) + git('commit -m "Initial project with add and multiply"', PROJECT_DIR) - // Now create the ground truth fix + // Second commit: add subtract (with a bug) fs.writeFileSync( path.join(PROJECT_DIR, 'index.js'), `// Simple math utility @@ -148,76 +121,72 @@ export function multiply(a, b) { return a * b } +// BUG: adds instead of subtracting export function subtract(a, b) { - return a - b -} - -export function divide(a, b) { - if (b === 0) throw new Error('Division by zero') - return a / b + return a + b } `, ) git('add .', PROJECT_DIR) - git('commit -m "Fix subtract function"', PROJECT_DIR) - const fixSha = git('rev-parse HEAD', PROJECT_DIR) - - console.log(` Fix commit (ground truth): ${fixSha.slice(0, 8)}`) + git('commit -m "Add subtract function (has bug)"', PROJECT_DIR) - // Get the diff for the ground truth - const diff = git(`diff ${parentSha} ${fixSha} -- index.js`, PROJECT_DIR) + // Third commit: fix the bug + fs.writeFileSync( + path.join(PROJECT_DIR, 'index.js'), + `// Simple math utility +export function add(a, b) { + return a + b +} - // Create bare clone for withTestRepo to clone from - execSync(`git clone --bare ${PROJECT_DIR} ${BARE_REPO}`, { - stdio: 'ignore', - env: { ...process.env, ...gitEnv }, - }) - console.log(` Bare repo created at: ${BARE_REPO}`) +export function multiply(a, b) { + return a * b +} - return { parentSha, fixSha, diff } +export function subtract(a, b) { + return a - b } +`, + ) -function createEvalFile(parentSha: string, fixSha: string, diff: string) { - console.log('\n=== Creating eval file ===') + fs.writeFileSync( + path.join(PROJECT_DIR, 'test.js'), + `import { add, multiply, subtract } from './index.js' - const evalData: EvalDataV2 = { - repoUrl: `file://${BARE_REPO}`, - generationDate: new Date().toISOString(), - evalCommits: [ - { - id: 'fix-subtract-bug', - sha: fixSha, - parentSha, - spec: 'Fix the subtract function which incorrectly adds instead of subtracting', - prompt: - 'The subtract function in index.js has a bug — it adds the two numbers instead of subtracting them. Fix it. Then run the tests to make sure they pass.', - supplementalFiles: ['test.js'], - fileDiffs: [ - { - path: 'index.js', - status: 'modified', - diff, - }, - ], - }, - ], - } +let passed = 0 +let failed = 0 - const evalPath = path.join(BASE_DIR, 'eval.json') - fs.writeFileSync(evalPath, JSON.stringify(evalData, null, 2)) - console.log(` Eval file: ${evalPath}`) - return evalPath +function assert(name, actual, expected) { + if (actual === expected) { + console.log(\` pass: \${name}\`) + passed++ + } else { + console.log(\` fail: \${name}: expected \${expected}, got \${actual}\`) + failed++ + } } -function setupTargetRepo() { - console.log('\n=== Setting up target repo (for docs output) ===') +console.log('Running tests...') +assert('add(2, 3)', add(2, 3), 5) +assert('multiply(3, 4)', multiply(3, 4), 12) +assert('subtract(10, 3)', subtract(10, 3), 7) + +console.log(\`\\n\${passed} passed, \${failed} failed\`) +if (failed > 0) process.exit(1) +`, + ) + + git('add .', PROJECT_DIR) + git('commit -m "Fix subtract bug and add test"', PROJECT_DIR) + + // Add a remote pointing to itself (learn mode needs git remote get-url) + git(`remote add origin file://${PROJECT_DIR}`, PROJECT_DIR) - fs.mkdirSync(TARGET_DIR, { recursive: true }) - git('init', TARGET_DIR) - git('commit --allow-empty -m "init"', TARGET_DIR) - console.log(` Target repo: ${TARGET_DIR}`) - return TARGET_DIR + const commitCount = parseInt( + git('rev-list --count HEAD', PROJECT_DIR), + ) + console.log(` Project dir: ${PROJECT_DIR}`) + console.log(` Commits: ${commitCount}`) } function detectAvailableReviewers(): ReviewerAgentType[] { @@ -226,151 +195,99 @@ function detectAvailableReviewers(): ReviewerAgentType[] { try { execSync('which claude', { stdio: 'ignore' }) reviewers.push('claude') - console.log(' ✓ claude CLI found') + console.log(' reviewer: claude') } catch { - console.log(' ✗ claude CLI not found') + console.log(' claude not found') } try { execSync('which codex', { stdio: 'ignore' }) if (process.env.OPENAI_API_KEY) { reviewers.push('codex') - console.log(' ✓ codex CLI found (OPENAI_API_KEY set)') - } else { - console.log(' ✗ codex CLI found but OPENAI_API_KEY not set') + console.log(' reviewer: codex') } } catch { - console.log(' ✗ codex CLI not found') + // skip } return reviewers } async function main() { - console.log('╔══════════════════════════════════════════╗') - console.log('║ Evalbuff Real E2E Test ║') - console.log('╚══════════════════════════════════════════╝') - console.log(`\nBase dir: ${BASE_DIR}`) + console.log('Evalbuff Real E2E Test') + console.log(`Base dir: ${BASE_DIR}`) - // Detect available agents console.log('\n=== Detecting available agents ===') const reviewers = detectAvailableReviewers() if (reviewers.length === 0) { - console.error('\nNo reviewer agents available. Need at least one of: claude, codex') + console.error('No reviewer agents available. Need at least: claude') process.exit(1) } - // Detect coding agent - let agentCommand = '' - try { - execSync('which claude', { stdio: 'ignore' }) - agentCommand = 'claude --dangerously-skip-permissions -p' - console.log(` Using coding agent: ${agentCommand}`) - } catch { - console.error('\nClaude CLI not found. Install with: npm install -g @anthropic-ai/claude-code') - process.exit(1) - } + setupProject() - // Setup - const { parentSha, fixSha, diff } = setupProject() - const evalPath = createEvalFile(parentSha, fixSha, diff) - const targetDir = setupTargetRepo() - - // Run evalbuff - console.log('\n=== Running evalbuff ===') - console.log(` Agent: ${agentCommand}`) - console.log(` Reviewers: ${reviewers.join(', ')}`) - console.log(` Task: fix-subtract-bug`) - console.log('') + // Run evalbuff learn mode against the project's own history + console.log('\n=== Running evalbuff learn mode ===') const startTime = Date.now() try { - await runEvalbuff({ - repoPath: targetDir, - agentCommand, - evalDataPaths: [evalPath], - maxIterations: 1, + await runLearnMode({ + mode: 'learn', + repoPath: PROJECT_DIR, + agentCommand: 'codebuff --agent base2-free', + parallelism: 2, maxCostUsd: 10, - scoreThreshold: 7.0, - agentTimeoutMs: 5 * 60 * 1000, // 5 min for the coding agent + agentTimeoutMs: 5 * 60 * 1000, + commitCount: 10, // only 3 commits in this repo reviewerAgents: reviewers, }) } catch (error) { - console.error('\nEvalbuff failed:', error) + console.error('Evalbuff failed:', error) } const durationMs = Date.now() - startTime // Verify results - console.log('\n=== Verifying results ===') + console.log('\n=== Results ===') - const logPath = path.join(targetDir, 'evalbuff-log.jsonl') + const logPath = path.join(PROJECT_DIR, 'evalbuff-log.jsonl') if (fs.existsSync(logPath)) { const logContent = fs.readFileSync(logPath, 'utf-8').trim() if (logContent) { const entries = logContent.split('\n').map((l) => JSON.parse(l)) console.log(` Log entries: ${entries.length}`) for (const entry of entries) { - console.log(` Task: ${entry.taskId}`) - console.log(` Old score: ${entry.oldScore}`) - console.log(` New score: ${entry.newScore ?? 'N/A'}`) - console.log(` Doc edit: ${entry.docEdit ? entry.docEdit.path : 'none'}`) - console.log(` Score comparison: ${entry.scoreComparison ?? 'N/A'}`) - console.log(` Duration: ${(entry.durationMs / 1000).toFixed(1)}s`) - console.log(` Error: ${entry.error ?? 'none'}`) + console.log(` Commit: ${entry.taskId}`) + console.log(` Baseline: ${entry.oldScore}`) + console.log(` After docs: ${entry.newScore ?? 'N/A'}`) + console.log(` Docs: ${entry.docEdit ? entry.docEdit.path : 'none'}`) } - } else { - console.log(' ✗ Log file is empty') } - } else { - console.log(' ✗ Log file not found') } - // Check morning report - const reportFiles = fs - .readdirSync(targetDir) - .filter((f) => f.startsWith('evalbuff-report-')) - if (reportFiles.length > 0) { - console.log(`\n ✓ Morning report: ${reportFiles[0]}`) - const report = fs.readFileSync( - path.join(targetDir, reportFiles[0]), - 'utf-8', - ) - console.log('\n--- Morning Report ---') - console.log(report) - console.log('--- End Report ---') - } else { - console.log(' ✗ No morning report generated') + const statePath = path.join(PROJECT_DIR, 'evalbuff-state.json') + if (fs.existsSync(statePath)) { + const state = JSON.parse(fs.readFileSync(statePath, 'utf-8')) + console.log(` Processed: ${state.processedCommitCount} commits`) + console.log(` Cost: $${state.totalCostUsd.toFixed(2)}`) } - // Check docs - const docsDir = path.join(targetDir, 'docs') + const docsDir = path.join(PROJECT_DIR, 'docs') if (fs.existsSync(docsDir)) { - const docFiles = execSync(`find ${docsDir} -name '*.md'`, { - encoding: 'utf-8', - }).trim() - if (docFiles) { - console.log(`\n ✓ Docs generated:`) - for (const f of docFiles.split('\n')) { + const docs = execSync(`find ${docsDir} -name '*.md'`, { encoding: 'utf-8' }).trim() + if (docs) { + console.log(` Docs generated:`) + for (const f of docs.split('\n')) { console.log(` ${f}`) } } } - // Check state - const statePath = path.join(targetDir, 'evalbuff-state.json') - if (fs.existsSync(statePath)) { - const state = JSON.parse(fs.readFileSync(statePath, 'utf-8')) - console.log(`\n ✓ State: ${state.completedTaskIds.length} completed, $${state.totalCostUsd.toFixed(2)} spent`) - } - - console.log(`\n=== E2E test completed in ${(durationMs / 1000).toFixed(1)}s ===`) - console.log(`Base dir (for inspection): ${BASE_DIR}`) - - // Cleanup prompt - console.log(`\nTo clean up: rm -rf ${BASE_DIR}`) + console.log(`\nCompleted in ${(durationMs / 1000).toFixed(1)}s`) + console.log(`Inspect: ${PROJECT_DIR}`) + console.log(`Cleanup: rm -rf ${BASE_DIR}`) } main().catch((error) => { diff --git a/evalbuff/src/run-evalbuff.ts b/evalbuff/src/run-evalbuff.ts index 07800758e..19307a6c3 100644 --- a/evalbuff/src/run-evalbuff.ts +++ b/evalbuff/src/run-evalbuff.ts @@ -2,6 +2,7 @@ import { execSync } from 'child_process' import fs from 'fs' import path from 'path' +import { buildCommitTask, getCommitList } from './commit-task-generator' import { runCliAgent } from './cli-runner' import { getCriteriaForLevel, @@ -14,8 +15,9 @@ import { applyDocEdit, compareScores, readCurrentDocs, + revertDocEdit, } from './docs-optimizer' -import { judgeCommitResult } from './judge' +import { judgeTaskResult } from './judge' import { appendLogEntry, generateMorningReport, @@ -25,51 +27,150 @@ import { withTestRepo } from './test-repo-utils' import type { QualityCriteria } from './criteria' import type { ReviewerAgentType } from './judge' import type { EvalbuffLogEntry } from './morning-report' -import type { EvalCommitV2, EvalDataV2 } from './types' +import type { CommitTask } from './commit-task-generator' -export interface EvalbuffOptions { - repoPath: string - agentCommand: string - evalDataPaths: string[] - maxIterations: number - maxCostUsd: number - scoreThreshold: number - agentTimeoutMs: number - criteriaPath?: string - reviewerAgents?: ReviewerAgentType[] -} +// --- State --- interface EvalbuffState { - completedTaskIds: string[] + lastProcessedCommitSha: string | null totalCostUsd: number recentScores: number[] + processedCommitCount: number } function loadState(statePath: string): EvalbuffState { if (fs.existsSync(statePath)) { return JSON.parse(fs.readFileSync(statePath, 'utf-8')) } - return { completedTaskIds: [], totalCostUsd: 0, recentScores: [] } + return { + lastProcessedCommitSha: null, + totalCostUsd: 0, + recentScores: [], + processedCommitCount: 0, + } } function saveState(statePath: string, state: EvalbuffState): void { fs.writeFileSync(statePath, JSON.stringify(state, null, 2)) } -function loadEvalTasks(evalDataPaths: string[]): Array<{ - task: EvalCommitV2 - evalData: EvalDataV2 -}> { - const tasks: Array<{ task: EvalCommitV2; evalData: EvalDataV2 }> = [] - for (const evalPath of evalDataPaths) { - const evalData: EvalDataV2 = JSON.parse( - fs.readFileSync(evalPath, 'utf-8'), +// --- Shared options --- + +export interface EvalbuffOptions { + repoPath: string + agentCommand: string + parallelism: number + maxCostUsd: number + agentTimeoutMs: number + criteriaPath?: string + reviewerAgents?: ReviewerAgentType[] + initCommand?: string +} + +export interface LearnOptions extends EvalbuffOptions { + mode: 'learn' + commitCount: number +} + +export interface PromptOptions extends EvalbuffOptions { + mode: 'prompt' + prompt: string +} + +// --- Core: run N agents in parallel, return average score --- + +interface ParallelRunResult { + avgScore: number + scores: number[] + diffs: string[] + agentTraces: string[] // stdout from each agent run (their reasoning/tool calls) + judgings: Array + costEstimate: number +} + +async function runAgentsInParallel(opts: { + agentCommand: string + prompt: string + repoPath: string + repoUrl: string + parentSha: string + initCommand?: string + groundTruthDiff?: string + parallelism: number + agentTimeoutMs: number + criteria: QualityCriteria + reviewerAgents?: ReviewerAgentType[] + docsSourcePath: string // path to the repo where docs/ lives +}): Promise { + const { + agentCommand, + prompt, + repoUrl, + parentSha, + initCommand, + groundTruthDiff, + parallelism, + agentTimeoutMs, + criteria, + reviewerAgents, + docsSourcePath, + } = opts + + const runOne = async (idx: number) => { + return withTestRepo( + { repoUrl, parentSha, initCommand }, + async (repoDir) => { + // Copy current docs into the test repo + copyDocsIntoRepo(docsSourcePath, repoDir) + + console.log(` [Run ${idx + 1}/${parallelism}] Running agent...`) + const result = await runCliAgent({ + command: agentCommand, + prompt, + cwd: repoDir, + timeoutMs: agentTimeoutMs, + }) + + const costEstimate = result.durationMs * 0.00001 + + console.log(` [Run ${idx + 1}/${parallelism}] Judging...`) + const judging = await judgeTaskResult({ + taskPrompt: prompt, + agentDiff: result.diff, + groundTruthDiff, + repoDir, + error: result.exitCode !== 0 ? result.stderr : undefined, + criteria, + reviewerAgents, + }) + + return { + score: judging.overallScore, + diff: result.diff, + agentTrace: result.stdout, + judging, + costEstimate, + } + }, ) - for (const commit of evalData.evalCommits) { - tasks.push({ task: commit, evalData }) - } } - return tasks + + const results = await Promise.all( + Array.from({ length: parallelism }, (_, i) => runOne(i)), + ) + + const scores = results.map((r) => r.score) + const avgScore = scores.reduce((a, b) => a + b, 0) / scores.length + const totalCost = results.reduce((a, r) => a + r.costEstimate, 0) + + return { + avgScore, + scores, + diffs: results.map((r) => r.diff), + agentTraces: results.map((r) => r.agentTrace), + judgings: results.map((r) => r.judging), + costEstimate: totalCost, + } } function copyDocsIntoRepo( @@ -89,108 +190,279 @@ function copyDocsIntoRepo( } } -function getContextFiles( - repoDir: string, - commit: EvalCommitV2, -): Record { - const contextFiles: Record = {} - const contextFilePaths = new Set([ - ...commit.supplementalFiles, - ...commit.fileDiffs.map((fd) => fd.path), - ]) - for (const { status, path: filePath } of commit.fileDiffs) { - if (status === 'added') contextFilePaths.delete(filePath) - } +// --- Iterative doc improvement loop --- - for (const filePath of contextFilePaths) { - try { - const content = execSync( - `git show ${commit.parentSha}:${JSON.stringify(filePath)}`, - { cwd: repoDir, encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }, - ) - contextFiles[filePath] = content - } catch { - contextFiles[filePath] = '' +/** + * Run the iterative doc improvement loop for a single task. + * Always analyzes failures. Keeps proposing doc changes until one is rejected. + * Returns the final average score and log info. + */ +async function improveDocs(opts: { + taskId: string + prompt: string + repoPath: string + repoUrl: string + parentSha: string + initCommand?: string + groundTruthDiff?: string + agentCommand: string + parallelism: number + agentTimeoutMs: number + criteria: QualityCriteria + reviewerAgents?: ReviewerAgentType[] +}): Promise<{ + finalScore: number + baselineScore: number + docsKept: Array<{ path: string; reasoning: string }> + docsRejected: Array<{ path: string; reasoning: string }> + totalCost: number +}> { + const { + taskId, + prompt, + repoPath, + repoUrl, + parentSha, + initCommand, + groundTruthDiff, + agentCommand, + parallelism, + agentTimeoutMs, + criteria, + reviewerAgents, + } = opts + + let totalCost = 0 + const docsKept: Array<{ path: string; reasoning: string }> = [] + const docsRejected: Array<{ path: string; reasoning: string }> = [] + + // Step 1: Baseline run + console.log(`\n Running ${parallelism} agents in parallel (baseline)...`) + const baseline = await runAgentsInParallel({ + agentCommand, + prompt, + repoPath, + repoUrl, + parentSha, + initCommand, + groundTruthDiff, + parallelism, + agentTimeoutMs, + criteria, + reviewerAgents, + docsSourcePath: repoPath, + }) + totalCost += baseline.costEstimate + + let currentScore = baseline.avgScore + console.log(` Baseline score: ${currentScore.toFixed(1)}/10 (scores: ${baseline.scores.map((s) => s.toFixed(1)).join(', ')})`) + + // Step 2: Iterative doc improvement + let improving = true + while (improving) { + // Pick the worst-scoring judging for analysis + const worstIdx = baseline.judgings.reduce( + (minIdx, j, idx, arr) => + j.overallScore < arr[minIdx].overallScore ? idx : minIdx, + 0, + ) + const worstJudging = baseline.judgings[worstIdx] + const worstDiff = baseline.diffs[worstIdx] + const worstTrace = baseline.agentTraces[worstIdx] + + const currentDocs = readCurrentDocs(repoPath) + + console.log(` Analyzing for doc improvements...`) + const docSuggestion = await analyzeFailure({ + judgeResult: worstJudging, + taskPrompt: prompt, + agentDiff: worstDiff, + agentTrace: worstTrace, + groundTruthDiff, + currentDocs, + }) + + if (!docSuggestion) { + console.log(` No doc suggestion — stopping improvement loop.`) + break + } + + console.log(` Doc suggestion: ${docSuggestion.suggestedDocPath}`) + console.log(` Reasoning: ${docSuggestion.reasoning}`) + + // Save previous content so we can restore on rejection + const docFullPath = path.join(repoPath, 'docs', docSuggestion.suggestedDocPath) + const previousContent = fs.existsSync(docFullPath) + ? fs.readFileSync(docFullPath, 'utf-8') + : null + + // Apply doc to the main repo + applyDocEdit(repoPath, docSuggestion.suggestedDocPath, docSuggestion.suggestedContent) + + // Re-run with new docs + console.log(` Re-running ${parallelism} agents with new docs...`) + const rerun = await runAgentsInParallel({ + agentCommand, + prompt, + repoPath, + repoUrl, + parentSha, + initCommand, + groundTruthDiff, + parallelism, + agentTimeoutMs, + criteria, + reviewerAgents, + docsSourcePath: repoPath, + }) + totalCost += rerun.costEstimate + + const comparison = compareScores(currentScore, rerun.avgScore) + console.log(` New score: ${rerun.avgScore.toFixed(1)}/10 (${comparison}) (scores: ${rerun.scores.map((s) => s.toFixed(1)).join(', ')})`) + + if (comparison === 'improved') { + console.log(` Keeping doc: ${docSuggestion.suggestedDocPath}`) + docsKept.push({ + path: docSuggestion.suggestedDocPath, + reasoning: docSuggestion.reasoning, + }) + + // Commit the doc change + try { + execSync('git add docs/ AGENTS.md', { cwd: repoPath, stdio: 'ignore' }) + execSync( + `git commit -m "evalbuff: add ${docSuggestion.suggestedDocPath} (${taskId})"`, + { cwd: repoPath, stdio: 'ignore' }, + ) + } catch { + console.warn('Failed to commit doc change') + } + + currentScore = rerun.avgScore + + // Update baseline data for next iteration + baseline.judgings.splice(0, baseline.judgings.length, ...rerun.judgings) + baseline.diffs.splice(0, baseline.diffs.length, ...rerun.diffs) + baseline.agentTraces.splice(0, baseline.agentTraces.length, ...rerun.agentTraces) + + // Continue loop — try to improve more + } else { + console.log(` Rejecting doc: ${docSuggestion.suggestedDocPath} (score didn't improve)`) + docsRejected.push({ + path: docSuggestion.suggestedDocPath, + reasoning: docSuggestion.reasoning, + }) + + // Revert the doc edit — restore previous content if it existed + if (previousContent !== null) { + // Restore the previously-accepted version + applyDocEdit(repoPath, docSuggestion.suggestedDocPath, previousContent) + } else { + revertDocEdit(repoPath, docSuggestion.suggestedDocPath) + } + + // Stop improving for this task + improving = false } } - return contextFiles + + return { + finalScore: currentScore, + baselineScore: baseline.avgScore, + docsKept, + docsRejected, + totalCost, + } } -export async function runEvalbuff(options: EvalbuffOptions): Promise { +// --- Mode: Commit Learning --- + +export async function runLearnMode(options: LearnOptions): Promise { const { repoPath, agentCommand, - evalDataPaths, - maxIterations, + parallelism, maxCostUsd, - scoreThreshold, agentTimeoutMs, criteriaPath, reviewerAgents, + commitCount, + initCommand, } = options const statePath = path.join(repoPath, 'evalbuff-state.json') const logPath = path.join(repoPath, 'evalbuff-log.jsonl') - - // Strip API key env vars — eval data provides test keys for init commands - // but agents need their real API keys to function - const API_KEY_PATTERN = /(_KEY|_SECRET|_TOKEN|_API_KEY)$/i - const stripApiKeys = (env?: Record) => { - if (!env) return undefined - return Object.fromEntries( - Object.entries(env).filter(([k]) => !API_KEY_PATTERN.test(k)), - ) - } - const safeEnv = (evalData: { env?: Record }) => - stripApiKeys(evalData.env) const defaultCriteriaPath = criteriaPath || path.join(repoPath, 'evalbuff-criteria.json') const state = loadState(statePath) let criteria = loadCriteria(defaultCriteriaPath) - const tasks = loadEvalTasks(evalDataPaths) + // Get the repo's remote URL + let repoUrl: string + try { + repoUrl = execSync('git remote get-url origin', { + cwd: repoPath, + encoding: 'utf-8', + }).trim() + } catch { + throw new Error( + `Could not determine remote URL for ${repoPath}. Make sure it has an 'origin' remote.`, + ) + } - console.log(`Evalbuff starting:`) + // Get commits to process + const commits = getCommitList( + repoPath, + commitCount, + state.lastProcessedCommitSha || undefined, + ) + + console.log(`Evalbuff Learn Mode:`) console.log(` Repo: ${repoPath}`) + console.log(` Remote: ${repoUrl}`) console.log(` Agent: ${agentCommand}`) + console.log(` Parallelism: ${parallelism}`) console.log(` Reviewer agents: ${(reviewerAgents || ['claude', 'codex']).join(', ')}`) - console.log(` Tasks: ${tasks.length}`) - console.log(` Max iterations: ${maxIterations}`) + console.log(` Commits to process: ${commits.length}`) console.log(` Max cost: $${maxCostUsd}`) - console.log(` Score threshold: ${scoreThreshold}`) console.log(` Criteria level: ${criteria.level}/5`) - console.log(` Completed: ${state.completedTaskIds.length} tasks`) - - let iterations = 0 + console.log( + ` Resumed from: ${state.lastProcessedCommitSha?.slice(0, 8) || '(fresh start)'}`, + ) + console.log(` Previously processed: ${state.processedCommitCount} commits`) - for (const { task, evalData } of tasks) { - // Budget checks - if (iterations >= maxIterations) { - console.log(`Reached max iterations (${maxIterations}). Stopping.`) - break - } + for (const sha of commits) { + // Budget check if (state.totalCostUsd >= maxCostUsd) { console.log( - `Reached max cost ($${state.totalCostUsd.toFixed(2)} >= $${maxCostUsd}). Stopping.`, + `\nReached max cost ($${state.totalCostUsd.toFixed(2)} >= $${maxCostUsd}). Stopping.`, ) break } - // Skip completed tasks - if (state.completedTaskIds.includes(task.id)) { - console.log(`Skipping completed task: ${task.id}`) + const shortSha = sha.slice(0, 8) + console.log( + `\n${'='.repeat(60)}\nCommit ${shortSha} (${state.processedCommitCount + 1})\n${'='.repeat(60)}`, + ) + + // Build task from commit + const task = await buildCommitTask(repoPath, sha) + if (!task) { + console.log(`Skipping ${shortSha} (merge commit, initial commit, or too large)`) + state.lastProcessedCommitSha = sha + saveState(statePath, state) continue } - iterations++ + console.log(` Message: ${task.message.split('\n')[0].slice(0, 80)}`) + console.log(` Files: ${task.filesChanged.length}`) + console.log(` Prompt: ${task.prompt.slice(0, 100)}...`) + const iterationStart = Date.now() - console.log( - `\n${'='.repeat(60)}\n[${iterations}/${maxIterations}] Task: ${task.id}\n${'='.repeat(60)}`, - ) let logEntry: EvalbuffLogEntry = { - taskId: task.id, + taskId: shortSha, timestamp: new Date().toISOString(), oldScore: 0, newScore: null, @@ -202,163 +474,36 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise { } try { - // Step 1: Run agent with current docs, then judge in the same repo - console.log(`Running agent on task ${task.id}...`) - const oldJudging = await withTestRepo( - { - repoUrl: evalData.repoUrl, - parentSha: task.parentSha, - initCommand: evalData.initCommand, - env: evalData.env, - }, - async (repoDir) => { - // Copy current docs into the test repo - copyDocsIntoRepo(repoPath, repoDir) - - const result = await runCliAgent({ - command: agentCommand, - prompt: task.prompt, - cwd: repoDir, - timeoutMs: agentTimeoutMs, - env: safeEnv(evalData), - }) - - const contextFiles = getContextFiles(repoDir, task) - logEntry.costUsd += result.durationMs * 0.00001 // ~$0.01/sec rough estimate - - // Judge the result — reviewer agents run IN the repo - // so they can build, test, start the app, use browser tools, etc. - console.log(`Judging result with reviewer agents...`) - const judging = await judgeCommitResult({ - commit: task, - contextFiles, - agentDiff: result.diff, - repoDir, - error: result.exitCode !== 0 ? result.stderr : undefined, - criteria, - reviewerAgents, - }) - - return judging - }, - ) - - logEntry.oldScore = oldJudging.overallScore - console.log(`Score: ${oldJudging.overallScore.toFixed(1)}/10 (e2e: ${oldJudging.e2eScore.toFixed(1)})`) - - // Step 2: If score is low, try to improve docs - if (oldJudging.overallScore < scoreThreshold) { - console.log(`Score below threshold (${scoreThreshold}). Analyzing failure...`) - - const groundTruthDiff = task.fileDiffs - .map(({ path: p, diff }) => `--- ${p}\n${diff}`) - .join('\n\n') - - const currentDocs = readCurrentDocs(repoPath) - - const docSuggestion = await analyzeFailure({ - judgeResult: oldJudging, - taskPrompt: task.prompt, - agentDiff: '', // agent diff not preserved after withTestRepo cleanup - groundTruthDiff, - currentDocs, - scoreThreshold, - }) - - if (docSuggestion) { - console.log( - `Doc suggestion: ${docSuggestion.suggestedDocPath} - ${docSuggestion.reasoning}`, - ) - logEntry.docEdit = { - path: docSuggestion.suggestedDocPath, - reasoning: docSuggestion.reasoning, - } - - // Re-run with updated docs on a FRESH repo, judge inside - console.log(`Re-running agent with new doc...`) - const newJudging = await withTestRepo( - { - repoUrl: evalData.repoUrl, - parentSha: task.parentSha, - initCommand: evalData.initCommand, - env: evalData.env, - }, - async (freshRepoDir) => { - copyDocsIntoRepo(repoPath, freshRepoDir) - applyDocEdit( - freshRepoDir, - docSuggestion.suggestedDocPath, - docSuggestion.suggestedContent, - ) - - const result = await runCliAgent({ - command: agentCommand, - prompt: task.prompt, - cwd: freshRepoDir, - timeoutMs: agentTimeoutMs, - env: safeEnv(evalData), - }) - - const contextFiles = getContextFiles(freshRepoDir, task) - logEntry.costUsd += result.durationMs * 0.00001 // ~$0.01/sec rough estimate - - console.log(`Re-judging with reviewer agents...`) - return await judgeCommitResult({ - commit: task, - contextFiles, - agentDiff: result.diff, - repoDir: freshRepoDir, - error: result.exitCode !== 0 ? result.stderr : undefined, - criteria, - reviewerAgents, - }) - }, - ) - - logEntry.newScore = newJudging.overallScore - logEntry.scoreComparison = compareScores( - oldJudging.overallScore, - newJudging.overallScore, - ) - - console.log( - `New score: ${newJudging.overallScore.toFixed(1)}/10 (${logEntry.scoreComparison})`, - ) - - // Keep doc if it improved - if (logEntry.scoreComparison === 'improved') { - console.log(`Keeping doc edit: ${docSuggestion.suggestedDocPath}`) - applyDocEdit( - repoPath, - docSuggestion.suggestedDocPath, - docSuggestion.suggestedContent, - ) - - try { - execSync('git add docs/ AGENTS.md', { - cwd: repoPath, - stdio: 'ignore', - }) - execSync( - `git commit -m "evalbuff: add docs for ${task.id}"`, - { - cwd: repoPath, - stdio: 'ignore', - }, - ) - } catch { - console.warn('Failed to commit doc change (may have no changes)') - } - } else { - console.log(`Reverting doc edit (${logEntry.scoreComparison})`) - } + const result = await improveDocs({ + taskId: shortSha, + prompt: task.prompt, + repoPath, + repoUrl, + parentSha: task.parentSha, + initCommand, + groundTruthDiff: task.diff, + agentCommand, + parallelism, + agentTimeoutMs, + criteria, + reviewerAgents, + }) + + logEntry.oldScore = result.baselineScore + logEntry.newScore = + result.docsKept.length > 0 ? result.finalScore : null + logEntry.costUsd = result.totalCost + + if (result.docsKept.length > 0) { + logEntry.docEdit = { + path: result.docsKept.map((d) => d.path).join(', '), + reasoning: result.docsKept.map((d) => d.reasoning).join('; '), } + logEntry.scoreComparison = 'improved' } // Update scores tracking - state.recentScores.push( - logEntry.newScore !== null ? logEntry.newScore : logEntry.oldScore, - ) + state.recentScores.push(result.finalScore) // Check criteria promotion const newLevel = maybePromoteCriteria(criteria, state.recentScores) @@ -374,33 +519,142 @@ export async function runEvalbuff(options: EvalbuffOptions): Promise { } catch (error) { const errorMsg = error instanceof Error ? error.message : String(error) - console.error(`Error on task ${task.id}:`, errorMsg) + console.error(`Error on commit ${shortSha}:`, errorMsg) logEntry.error = errorMsg } logEntry.durationMs = Date.now() - iterationStart state.totalCostUsd += logEntry.costUsd - state.completedTaskIds.push(task.id) + state.lastProcessedCommitSha = sha + state.processedCommitCount++ - // Persist state and log appendLogEntry(logPath, logEntry) saveState(statePath, state) } // Generate morning report - console.log('\nGenerating morning report...') + console.log('\nGenerating report...') const report = generateMorningReport(logPath) - const reportPath = path.join( repoPath, `evalbuff-report-${new Date().toISOString().slice(0, 10)}.md`, ) fs.writeFileSync(reportPath, report) - console.log(`Morning report written to: ${reportPath}`) + console.log(`Report written to: ${reportPath}`) console.log(report) } -// CLI entry point +// --- Mode: Prompt --- + +export async function runPromptMode(options: PromptOptions): Promise { + const { + repoPath, + agentCommand, + parallelism, + maxCostUsd, + agentTimeoutMs, + criteriaPath, + reviewerAgents, + prompt, + initCommand, + } = options + + const logPath = path.join(repoPath, 'evalbuff-log.jsonl') + const defaultCriteriaPath = + criteriaPath || path.join(repoPath, 'evalbuff-criteria.json') + + const criteria = loadCriteria(defaultCriteriaPath) + + let repoUrl: string + try { + repoUrl = execSync('git remote get-url origin', { + cwd: repoPath, + encoding: 'utf-8', + }).trim() + } catch { + throw new Error( + `Could not determine remote URL for ${repoPath}. Make sure it has an 'origin' remote.`, + ) + } + + // Get current HEAD as the parentSha (agents work on the current state) + const headSha = execSync('git rev-parse HEAD', { + cwd: repoPath, + encoding: 'utf-8', + }).trim() + + console.log(`Evalbuff Prompt Mode:`) + console.log(` Repo: ${repoPath}`) + console.log(` Remote: ${repoUrl}`) + console.log(` Agent: ${agentCommand}`) + console.log(` Parallelism: ${parallelism}`) + console.log(` Reviewer agents: ${(reviewerAgents || ['claude', 'codex']).join(', ')}`) + console.log(` Max cost: $${maxCostUsd}`) + console.log(` Criteria level: ${criteria.level}/5`) + console.log(` Prompt: ${prompt.slice(0, 100)}...`) + + const iterationStart = Date.now() + + const logEntry: EvalbuffLogEntry = { + taskId: 'prompt-mode', + timestamp: new Date().toISOString(), + oldScore: 0, + newScore: null, + docEdit: null, + scoreComparison: null, + costUsd: 0, + durationMs: 0, + criteriaLevel: criteria.level, + } + + try { + const result = await improveDocs({ + taskId: 'prompt-mode', + prompt, + repoPath, + repoUrl, + parentSha: headSha, + initCommand, + // No ground truth diff in prompt mode + agentCommand, + parallelism, + agentTimeoutMs, + criteria, + reviewerAgents, + }) + + logEntry.oldScore = result.baselineScore + logEntry.newScore = + result.docsKept.length > 0 ? result.finalScore : null + logEntry.costUsd = result.totalCost + + if (result.docsKept.length > 0) { + logEntry.docEdit = { + path: result.docsKept.map((d) => d.path).join(', '), + reasoning: result.docsKept.map((d) => d.reasoning).join('; '), + } + logEntry.scoreComparison = 'improved' + } + + console.log(`\nResult:`) + console.log(` Baseline score: ${result.baselineScore.toFixed(1)}/10`) + console.log(` Final score: ${result.finalScore.toFixed(1)}/10`) + console.log(` Docs kept: ${result.docsKept.length}`) + console.log(` Docs rejected: ${result.docsRejected.length}`) + console.log(` Cost: $${result.totalCost.toFixed(2)}`) + } catch (error) { + const errorMsg = + error instanceof Error ? error.message : String(error) + console.error(`Error in prompt mode:`, errorMsg) + logEntry.error = errorMsg + } + + logEntry.durationMs = Date.now() - iterationStart + appendLogEntry(logPath, logEntry) +} + +// --- CLI entry point --- + async function main() { const args = process.argv.slice(2) const getArg = (name: string, defaultValue?: string): string => { @@ -409,38 +663,55 @@ async function main() { if (defaultValue !== undefined) return defaultValue throw new Error(`Missing required argument: --${name}`) } + const hasArg = (name: string): boolean => args.includes(`--${name}`) const repoPath = getArg('repo') - const agentCommand = getArg('agent') - const evalDataPaths = getArg('evals').split(',') - const maxIterations = parseInt(getArg('max-iterations', '50')) - const maxCostUsd = parseFloat(getArg('max-cost', '50')) - const scoreThreshold = parseFloat(getArg('score-threshold', '7.0')) + const agentCommand = getArg('agent', 'codebuff --agent base2-free') + const parallelism = parseInt(getArg('parallelism', '5')) + const maxCostUsd = parseFloat(getArg('max-cost', '100')) const agentTimeoutMs = parseInt(getArg('agent-timeout', '300000')) - const criteriaPath = args.includes('--criteria') - ? getArg('criteria') - : undefined - const reviewerAgentsArg = args.includes('--reviewers') + const criteriaPath = hasArg('criteria') ? getArg('criteria') : undefined + const initCommand = hasArg('init-command') ? getArg('init-command') : undefined + const reviewerAgentsArg = hasArg('reviewers') ? getArg('reviewers') : undefined const reviewerAgents = reviewerAgentsArg ? (reviewerAgentsArg.split(',') as ReviewerAgentType[]) : undefined - await runEvalbuff({ - repoPath, - agentCommand, - evalDataPaths, - maxIterations, - maxCostUsd, - scoreThreshold, - agentTimeoutMs, - criteriaPath, - reviewerAgents, - }) + if (hasArg('prompt')) { + // Prompt mode + const prompt = getArg('prompt') + await runPromptMode({ + mode: 'prompt', + repoPath, + agentCommand, + parallelism, + maxCostUsd, + agentTimeoutMs, + criteriaPath, + reviewerAgents, + prompt, + initCommand, + }) + } else { + // Learn mode (default) + const commitCount = parseInt(getArg('commits', '500')) + await runLearnMode({ + mode: 'learn', + repoPath, + agentCommand, + parallelism, + maxCostUsd, + agentTimeoutMs, + criteriaPath, + reviewerAgents, + commitCount, + initCommand, + }) + } } -// Only run CLI when executed directly (not when imported) if (import.meta.main) { main().catch((error) => { console.error('Evalbuff failed:', error) diff --git a/evalbuff/src/trace-compressor.ts b/evalbuff/src/trace-compressor.ts new file mode 100644 index 000000000..995f08b2c --- /dev/null +++ b/evalbuff/src/trace-compressor.ts @@ -0,0 +1,284 @@ +import fs from 'fs' +import path from 'path' + +/** + * A compressed trace where large tool results are stored in separate files. + * The inline trace keeps the full reasoning + tool calls but replaces + * tool result bodies with pointers like: + * [Tool result stored in: /tmp/evalbuff-traces-xxx/result-003.txt (2847 chars)] + */ +export interface CompressedTrace { + /** The trace with large tool results replaced by file pointers */ + inline: string + /** Directory containing the extracted result files (caller should clean up) */ + traceDir: string +} + +/** Minimum size (chars) for a tool result body to get extracted to a file */ +const EXTRACT_THRESHOLD = 300 + +/** + * Compress an agent trace by extracting large tool results into files. + * + * Supports multiple trace formats: + * 1. JSON-lines streaming (Claude `--output-format stream-json`) + * 2. Structured text with code blocks / indented output + * + * Returns the compressed inline trace + path to the directory of result files. + */ +export function compressTrace( + rawTrace: string, + traceDir: string, +): CompressedTrace { + fs.mkdirSync(traceDir, { recursive: true }) + + // Try JSON-lines first (Claude streaming format) + const jsonResult = tryCompressJsonLines(rawTrace, traceDir) + if (jsonResult) return jsonResult + + // Fall back to heuristic text compression + return compressTextTrace(rawTrace, traceDir) +} + +/** + * Try to parse as JSON-lines (one JSON object per line). + * Claude CLI with --output-format stream-json emits events like: + * {"type":"tool_use","name":"Read","input":{...}} + * {"type":"tool_result","content":"...huge file contents..."} + */ +function tryCompressJsonLines( + rawTrace: string, + traceDir: string, +): CompressedTrace | null { + const lines = rawTrace.split('\n') + + // Quick check: are most non-empty lines valid JSON? + const nonEmpty = lines.filter((l) => l.trim()) + if (nonEmpty.length < 2) return null + + let jsonCount = 0 + for (const line of nonEmpty.slice(0, 10)) { + try { + JSON.parse(line) + jsonCount++ + } catch { + // not json + } + } + if (jsonCount < nonEmpty.length * 0.5) return null + + // Parse and compress + const outputLines: string[] = [] + let fileIdx = 0 + + for (const line of lines) { + const trimmed = line.trim() + if (!trimmed) { + outputLines.push('') + continue + } + + let parsed: any + try { + parsed = JSON.parse(trimmed) + } catch { + outputLines.push(line) + continue + } + + // Check if this is a tool result with large content + if (isToolResultEvent(parsed)) { + const content = extractToolResultContent(parsed) + if (content && content.length > EXTRACT_THRESHOLD) { + const fileName = `result-${String(fileIdx).padStart(3, '0')}.txt` + const filePath = path.join(traceDir, fileName) + fs.writeFileSync(filePath, content) + fileIdx++ + + // Replace content with pointer, keep the rest of the event + const summary = summarizeContent(content) + const compressed = replaceToolResultContent( + parsed, + `[Stored in: ${filePath} (${content.length} chars) — ${summary}]`, + ) + outputLines.push(JSON.stringify(compressed)) + continue + } + } + + outputLines.push(line) + } + + return { + inline: outputLines.join('\n'), + traceDir, + } +} + +/** + * Heuristic compression for unstructured text traces. + * Detects large blocks (code fences, indented blocks, long output runs) + * and extracts them to files. + */ +function compressTextTrace( + rawTrace: string, + traceDir: string, +): CompressedTrace { + const lines = rawTrace.split('\n') + const outputLines: string[] = [] + let fileIdx = 0 + let i = 0 + + while (i < lines.length) { + // Detect code fence blocks: ``` ... ``` + if (lines[i].trim().startsWith('```')) { + const blockStart = i + const openFence = lines[i].trim() + i++ + const blockLines: string[] = [lines[blockStart]] + + // Find closing fence + while (i < lines.length) { + blockLines.push(lines[i]) + if (lines[i].trim() === '```' || lines[i].trim() === openFence) { + i++ + break + } + i++ + } + + const blockContent = blockLines.join('\n') + if (blockContent.length > EXTRACT_THRESHOLD) { + const fileName = `result-${String(fileIdx).padStart(3, '0')}.txt` + const filePath = path.join(traceDir, fileName) + fs.writeFileSync(filePath, blockContent) + fileIdx++ + const summary = summarizeContent(blockContent) + outputLines.push( + `[Code block stored in: ${filePath} (${blockContent.length} chars) — ${summary}]`, + ) + } else { + outputLines.push(...blockLines) + } + continue + } + + // Detect indented blocks (4+ spaces or tab) — common for tool output + if (/^(?: |\t)/.test(lines[i]) && i + 1 < lines.length) { + const blockStart = i + const blockLines: string[] = [] + while (i < lines.length && (/^(?: |\t)/.test(lines[i]) || lines[i].trim() === '')) { + blockLines.push(lines[i]) + i++ + } + + // Only extract if it's a large block (not just 2-3 indented lines) + const blockContent = blockLines.join('\n') + if (blockContent.length > EXTRACT_THRESHOLD && blockLines.length > 5) { + const fileName = `result-${String(fileIdx).padStart(3, '0')}.txt` + const filePath = path.join(traceDir, fileName) + fs.writeFileSync(filePath, blockContent) + fileIdx++ + const summary = summarizeContent(blockContent) + outputLines.push( + `[Indented block stored in: ${filePath} (${blockContent.length} chars, ${blockLines.length} lines) — ${summary}]`, + ) + } else { + outputLines.push(...blockLines) + } + continue + } + + outputLines.push(lines[i]) + i++ + } + + return { + inline: outputLines.join('\n'), + traceDir, + } +} + +// --- Helpers --- + +/** Check if a parsed JSON event is a tool result */ +function isToolResultEvent(event: any): boolean { + if (!event || typeof event !== 'object') return false + // Claude streaming: {"type":"tool_result",...} or {"type":"content_block_delta","delta":{"type":"tool_result",...}} + if (event.type === 'tool_result') return true + if (event.type === 'content_block_stop' && event.content_block?.type === 'tool_result') return true + // Codex: {"type":"function_result",...} + if (event.type === 'function_result') return true + // Generic: anything with a large "content" or "output" or "result" field + for (const key of ['content', 'output', 'result', 'text']) { + if (typeof event[key] === 'string' && event[key].length > EXTRACT_THRESHOLD) return true + } + return false +} + +/** Extract the large content body from a tool result event */ +function extractToolResultContent(event: any): string | null { + // Try common field names in order of specificity + for (const key of ['content', 'output', 'result', 'text']) { + if (typeof event[key] === 'string') return event[key] + // Nested: event.content[0].text (Claude format) + if (Array.isArray(event[key])) { + const texts = event[key] + .filter((item: any) => typeof item === 'object' && typeof item.text === 'string') + .map((item: any) => item.text) + if (texts.length > 0) return texts.join('\n') + } + } + // Check nested delta + if (event.delta && typeof event.delta === 'object') { + return extractToolResultContent(event.delta) + } + return null +} + +/** Replace the content body in a tool result event with a pointer string */ +function replaceToolResultContent(event: any, pointer: string): any { + const clone = { ...event } + for (const key of ['content', 'output', 'result', 'text']) { + if (typeof clone[key] === 'string') { + clone[key] = pointer + return clone + } + if (Array.isArray(clone[key])) { + clone[key] = [{ type: 'text', text: pointer }] + return clone + } + } + if (clone.delta) { + clone.delta = replaceToolResultContent({ ...clone.delta }, pointer) + } + return clone +} + +/** Generate a short summary of content for the inline pointer */ +function summarizeContent(content: string): string { + const firstLine = content.split('\n').find((l) => l.trim())?.trim() || '' + const lineCount = content.split('\n').length + + // Detect content type + if (content.includes('```')) return `code block, ${lineCount} lines` + if (firstLine.startsWith('{') || firstLine.startsWith('[')) return `JSON, ${lineCount} lines` + if (firstLine.match(/^\s*\d+[→|│:]/)) return `file content, ${lineCount} lines` + if (firstLine.startsWith('diff ') || firstLine.startsWith('---')) return `diff, ${lineCount} lines` + if (firstLine.startsWith('$') || firstLine.startsWith('>')) return `command output, ${lineCount} lines` + + // Use first line as summary, truncated + const short = firstLine.length > 60 ? firstLine.slice(0, 57) + '...' : firstLine + return `${short} (${lineCount} lines)` +} + +/** + * Clean up a trace directory. + */ +export function cleanupTraceDir(traceDir: string): void { + try { + fs.rmSync(traceDir, { recursive: true, force: true }) + } catch { + // ignore + } +}