diff --git a/agents/base2/base2-free-evals.ts b/agents/base2/base2-free-evals.ts new file mode 100644 index 0000000000..a6489c03e2 --- /dev/null +++ b/agents/base2/base2-free-evals.ts @@ -0,0 +1,8 @@ +import { createBase2 } from './base2' + +const definition = { + ...createBase2('free', { noAskUser: true }), + id: 'base2-free-evals', + displayName: 'Buffy the Free Evals Orchestrator', +} +export default definition diff --git a/bun.lock b/bun.lock index cb61364991..3df586afb9 100644 --- a/bun.lock +++ b/bun.lock @@ -111,8 +111,10 @@ "name": "@codebuff/evalbuff", "version": "1.0.0", "dependencies": { + "@ai-sdk/anthropic": "^2.0.50", "@codebuff/common": "workspace:*", "@codebuff/sdk": "workspace:*", + "ai": "^5.0.0", "zod": "^4.2.1", }, }, diff --git a/cli/src/hooks/helpers/__tests__/send-message.test.ts b/cli/src/hooks/helpers/__tests__/send-message.test.ts index 4247695f7b..7f017deb15 100644 --- a/cli/src/hooks/helpers/__tests__/send-message.test.ts +++ b/cli/src/hooks/helpers/__tests__/send-message.test.ts @@ -35,6 +35,7 @@ const { createBatchedMessageUpdater } = await import( '../../../utils/message-updater' ) import { createPaymentRequiredError } from '@codebuff/sdk' +import type { RunState } from '@codebuff/sdk' const createMockTimerController = (): SendMessageTimerController & { startCalls: string[] @@ -348,7 +349,7 @@ describe('handleRunCompletion', () => { let hasReceivedPlanResponse = false const runState = { - sessionState: null, + sessionState: undefined, output: { type: 'lastMessage' as const, value: [] }, } @@ -372,7 +373,7 @@ describe('handleRunCompletion', () => { expect(chainInProgress).toBe(false) expect(canProcessQueue).toBe(true) expect(isProcessingQueueRef.current).toBe(false) - expect(streamStatus).toBe('idle') + expect(streamStatus as StreamStatus).toBe('idle') }) test('does not process server response when wasAbortedByUser is true', () => { @@ -388,7 +389,7 @@ describe('handleRunCompletion', () => { let hasReceivedPlanResponse = false const runState = { - sessionState: null, + sessionState: undefined, output: { type: 'lastMessage' as const, value: [{ type: 'text' as const, text: 'Server response that should be ignored' }], @@ -431,7 +432,7 @@ describe('handleRunCompletion', () => { let canProcessQueueCalled = false const runState = { - sessionState: null, + sessionState: undefined, output: { type: 'lastMessage' as const, value: [] }, } @@ -929,7 +930,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves // Abort handler fires synchronously: UI is updated, but chain lock stays held expect(streamRefsA.state.wasAbortedByUser).toBe(true) - expect(streamStatus).toBe('idle') // UI shows idle + expect(streamStatus as StreamStatus).toBe('idle') // UI shows idle expect(chainInProgress).toBe(true) // But chain lock is still held! // --- PHASE 3: User types run B — verify it's BLOCKED --- @@ -952,8 +953,8 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves // Simulate what happens in useSendMessage after `await client.run(runConfig)`: // 1. previousRunStateRef.current = runState (state saved) // 2. handleRunCompletion is called - const runStateFromA = { - sessionState: { conversationId: 'conv-123', history: ['user msg A', 'partial assistant response'] }, + const runStateFromA: RunState = { + sessionState: { conversationId: 'conv-123', history: ['user msg A', 'partial assistant response'] } as any, output: { type: 'lastMessage' as const, value: [{ type: 'text' as const, text: 'partial' }] }, } @@ -991,11 +992,11 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves expect(chainInProgress).toBe(false) expect(canProcessQueue).toBe(true) expect(isProcessingQueueRef.current).toBe(false) - expect(streamStatus).toBe('idle') + expect(streamStatus as StreamStatus).toBe('idle') // The crucial state continuity: previousRunState from A is available for B expect(previousRunState).toBe(runStateFromA) - expect(previousRunState.sessionState).toEqual({ + expect(previousRunState.sessionState as any).toEqual({ conversationId: 'conv-123', history: ['user msg A', 'partial assistant response'], }) @@ -1049,7 +1050,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves let chainInProgress = true const isProcessingQueueRef = { current: false } const isQueuePausedRef = { current: false } - let previousRunState: { sessionState: unknown; output: unknown } | null = null + let previousRunState: RunState | null = null const setStreamStatus = (status: StreamStatus) => { streamStatus = status } const setCanProcessQueue = (can: boolean) => { canProcessQueue = can } @@ -1083,14 +1084,14 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves expect(chainInProgress).toBe(true) // Lock held // client.run() resolves for run A - const runStateA = { + const runStateA: RunState = { sessionState: { id: 'session-abc', messages: [ { role: 'user', content: 'first message' }, { role: 'assistant', content: 'partial response before cancel' }, ], - }, + } as any, output: { type: 'lastMessage' as const, value: [] }, } previousRunState = runStateA @@ -1146,7 +1147,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves // In the real code, this is: previousRunState: previousRunStateRef.current // passed to createRunConfig expect(previousRunState).toBe(runStateA) - expect(previousRunState!.sessionState).toEqual({ + expect(previousRunState!.sessionState as any).toEqual({ id: 'session-abc', messages: [ { role: 'user', content: 'first message' }, @@ -1155,7 +1156,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves }) // Simulate run B completing normally - const runStateB = { + const runStateB: RunState = { sessionState: { id: 'session-abc', messages: [ @@ -1164,7 +1165,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves { role: 'user', content: 'second message' }, { role: 'assistant', content: 'full response to second message' }, ], - }, + } as any, output: { type: 'lastMessage' as const, value: [{ type: 'text' as const, text: 'full response' }] }, } previousRunState = runStateB @@ -1186,7 +1187,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves }) // Final state: both runs' messages are preserved in session history - expect(previousRunState!.sessionState).toEqual({ + expect(previousRunState!.sessionState as any).toEqual({ id: 'session-abc', messages: [ { role: 'user', content: 'first message' }, diff --git a/docs/patterns/handle-steps-generators.md b/docs/patterns/handle-steps-generators.md deleted file mode 100644 index a3db4b672f..0000000000 --- a/docs/patterns/handle-steps-generators.md +++ /dev/null @@ -1,180 +0,0 @@ -# handleSteps Generator Pattern for Programmatic Agents - -When creating agents that use `handleSteps` generators to programmatically execute tool calls, follow these exact patterns to avoid TypeScript compilation errors. - -## Correct handleSteps Signature - -```typescript -import type { AgentDefinition } from '../types/agent-definition' - -const definition: AgentDefinition = { - // ... other fields - - handleSteps: function* ({ agentState, prompt, params }) { - // Generator body - }, -} -``` - -## Yielding Tool Calls - -Yield objects with `toolName` and `input` properties. The input schema must match the tool's expected parameters exactly. - -### spawn_agents Tool - -```typescript -handleSteps: function* ({ agentState, prompt, params }) { - const promptWithDefault = prompt ?? 'Default prompt' - - yield { - toolName: 'spawn_agents', - input: { - agents: [ - { - agent_type: 'agent-id-1', - prompt: promptWithDefault, - }, - { - agent_type: 'agent-id-2', - prompt: promptWithDefault, - }, - ], - }, - } - - // After tool execution, yield 'STEP' to let the agent process results - yield 'STEP' -}, -``` - -### Common Mistakes - -**WRONG:** Using incorrect property names or nested structures -```typescript -// ❌ Incorrect - wrong tool call structure -yield { - type: 'tool_call', - name: 'spawn_agents', - arguments: { ... } -} -``` - -**WRONG:** Using `think_deeply` or custom tool names that don't exist -```typescript -// ❌ Incorrect - this tool doesn't exist -yield { - toolName: 'think_deeply', - input: { ... } -} -``` - -**CORRECT:** Use `toolName` and `input` at the top level -```typescript -// ✅ Correct -yield { - toolName: 'spawn_agents', - input: { - agents: [{ agent_type: 'my-agent', prompt: 'Do something' }] - } -} -``` - -## Yielding STEP - -After yielding tool calls, yield the string `'STEP'` to let the main agent process the results: - -```typescript -handleSteps: function* ({ prompt }) { - yield { - toolName: 'spawn_agents', - input: { agents: [...] }, - } - - // This tells the runtime to run an LLM step to process spawn results - yield 'STEP' -}, -``` - -## Agent Definition Requirements for Spawning - -Agents that spawn sub-agents must include: - -1. `toolNames: ['spawn_agents']` - Enable the spawn tool -2. `spawnableAgents: ['agent-id-1', 'agent-id-2']` - List allowed sub-agents - -```typescript -const definition: AgentDefinition = { - id: 'coordinator', - model: 'openai/gpt-5', - toolNames: ['spawn_agents'], - spawnableAgents: ['sub-agent-1', 'sub-agent-2', 'sub-agent-3'], - // ... -} -``` - -## Complete Example: Multi-Model Coordinator - -See `.agents/deep-thinking/deep-thinker.ts` for a working example: - -```typescript -import type { AgentDefinition } from '../types/agent-definition' - -const definition: AgentDefinition = { - id: 'deep-thinker', - displayName: 'Deep Thinker Agent', - model: 'openai/gpt-5', - - toolNames: ['spawn_agents'], - spawnableAgents: ['gpt5-thinker', 'sonnet-thinker', 'gemini-thinker'], - - inputSchema: { - prompt: { - type: 'string', - description: 'The topic to analyze', - }, - }, - - outputMode: 'last_message', - - handleSteps: function* ({ prompt }) { - const promptWithDefault = prompt ?? 'Think about this topic' - - yield { - toolName: 'spawn_agents', - input: { - agents: [ - { agent_type: 'gpt5-thinker', prompt: promptWithDefault }, - { agent_type: 'sonnet-thinker', prompt: promptWithDefault }, - { agent_type: 'gemini-thinker', prompt: promptWithDefault }, - ], - }, - } - - yield 'STEP' - }, -} - -export default definition -``` - -## Directory Structure - -Place related agents in subdirectories under `.agents/`: - -``` -.agents/ -└── deep-thinking/ - ├── deep-thinker.ts # Coordinator - ├── deepest-thinker.ts # Meta-coordinator - ├── gpt5-thinker.ts # Sub-agent - ├── sonnet-thinker.ts # Sub-agent - └── gemini-thinker.ts # Sub-agent -``` - -## Avoid Over-Engineering - -When implementing agents: -- Only create files that are directly requested -- Don't add documentation files unless explicitly asked -- Keep agent definitions simple - use `AgentDefinition` type, not custom wrappers -- Don't create factory patterns unless there's clear reuse need \ No newline at end of file diff --git a/evalbuff/package.json b/evalbuff/package.json index f3374246dd..ac8a55395f 100644 --- a/evalbuff/package.json +++ b/evalbuff/package.json @@ -14,8 +14,10 @@ "run": "bun run src/run-evalbuff.ts" }, "dependencies": { + "@ai-sdk/anthropic": "^2.0.50", "@codebuff/common": "workspace:*", "@codebuff/sdk": "workspace:*", + "ai": "^5.0.0", "zod": "^4.2.1" } } diff --git a/evalbuff/src/__tests__/e2e.test.ts b/evalbuff/src/__tests__/e2e.test.ts index abc317e998..f1ca599662 100644 --- a/evalbuff/src/__tests__/e2e.test.ts +++ b/evalbuff/src/__tests__/e2e.test.ts @@ -40,14 +40,25 @@ mock.module('../test-repo-utils', () => ({ }, })) -mock.module('../cli-runner', () => ({ - runCliAgent: async () => ({ - diff: 'mock diff content', - durationMs: 1000, - exitCode: 0, - stdout: 'mock stdout', - stderr: '', - }), +mock.module('../runners/codebuff', () => ({ + CodebuffRunner: class { + constructor() {} + async run() { + return { + steps: [{ type: 'text', content: 'mock trace' }], + totalCostUsd: 0.01, + diff: 'mock diff content', + } + } + }, +})) + +mock.module('@codebuff/sdk', () => ({ + CodebuffClient: class { + constructor() {} + async run() { return { output: { type: 'success' }, sessionState: null } } + }, + loadLocalAgents: async () => ({}), })) // Judge returns alternating scores: low (triggers doc edit), then higher (confirms improvement) @@ -126,7 +137,7 @@ describe('evalbuff E2E', () => { await runLearnMode({ mode: 'learn', repoPath: repoDir, - agentCommand: 'echo', + agentId: 'base2-free-evals', parallelism: 1, maxCostUsd: 50, agentTimeoutMs: 10_000, diff --git a/evalbuff/src/__tests__/loop.integration.test.ts b/evalbuff/src/__tests__/loop.integration.test.ts index 334dc545e9..7246261330 100644 --- a/evalbuff/src/__tests__/loop.integration.test.ts +++ b/evalbuff/src/__tests__/loop.integration.test.ts @@ -32,20 +32,30 @@ mock.module('../test-repo-utils', () => ({ }, })) -// Mock CLI runner to return a fake result -mock.module('../cli-runner', () => ({ - runCliAgent: async () => { - cliRunnerCallCount++ - return { - diff: 'mock diff content', - durationMs: 1000, - exitCode: 0, - stdout: 'mock stdout', - stderr: '', +// Mock CodebuffRunner to return a fake result +mock.module('../runners/codebuff', () => ({ + CodebuffRunner: class { + constructor() {} + async run() { + cliRunnerCallCount++ + return { + steps: [{ type: 'text', content: 'mock trace' }], + totalCostUsd: 0.01, + diff: 'mock diff content', + } } }, })) +// Mock SDK client and loadLocalAgents +mock.module('@codebuff/sdk', () => ({ + CodebuffClient: class { + constructor() {} + async run() { return { output: { type: 'success' }, sessionState: null } } + }, + loadLocalAgents: async () => ({}), +})) + // Mock judge to return configurable scores mock.module('../judge', () => ({ judgeTaskResult: async () => { @@ -144,7 +154,7 @@ describe('runLearnMode integration', () => { await runLearnMode({ mode: 'learn', repoPath: repoDir, - agentCommand: 'echo', + agentId: 'base2-free-evals', parallelism: 1, maxCostUsd: 100, agentTimeoutMs: 10_000, @@ -190,7 +200,7 @@ describe('runLearnMode integration', () => { await runLearnMode({ mode: 'learn', repoPath: repoDir, - agentCommand: 'echo', + agentId: 'base2-free-evals', parallelism: 1, maxCostUsd: 100, agentTimeoutMs: 10_000, @@ -233,7 +243,7 @@ describe('runLearnMode integration', () => { await runLearnMode({ mode: 'learn', repoPath: repoDir, - agentCommand: 'echo', + agentId: 'base2-free-evals', parallelism: 1, maxCostUsd: 100, agentTimeoutMs: 10_000, @@ -245,10 +255,10 @@ describe('runLearnMode integration', () => { expect(fs.existsSync(logPath)).toBe(false) }) - it('rejects doc edit when score does not improve', async () => { - // Commit1: baseline 4.0, rerun 3.0 (worse) — doc rejected, loop stops. + it('rejects doc edit when score drops significantly', async () => { + // Commit1: baseline 5.0, rerun 2.0 (3-point drop, past 1.5 threshold) — doc rejected. // Commit2: baseline 8.0, analyze returns null. Commit3: baseline 8.0, null. - judgeScores = [4.0, 3.0, 8.0, 8.0] + judgeScores = [5.0, 2.0, 8.0, 8.0] analyzeFailureResults = [ { reasoning: 'Tried to help', @@ -262,7 +272,7 @@ describe('runLearnMode integration', () => { await runLearnMode({ mode: 'learn', repoPath: repoDir, - agentCommand: 'echo', + agentId: 'base2-free-evals', parallelism: 1, maxCostUsd: 100, agentTimeoutMs: 10_000, @@ -290,7 +300,7 @@ describe('runPromptMode integration', () => { await runPromptMode({ mode: 'prompt', repoPath: repoDir, - agentCommand: 'echo', + agentId: 'base2-free-evals', parallelism: 1, maxCostUsd: 100, agentTimeoutMs: 10_000, diff --git a/evalbuff/src/commit-task-generator.ts b/evalbuff/src/commit-task-generator.ts index 51357c8291..e85127699d 100644 --- a/evalbuff/src/commit-task-generator.ts +++ b/evalbuff/src/commit-task-generator.ts @@ -1,8 +1,9 @@ import { execSync } from 'child_process' import fs from 'fs' -import os from 'os' import path from 'path' +import { generatePrompt } from './llm' + export interface CommitTask { sha: string parentSha: string @@ -14,6 +15,55 @@ export interface CommitTask { const MAX_DIFF_CHARS = 200_000 +/** + * Commit message patterns that indicate trivial/automated commits not worth + * running agents on. Saves ~10 agent+judge invocations per skipped commit. + */ +const TRIVIAL_COMMIT_PATTERNS = [ + /^bump\b.*\bversion\b/i, + /^v?\d+\.\d+\.\d+$/, // version-only messages like "1.0.635" + /^release\s+v?\d+/i, + /^chore\(release\)/i, + /^update\s+(change|changelog)/i, + /^merge\s+(branch|pull request)/i, +] + +/** + * Returns true if a commit is trivial and should be skipped. + * Checks commit message patterns and whether only package.json version fields changed. + */ +function isTrivialCommit( + message: string, + filesChanged: string[], + diff: string, +): boolean { + const firstLine = message.split('\n')[0].trim() + + // Check message patterns + if (TRIVIAL_COMMIT_PATTERNS.some((p) => p.test(firstLine))) return true + + // Single package.json change that only touches "version" field + if ( + filesChanged.length === 1 && + filesChanged[0].endsWith('package.json') && + diff.length < 1000 + ) { + const addedLines = diff + .split('\n') + .filter((l) => l.startsWith('+') && !l.startsWith('+++')) + const removedLines = diff + .split('\n') + .filter((l) => l.startsWith('-') && !l.startsWith('---')) + const allVersionChanges = + [...addedLines, ...removedLines].every((l) => + /^\s*[+-]\s*"version"/.test(l), + ) + if (allVersionChanges) return true + } + + return false +} + /** * Files that add noise to diffs without useful signal. * Lockfiles are huge and auto-generated — agents shouldn't replicate them. @@ -231,31 +281,14 @@ ${filesSection}## Diff ${diff} \`\`\`` - const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-promptgen-')) - const promptFile = path.join(tmpDir, 'PROMPT_GEN.md') - try { - fs.writeFileSync(promptFile, `${PROMPT_GEN_SYSTEM}\n\n---\n\n${userPrompt}`) - - // IMPORTANT: Run in tmpDir to avoid Claude reading the repo's CLAUDE.md/AGENTS.md, - // which can confuse prompt generation (e.g., generating prompts about evalbuff itself). - const output = execSync( - `claude --dangerously-skip-permissions -p "Read ${promptFile} and follow all instructions. Respond with ONLY the task prompt text."`, - { - cwd: tmpDir, - encoding: 'utf-8', - timeout: 2 * 60 * 1000, - stdio: ['ignore', 'pipe', 'pipe'], - maxBuffer: 10 * 1024 * 1024, - }, - ).trim() - + // Use API directly — faster than spawning Claude CLI (~3s vs ~15s) + // and avoids CLAUDE.md/AGENTS.md context pollution + const output = await generatePrompt(PROMPT_GEN_SYSTEM, userPrompt) return output || message } catch { // Fallback to the commit message itself return message - } finally { - fs.rmSync(tmpDir, { recursive: true, force: true }) } } @@ -270,6 +303,12 @@ export async function buildCommitTask( const info = getCommitInfo(repoPath, sha) if (!info) return null + // Skip trivial/automated commits (version bumps, releases, etc.) + if (isTrivialCommit(info.message, info.filesChanged, info.diff)) { + console.log(`Skipping ${sha.slice(0, 8)}: trivial commit (${info.message.split('\n')[0].slice(0, 50)})`) + return null + } + // Skip commits with diffs that exceed our limit if (info.diff.length > MAX_DIFF_CHARS) { console.log(`Skipping ${sha.slice(0, 8)}: diff too large (${info.diff.length} chars)`) diff --git a/evalbuff/src/docs-optimizer.ts b/evalbuff/src/docs-optimizer.ts index 697a0c1b7b..408dffc4c1 100644 --- a/evalbuff/src/docs-optimizer.ts +++ b/evalbuff/src/docs-optimizer.ts @@ -1,8 +1,8 @@ -import { execSync } from 'child_process' import fs from 'fs' import os from 'os' import path from 'path' +import { analyzeFailureViaApi } from './llm' import { compressTrace, cleanupTraceDir } from './trace-compressor' import type { JudgingResult } from './judge' @@ -26,6 +26,20 @@ The docs you write must be **generic enough to be useful across many future task DO NOT write docs that only help with one specific task. If the failure is too task-specific and doesn't reveal a general pattern, respond with: {"skip": true, "reasoning": "Too task-specific to generalize"} +## What Makes Good Agent Docs + +The best docs for AI coding agents are: +1. **Maps, not essays** — tell the agent WHERE things are and HOW they connect. "Feature X lives in src/x/, uses the Y pattern from src/shared/y.ts, and must be registered in src/registry.ts" +2. **Decision trees, not philosophy** — "If modifying auth, check src/middleware/auth.ts AND update tests in __tests__/auth.test.ts. If adding a new route, register it in routes.ts." +3. **Anti-patterns with fixes** — "DON'T create new files in the root. DO put utilities in src/shared/. DON'T import from '../../../', DO use the path alias @/" +4. **Concrete examples** — Show a before/after or a correct pattern from the actual codebase. + +Bad docs that HURT agent performance (avoid these): +- Vague principles like "keep code clean" or "follow SOLID" +- Long explanations without actionable takeaways +- Docs that duplicate what's already in the code (comments, types, etc.) +- Over-scoped docs that try to cover everything + ## Using the Agent Trace You may be given the agent's trace (stdout) showing its reasoning process, tool calls, and decisions. This is the most valuable signal — it shows you WHY the agent went wrong, not just WHAT it got wrong. Look for: @@ -34,10 +48,6 @@ You may be given the agent's trace (stdout) showing its reasoning process, tool - **Missing context** — the agent didn't know about a key file, config, or convention - **Wrong approach** — the agent took a fundamentally different approach than needed -The trace shows the full agent reasoning inline, but large tool results (file contents, command output) have been extracted to separate files. You'll see markers like: - [Stored in: /tmp/evalbuff-traces-xxx/result-003.txt (2847 chars) — file content, 84 lines] -You can read these files if you need the full content to understand what the agent saw. - Write docs that address the ROOT CAUSE visible in the trace, not just the symptom visible in the diff. ## Rules @@ -46,10 +56,11 @@ Write docs that address the ROOT CAUSE visible in the trace, not just the sympto 2. Do NOT write generic advice like "follow best practices" or "write clean code." 3. Focus on the general PATTERN behind the gap, not the specific gap itself. 4. Write docs that a coding agent will read and immediately know what to do differently on any similar task. -5. Keep docs concise — under 200 lines. Dense information beats verbose explanations. +5. Keep docs concise — under 100 lines. Dense information beats verbose explanations. Every line should be actionable. 6. Use a logical file path that groups related docs together (e.g., "patterns/", "conventions/", "architecture/"). 7. Include examples of correct patterns from the codebase when possible. 8. If a doc already exists on a similar topic, suggest UPDATING it (use the same path) rather than creating a new one. +9. Start the doc with a 1-2 sentence TL;DR that tells the agent the key rule. ## Output Format @@ -102,6 +113,7 @@ export async function analyzeFailure({ groundTruthDiff, currentDocs, editHistory, + commitMessage, }: { judgeResult: JudgingResult taskPrompt: string @@ -110,6 +122,7 @@ export async function analyzeFailure({ groundTruthDiff?: string // optional — not available in prompt mode currentDocs: Record editHistory?: DocEditHistoryEntry[] + commitMessage?: string // original commit message — helps identify patterns }): Promise { const docsContent = Object.entries(currentDocs) .map(([docPath, content]) => `### ${docPath}\n\`\`\`\n${content}\n\`\`\``) @@ -123,7 +136,7 @@ ${groundTruthDiff} : '## Ground Truth\n(Not available — judge should have tested the output directly)' // Compress agent trace: keep reasoning inline, extract large tool results to files - // The doc writer agent can read those files if it needs the full content + // We inline the extracted files into the prompt to avoid extra tool-call roundtrips let compressed: ReturnType | null = null let traceSection = '' @@ -131,26 +144,44 @@ ${groundTruthDiff} const traceDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-traces-')) compressed = compressTrace(agentTrace, traceDir) + // Inline extracted trace files to avoid tool-call roundtrips const resultFiles = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt')) + let inlinedResults = '' + for (const file of resultFiles) { + const content = fs.readFileSync(path.join(traceDir, file), 'utf-8') + // Cap each file to 5KB to avoid bloating the prompt + const capped = content.length > 5000 ? content.slice(0, 5000) + '\n... (truncated)' : content + inlinedResults += `\n### ${file}\n\`\`\`\n${capped}\n\`\`\`\n` + } traceSection = `## Agent Trace (reasoning, tool calls, and decisions) This is the agent's stdout showing its reasoning process, tool calls, and decisions. -Large tool results have been extracted to separate files — you can read them if needed. Look for: what the agent misunderstood, wrong assumptions it made, where it went off track. -${resultFiles.length > 0 ? `**${resultFiles.length} tool result(s) stored in ${traceDir}/** — read any file for full content.\n` : ''} \`\`\` ${compressed.inline} -\`\`\`` +\`\`\` +${inlinedResults ? `\n## Extracted Tool Results\n${inlinedResults}` : ''}` + + // Clean up trace dir immediately since we've inlined everything + cleanupTraceDir(compressed.traceDir) + compressed = null } + const commitSection = commitMessage + ? `## Original Commit Message (for pattern context) +${commitMessage} + +` + : '' + const prompt = `${DOC_WRITER_SYSTEM_PROMPT} ## Task Prompt ${taskPrompt} -## Judge Analysis +${commitSection}## Judge Analysis ${judgeResult.analysis} ## Judge Weaknesses Found @@ -180,31 +211,8 @@ Based on the agent's trace (if available), the gap between what the agent did an Respond with ONLY the JSON object.` try { - const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docwriter-')) - const promptFile = path.join(tmpDir, 'DOC_WRITER_PROMPT.md') - fs.writeFileSync(promptFile, prompt) - - let output: string - try { - // IMPORTANT: Run in tmpDir to avoid Claude reading the repo's CLAUDE.md/AGENTS.md, - // which can pollute the doc writer's analysis with unrelated project context. - output = execSync( - `claude --dangerously-skip-permissions -p "Read the file ${promptFile} and follow all instructions in it. Respond with ONLY the JSON object as specified."`, - { - cwd: tmpDir, - encoding: 'utf-8', - timeout: 5 * 60 * 1000, - stdio: ['ignore', 'pipe', 'pipe'], - maxBuffer: 10 * 1024 * 1024, - }, - ).trim() - } finally { - fs.rmSync(tmpDir, { recursive: true, force: true }) - // Clean up trace files after doc writer is done - if (compressed) { - cleanupTraceDir(compressed.traceDir) - } - } + // Use API directly — faster than spawning Claude CLI and avoids cwd/CLAUDE.md pollution + const output = await analyzeFailureViaApi(prompt) // Try to extract JSON from the output let jsonStr = output @@ -331,18 +339,20 @@ export function revertDocEdit( /** * Compare scores to determine if a doc edit improved things. - * Requires a minimum improvement of 0.3 points to count as "improved" - * to avoid accepting docs based on noise (especially with low parallelism). + * + * With parallelism=5, averages are reasonably stable. A 0.3 threshold + * catches real improvements without being too sensitive to noise. */ -const MIN_IMPROVEMENT_THRESHOLD = 0.3 - export function compareScores( oldScore: number, newScore: number, ): 'improved' | 'same' | 'worse' { const delta = newScore - oldScore - if (delta >= MIN_IMPROVEMENT_THRESHOLD) return 'improved' - if (delta <= -MIN_IMPROVEMENT_THRESHOLD) return 'worse' + const threshold = 0.3 + + if (delta >= threshold) return 'improved' + if (delta <= -threshold) return 'worse' + return 'same' } diff --git a/evalbuff/src/judge.ts b/evalbuff/src/judge.ts index 14ef8bebd6..50cd02fdd7 100644 --- a/evalbuff/src/judge.ts +++ b/evalbuff/src/judge.ts @@ -509,6 +509,10 @@ async function runReviewersAndAggregate( } } + // Use median for qualitative analysis (pick the most representative reviewer) + // but average for scores. Averaging is better because models have consistent + // scoring biases (e.g. GPT-5 scores lower) — median would always pick the + // same model's score, while average blends them. const sorted = validResults.sort( (a, b) => a.overallScore - b.overallScore, ) diff --git a/evalbuff/src/llm.ts b/evalbuff/src/llm.ts new file mode 100644 index 0000000000..36e5eee61e --- /dev/null +++ b/evalbuff/src/llm.ts @@ -0,0 +1,49 @@ +/** + * Direct LLM API calls for evalbuff, replacing Claude CLI spawning. + * + * Using the API directly is 2-5x faster than spawning `claude` CLI: + * - No process startup overhead (~5s saved per call) + * - No CLAUDE.md/AGENTS.md context pollution + * - Structured JSON output with schema validation + * - Better error handling and retry logic + */ +import { createAnthropic } from '@ai-sdk/anthropic' +import { generateText } from 'ai' + +const anthropic = createAnthropic() + +const DEFAULT_MODEL = 'claude-sonnet-4-6' + +/** + * Generate a task prompt from a commit diff using the LLM API directly. + * Replaces the `claude --dangerously-skip-permissions -p` call in commit-task-generator.ts. + */ +export async function generatePrompt( + systemPrompt: string, + userPrompt: string, +): Promise { + const result = await generateText({ + model: anthropic(DEFAULT_MODEL), + system: systemPrompt, + prompt: userPrompt, + }) + + return result.text.trim() +} + +/** + * Analyze a failure and suggest a doc edit using the LLM API directly. + * Replaces the `claude --dangerously-skip-permissions -p` call in docs-optimizer.ts. + * + * Returns raw JSON string (caller handles parsing). + */ +export async function analyzeFailureViaApi( + prompt: string, +): Promise { + const result = await generateText({ + model: anthropic(DEFAULT_MODEL), + prompt, + }) + + return result.text.trim() +} diff --git a/evalbuff/src/run-e2e-test.ts b/evalbuff/src/run-e2e-test.ts index 56840ed5ee..bb6f576f12 100644 --- a/evalbuff/src/run-e2e-test.ts +++ b/evalbuff/src/run-e2e-test.ts @@ -236,7 +236,7 @@ async function main() { await runLearnMode({ mode: 'learn', repoPath: PROJECT_DIR, - agentCommand: 'codebuff --agent base2-free', + agentId: 'base2-free-evals', parallelism: 2, maxCostUsd: 10, agentTimeoutMs: 5 * 60 * 1000, diff --git a/evalbuff/src/run-evalbuff.ts b/evalbuff/src/run-evalbuff.ts index 54b257c2a6..cac655a1d6 100644 --- a/evalbuff/src/run-evalbuff.ts +++ b/evalbuff/src/run-evalbuff.ts @@ -2,8 +2,9 @@ import { execSync } from 'child_process' import fs from 'fs' import path from 'path' +import { CodebuffClient, loadLocalAgents } from '@codebuff/sdk' + import { buildCommitTask, getCommitList } from './commit-task-generator' -import { runCliAgent } from './cli-runner' import { getCriteriaForLevel, loadCriteria, @@ -22,6 +23,7 @@ import { appendLogEntry, generateMorningReport, } from './morning-report' +import { CodebuffRunner } from './runners/codebuff' import { withTestRepo } from './test-repo-utils' import type { QualityCriteria } from './criteria' @@ -58,7 +60,8 @@ function saveState(statePath: string, state: EvalbuffState): void { export interface EvalbuffOptions { repoPath: string - agentCommand: string + agentCommand?: string // deprecated — kept for backward compat with CLI runner + agentId: string // codebuff agent ID, e.g. 'base2-free-evals' parallelism: number maxCostUsd: number agentTimeoutMs: number @@ -89,10 +92,13 @@ interface ParallelRunResult { } async function runAgentsInParallel(opts: { - agentCommand: string + client: CodebuffClient + agentId: string + agentDefinitions: any[] prompt: string repoPath: string repoUrl: string + localRepoPath?: string parentSha: string initCommand?: string groundTruthDiff?: string @@ -103,9 +109,12 @@ async function runAgentsInParallel(opts: { docsSourcePath: string // path to the repo where docs/ lives }): Promise { const { - agentCommand, + client, + agentId, + agentDefinitions, prompt, repoUrl, + localRepoPath, parentSha, initCommand, groundTruthDiff, @@ -118,20 +127,53 @@ async function runAgentsInParallel(opts: { const runOne = async (idx: number) => { return withTestRepo( - { repoUrl, parentSha, initCommand }, + { repoUrl, localRepoPath, parentSha, initCommand }, async (repoDir) => { // Copy current docs into the test repo copyDocsIntoRepo(docsSourcePath, repoDir) - console.log(` [Run ${idx + 1}/${parallelism}] Running agent...`) - const result = await runCliAgent({ - command: agentCommand, - prompt, + console.log(` [Run ${idx + 1}/${parallelism}] Running agent via SDK...`) + const shortSha = parentSha.slice(0, 8) + const runner = new CodebuffRunner({ cwd: repoDir, - timeoutMs: agentTimeoutMs, + client, + agentId, + localAgentDefinitions: agentDefinitions, + printEvents: false, + commitId: shortSha, + parentSha, }) - const costEstimate = result.durationMs * 0.00001 + let result: Awaited> + try { + result = await runner.run(prompt) + } catch (runError) { + // Infrastructure errors (503s, timeouts) should not produce a 0 score. + // Return a sentinel so the caller can detect and handle it. + const errMsg = runError instanceof Error ? runError.message : String(runError) + console.warn(` [Run ${idx + 1}/${parallelism}] Agent failed: ${errMsg.slice(0, 200)}`) + return { + score: -1, // sentinel: infrastructure failure + diff: '', + agentTrace: `Agent error: ${errMsg}`, + judging: { + analysis: `Agent failed: ${errMsg.slice(0, 500)}`, + strengths: [], + weaknesses: ['Agent failed due to infrastructure error'], + e2eTestsPerformed: [], + completionScore: -1, + codeQualityScore: -1, + e2eScore: -1, + overallScore: -1, + }, + costEstimate: 0, + } + } + + // Serialize trace steps as JSON for the doc writer to analyze + const agentTrace = result.steps + .map((step) => JSON.stringify(step)) + .join('\n') console.log(` [Run ${idx + 1}/${parallelism}] Judging...`) const judging = await judgeTaskResult({ @@ -139,7 +181,7 @@ async function runAgentsInParallel(opts: { agentDiff: result.diff, groundTruthDiff, repoDir, - error: result.exitCode !== 0 ? result.stderr : undefined, + error: result.diff === '' ? 'Agent made no changes' : undefined, criteria, reviewerAgents, }) @@ -147,21 +189,40 @@ async function runAgentsInParallel(opts: { return { score: judging.overallScore, diff: result.diff, - agentTrace: result.stdout, + agentTrace, judging, - costEstimate, + costEstimate: result.totalCostUsd, } }, ) } - const results = await Promise.all( + const allResults = await Promise.all( Array.from({ length: parallelism }, (_, i) => runOne(i)), ) + // Filter out infrastructure failures (score === -1) + const results = allResults.filter((r) => r.score >= 0) + const totalCost = allResults.reduce((a, r) => a + r.costEstimate, 0) + + if (results.length === 0) { + console.warn(` All ${parallelism} agent runs failed (infrastructure errors)`) + return { + avgScore: -1, + scores: [], + diffs: [], + agentTraces: allResults.map((r) => r.agentTrace), + judgings: [], + costEstimate: totalCost, + } + } + + if (results.length < allResults.length) { + console.warn(` ${allResults.length - results.length}/${allResults.length} runs failed, using ${results.length} valid results`) + } + const scores = results.map((r) => r.score) const avgScore = scores.reduce((a, b) => a + b, 0) / scores.length - const totalCost = results.reduce((a, r) => a + r.costEstimate, 0) return { avgScore, @@ -227,12 +288,16 @@ function copyDocsIntoRepo( async function improveDocs(opts: { taskId: string prompt: string + commitMessage?: string repoPath: string repoUrl: string + localRepoPath?: string parentSha: string initCommand?: string groundTruthDiff?: string - agentCommand: string + client: CodebuffClient + agentId: string + agentDefinitions: any[] parallelism: number agentTimeoutMs: number criteria: QualityCriteria @@ -247,12 +312,16 @@ async function improveDocs(opts: { const { taskId, prompt, + commitMessage, repoPath, repoUrl, + localRepoPath, parentSha, initCommand, groundTruthDiff, - agentCommand, + client, + agentId, + agentDefinitions, parallelism, agentTimeoutMs, criteria, @@ -266,10 +335,13 @@ async function improveDocs(opts: { // Step 1: Baseline run console.log(`\n Running ${parallelism} agents in parallel (baseline)...`) const baseline = await runAgentsInParallel({ - agentCommand, + client, + agentId, + agentDefinitions, prompt, repoPath, repoUrl, + localRepoPath, parentSha, initCommand, groundTruthDiff, @@ -284,6 +356,31 @@ async function improveDocs(opts: { let currentScore = baseline.avgScore console.log(` Baseline score: ${currentScore.toFixed(1)}/10 (scores: ${baseline.scores.map((s) => s.toFixed(1)).join(', ')})`) + // All agents failed — skip this task entirely + if (currentScore < 0) { + console.log(` All agent runs failed, skipping task.`) + return { + finalScore: 0, + baselineScore: 0, + docsKept: [], + docsRejected: [], + totalCost, + } + } + + // Early stopping: if baseline is already excellent, skip improvement loop + const EARLY_STOP_THRESHOLD = 9.0 + if (currentScore >= EARLY_STOP_THRESHOLD) { + console.log(` Baseline score ${currentScore.toFixed(1)} >= ${EARLY_STOP_THRESHOLD}, skipping improvement loop.`) + return { + finalScore: currentScore, + baselineScore: baseline.avgScore, + docsKept: [], + docsRejected: [], + totalCost: totalCost, + } + } + // Step 2: Iterative doc improvement let improving = true const MAX_IMPROVEMENT_ITERATIONS = 5 @@ -319,6 +416,7 @@ async function improveDocs(opts: { groundTruthDiff, currentDocs, editHistory, + commitMessage, }) if (!docSuggestion) { @@ -341,10 +439,13 @@ async function improveDocs(opts: { // Re-run with new docs console.log(` Re-running ${parallelism} agents with new docs...`) const rerun = await runAgentsInParallel({ - agentCommand, + client, + agentId, + agentDefinitions, prompt, repoPath, repoUrl, + localRepoPath, parentSha, initCommand, groundTruthDiff, @@ -356,11 +457,25 @@ async function improveDocs(opts: { }) totalCost += rerun.costEstimate + // If re-run failed entirely, don't count it as a rejection + if (rerun.avgScore < 0) { + console.log(` Re-run failed (infrastructure errors), reverting doc and retrying later.`) + if (previousContent !== null) { + applyDocEdit(repoPath, docSuggestion.suggestedDocPath, previousContent) + } else { + revertDocEdit(repoPath, docSuggestion.suggestedDocPath) + } + break + } + const comparison = compareScores(currentScore, rerun.avgScore) console.log(` New score: ${rerun.avgScore.toFixed(1)}/10 (${comparison}) (scores: ${rerun.scores.map((s) => s.toFixed(1)).join(', ')})`) - if (comparison === 'improved') { - console.log(` Keeping doc: ${docSuggestion.suggestedDocPath}`) + if (comparison === 'improved' || comparison === 'same') { + // 'improved' = clear signal the doc helps + // 'same' = within noise range — keep it (benefit of the doubt) + const reason = comparison === 'improved' ? 'score improved' : 'within noise range, keeping' + console.log(` Keeping doc: ${docSuggestion.suggestedDocPath} (${reason})`) docsKept.push({ path: docSuggestion.suggestedDocPath, reasoning: docSuggestion.reasoning, @@ -388,7 +503,7 @@ async function improveDocs(opts: { // Continue loop — try to improve more } else { - console.log(` Rejecting doc: ${docSuggestion.suggestedDocPath} (score didn't improve)`) + console.log(` Rejecting doc: ${docSuggestion.suggestedDocPath} (score dropped significantly)`) docsRejected.push({ path: docSuggestion.suggestedDocPath, reasoning: docSuggestion.reasoning, @@ -423,7 +538,7 @@ async function improveDocs(opts: { export async function runLearnMode(options: LearnOptions): Promise { const { repoPath, - agentCommand, + agentId, parallelism, maxCostUsd, agentTimeoutMs, @@ -441,6 +556,13 @@ export async function runLearnMode(options: LearnOptions): Promise { const state = loadState(statePath) let criteria = loadCriteria(defaultCriteriaPath) + // Initialize codebuff SDK client and load agent definitions + const client = new CodebuffClient({ cwd: repoPath }) + const agentsDir = path.resolve(__dirname, '../../agents') + const loadedAgents = await loadLocalAgents({ agentsPath: agentsDir }) + const agentDefinitions = Object.values(loadedAgents) + console.log(`Loaded ${agentDefinitions.length} agent definitions from ${agentsDir}`) + // Get the repo's remote URL let repoUrl: string try { @@ -464,7 +586,7 @@ export async function runLearnMode(options: LearnOptions): Promise { console.log(`Evalbuff Learn Mode:`) console.log(` Repo: ${repoPath}`) console.log(` Remote: ${repoUrl}`) - console.log(` Agent: ${agentCommand}`) + console.log(` Agent: ${agentId}`) console.log(` Parallelism: ${parallelism}`) console.log(` Reviewer agents: ${(reviewerAgents || ['claude', 'codex']).join(', ')}`) console.log(` Commits to process: ${commits.length}`) @@ -520,12 +642,16 @@ export async function runLearnMode(options: LearnOptions): Promise { const result = await improveDocs({ taskId: shortSha, prompt: task.prompt, + commitMessage: task.message, repoPath, repoUrl, + localRepoPath: repoPath, parentSha: task.parentSha, initCommand, groundTruthDiff: task.diff, - agentCommand, + client, + agentId, + agentDefinitions, parallelism, agentTimeoutMs, criteria, @@ -592,7 +718,7 @@ export async function runLearnMode(options: LearnOptions): Promise { export async function runPromptMode(options: PromptOptions): Promise { const { repoPath, - agentCommand, + agentId, parallelism, maxCostUsd, agentTimeoutMs, @@ -608,6 +734,12 @@ export async function runPromptMode(options: PromptOptions): Promise { const criteria = loadCriteria(defaultCriteriaPath) + // Initialize codebuff SDK client and load agent definitions + const client = new CodebuffClient({ cwd: repoPath }) + const agentsDir = path.resolve(__dirname, '../../agents') + const loadedAgents = await loadLocalAgents({ agentsPath: agentsDir }) + const agentDefinitions = Object.values(loadedAgents) + let repoUrl: string try { repoUrl = execSync('git remote get-url origin', { @@ -629,7 +761,7 @@ export async function runPromptMode(options: PromptOptions): Promise { console.log(`Evalbuff Prompt Mode:`) console.log(` Repo: ${repoPath}`) console.log(` Remote: ${repoUrl}`) - console.log(` Agent: ${agentCommand}`) + console.log(` Agent: ${agentId}`) console.log(` Parallelism: ${parallelism}`) console.log(` Reviewer agents: ${(reviewerAgents || ['claude', 'codex']).join(', ')}`) console.log(` Max cost: $${maxCostUsd}`) @@ -656,10 +788,13 @@ export async function runPromptMode(options: PromptOptions): Promise { prompt, repoPath, repoUrl, + localRepoPath: repoPath, parentSha: headSha, initCommand, // No ground truth diff in prompt mode - agentCommand, + client, + agentId, + agentDefinitions, parallelism, agentTimeoutMs, criteria, @@ -709,7 +844,7 @@ async function main() { const hasArg = (name: string): boolean => args.includes(`--${name}`) const repoPath = getArg('repo') - const agentCommand = getArg('agent', 'codebuff --agent base2-free') + const agentId = getArg('agent', 'base2-free-evals') const parallelism = parseInt(getArg('parallelism', '5')) const maxCostUsd = parseFloat(getArg('max-cost', '100')) const agentTimeoutMs = parseInt(getArg('agent-timeout', '300000')) @@ -728,7 +863,7 @@ async function main() { await runPromptMode({ mode: 'prompt', repoPath, - agentCommand, + agentId, parallelism, maxCostUsd, agentTimeoutMs, @@ -743,7 +878,7 @@ async function main() { await runLearnMode({ mode: 'learn', repoPath, - agentCommand, + agentId, parallelism, maxCostUsd, agentTimeoutMs, diff --git a/evalbuff/src/test-repo-utils.ts b/evalbuff/src/test-repo-utils.ts index 60039a3a62..7c1ba6700e 100644 --- a/evalbuff/src/test-repo-utils.ts +++ b/evalbuff/src/test-repo-utils.ts @@ -7,11 +7,16 @@ import { getErrorObject } from '@codebuff/common/util/error' /** * Helper function to manage test repository lifecycle - * Sets up a test repo, runs a function with the repo cwd, then cleans up + * Sets up a test repo, runs a function with the repo cwd, then cleans up. + * + * When localRepoPath is provided, uses a local clone (near-instant via hardlinks) + * instead of a remote clone (5-30s per clone). This is the single biggest + * speedup in evalbuff — with parallelism=5, saves 10-30 remote clones per commit. */ export const withTestRepo = async ( repoConfig: { repoUrl: string + localRepoPath?: string // The sha of the commit to checkout. If you have a commit with changes to replicate, you would check out the parent commit. parentSha: string initCommand?: string @@ -19,20 +24,27 @@ export const withTestRepo = async ( }, fn: (cwd: string) => Promise, ): Promise => { - const { repoUrl, parentSha, initCommand, env } = repoConfig + const { repoUrl, localRepoPath, parentSha, initCommand, env } = repoConfig // Create a temporary directory for the test repo const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codebuff-eval-')) const repoDir = path.join(tempDir, 'repo') try { - execSync(`git clone --depth 1 ${repoUrl} ${repoDir}`, { stdio: 'ignore' }) - - execSync(`git fetch --depth 1 origin ${parentSha}`, { - cwd: repoDir, - stdio: 'ignore', - }) - execSync(`git checkout ${parentSha}`, { cwd: repoDir, stdio: 'ignore' }) + if (localRepoPath) { + // Local clone: uses hardlinks for objects, nearly instant + execSync(`git clone --no-checkout "${localRepoPath}" "${repoDir}"`, { stdio: 'ignore' }) + execSync(`git checkout ${parentSha}`, { cwd: repoDir, stdio: 'ignore' }) + } else { + // Remote clone: slow but works without local repo + execSync(`git clone --depth 1 ${repoUrl} ${repoDir}`, { stdio: 'ignore' }) + + execSync(`git fetch --depth 1 origin ${parentSha}`, { + cwd: repoDir, + stdio: 'ignore', + }) + execSync(`git checkout ${parentSha}`, { cwd: repoDir, stdio: 'ignore' }) + } if (initCommand) { console.log(`Running init command: ${initCommand}...`)