Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 47 additions & 10 deletions evalbuff/src/commit-task-generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,28 @@ export interface CommitTask {

const MAX_DIFF_CHARS = 200_000

/**
* Files that add noise to diffs without useful signal.
* Lockfiles are huge and auto-generated — agents shouldn't replicate them.
*/
const NOISE_FILE_PATTERNS = [
'bun.lock',
'bun.lockb',
'package-lock.json',
'yarn.lock',
'pnpm-lock.yaml',
'Gemfile.lock',
'Cargo.lock',
'poetry.lock',
'composer.lock',
'go.sum',
]

function isNoiseFile(filePath: string): boolean {
const basename = filePath.split('/').pop() || ''
return NOISE_FILE_PATTERNS.includes(basename)
}

/**
* Get a list of commits from the repo, oldest first.
* Starts from `startAfterSha` (exclusive) or HEAD~commitCount if no state.
Expand Down Expand Up @@ -68,19 +90,24 @@ export function getCommitInfo(
encoding: 'utf-8',
}).trim()

// Get diff
const diff = execSync(`git diff ${parentSha} ${sha}`, {
cwd: repoPath,
encoding: 'utf-8',
maxBuffer: 10 * 1024 * 1024,
})

// Get files changed
// Get files changed (filter out noise files like lockfiles)
const filesOutput = execSync(`git diff --name-only ${parentSha} ${sha}`, {
cwd: repoPath,
encoding: 'utf-8',
}).trim()
const filesChanged = filesOutput ? filesOutput.split('\n') : []
const allFiles = filesOutput ? filesOutput.split('\n') : []
const filesChanged = allFiles.filter((f) => !isNoiseFile(f))

// Get diff, excluding noise files (lockfiles etc.)
const excludeArgs = NOISE_FILE_PATTERNS.map((p) => `':!${p}'`).join(' ')
const diff = execSync(
`git diff ${parentSha} ${sha} -- . ${excludeArgs}`,
{
cwd: repoPath,
encoding: 'utf-8',
maxBuffer: 10 * 1024 * 1024,
},
)

return { parentSha, message, diff, filesChanged }
} catch {
Expand Down Expand Up @@ -124,6 +151,7 @@ function readFilesAtParent(

for (const filePath of filesChanged) {
if (totalSize >= maxTotalSize) break
if (isNoiseFile(filePath)) continue

const content = readFileAtCommit(repoPath, parentSha, filePath)
if (content != null && content.length > 0) {
Expand Down Expand Up @@ -209,9 +237,12 @@ ${diff}
try {
fs.writeFileSync(promptFile, `${PROMPT_GEN_SYSTEM}\n\n---\n\n${userPrompt}`)

// IMPORTANT: Run in tmpDir to avoid Claude reading the repo's CLAUDE.md/AGENTS.md,
// which can confuse prompt generation (e.g., generating prompts about evalbuff itself).
const output = execSync(
`claude --dangerously-skip-permissions -p "Read ${promptFile} and follow all instructions. Respond with ONLY the task prompt text."`,
{
cwd: tmpDir,
encoding: 'utf-8',
timeout: 2 * 60 * 1000,
stdio: ['ignore', 'pipe', 'pipe'],
Expand Down Expand Up @@ -245,11 +276,17 @@ export async function buildCommitTask(
return null
}

// Skip commits with no meaningful code changes
// Skip commits with no meaningful code changes (after filtering noise files)
if (info.filesChanged.length === 0) {
return null
}

// Skip commits where the diff is empty after filtering noise files
if (info.diff.trim().length === 0) {
console.log(`Skipping ${sha.slice(0, 8)}: only noise files changed (lockfiles, etc.)`)
return null
}

const prompt = await generatePromptFromCommit(
repoPath,
info.parentSha,
Expand Down
42 changes: 40 additions & 2 deletions evalbuff/src/docs-optimizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,25 +63,53 @@ You MUST respond with ONLY a JSON object (no markdown fences, no explanation). T
Or if too task-specific:
{"skip": true, "reasoning": "explanation"}`

function formatEditHistory(history?: DocEditHistoryEntry[]): string {
if (!history || history.length === 0) return ''

const lines = history.map((entry) => {
const score =
entry.scoreBefore != null && entry.scoreAfter != null
? ` (score: ${entry.scoreBefore.toFixed(1)} → ${entry.scoreAfter.toFixed(1)})`
: ''
return `- **${entry.outcome.toUpperCase()}**: \`${entry.path}\`${score}\n Reasoning: ${entry.reasoning}`
})

return `## Edit History (previous doc edits tried this session)

Use this history to avoid repeating rejected approaches and to build on what worked.

${lines.join('\n')}`
}

/**
* Analyze agent run results and suggest a doc edit to improve future performance.
* Always analyzes — no score threshold check.
* Returns null if the doc writer decides the failure is too task-specific to generalize.
*/
export interface DocEditHistoryEntry {
path: string
reasoning: string
outcome: 'accepted' | 'rejected'
scoreBefore?: number
scoreAfter?: number
}

export async function analyzeFailure({
judgeResult,
taskPrompt,
agentDiff,
agentTrace,
groundTruthDiff,
currentDocs,
editHistory,
}: {
judgeResult: JudgingResult
taskPrompt: string
agentDiff: string
agentTrace?: string // stdout from the agent — reasoning, tool calls, errors
groundTruthDiff?: string // optional — not available in prompt mode
currentDocs: Record<string, string>
editHistory?: DocEditHistoryEntry[]
}): Promise<DocSuggestion | null> {
const docsContent = Object.entries(currentDocs)
.map(([docPath, content]) => `### ${docPath}\n\`\`\`\n${content}\n\`\`\``)
Expand Down Expand Up @@ -145,6 +173,8 @@ ${traceSection}
## Current Docs (already available to the agent)
${docsContent || '(No docs yet)'}

${formatEditHistory(editHistory)}

Based on the agent's trace (if available), the gap between what the agent did and what it should have done, and the judge's analysis, write a doc file that captures a GENERAL PATTERN that would help the agent across many similar tasks. Focus on what the agent MISUNDERSTOOD (visible in the trace) rather than just what it got wrong (visible in the diff). If this failure doesn't reveal a generalizable pattern, respond with {"skip": true, "reasoning": "..."}.

Respond with ONLY the JSON object.`
Expand All @@ -156,9 +186,12 @@ Respond with ONLY the JSON object.`

let output: string
try {
// IMPORTANT: Run in tmpDir to avoid Claude reading the repo's CLAUDE.md/AGENTS.md,
// which can pollute the doc writer's analysis with unrelated project context.
output = execSync(
`claude --dangerously-skip-permissions -p "Read the file ${promptFile} and follow all instructions in it. Respond with ONLY the JSON object as specified."`,
{
cwd: tmpDir,
encoding: 'utf-8',
timeout: 5 * 60 * 1000,
stdio: ['ignore', 'pipe', 'pipe'],
Expand Down Expand Up @@ -298,13 +331,18 @@ export function revertDocEdit(

/**
* Compare scores to determine if a doc edit improved things.
* Requires a minimum improvement of 0.3 points to count as "improved"
* to avoid accepting docs based on noise (especially with low parallelism).
*/
const MIN_IMPROVEMENT_THRESHOLD = 0.3

export function compareScores(
oldScore: number,
newScore: number,
): 'improved' | 'same' | 'worse' {
if (newScore > oldScore) return 'improved'
if (newScore < oldScore) return 'worse'
const delta = newScore - oldScore
if (delta >= MIN_IMPROVEMENT_THRESHOLD) return 'improved'
if (delta <= -MIN_IMPROVEMENT_THRESHOLD) return 'worse'
return 'same'
}

Expand Down
47 changes: 45 additions & 2 deletions evalbuff/src/run-evalbuff.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,13 @@ async function runAgentsInParallel(opts: {
}
}

/**
* Copy docs into a test repo and commit them so they don't appear in the agent's diff.
*
* Without this commit, `git diff HEAD` after the agent runs would include
* the pre-copied docs as "new files", corrupting the diff attribution —
* the judge would penalize or credit the agent for docs it didn't create.
*/
function copyDocsIntoRepo(
sourceRepoPath: string,
targetRepoPath: string,
Expand All @@ -182,11 +189,31 @@ function copyDocsIntoRepo(
const targetDocsDir = path.join(targetRepoPath, 'docs')
const targetAgentsMd = path.join(targetRepoPath, 'AGENTS.md')

let copied = false
if (fs.existsSync(sourceDocsDir)) {
fs.cpSync(sourceDocsDir, targetDocsDir, { recursive: true })
copied = true
}
if (fs.existsSync(sourceAgentsMd)) {
fs.cpSync(sourceAgentsMd, targetAgentsMd)
copied = true
}

// Commit the docs so they become part of HEAD — otherwise git diff HEAD
// after the agent runs will include these docs as agent-created changes.
if (copied) {
try {
execSync('git add docs/ AGENTS.md 2>/dev/null; git add -u docs/ AGENTS.md 2>/dev/null', {
cwd: targetRepoPath,
stdio: 'ignore',
})
execSync('git commit -m "evalbuff: pre-load docs" --allow-empty', {
cwd: targetRepoPath,
stdio: 'ignore',
})
} catch {
// If nothing to commit, that's fine
}
}
}

Expand All @@ -213,8 +240,8 @@ async function improveDocs(opts: {
}): Promise<{
finalScore: number
baselineScore: number
docsKept: Array<{ path: string; reasoning: string }>
docsRejected: Array<{ path: string; reasoning: string }>
docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }>
docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }>
totalCost: number
}> {
const {
Expand Down Expand Up @@ -259,7 +286,14 @@ async function improveDocs(opts: {

// Step 2: Iterative doc improvement
let improving = true
const MAX_IMPROVEMENT_ITERATIONS = 5
let iterationCount = 0
while (improving) {
iterationCount++
if (iterationCount > MAX_IMPROVEMENT_ITERATIONS) {
console.log(` Hit max improvement iterations (${MAX_IMPROVEMENT_ITERATIONS}), stopping.`)
break
}
// Pick the worst-scoring judging for analysis
const worstIdx = baseline.judgings.reduce(
(minIdx, j, idx, arr) =>
Expand All @@ -273,13 +307,18 @@ async function improveDocs(opts: {
const currentDocs = readCurrentDocs(repoPath)

console.log(` Analyzing for doc improvements...`)
const editHistory = [
...docsKept.map((d) => ({ ...d, outcome: 'accepted' as const })),
...docsRejected.map((d) => ({ ...d, outcome: 'rejected' as const })),
]
const docSuggestion = await analyzeFailure({
judgeResult: worstJudging,
taskPrompt: prompt,
agentDiff: worstDiff,
agentTrace: worstTrace,
groundTruthDiff,
currentDocs,
editHistory,
})

if (!docSuggestion) {
Expand Down Expand Up @@ -325,6 +364,8 @@ async function improveDocs(opts: {
docsKept.push({
path: docSuggestion.suggestedDocPath,
reasoning: docSuggestion.reasoning,
scoreBefore: currentScore,
scoreAfter: rerun.avgScore,
})

// Commit the doc change
Expand All @@ -351,6 +392,8 @@ async function improveDocs(opts: {
docsRejected.push({
path: docSuggestion.suggestedDocPath,
reasoning: docSuggestion.reasoning,
scoreBefore: currentScore,
scoreAfter: rerun.avgScore,
})

// Revert the doc edit — restore previous content if it existed
Expand Down
Loading