From a5b5a2bf1c2a24f8ba221e715c0da272b317385b Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Sun, 29 Mar 2026 16:18:07 -0700
Subject: [PATCH 01/12] evalbuff: add patterns/task-completion-validation.md
 (fde408c6)

---
 AGENTS.md                                   |   1 +
 docs/patterns/task-completion-validation.md | 130 ++++++++++++++++++++
 2 files changed, 131 insertions(+)
 create mode 100644 docs/patterns/task-completion-validation.md

diff --git a/AGENTS.md b/AGENTS.md
index ca06ab44c..e5080eda7 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -43,3 +43,4 @@ Make an efficient learning agent that can do anything.
 - [`docs/agents-and-tools.md`](docs/agents-and-tools.md) — Agent system, shell shims, tool definitions
 - [`docs/patterns/handle-steps-generators.md`](docs/patterns/handle-steps-generators.md) — handleSteps generator patterns and spawn_agents tool calls
 - [docs/evalbuff/interpreting-task-prompts.md](docs/evalbuff/interpreting-task-prompts.md)
+- [docs/patterns/task-completion-validation.md](docs/patterns/task-completion-validation.md)
diff --git a/docs/patterns/task-completion-validation.md b/docs/patterns/task-completion-validation.md
new file mode 100644
index 000000000..44668fb4c
--- /dev/null
+++ b/docs/patterns/task-completion-validation.md
@@ -0,0 +1,130 @@
+# Task Completion Validation Pattern
+
+When implementing features, always validate whether the requested functionality already exists before making changes. Many tasks may already be complete or partially implemented.
+
+## The Validation Process
+
+### 1. Search for Existing Implementation
+
+Before implementing any feature, search the codebase for existing implementations:
+
+```bash
+# Search for relevant patterns
+code_search "pattern related to feature"
+# Check configuration files
+read_files ["config-file.js", "next.config.mjs"]
+```
+
+### 2. Verify Implementation Completeness
+
+When you find existing code, verify it matches the requirements:
+
+- **Check the exact pattern**: Does `/b/:hash` match the requested route?
+- **Verify the destination**: Does `https://go.trybeluga.ai/:hash` match the target?
+- **Confirm the behavior**: Is `permanent: false` the correct redirect type?
+
+### 3. Test the Implementation
+
+If implementation exists, verify it works:
+
+```bash
+# Look for existing tests
+code_search "test.*redirect.*b/"
+# Run relevant tests if they exist
+run_terminal_command "bun test path/to/redirect/tests"
+```
+
+### 4. Communicate Task Status Clearly
+
+Always provide explicit feedback about task completion:
+
+**✅ When feature is already implemented:**
+```
+The redirect for /b/:hash is already implemented in next.config.mjs:
+
+[show the relevant code]
+
+This configuration:
+- Redirects /b/:hash to https://go.trybeluga.ai/:hash
+- Uses temporary redirects (permanent: false)
+- Preserves query parameters
+
+No additional changes needed - the feature is complete!
+```
+
+**✅ When making changes:**
+```
+I need to add the redirect configuration to next.config.mjs:
+
+[show the changes being made]
+
+This will enable the requested /b/:hash → go.trybeluga.ai functionality.
+```
+
+## Common Scenarios
+
+### Next.js Redirects
+
+For Next.js redirect tasks, always check `next.config.mjs` first:
+
+```javascript
+// Look for existing redirects() function
+async redirects() {
+  return [
+    {
+      source: '/pattern',
+      destination: 'https://target.com/pattern',
+      permanent: false,
+    },
+  ]
+}
+```
+
+### API Routes
+
+For API endpoint tasks, check existing route files:
+
+```bash
+# Check if route already exists
+list_directory "src/app/api/target-path"
+read_files ["src/app/api/target-path/route.ts"]
+```
+
+### Component Features
+
+For UI feature tasks, search for existing components:
+
+```bash
+code_search "component.*feature.*name"
+glob "**/*ComponentName*"
+```
+
+## Anti-Patterns to Avoid
+
+❌ **Silent failure**: Making no changes without explanation
+❌ **Duplicate implementation**: Adding code when it already exists
+❌ **Incomplete verification**: Finding partial implementation but not checking if it's complete
+❌ **No status communication**: Leaving the user unsure whether the task succeeded
+
+## Test Validation
+
+When tests exist for the feature:
+
+1. **Read the test file** to understand expected behavior
+2. **Run the tests** to verify current implementation works
+3. **Report test results** as evidence of completion
+
+Example from redirect tests:
+```typescript
+test('redirects to go.trybeluga.ai with the hash', async ({ request }) => {
+  const response = await request.get('/b/test123', { maxRedirects: 0 })
+  expect(response.status()).toBe(307)
+  expect(response.headers()['location']).toBe('https://go.trybeluga.ai/test123')
+})
+```
+
+If these tests exist and pass, the feature is confirmed working.
+
+## Key Principle
+
+**Always explicitly state whether a task is complete, incomplete, or already done.** Never leave the user guessing about the status of their request.
\ No newline at end of file

From 1d598f0157610472f52fed3922da3036c8c69865 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Sun, 29 Mar 2026 18:33:36 -0700
Subject: [PATCH 02/12] evalbuff: add patterns/template-literal-escaping.md
 (6d8bf394)

---
 AGENTS.md                                     |   2 +
 docs/patterns/template-literal-escaping.md    |  88 ++++++++
 .../terminal-alternate-screen-buffer.md       | 191 ++++++++++++++++++
 3 files changed, 281 insertions(+)
 create mode 100644 docs/patterns/template-literal-escaping.md
 create mode 100644 docs/patterns/terminal-alternate-screen-buffer.md

diff --git a/AGENTS.md b/AGENTS.md
index e5080eda7..1c02e5b5d 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -44,3 +44,5 @@ Make an efficient learning agent that can do anything.
 - [`docs/patterns/handle-steps-generators.md`](docs/patterns/handle-steps-generators.md) — handleSteps generator patterns and spawn_agents tool calls
 - [docs/evalbuff/interpreting-task-prompts.md](docs/evalbuff/interpreting-task-prompts.md)
 - [docs/patterns/task-completion-validation.md](docs/patterns/task-completion-validation.md)
+- [docs/patterns/terminal-alternate-screen-buffer.md](docs/patterns/terminal-alternate-screen-buffer.md)
+- [docs/patterns/template-literal-escaping.md](docs/patterns/template-literal-escaping.md)
diff --git a/docs/patterns/template-literal-escaping.md b/docs/patterns/template-literal-escaping.md
new file mode 100644
index 000000000..d4ce5c447
--- /dev/null
+++ b/docs/patterns/template-literal-escaping.md
@@ -0,0 +1,88 @@
+# Template Literal Escaping Pattern
+
+When modifying JavaScript/TypeScript code that contains template literals (backtick strings), always escape backticks that appear within the template literal content to prevent syntax errors.
+
+## The Problem
+
+Template literals use backticks (`) as delimiters. When you have backticks inside the template literal content, they must be escaped or they will break the JavaScript syntax.
+
+**WRONG:**
+```typescript
+const message = `Use \`wait-idle\` with send** (e.g., `--wait-idle 3`) to wait for output`
+//                                              ^ unescaped backtick breaks syntax
+```
+
+**CORRECT:**
+```typescript
+const message = `Use \`wait-idle\` with send** (e.g., \`--wait-idle 3\`) to wait for output`
+//                                              ^ properly escaped backticks
+```
+
+## When This Happens
+
+This issue commonly occurs when:
+- Modifying documentation strings that contain code examples with backticks
+- Updating help text or error messages that reference command-line syntax
+- Changing template literals that contain markdown-style code formatting
+- Replacing text that includes shell command examples
+
+## The Fix
+
+When working inside template literals, escape all backticks in the content:
+
+1. **Find all backticks** in the string content (not the template literal delimiters)
+2. **Escape each one** with a backslash: `` ` `` becomes `` \` ``
+3. **Verify the syntax** - the opening and closing backticks of the template literal should be the only unescaped ones
+
+## Examples from Real Code
+
+### Helper Script Documentation
+```typescript
+// WRONG - breaks compilation
+const helperScript = `
+  echo "Commands: send, capture, wait-idle"
+  # Usage example: helper wait-idle "session" 3
+  echo "Use \`--wait-idle 3\` for timing"
+`
+
+// CORRECT - properly escaped
+const helperScript = `
+  echo "Commands: send, capture, wait-idle"
+  # Usage example: helper wait-idle "session" 3
+  echo "Use \\\`--wait-idle 3\\\` for timing"
+`
+```
+
+### Quick Reference Strings
+```typescript
+// WRONG
+const quickRef = 
+  '- Send + wait: `' + helperPath + ' send "' + sessionName + '" "..." --wait-idle 3`\n' +
+  '- Example usage: `--wait-idle 3` waits for output\n'
+//                  ^ unescaped backticks in concatenated string
+
+// CORRECT  
+const quickRef = 
+  '- Send + wait: `' + helperPath + ' send "' + sessionName + '" "..." --wait-idle 3`\n' +
+  '- Example usage: \`--wait-idle 3\` waits for output\n'
+//                  ^ properly escaped backticks
+```
+
+## Detection
+
+Syntax errors from unescaped backticks typically show:
+- `TS1005: ',' expected` or `TS1005: ';' expected`
+- `TS1003: Identifier expected`
+- `error: Expected "}" but found "<word>"`
+- Compilation errors pointing to the line with unescaped backticks
+
+## Prevention
+
+1. **When modifying template literals**, scan for all backticks in the content
+2. **Use find-and-replace** to systematically escape backticks: find `` ` `` replace with `` \` ``
+3. **Test compilation** after making changes to catch syntax errors early
+4. **Be extra careful** with documentation strings, help text, and code examples
+
+## Key Rule
+
+Inside template literals, the only unescaped backticks should be the opening and closing delimiters of the template literal itself. All backticks in the content must be escaped with backslashes.
\ No newline at end of file
diff --git a/docs/patterns/terminal-alternate-screen-buffer.md b/docs/patterns/terminal-alternate-screen-buffer.md
new file mode 100644
index 000000000..9f7d8fc0b
--- /dev/null
+++ b/docs/patterns/terminal-alternate-screen-buffer.md
@@ -0,0 +1,191 @@
+# Terminal Alternate Screen Buffer Pattern
+
+When building CLI applications with full-screen UIs (like TUI apps), use the alternate screen buffer to prevent UI output from polluting the user's terminal scrollback when the app exits.
+
+## The Problem
+
+By default, terminal applications write to the main screen buffer. When a full-screen CLI app exits, all its UI output remains in the terminal scrollback, cluttering the user's terminal history. This is annoying for users who expect clean terminal behavior like vim, less, htop, and other well-behaved CLI tools.
+
+## The Solution: Alternate Screen Buffer
+
+Terminals support an alternate screen buffer that can be entered/exited using ANSI escape sequences:
+
+- **Enter alternate screen:** `\x1b[?1049h` (smcup)
+- **Exit alternate screen:** `\x1b[?1049l` (rmcup)
+
+When you enter the alternate screen buffer, the terminal saves the current screen content. When you exit, it restores the original content, leaving the scrollback clean.
+
+## Implementation Pattern
+
+### 1. Define the Escape Sequences
+
+```typescript
+// Terminal alternate screen buffer escape sequences
+export const ENTER_ALT_BUFFER = '\x1b[?1049h'
+export const EXIT_ALT_BUFFER = '\x1b[?1049l'
+```
+
+### 2. Enter Before Rendering
+
+Enter the alternate screen buffer BEFORE initializing your UI renderer:
+
+```typescript
+export function enterAlternateScreen(): void {
+  if (process.stdout.isTTY) {
+    process.stdout.write(ENTER_ALT_BUFFER)
+  }
+}
+
+async function main(): Promise<void> {
+  // Enter alternate screen buffer BEFORE rendering the app
+  if (process.stdout.isTTY) {
+    enterAlternateScreen()
+  }
+
+  // Initialize your UI renderer after entering alternate buffer
+  const renderer = await createCliRenderer({ ... })
+  // ... rest of app initialization
+}
+```
+
+### 3. Exit During Cleanup
+
+Ensure the alternate screen buffer is exited during all cleanup scenarios:
+
+```typescript
+const TERMINAL_RESET_SEQUENCES =
+  EXIT_ALT_BUFFER + // Exit alternate screen buffer (restores main screen)
+  '\x1b[?1000l' + // Disable X10 mouse mode
+  '\x1b[?1002l' + // Disable button event mouse mode
+  // ... other terminal reset sequences
+  '\x1b[?25h' // Show cursor
+
+function resetTerminalState(): void {
+  try {
+    process.stdout.write(TERMINAL_RESET_SEQUENCES)
+  } catch {
+    // Ignore errors - stdout may already be closed
+  }
+}
+```
+
+### 4. Handle All Exit Scenarios
+
+Register cleanup handlers for all possible exit scenarios:
+
+```typescript
+process.on('SIGTERM', cleanup)
+process.on('SIGHUP', cleanup)
+process.on('SIGINT', cleanup)
+process.on('beforeExit', cleanup)
+process.on('exit', cleanup)
+process.on('uncaughtException', cleanup)
+process.on('unhandledRejection', cleanup)
+```
+
+## Key Considerations
+
+### TTY Detection
+
+Only enter alternate screen buffer in interactive terminals:
+
+```typescript
+if (process.stdout.isTTY) {
+  enterAlternateScreen()
+}
+```
+
+This prevents issues when:
+- Output is piped to a file (`app > output.txt`)
+- Running in CI/automated environments
+- Output is redirected or captured
+
+### Timing is Critical
+
+1. **Enter alternate buffer FIRST** - before any UI initialization
+2. **Exit alternate buffer LAST** - as part of terminal reset sequences
+3. **Write exit sequence directly to stdout** - don't rely on UI renderer cleanup
+
+### Terminal Compatibility
+
+The `?1049` sequence is widely supported by modern terminals:
+- xterm, gnome-terminal, iTerm2, Terminal.app
+- tmux, screen (with proper configuration)
+- Windows Terminal, ConEmu
+
+Very old terminals may not support it, but the TTY check provides a reasonable fallback.
+
+## Integration with UI Frameworks
+
+### OpenTUI Example
+
+```typescript
+import { createCliRenderer } from '@opentui/core'
+
+async function main(): Promise<void> {
+  // Enter alternate screen BEFORE creating renderer
+  if (process.stdout.isTTY) {
+    enterAlternateScreen()
+  }
+
+  const renderer = await createCliRenderer({
+    backgroundColor: 'transparent',
+    exitOnCtrlC: false,
+  })
+  
+  // Install cleanup handlers that exit alternate screen
+  installProcessCleanupHandlers(renderer)
+  
+  // Render your app
+  createRoot(renderer).render(<App />)
+}
+```
+
+### Ink.js Example
+
+```typescript
+import { render } from 'ink'
+
+function main() {
+  if (process.stdout.isTTY) {
+    enterAlternateScreen()
+  }
+
+  const { unmount } = render(<App />)
+  
+  // Ensure cleanup on exit
+  process.on('exit', () => {
+    unmount()
+    resetTerminalState()
+  })
+}
+```
+
+## Testing
+
+To verify alternate screen buffer works correctly:
+
+1. **Before running your CLI:** Note some text in your terminal scrollback
+2. **Run your CLI:** The UI should appear in a clean screen
+3. **Exit your CLI:** You should return to the exact terminal state from step 1
+4. **Check scrollback:** The UI output should not appear in your scrollback history
+
+## Common Mistakes
+
+❌ **Entering alternate buffer too late** - after UI initialization
+❌ **Not checking TTY status** - breaks piped output
+❌ **Forgetting exit sequences** - leaves terminal in alternate buffer
+❌ **Not handling all exit scenarios** - cleanup only works for normal exit
+❌ **Relying on UI framework cleanup** - may not run if framework crashes
+
+## When to Use
+
+Use alternate screen buffer for:
+- Full-screen TUI applications
+- Interactive CLI tools with complex UIs
+- Any CLI that renders multiple lines of output that users don't need to reference later
+
+Don't use for:
+- Simple command-line tools with minimal output
+- Tools where users need to reference output after exit
+- Log viewers or tools that should integrate with terminal scrollback
\ No newline at end of file

From 624c237a6a18eade4fc457a263f04641ceab782a Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Sun, 29 Mar 2026 18:50:34 -0700
Subject: [PATCH 03/12] evalbuff: add patterns/task-scope-adherence.md
 (6d8bf394)

---
 AGENTS.md                             |   1 +
 docs/patterns/task-scope-adherence.md | 139 ++++++++++++++++++++++++++
 2 files changed, 140 insertions(+)
 create mode 100644 docs/patterns/task-scope-adherence.md

diff --git a/AGENTS.md b/AGENTS.md
index 1c02e5b5d..a6e77ec87 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -46,3 +46,4 @@ Make an efficient learning agent that can do anything.
 - [docs/patterns/task-completion-validation.md](docs/patterns/task-completion-validation.md)
 - [docs/patterns/terminal-alternate-screen-buffer.md](docs/patterns/terminal-alternate-screen-buffer.md)
 - [docs/patterns/template-literal-escaping.md](docs/patterns/template-literal-escaping.md)
+- [docs/patterns/task-scope-adherence.md](docs/patterns/task-scope-adherence.md)
diff --git a/docs/patterns/task-scope-adherence.md b/docs/patterns/task-scope-adherence.md
new file mode 100644
index 000000000..d10aae54c
--- /dev/null
+++ b/docs/patterns/task-scope-adherence.md
@@ -0,0 +1,139 @@
+# Task Scope Adherence Pattern
+
+When given a specific technical task, implement ONLY what is explicitly requested. Do not add "helpful" extras, documentation, or related improvements unless specifically asked.
+
+## The Problem
+
+Agents often interpret tasks broadly and add related work they think would be helpful:
+- Task: "Remove the wait-for command" → Agent also creates 3 new documentation files
+- Task: "Fix the redirect" → Agent also refactors related code and adds tests
+- Task: "Update the config" → Agent also adds validation and error handling
+
+While well-intentioned, this scope creep can:
+- Introduce unintended changes that break existing functionality
+- Make code review more difficult by mixing requested changes with unrequested additions
+- Violate the principle of least change in production systems
+- Create maintenance burden for code the user didn't ask for
+
+## The Solution: Strict Scope Adherence
+
+### 1. Parse the Task Precisely
+
+Identify the exact scope from the task description:
+- "Remove the wait-for command" = delete wait-for implementation and references
+- "Add a redirect" = add one specific redirect rule
+- "Fix the test" = make the failing test pass
+
+### 2. Implement Only What's Requested
+
+**DO:**
+- Remove the specific command/feature mentioned
+- Update direct references to use the replacement
+- Fix compilation errors caused by the removal
+- Update usage examples that directly reference the removed feature
+
+**DON'T:**
+- Add new documentation files unless specifically requested
+- Refactor related code "while you're at it"
+- Add validation, error handling, or tests unless they're breaking
+- Create helper utilities or abstractions
+- Update tangentially related files
+
+### 3. Resist the Urge to "Improve"
+
+Common scope creep patterns to avoid:
+
+```typescript
+// Task: Remove deprecated function
+// WRONG - adding documentation
+const changes = [
+  'Remove oldFunction()',
+  'Update all references', 
+  'Add migration guide',  // ❌ Not requested
+  'Create best practices doc', // ❌ Not requested
+  'Add usage examples' // ❌ Not requested
+]
+
+// CORRECT - minimal scope
+const changes = [
+  'Remove oldFunction()',
+  'Update direct references to use newFunction()'
+]
+```
+
+## Examples from Real Tasks
+
+### Task: "Remove wait-for command from tmux CLI agent"
+
+**Correct scope:**
+- Remove wait-for case from bash script
+- Update documentation strings to reference wait-idle instead
+- Update error message examples
+- Fix any compilation errors
+
+**Scope creep (avoid):**
+- Creating new documentation files about task validation
+- Adding template literal escaping guides
+- Creating terminal buffer management docs
+- Updating AGENTS.md with new doc references
+
+### Task: "Add redirect for /b/:hash"
+
+**Correct scope:**
+- Add one redirect rule to next.config.js
+- Verify it compiles
+
+**Scope creep (avoid):**
+- Adding tests for the redirect
+- Creating redirect management utilities
+- Adding analytics tracking
+- Documenting redirect patterns
+
+## When Additional Work IS Appropriate
+
+**Exception 1: Compilation/Runtime Errors**
+If your minimal change breaks compilation or causes runtime errors, fix those:
+```typescript
+// If removing wait-for breaks template literals, fix the escaping
+// If removing a function breaks imports, update the imports
+```
+
+**Exception 2: Direct Dependencies**
+If the change requires updating direct references:
+```typescript
+// If removing wait-for, update help text that mentions it
+// If renaming a function, update its direct callers
+```
+
+**Exception 3: Explicit "and" in Task**
+```
+"Remove wait-for command and add documentation" // Two explicit tasks
+"Fix the bug and add a test" // Two explicit tasks
+```
+
+## Validation Questions
+
+Before adding anything beyond the core task, ask:
+1. **Was this explicitly requested?** If no, don't add it.
+2. **Does the minimal change break without this?** If no, don't add it.
+3. **Is this a direct reference that must be updated?** If no, don't add it.
+
+## Communication Pattern
+
+When completing a task, clearly separate what was requested vs. what you considered:
+
+```
+✅ Completed: Removed wait-for command from tmux CLI agent
+- Removed wait-for case from helper script
+- Updated documentation to use wait-idle
+- Fixed template literal escaping issues
+
+💭 Considered but did not implement (not requested):
+- Adding comprehensive documentation about wait patterns
+- Creating validation guides
+- Refactoring related timing code
+```
+
+## Key Principle
+
+**The best code change is the smallest one that accomplishes the exact goal.** Resist the urge to "improve while you're there" unless explicitly asked. Production systems value predictability and minimal change over comprehensive improvements.
\ No newline at end of file

From b62f461e37b65f100ff5d604b26cd94ae12a83f5 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Sun, 29 Mar 2026 19:00:07 -0700
Subject: [PATCH 04/12] evalbuff: add patterns/task-scope-adherence.md
 (6d8bf394)

---
 docs/patterns/task-scope-adherence.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/patterns/task-scope-adherence.md b/docs/patterns/task-scope-adherence.md
index d10aae54c..4b5731f62 100644
--- a/docs/patterns/task-scope-adherence.md
+++ b/docs/patterns/task-scope-adherence.md
@@ -5,7 +5,7 @@ When given a specific technical task, implement ONLY what is explicitly requeste
 ## The Problem
 
 Agents often interpret tasks broadly and add related work they think would be helpful:
-- Task: "Remove the wait-for command" → Agent also creates 3 new documentation files
+- Task: "Remove the wait-for command" → Agent also creates 4 new documentation files (548 lines)
 - Task: "Fix the redirect" → Agent also refactors related code and adds tests
 - Task: "Update the config" → Agent also adds validation and error handling
 

From c8da981dc6a7ac9ac31a979251b7d4eaa3bea8ab Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Sun, 29 Mar 2026 19:23:20 -0700
Subject: [PATCH 05/12] evalbuff: add patterns/task-type-identification.md
 (fde408c6)

---
 AGENTS.md                                 |   1 +
 docs/patterns/task-type-identification.md | 160 ++++++++++++++++++++++
 2 files changed, 161 insertions(+)
 create mode 100644 docs/patterns/task-type-identification.md

diff --git a/AGENTS.md b/AGENTS.md
index a6e77ec87..718640fd7 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -47,3 +47,4 @@ Make an efficient learning agent that can do anything.
 - [docs/patterns/terminal-alternate-screen-buffer.md](docs/patterns/terminal-alternate-screen-buffer.md)
 - [docs/patterns/template-literal-escaping.md](docs/patterns/template-literal-escaping.md)
 - [docs/patterns/task-scope-adherence.md](docs/patterns/task-scope-adherence.md)
+- [docs/patterns/task-type-identification.md](docs/patterns/task-type-identification.md)
diff --git a/docs/patterns/task-type-identification.md b/docs/patterns/task-type-identification.md
new file mode 100644
index 000000000..f2ac84018
--- /dev/null
+++ b/docs/patterns/task-type-identification.md
@@ -0,0 +1,160 @@
+# Task Type Identification: Implementation vs Documentation
+
+When given a task, correctly identify whether you're being asked to IMPLEMENT functionality or CREATE documentation. Don't assume a task is about documentation just because you found existing code.
+
+## The Problem
+
+Agents often misclassify implementation tasks as documentation tasks when they discover existing code that seems to match the requirements. This leads to:
+- Creating unnecessary documentation instead of implementing features
+- Claiming tasks are "already complete" without proper validation
+- Ignoring the actual user need (working functionality)
+
+## Task Classification Rules
+
+### Implementation Tasks (require code changes)
+
+Keywords that indicate implementation work:
+- "Add redirects for..."
+- "Implement feature X"
+- "Create endpoint that..."
+- "Make it so that when..."
+- "Fix the bug where..."
+- "Update the config to..."
+
+**Action required:** Write/modify code, test functionality, verify it works.
+
+### Documentation Tasks (require writing docs)
+
+Keywords that indicate documentation work:
+- "Document how to..."
+- "Write a guide for..."
+- "Create documentation explaining..."
+- "Add README section about..."
+- "Explain the architecture of..."
+
+**Action required:** Write markdown/text files, create examples, explain concepts.
+
+## When You Find Existing Code
+
+If you discover code that appears to implement the requested functionality:
+
+### 1. Verify It Actually Works
+
+```bash
+# For Next.js redirects, test the actual redirect
+curl -I http://localhost:3000/b/test123
+# Should return 307/308 with Location header
+
+# For API endpoints, test the endpoint
+curl http://localhost:3000/api/endpoint
+# Should return expected response
+
+# For features, test the user flow
+# Navigate to the feature and verify it behaves as expected
+```
+
+### 2. Check Configuration vs Runtime
+
+Just because code exists doesn't mean it's active:
+- **Next.js redirects:** Must be in `next.config.mjs` redirects() function AND server must be restarted
+- **API routes:** File must exist in correct location AND export correct HTTP methods
+- **Features:** Code must be imported/called from the right places
+
+### 3. Validate Against Requirements
+
+Even if code exists, check if it matches the exact requirements:
+- **Correct URL pattern:** `/b/:hash` vs `/b/:id` vs `/buffer/:hash`
+- **Correct destination:** `https://go.trybeluga.ai/:hash` vs other domains
+- **Correct behavior:** temporary vs permanent redirects, query parameter handling
+
+## Response Patterns
+
+### ✅ When Implementation Already Works
+
+```
+I found the redirect configuration in next.config.mjs and tested it:
+
+[show the existing code]
+
+Testing confirms it works:
+$ curl -I http://localhost:3000/b/test123
+HTTP/1.1 307 Temporary Redirect
+Location: https://go.trybeluga.ai/test123
+
+The feature is already implemented and functional. No changes needed.
+```
+
+### ✅ When Implementation Exists But Doesn't Work
+
+```
+I found redirect configuration in next.config.mjs, but testing shows it's not working:
+
+[show the existing code]
+
+Testing reveals the issue:
+$ curl -I http://localhost:3000/b/test123
+HTTP/1.1 404 Not Found
+
+The configuration looks correct but the server needs to be restarted. After restart:
+[show working test results]
+
+The feature now works correctly.
+```
+
+### ✅ When Implementation Is Missing
+
+```
+I need to add the redirect configuration to next.config.mjs:
+
+[show the changes being made]
+
+This will redirect /b/:hash to https://go.trybeluga.ai/:hash as requested.
+```
+
+## Anti-Patterns to Avoid
+
+❌ **Assuming task completion without testing:**
+```
+"The redirect is already implemented in next.config.mjs. No changes needed."
+// Without actually testing if it works
+```
+
+❌ **Creating documentation for implementation tasks:**
+```
+// Task: "Add redirects for short URLs"
+// WRONG: Creating docs/patterns/redirect-patterns.md
+// RIGHT: Modifying next.config.mjs
+```
+
+❌ **Confusing configuration with functionality:**
+```
+// Code exists in config file ≠ feature works
+// Must test the actual user-facing behavior
+```
+
+## Testing Implementation Tasks
+
+For common implementation types:
+
+**Web redirects:**
+```bash
+curl -I http://localhost:PORT/path
+# Check status code (307/308) and Location header
+```
+
+**API endpoints:**
+```bash
+curl http://localhost:PORT/api/endpoint
+# Check response status and body
+```
+
+**UI features:**
+```bash
+# Start dev server and manually test in browser
+# Or run existing test suite
+bun test path/to/feature.test.ts
+```
+
+## Key Principle
+
+**Implementation tasks require working code, not documentation about code.** When you find existing code, your job is to verify it works and fix it if it doesn't, not to document how it should work.
\ No newline at end of file

From 0596fdc783c63bc3e1455c9fc1ceb324fb9b4503 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Sun, 29 Mar 2026 19:33:15 -0700
Subject: [PATCH 06/12] evalbuff: add patterns/implementation-validation.md
 (fde408c6)

---
 AGENTS.md                                  |   1 +
 docs/patterns/implementation-validation.md | 171 +++++++++++++++++++++
 2 files changed, 172 insertions(+)
 create mode 100644 docs/patterns/implementation-validation.md

diff --git a/AGENTS.md b/AGENTS.md
index 718640fd7..752830638 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -48,3 +48,4 @@ Make an efficient learning agent that can do anything.
 - [docs/patterns/template-literal-escaping.md](docs/patterns/template-literal-escaping.md)
 - [docs/patterns/task-scope-adherence.md](docs/patterns/task-scope-adherence.md)
 - [docs/patterns/task-type-identification.md](docs/patterns/task-type-identification.md)
+- [docs/patterns/implementation-validation.md](docs/patterns/implementation-validation.md)
diff --git a/docs/patterns/implementation-validation.md b/docs/patterns/implementation-validation.md
new file mode 100644
index 000000000..da1f3a829
--- /dev/null
+++ b/docs/patterns/implementation-validation.md
@@ -0,0 +1,171 @@
+# Implementation Validation Pattern
+
+When you find existing code that appears to implement the requested functionality, always validate that it actually works before claiming the task is complete.
+
+## The Problem
+
+Agents often discover existing implementations and immediately conclude the task is done without testing functionality. This leads to:
+- Claiming features work when they don't
+- Missing broken configurations that look correct in code
+- Failing to communicate actual task status clearly
+- Users left uncertain whether their request was fulfilled
+
+## The Validation Process
+
+### 1. Find the Implementation
+
+When you discover code that seems to match the requirements:
+
+```bash
+# Found redirect in next.config.mjs
+{
+  source: '/b/:hash',
+  destination: 'https://go.trybeluga.ai/:hash',
+  permanent: false,
+}
+```
+
+### 2. Test the Actual Functionality
+
+Don't assume code works just because it exists. Test the user-facing behavior:
+
+```bash
+# For Next.js redirects - test the actual redirect
+curl -I http://localhost:3000/b/test123
+# Should return 307/308 with Location header
+
+# For API endpoints - test the endpoint
+curl http://localhost:3000/api/endpoint
+# Should return expected response
+
+# For UI features - verify the user flow works
+```
+
+### 3. Check Runtime vs Configuration
+
+Code existing ≠ code working. Verify the implementation is active:
+
+- **Next.js redirects**: Must be in active config AND server restarted
+- **API routes**: File must exist in correct location AND export right methods
+- **Features**: Code must be imported/called from the right places
+- **Environment**: Required env vars, database state, etc.
+
+### 4. Validate Against Exact Requirements
+
+Even working code might not match the specific requirements:
+
+- **URL patterns**: `/b/:hash` vs `/b/:id` vs `/buffer/:hash`
+- **Destinations**: `https://go.trybeluga.ai/:hash` vs other domains
+- **Behavior**: temporary vs permanent redirects, query handling
+
+## Response Patterns
+
+### ✅ When Implementation Works
+
+```
+I found the redirect configuration in next.config.mjs and verified it works:
+
+[show the existing code]
+
+Testing confirms functionality:
+$ curl -I http://localhost:3000/b/test123
+HTTP/1.1 307 Temporary Redirect
+Location: https://go.trybeluga.ai/test123
+
+The feature is already implemented and working correctly.
+```
+
+### ✅ When Implementation Exists But Broken
+
+```
+I found redirect configuration in next.config.mjs, but testing shows it's not working:
+
+[show the existing code]
+
+Testing reveals the issue:
+$ curl -I http://localhost:3000/b/test123
+HTTP/1.1 404 Not Found
+
+The server needs to be restarted for redirects to take effect.
+After restart, the redirect works correctly.
+```
+
+### ✅ When Implementation Is Missing
+
+```
+I need to add the redirect configuration to next.config.mjs:
+
+[show the changes being made]
+
+This will redirect /b/:hash to https://go.trybeluga.ai/:hash as requested.
+```
+
+## Testing Strategies by Feature Type
+
+### Web Redirects
+```bash
+curl -I http://localhost:PORT/path
+# Check status code (307/308) and Location header
+```
+
+### API Endpoints
+```bash
+curl http://localhost:PORT/api/endpoint
+# Check response status and body
+```
+
+### Database Features
+```bash
+# Check if tables/columns exist
+# Verify data can be inserted/queried
+# Test constraints and relationships
+```
+
+### UI Components
+```bash
+# Start dev server and manually test
+# Or run existing test suite
+bun test path/to/feature.test.ts
+```
+
+## Use Existing Tests When Available
+
+If tests exist for the feature:
+
+1. **Read the test file** to understand expected behavior
+2. **Run the tests** to verify current implementation
+3. **Report test results** as evidence of functionality
+
+Example:
+```typescript
+test('redirects to go.trybeluga.ai with the hash', async ({ request }) => {
+  const response = await request.get('/b/test123', { maxRedirects: 0 })
+  expect(response.status()).toBe(307)
+  expect(response.headers()['location']).toBe('https://go.trybeluga.ai/test123')
+})
+```
+
+If these tests exist and pass, the feature is confirmed working.
+
+## Anti-Patterns to Avoid
+
+❌ **Assuming functionality without testing**:
+```
+"The redirect is already implemented in next.config.mjs. No changes needed."
+// Without actually verifying it works
+```
+
+❌ **Confusing configuration with functionality**:
+```
+// Code exists in config file ≠ feature works at runtime
+```
+
+❌ **Silent completion without communication**:
+```
+// Making no changes and not explaining why
+// Leaving user unsure if their request was handled
+```
+
+## Key Principle
+
+**Code that looks right might not work right.** Always test the actual user-facing behavior before claiming a feature is complete. When in doubt, verify through testing rather than code inspection alone.
\ No newline at end of file

From 694ae0bf6fed63c6cc1fe74f4d05d3e169050ec6 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Sun, 29 Mar 2026 21:11:01 -0700
Subject: [PATCH 07/12] evalbuff: add
 patterns/existing-implementation-validation.md (fde408c6)

---
 AGENTS.md                                     |   1 +
 .../existing-implementation-validation.md     | 150 ++++++++++++++++++
 2 files changed, 151 insertions(+)
 create mode 100644 docs/patterns/existing-implementation-validation.md

diff --git a/AGENTS.md b/AGENTS.md
index 752830638..721b5575e 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -49,3 +49,4 @@ Make an efficient learning agent that can do anything.
 - [docs/patterns/task-scope-adherence.md](docs/patterns/task-scope-adherence.md)
 - [docs/patterns/task-type-identification.md](docs/patterns/task-type-identification.md)
 - [docs/patterns/implementation-validation.md](docs/patterns/implementation-validation.md)
+- [docs/patterns/existing-implementation-validation.md](docs/patterns/existing-implementation-validation.md)
diff --git a/docs/patterns/existing-implementation-validation.md b/docs/patterns/existing-implementation-validation.md
new file mode 100644
index 000000000..fa54821ef
--- /dev/null
+++ b/docs/patterns/existing-implementation-validation.md
@@ -0,0 +1,150 @@
+# Existing Implementation Validation Pattern
+
+When you discover existing code that appears to implement the requested functionality, validate three things: location correctness, functionality, and clear communication to the user.
+
+## The Problem
+
+Agents often find existing implementations and immediately conclude the task is done without:
+- Verifying the implementation is in the correct location/context
+- Testing that it actually works as expected
+- Clearly communicating the task status to the user
+
+This leads to:
+- Missing implementations in the wrong location (main app vs specific sub-app)
+- Assuming code works without verification
+- Silent completion that leaves users uncertain
+
+## The Three-Step Validation Process
+
+### 1. Location Validation
+
+Even if the functionality exists somewhere in the codebase, verify it's in the RIGHT place:
+
+```bash
+# Task: "Add redirects to freebuff/web/next.config.mjs"
+# Found: Redirects in web/next.config.mjs
+# Problem: Wrong location - freebuff/web is a separate app
+```
+
+**Check:**
+- Is this the correct file path mentioned in the task?
+- Is this the right application/service (monorepo apps are separate)
+- Does the implementation serve the intended use case?
+
+### 2. Functionality Validation
+
+Don't assume existing code works. Test the actual behavior:
+
+```bash
+# For Next.js redirects
+curl -I http://localhost:PORT/b/test123
+# Should return 307/308 with correct Location header
+
+# For API endpoints
+curl http://localhost:PORT/api/endpoint
+# Should return expected response
+
+# For features
+# Test the actual user-facing behavior
+```
+
+**Verify:**
+- The implementation is active (not just configured)
+- It handles the exact patterns requested
+- Edge cases work (query parameters, special characters)
+
+### 3. Clear Communication
+
+Always explicitly tell the user what you found and what the status is:
+
+**✅ When implementation exists and works in the right place:**
+```
+I found the redirect configuration in freebuff/web/next.config.mjs:
+
+[show the code]
+
+Testing confirms it works correctly:
+$ curl -I http://localhost:3001/b/test123
+HTTP/1.1 307 Temporary Redirect
+Location: https://go.trybeluga.ai/test123
+
+The feature is already implemented and functional. No changes needed.
+```
+
+**✅ When implementation exists but in wrong location:**
+```
+I found redirect configuration in the main web/next.config.mjs, but the task requires it in freebuff/web/next.config.mjs (separate app).
+
+Adding the redirect to freebuff/web/next.config.mjs:
+
+[show the changes]
+
+This will enable the redirect specifically for the freebuff web app.
+```
+
+**✅ When implementation doesn't exist:**
+```
+I need to add the redirect configuration to freebuff/web/next.config.mjs:
+
+[show the changes]
+
+This will redirect /b/:hash to https://go.trybeluga.ai/:hash as requested.
+```
+
+## Context-Specific Validation
+
+### Monorepo Applications
+
+In monorepos, similar functionality may exist in multiple apps:
+- `web/` - main application
+- `freebuff/web/` - separate free tier app
+- Each has its own `next.config.mjs`, `package.json`, etc.
+
+**Always check the specific path mentioned in the task.**
+
+### Configuration vs Runtime
+
+Configuration existing ≠ feature working:
+- **Next.js redirects:** Must be in active config AND server restarted
+- **API routes:** Must be in correct location AND export right methods
+- **Environment variables:** Must be set in runtime environment
+
+### Test Coverage
+
+If tests exist, use them as validation:
+```typescript
+// If you find tests like this:
+test('redirects to go.trybeluga.ai with the hash', async ({ request }) => {
+  const response = await request.get('/b/test123', { maxRedirects: 0 })
+  expect(response.status()).toBe(307)
+  expect(response.headers()['location']).toBe('https://go.trybeluga.ai/test123')
+})
+
+// Run them to verify functionality
+bun test path/to/redirect.test.ts
+```
+
+## Anti-Patterns to Avoid
+
+❌ **Silent assumption:**
+```
+// Found redirect in web/next.config.mjs, task must be done
+// (Without checking if it's the right location or communicating status)
+```
+
+❌ **Location confusion:**
+```
+// Task asks for freebuff/web/next.config.mjs
+// Found in web/next.config.mjs
+// Assumed they're the same thing
+```
+
+❌ **No status communication:**
+```
+// Making no changes without explaining why
+// User left wondering if their request was handled
+```
+
+## Key Principle
+
+**Existing code is only a solution if it's in the right place, works correctly, and serves the intended use case.** Always validate all three before claiming task completion.
\ No newline at end of file

From 146eddf4fc8662e4ab9fbf4fbabc79e524e9298c Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 30 Mar 2026 11:19:02 -0700
Subject: [PATCH 08/12] evalbuff: use Codebuff SDK, direct LLM API, and improve
 quality

Replace CLI spawning with Codebuff SDK for agent execution and Vercel AI SDK
for LLM calls (5x faster prompt generation). Add base2-free-evals agent with
noAskUser. Use local git clones with hardlinks for near-instant repo setup.
Filter trivial commits, use average reviewer scores, inline traces into doc
writer prompts, and add adaptive improvement thresholds.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 agents/base2/base2-free-evals.ts              |   8 +
 bun.lock                                      |   2 +
 evalbuff/package.json                         |   2 +
 evalbuff/src/__tests__/e2e.test.ts            |  29 ++-
 .../src/__tests__/loop.integration.test.ts    |  46 ++--
 evalbuff/src/commit-task-generator.ts         |  81 +++++--
 evalbuff/src/docs-optimizer.ts                | 107 ++++++----
 evalbuff/src/judge.ts                         |   4 +
 evalbuff/src/llm.ts                           |  53 +++++
 evalbuff/src/run-e2e-test.ts                  |   2 +-
 evalbuff/src/run-evalbuff.ts                  | 202 +++++++++++++++---
 evalbuff/src/test-repo-utils.ts               |  30 ++-
 12 files changed, 433 insertions(+), 133 deletions(-)
 create mode 100644 agents/base2/base2-free-evals.ts
 create mode 100644 evalbuff/src/llm.ts

diff --git a/agents/base2/base2-free-evals.ts b/agents/base2/base2-free-evals.ts
new file mode 100644
index 000000000..a6489c03e
--- /dev/null
+++ b/agents/base2/base2-free-evals.ts
@@ -0,0 +1,8 @@
+import { createBase2 } from './base2'
+
+const definition = {
+  ...createBase2('free', { noAskUser: true }),
+  id: 'base2-free-evals',
+  displayName: 'Buffy the Free Evals Orchestrator',
+}
+export default definition
diff --git a/bun.lock b/bun.lock
index cb6136499..3df586afb 100644
--- a/bun.lock
+++ b/bun.lock
@@ -111,8 +111,10 @@
       "name": "@codebuff/evalbuff",
       "version": "1.0.0",
       "dependencies": {
+        "@ai-sdk/anthropic": "^2.0.50",
         "@codebuff/common": "workspace:*",
         "@codebuff/sdk": "workspace:*",
+        "ai": "^5.0.0",
         "zod": "^4.2.1",
       },
     },
diff --git a/evalbuff/package.json b/evalbuff/package.json
index f3374246d..ac8a55395 100644
--- a/evalbuff/package.json
+++ b/evalbuff/package.json
@@ -14,8 +14,10 @@
     "run": "bun run src/run-evalbuff.ts"
   },
   "dependencies": {
+    "@ai-sdk/anthropic": "^2.0.50",
     "@codebuff/common": "workspace:*",
     "@codebuff/sdk": "workspace:*",
+    "ai": "^5.0.0",
     "zod": "^4.2.1"
   }
 }
diff --git a/evalbuff/src/__tests__/e2e.test.ts b/evalbuff/src/__tests__/e2e.test.ts
index abc317e99..f1ca59966 100644
--- a/evalbuff/src/__tests__/e2e.test.ts
+++ b/evalbuff/src/__tests__/e2e.test.ts
@@ -40,14 +40,25 @@ mock.module('../test-repo-utils', () => ({
   },
 }))
 
-mock.module('../cli-runner', () => ({
-  runCliAgent: async () => ({
-    diff: 'mock diff content',
-    durationMs: 1000,
-    exitCode: 0,
-    stdout: 'mock stdout',
-    stderr: '',
-  }),
+mock.module('../runners/codebuff', () => ({
+  CodebuffRunner: class {
+    constructor() {}
+    async run() {
+      return {
+        steps: [{ type: 'text', content: 'mock trace' }],
+        totalCostUsd: 0.01,
+        diff: 'mock diff content',
+      }
+    }
+  },
+}))
+
+mock.module('@codebuff/sdk', () => ({
+  CodebuffClient: class {
+    constructor() {}
+    async run() { return { output: { type: 'success' }, sessionState: null } }
+  },
+  loadLocalAgents: async () => ({}),
 }))
 
 // Judge returns alternating scores: low (triggers doc edit), then higher (confirms improvement)
@@ -126,7 +137,7 @@ describe('evalbuff E2E', () => {
     await runLearnMode({
       mode: 'learn',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 50,
       agentTimeoutMs: 10_000,
diff --git a/evalbuff/src/__tests__/loop.integration.test.ts b/evalbuff/src/__tests__/loop.integration.test.ts
index 334dc545e..724626133 100644
--- a/evalbuff/src/__tests__/loop.integration.test.ts
+++ b/evalbuff/src/__tests__/loop.integration.test.ts
@@ -32,20 +32,30 @@ mock.module('../test-repo-utils', () => ({
   },
 }))
 
-// Mock CLI runner to return a fake result
-mock.module('../cli-runner', () => ({
-  runCliAgent: async () => {
-    cliRunnerCallCount++
-    return {
-      diff: 'mock diff content',
-      durationMs: 1000,
-      exitCode: 0,
-      stdout: 'mock stdout',
-      stderr: '',
+// Mock CodebuffRunner to return a fake result
+mock.module('../runners/codebuff', () => ({
+  CodebuffRunner: class {
+    constructor() {}
+    async run() {
+      cliRunnerCallCount++
+      return {
+        steps: [{ type: 'text', content: 'mock trace' }],
+        totalCostUsd: 0.01,
+        diff: 'mock diff content',
+      }
     }
   },
 }))
 
+// Mock SDK client and loadLocalAgents
+mock.module('@codebuff/sdk', () => ({
+  CodebuffClient: class {
+    constructor() {}
+    async run() { return { output: { type: 'success' }, sessionState: null } }
+  },
+  loadLocalAgents: async () => ({}),
+}))
+
 // Mock judge to return configurable scores
 mock.module('../judge', () => ({
   judgeTaskResult: async () => {
@@ -144,7 +154,7 @@ describe('runLearnMode integration', () => {
     await runLearnMode({
       mode: 'learn',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 100,
       agentTimeoutMs: 10_000,
@@ -190,7 +200,7 @@ describe('runLearnMode integration', () => {
     await runLearnMode({
       mode: 'learn',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 100,
       agentTimeoutMs: 10_000,
@@ -233,7 +243,7 @@ describe('runLearnMode integration', () => {
     await runLearnMode({
       mode: 'learn',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 100,
       agentTimeoutMs: 10_000,
@@ -245,10 +255,10 @@ describe('runLearnMode integration', () => {
     expect(fs.existsSync(logPath)).toBe(false)
   })
 
-  it('rejects doc edit when score does not improve', async () => {
-    // Commit1: baseline 4.0, rerun 3.0 (worse) — doc rejected, loop stops.
+  it('rejects doc edit when score drops significantly', async () => {
+    // Commit1: baseline 5.0, rerun 2.0 (3-point drop, past 1.5 threshold) — doc rejected.
     // Commit2: baseline 8.0, analyze returns null. Commit3: baseline 8.0, null.
-    judgeScores = [4.0, 3.0, 8.0, 8.0]
+    judgeScores = [5.0, 2.0, 8.0, 8.0]
     analyzeFailureResults = [
       {
         reasoning: 'Tried to help',
@@ -262,7 +272,7 @@ describe('runLearnMode integration', () => {
     await runLearnMode({
       mode: 'learn',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 100,
       agentTimeoutMs: 10_000,
@@ -290,7 +300,7 @@ describe('runPromptMode integration', () => {
     await runPromptMode({
       mode: 'prompt',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 100,
       agentTimeoutMs: 10_000,
diff --git a/evalbuff/src/commit-task-generator.ts b/evalbuff/src/commit-task-generator.ts
index 51357c829..e85127699 100644
--- a/evalbuff/src/commit-task-generator.ts
+++ b/evalbuff/src/commit-task-generator.ts
@@ -1,8 +1,9 @@
 import { execSync } from 'child_process'
 import fs from 'fs'
-import os from 'os'
 import path from 'path'
 
+import { generatePrompt } from './llm'
+
 export interface CommitTask {
   sha: string
   parentSha: string
@@ -14,6 +15,55 @@ export interface CommitTask {
 
 const MAX_DIFF_CHARS = 200_000
 
+/**
+ * Commit message patterns that indicate trivial/automated commits not worth
+ * running agents on. Saves ~10 agent+judge invocations per skipped commit.
+ */
+const TRIVIAL_COMMIT_PATTERNS = [
+  /^bump\b.*\bversion\b/i,
+  /^v?\d+\.\d+\.\d+$/,           // version-only messages like "1.0.635"
+  /^release\s+v?\d+/i,
+  /^chore\(release\)/i,
+  /^update\s+(change|changelog)/i,
+  /^merge\s+(branch|pull request)/i,
+]
+
+/**
+ * Returns true if a commit is trivial and should be skipped.
+ * Checks commit message patterns and whether only package.json version fields changed.
+ */
+function isTrivialCommit(
+  message: string,
+  filesChanged: string[],
+  diff: string,
+): boolean {
+  const firstLine = message.split('\n')[0].trim()
+
+  // Check message patterns
+  if (TRIVIAL_COMMIT_PATTERNS.some((p) => p.test(firstLine))) return true
+
+  // Single package.json change that only touches "version" field
+  if (
+    filesChanged.length === 1 &&
+    filesChanged[0].endsWith('package.json') &&
+    diff.length < 1000
+  ) {
+    const addedLines = diff
+      .split('\n')
+      .filter((l) => l.startsWith('+') && !l.startsWith('+++'))
+    const removedLines = diff
+      .split('\n')
+      .filter((l) => l.startsWith('-') && !l.startsWith('---'))
+    const allVersionChanges =
+      [...addedLines, ...removedLines].every((l) =>
+        /^\s*[+-]\s*"version"/.test(l),
+      )
+    if (allVersionChanges) return true
+  }
+
+  return false
+}
+
 /**
  * Files that add noise to diffs without useful signal.
  * Lockfiles are huge and auto-generated — agents shouldn't replicate them.
@@ -231,31 +281,14 @@ ${filesSection}## Diff
 ${diff}
 \`\`\``
 
-  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-promptgen-'))
-  const promptFile = path.join(tmpDir, 'PROMPT_GEN.md')
-
   try {
-    fs.writeFileSync(promptFile, `${PROMPT_GEN_SYSTEM}\n\n---\n\n${userPrompt}`)
-
-    // IMPORTANT: Run in tmpDir to avoid Claude reading the repo's CLAUDE.md/AGENTS.md,
-    // which can confuse prompt generation (e.g., generating prompts about evalbuff itself).
-    const output = execSync(
-      `claude --dangerously-skip-permissions -p "Read ${promptFile} and follow all instructions. Respond with ONLY the task prompt text."`,
-      {
-        cwd: tmpDir,
-        encoding: 'utf-8',
-        timeout: 2 * 60 * 1000,
-        stdio: ['ignore', 'pipe', 'pipe'],
-        maxBuffer: 10 * 1024 * 1024,
-      },
-    ).trim()
-
+    // Use API directly — faster than spawning Claude CLI (~3s vs ~15s)
+    // and avoids CLAUDE.md/AGENTS.md context pollution
+    const output = await generatePrompt(PROMPT_GEN_SYSTEM, userPrompt)
     return output || message
   } catch {
     // Fallback to the commit message itself
     return message
-  } finally {
-    fs.rmSync(tmpDir, { recursive: true, force: true })
   }
 }
 
@@ -270,6 +303,12 @@ export async function buildCommitTask(
   const info = getCommitInfo(repoPath, sha)
   if (!info) return null
 
+  // Skip trivial/automated commits (version bumps, releases, etc.)
+  if (isTrivialCommit(info.message, info.filesChanged, info.diff)) {
+    console.log(`Skipping ${sha.slice(0, 8)}: trivial commit (${info.message.split('\n')[0].slice(0, 50)})`)
+    return null
+  }
+
   // Skip commits with diffs that exceed our limit
   if (info.diff.length > MAX_DIFF_CHARS) {
     console.log(`Skipping ${sha.slice(0, 8)}: diff too large (${info.diff.length} chars)`)
diff --git a/evalbuff/src/docs-optimizer.ts b/evalbuff/src/docs-optimizer.ts
index 697a0c1b7..e6d5fbae8 100644
--- a/evalbuff/src/docs-optimizer.ts
+++ b/evalbuff/src/docs-optimizer.ts
@@ -1,8 +1,8 @@
-import { execSync } from 'child_process'
 import fs from 'fs'
 import os from 'os'
 import path from 'path'
 
+import { analyzeFailureViaApi } from './llm'
 import { compressTrace, cleanupTraceDir } from './trace-compressor'
 
 import type { JudgingResult } from './judge'
@@ -26,6 +26,20 @@ The docs you write must be **generic enough to be useful across many future task
 
 DO NOT write docs that only help with one specific task. If the failure is too task-specific and doesn't reveal a general pattern, respond with: {"skip": true, "reasoning": "Too task-specific to generalize"}
 
+## What Makes Good Agent Docs
+
+The best docs for AI coding agents are:
+1. **Maps, not essays** — tell the agent WHERE things are and HOW they connect. "Feature X lives in src/x/, uses the Y pattern from src/shared/y.ts, and must be registered in src/registry.ts"
+2. **Decision trees, not philosophy** — "If modifying auth, check src/middleware/auth.ts AND update tests in __tests__/auth.test.ts. If adding a new route, register it in routes.ts."
+3. **Anti-patterns with fixes** — "DON'T create new files in the root. DO put utilities in src/shared/. DON'T import from '../../../', DO use the path alias @/"
+4. **Concrete examples** — Show a before/after or a correct pattern from the actual codebase.
+
+Bad docs that HURT agent performance (avoid these):
+- Vague principles like "keep code clean" or "follow SOLID"
+- Long explanations without actionable takeaways
+- Docs that duplicate what's already in the code (comments, types, etc.)
+- Over-scoped docs that try to cover everything
+
 ## Using the Agent Trace
 
 You may be given the agent's trace (stdout) showing its reasoning process, tool calls, and decisions. This is the most valuable signal — it shows you WHY the agent went wrong, not just WHAT it got wrong. Look for:
@@ -34,10 +48,6 @@ You may be given the agent's trace (stdout) showing its reasoning process, tool
 - **Missing context** — the agent didn't know about a key file, config, or convention
 - **Wrong approach** — the agent took a fundamentally different approach than needed
 
-The trace shows the full agent reasoning inline, but large tool results (file contents, command output) have been extracted to separate files. You'll see markers like:
-  [Stored in: /tmp/evalbuff-traces-xxx/result-003.txt (2847 chars) — file content, 84 lines]
-You can read these files if you need the full content to understand what the agent saw.
-
 Write docs that address the ROOT CAUSE visible in the trace, not just the symptom visible in the diff.
 
 ## Rules
@@ -46,10 +56,11 @@ Write docs that address the ROOT CAUSE visible in the trace, not just the sympto
 2. Do NOT write generic advice like "follow best practices" or "write clean code."
 3. Focus on the general PATTERN behind the gap, not the specific gap itself.
 4. Write docs that a coding agent will read and immediately know what to do differently on any similar task.
-5. Keep docs concise — under 200 lines. Dense information beats verbose explanations.
+5. Keep docs concise — under 100 lines. Dense information beats verbose explanations. Every line should be actionable.
 6. Use a logical file path that groups related docs together (e.g., "patterns/", "conventions/", "architecture/").
 7. Include examples of correct patterns from the codebase when possible.
 8. If a doc already exists on a similar topic, suggest UPDATING it (use the same path) rather than creating a new one.
+9. Start the doc with a 1-2 sentence TL;DR that tells the agent the key rule.
 
 ## Output Format
 
@@ -102,6 +113,7 @@ export async function analyzeFailure({
   groundTruthDiff,
   currentDocs,
   editHistory,
+  commitMessage,
 }: {
   judgeResult: JudgingResult
   taskPrompt: string
@@ -110,6 +122,7 @@ export async function analyzeFailure({
   groundTruthDiff?: string // optional — not available in prompt mode
   currentDocs: Record<string, string>
   editHistory?: DocEditHistoryEntry[]
+  commitMessage?: string // original commit message — helps identify patterns
 }): Promise<DocSuggestion | null> {
   const docsContent = Object.entries(currentDocs)
     .map(([docPath, content]) => `### ${docPath}\n\`\`\`\n${content}\n\`\`\``)
@@ -123,7 +136,7 @@ ${groundTruthDiff}
     : '## Ground Truth\n(Not available — judge should have tested the output directly)'
 
   // Compress agent trace: keep reasoning inline, extract large tool results to files
-  // The doc writer agent can read those files if it needs the full content
+  // We inline the extracted files into the prompt to avoid extra tool-call roundtrips
   let compressed: ReturnType<typeof compressTrace> | null = null
   let traceSection = ''
 
@@ -131,26 +144,44 @@ ${groundTruthDiff}
     const traceDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-traces-'))
     compressed = compressTrace(agentTrace, traceDir)
 
+    // Inline extracted trace files to avoid tool-call roundtrips
     const resultFiles = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt'))
+    let inlinedResults = ''
+    for (const file of resultFiles) {
+      const content = fs.readFileSync(path.join(traceDir, file), 'utf-8')
+      // Cap each file to 5KB to avoid bloating the prompt
+      const capped = content.length > 5000 ? content.slice(0, 5000) + '\n... (truncated)' : content
+      inlinedResults += `\n### ${file}\n\`\`\`\n${capped}\n\`\`\`\n`
+    }
 
     traceSection = `## Agent Trace (reasoning, tool calls, and decisions)
 
 This is the agent's stdout showing its reasoning process, tool calls, and decisions.
-Large tool results have been extracted to separate files — you can read them if needed.
 Look for: what the agent misunderstood, wrong assumptions it made, where it went off track.
 
-${resultFiles.length > 0 ? `**${resultFiles.length} tool result(s) stored in ${traceDir}/** — read any file for full content.\n` : ''}
 \`\`\`
 ${compressed.inline}
-\`\`\``
+\`\`\`
+${inlinedResults ? `\n## Extracted Tool Results\n${inlinedResults}` : ''}`
+
+    // Clean up trace dir immediately since we've inlined everything
+    cleanupTraceDir(compressed.traceDir)
+    compressed = null
   }
 
+  const commitSection = commitMessage
+    ? `## Original Commit Message (for pattern context)
+${commitMessage}
+
+`
+    : ''
+
   const prompt = `${DOC_WRITER_SYSTEM_PROMPT}
 
 ## Task Prompt
 ${taskPrompt}
 
-## Judge Analysis
+${commitSection}## Judge Analysis
 ${judgeResult.analysis}
 
 ## Judge Weaknesses Found
@@ -180,31 +211,8 @@ Based on the agent's trace (if available), the gap between what the agent did an
 Respond with ONLY the JSON object.`
 
   try {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docwriter-'))
-    const promptFile = path.join(tmpDir, 'DOC_WRITER_PROMPT.md')
-    fs.writeFileSync(promptFile, prompt)
-
-    let output: string
-    try {
-      // IMPORTANT: Run in tmpDir to avoid Claude reading the repo's CLAUDE.md/AGENTS.md,
-      // which can pollute the doc writer's analysis with unrelated project context.
-      output = execSync(
-        `claude --dangerously-skip-permissions -p "Read the file ${promptFile} and follow all instructions in it. Respond with ONLY the JSON object as specified."`,
-        {
-          cwd: tmpDir,
-          encoding: 'utf-8',
-          timeout: 5 * 60 * 1000,
-          stdio: ['ignore', 'pipe', 'pipe'],
-          maxBuffer: 10 * 1024 * 1024,
-        },
-      ).trim()
-    } finally {
-      fs.rmSync(tmpDir, { recursive: true, force: true })
-      // Clean up trace files after doc writer is done
-      if (compressed) {
-        cleanupTraceDir(compressed.traceDir)
-      }
-    }
+    // Use API directly — faster than spawning Claude CLI and avoids cwd/CLAUDE.md pollution
+    const output = await analyzeFailureViaApi(prompt)
 
     // Try to extract JSON from the output
     let jsonStr = output
@@ -331,18 +339,33 @@ export function revertDocEdit(
 
 /**
  * Compare scores to determine if a doc edit improved things.
- * Requires a minimum improvement of 0.3 points to count as "improved"
- * to avoid accepting docs based on noise (especially with low parallelism).
+ *
+ * With parallelism=1, score variance is very high (often 3+ points on
+ * the same task). To avoid rejecting good docs due to noise:
+ * - Require only small improvement to accept (0.3 threshold)
+ * - Require large decline to reject (1.5 threshold) — benefit of the doubt
+ *
+ * With higher parallelism, averages are more stable so we can use
+ * tighter thresholds.
  */
-const MIN_IMPROVEMENT_THRESHOLD = 0.3
-
 export function compareScores(
   oldScore: number,
   newScore: number,
+  parallelism: number = 1,
 ): 'improved' | 'same' | 'worse' {
   const delta = newScore - oldScore
-  if (delta >= MIN_IMPROVEMENT_THRESHOLD) return 'improved'
-  if (delta <= -MIN_IMPROVEMENT_THRESHOLD) return 'worse'
+  const improveThreshold = 0.3
+  // With parallelism=1, require a bigger drop before rejecting
+  const worseThreshold = parallelism <= 1 ? 1.5 : 0.3
+
+  if (delta >= improveThreshold) return 'improved'
+  if (delta <= -worseThreshold) return 'worse'
+
+  // Don't give benefit of the doubt when score is very low —
+  // if the doc didn't produce a clear improvement from near-zero,
+  // it's not helping. Require actual improvement.
+  if (oldScore < 2.0 && delta < improveThreshold) return 'worse'
+
   return 'same'
 }
 
diff --git a/evalbuff/src/judge.ts b/evalbuff/src/judge.ts
index 14ef8bebd..50cd02fdd 100644
--- a/evalbuff/src/judge.ts
+++ b/evalbuff/src/judge.ts
@@ -509,6 +509,10 @@ async function runReviewersAndAggregate(
     }
   }
 
+  // Use median for qualitative analysis (pick the most representative reviewer)
+  // but average for scores. Averaging is better because models have consistent
+  // scoring biases (e.g. GPT-5 scores lower) — median would always pick the
+  // same model's score, while average blends them.
   const sorted = validResults.sort(
     (a, b) => a.overallScore - b.overallScore,
   )
diff --git a/evalbuff/src/llm.ts b/evalbuff/src/llm.ts
new file mode 100644
index 000000000..39ca39329
--- /dev/null
+++ b/evalbuff/src/llm.ts
@@ -0,0 +1,53 @@
+/**
+ * Direct LLM API calls for evalbuff, replacing Claude CLI spawning.
+ *
+ * Using the API directly is 2-5x faster than spawning `claude` CLI:
+ * - No process startup overhead (~5s saved per call)
+ * - No CLAUDE.md/AGENTS.md context pollution
+ * - Structured JSON output with schema validation
+ * - Better error handling and retry logic
+ */
+import { createAnthropic } from '@ai-sdk/anthropic'
+import { generateText } from 'ai'
+
+const anthropic = createAnthropic()
+
+const DEFAULT_MODEL = 'claude-sonnet-4-20250514'
+
+/**
+ * Generate a task prompt from a commit diff using the LLM API directly.
+ * Replaces the `claude --dangerously-skip-permissions -p` call in commit-task-generator.ts.
+ */
+export async function generatePrompt(
+  systemPrompt: string,
+  userPrompt: string,
+): Promise<string> {
+  const result = await generateText({
+    model: anthropic(DEFAULT_MODEL),
+    system: systemPrompt,
+    prompt: userPrompt,
+    maxOutputTokens: 500,
+    temperature: 0.3,
+  })
+
+  return result.text.trim()
+}
+
+/**
+ * Analyze a failure and suggest a doc edit using the LLM API directly.
+ * Replaces the `claude --dangerously-skip-permissions -p` call in docs-optimizer.ts.
+ *
+ * Returns raw JSON string (caller handles parsing).
+ */
+export async function analyzeFailureViaApi(
+  prompt: string,
+): Promise<string> {
+  const result = await generateText({
+    model: anthropic(DEFAULT_MODEL),
+    prompt,
+    maxOutputTokens: 4096,
+    temperature: 0.2,
+  })
+
+  return result.text.trim()
+}
diff --git a/evalbuff/src/run-e2e-test.ts b/evalbuff/src/run-e2e-test.ts
index 56840ed5e..bb6f576f1 100644
--- a/evalbuff/src/run-e2e-test.ts
+++ b/evalbuff/src/run-e2e-test.ts
@@ -236,7 +236,7 @@ async function main() {
     await runLearnMode({
       mode: 'learn',
       repoPath: PROJECT_DIR,
-      agentCommand: 'codebuff --agent base2-free',
+      agentId: 'base2-free-evals',
       parallelism: 2,
       maxCostUsd: 10,
       agentTimeoutMs: 5 * 60 * 1000,
diff --git a/evalbuff/src/run-evalbuff.ts b/evalbuff/src/run-evalbuff.ts
index 54b257c2a..40ebcdfa8 100644
--- a/evalbuff/src/run-evalbuff.ts
+++ b/evalbuff/src/run-evalbuff.ts
@@ -2,8 +2,9 @@ import { execSync } from 'child_process'
 import fs from 'fs'
 import path from 'path'
 
+import { CodebuffClient, loadLocalAgents } from '@codebuff/sdk'
+
 import { buildCommitTask, getCommitList } from './commit-task-generator'
-import { runCliAgent } from './cli-runner'
 import {
   getCriteriaForLevel,
   loadCriteria,
@@ -22,6 +23,7 @@ import {
   appendLogEntry,
   generateMorningReport,
 } from './morning-report'
+import { CodebuffRunner } from './runners/codebuff'
 import { withTestRepo } from './test-repo-utils'
 
 import type { QualityCriteria } from './criteria'
@@ -58,7 +60,8 @@ function saveState(statePath: string, state: EvalbuffState): void {
 
 export interface EvalbuffOptions {
   repoPath: string
-  agentCommand: string
+  agentCommand?: string // deprecated — kept for backward compat with CLI runner
+  agentId: string // codebuff agent ID, e.g. 'base2-free-evals'
   parallelism: number
   maxCostUsd: number
   agentTimeoutMs: number
@@ -89,10 +92,13 @@ interface ParallelRunResult {
 }
 
 async function runAgentsInParallel(opts: {
-  agentCommand: string
+  client: CodebuffClient
+  agentId: string
+  agentDefinitions: any[]
   prompt: string
   repoPath: string
   repoUrl: string
+  localRepoPath?: string
   parentSha: string
   initCommand?: string
   groundTruthDiff?: string
@@ -103,9 +109,12 @@ async function runAgentsInParallel(opts: {
   docsSourcePath: string // path to the repo where docs/ lives
 }): Promise<ParallelRunResult> {
   const {
-    agentCommand,
+    client,
+    agentId,
+    agentDefinitions,
     prompt,
     repoUrl,
+    localRepoPath,
     parentSha,
     initCommand,
     groundTruthDiff,
@@ -118,20 +127,53 @@ async function runAgentsInParallel(opts: {
 
   const runOne = async (idx: number) => {
     return withTestRepo(
-      { repoUrl, parentSha, initCommand },
+      { repoUrl, localRepoPath, parentSha, initCommand },
       async (repoDir) => {
         // Copy current docs into the test repo
         copyDocsIntoRepo(docsSourcePath, repoDir)
 
-        console.log(`  [Run ${idx + 1}/${parallelism}] Running agent...`)
-        const result = await runCliAgent({
-          command: agentCommand,
-          prompt,
+        console.log(`  [Run ${idx + 1}/${parallelism}] Running agent via SDK...`)
+        const shortSha = parentSha.slice(0, 8)
+        const runner = new CodebuffRunner({
           cwd: repoDir,
-          timeoutMs: agentTimeoutMs,
+          client,
+          agentId,
+          localAgentDefinitions: agentDefinitions,
+          printEvents: false,
+          commitId: shortSha,
+          parentSha,
         })
 
-        const costEstimate = result.durationMs * 0.00001
+        let result: Awaited<ReturnType<typeof runner.run>>
+        try {
+          result = await runner.run(prompt)
+        } catch (runError) {
+          // Infrastructure errors (503s, timeouts) should not produce a 0 score.
+          // Return a sentinel so the caller can detect and handle it.
+          const errMsg = runError instanceof Error ? runError.message : String(runError)
+          console.warn(`  [Run ${idx + 1}/${parallelism}] Agent failed: ${errMsg.slice(0, 200)}`)
+          return {
+            score: -1, // sentinel: infrastructure failure
+            diff: '',
+            agentTrace: `Agent error: ${errMsg}`,
+            judging: {
+              analysis: `Agent failed: ${errMsg.slice(0, 500)}`,
+              strengths: [],
+              weaknesses: ['Agent failed due to infrastructure error'],
+              e2eTestsPerformed: [],
+              completionScore: -1,
+              codeQualityScore: -1,
+              e2eScore: -1,
+              overallScore: -1,
+            },
+            costEstimate: 0,
+          }
+        }
+
+        // Serialize trace steps as JSON for the doc writer to analyze
+        const agentTrace = result.steps
+          .map((step) => JSON.stringify(step))
+          .join('\n')
 
         console.log(`  [Run ${idx + 1}/${parallelism}] Judging...`)
         const judging = await judgeTaskResult({
@@ -139,7 +181,7 @@ async function runAgentsInParallel(opts: {
           agentDiff: result.diff,
           groundTruthDiff,
           repoDir,
-          error: result.exitCode !== 0 ? result.stderr : undefined,
+          error: result.diff === '' ? 'Agent made no changes' : undefined,
           criteria,
           reviewerAgents,
         })
@@ -147,21 +189,40 @@ async function runAgentsInParallel(opts: {
         return {
           score: judging.overallScore,
           diff: result.diff,
-          agentTrace: result.stdout,
+          agentTrace,
           judging,
-          costEstimate,
+          costEstimate: result.totalCostUsd,
         }
       },
     )
   }
 
-  const results = await Promise.all(
+  const allResults = await Promise.all(
     Array.from({ length: parallelism }, (_, i) => runOne(i)),
   )
 
+  // Filter out infrastructure failures (score === -1)
+  const results = allResults.filter((r) => r.score >= 0)
+  const totalCost = allResults.reduce((a, r) => a + r.costEstimate, 0)
+
+  if (results.length === 0) {
+    console.warn(`  All ${parallelism} agent runs failed (infrastructure errors)`)
+    return {
+      avgScore: -1,
+      scores: [],
+      diffs: [],
+      agentTraces: allResults.map((r) => r.agentTrace),
+      judgings: [],
+      costEstimate: totalCost,
+    }
+  }
+
+  if (results.length < allResults.length) {
+    console.warn(`  ${allResults.length - results.length}/${allResults.length} runs failed, using ${results.length} valid results`)
+  }
+
   const scores = results.map((r) => r.score)
   const avgScore = scores.reduce((a, b) => a + b, 0) / scores.length
-  const totalCost = results.reduce((a, r) => a + r.costEstimate, 0)
 
   return {
     avgScore,
@@ -227,12 +288,16 @@ function copyDocsIntoRepo(
 async function improveDocs(opts: {
   taskId: string
   prompt: string
+  commitMessage?: string
   repoPath: string
   repoUrl: string
+  localRepoPath?: string
   parentSha: string
   initCommand?: string
   groundTruthDiff?: string
-  agentCommand: string
+  client: CodebuffClient
+  agentId: string
+  agentDefinitions: any[]
   parallelism: number
   agentTimeoutMs: number
   criteria: QualityCriteria
@@ -247,12 +312,16 @@ async function improveDocs(opts: {
   const {
     taskId,
     prompt,
+    commitMessage,
     repoPath,
     repoUrl,
+    localRepoPath,
     parentSha,
     initCommand,
     groundTruthDiff,
-    agentCommand,
+    client,
+    agentId,
+    agentDefinitions,
     parallelism,
     agentTimeoutMs,
     criteria,
@@ -266,10 +335,13 @@ async function improveDocs(opts: {
   // Step 1: Baseline run
   console.log(`\n  Running ${parallelism} agents in parallel (baseline)...`)
   const baseline = await runAgentsInParallel({
-    agentCommand,
+    client,
+    agentId,
+    agentDefinitions,
     prompt,
     repoPath,
     repoUrl,
+    localRepoPath,
     parentSha,
     initCommand,
     groundTruthDiff,
@@ -284,6 +356,31 @@ async function improveDocs(opts: {
   let currentScore = baseline.avgScore
   console.log(`  Baseline score: ${currentScore.toFixed(1)}/10 (scores: ${baseline.scores.map((s) => s.toFixed(1)).join(', ')})`)
 
+  // All agents failed — skip this task entirely
+  if (currentScore < 0) {
+    console.log(`  All agent runs failed, skipping task.`)
+    return {
+      finalScore: 0,
+      baselineScore: 0,
+      docsKept: [],
+      docsRejected: [],
+      totalCost,
+    }
+  }
+
+  // Early stopping: if baseline is already excellent, skip improvement loop
+  const EARLY_STOP_THRESHOLD = 9.0
+  if (currentScore >= EARLY_STOP_THRESHOLD) {
+    console.log(`  Baseline score ${currentScore.toFixed(1)} >= ${EARLY_STOP_THRESHOLD}, skipping improvement loop.`)
+    return {
+      finalScore: currentScore,
+      baselineScore: baseline.avgScore,
+      docsKept: [],
+      docsRejected: [],
+      totalCost: totalCost,
+    }
+  }
+
   // Step 2: Iterative doc improvement
   let improving = true
   const MAX_IMPROVEMENT_ITERATIONS = 5
@@ -319,6 +416,7 @@ async function improveDocs(opts: {
       groundTruthDiff,
       currentDocs,
       editHistory,
+      commitMessage,
     })
 
     if (!docSuggestion) {
@@ -341,10 +439,13 @@ async function improveDocs(opts: {
     // Re-run with new docs
     console.log(`  Re-running ${parallelism} agents with new docs...`)
     const rerun = await runAgentsInParallel({
-      agentCommand,
+      client,
+      agentId,
+      agentDefinitions,
       prompt,
       repoPath,
       repoUrl,
+      localRepoPath,
       parentSha,
       initCommand,
       groundTruthDiff,
@@ -356,11 +457,26 @@ async function improveDocs(opts: {
     })
     totalCost += rerun.costEstimate
 
-    const comparison = compareScores(currentScore, rerun.avgScore)
+    // If re-run failed entirely, don't count it as a rejection
+    if (rerun.avgScore < 0) {
+      console.log(`  Re-run failed (infrastructure errors), reverting doc and retrying later.`)
+      if (previousContent !== null) {
+        applyDocEdit(repoPath, docSuggestion.suggestedDocPath, previousContent)
+      } else {
+        revertDocEdit(repoPath, docSuggestion.suggestedDocPath)
+      }
+      break
+    }
+
+    const comparison = compareScores(currentScore, rerun.avgScore, parallelism)
     console.log(`  New score: ${rerun.avgScore.toFixed(1)}/10 (${comparison}) (scores: ${rerun.scores.map((s) => s.toFixed(1)).join(', ')})`)
 
-    if (comparison === 'improved') {
-      console.log(`  Keeping doc: ${docSuggestion.suggestedDocPath}`)
+    if (comparison === 'improved' || comparison === 'same') {
+      // 'improved' = clear signal the doc helps
+      // 'same' = within noise range — keep it (benefit of the doubt,
+      //   especially at low parallelism where variance is high)
+      const reason = comparison === 'improved' ? 'score improved' : 'within noise range, keeping'
+      console.log(`  Keeping doc: ${docSuggestion.suggestedDocPath} (${reason})`)
       docsKept.push({
         path: docSuggestion.suggestedDocPath,
         reasoning: docSuggestion.reasoning,
@@ -388,7 +504,7 @@ async function improveDocs(opts: {
 
       // Continue loop — try to improve more
     } else {
-      console.log(`  Rejecting doc: ${docSuggestion.suggestedDocPath} (score didn't improve)`)
+      console.log(`  Rejecting doc: ${docSuggestion.suggestedDocPath} (score dropped significantly)`)
       docsRejected.push({
         path: docSuggestion.suggestedDocPath,
         reasoning: docSuggestion.reasoning,
@@ -423,7 +539,7 @@ async function improveDocs(opts: {
 export async function runLearnMode(options: LearnOptions): Promise<void> {
   const {
     repoPath,
-    agentCommand,
+    agentId,
     parallelism,
     maxCostUsd,
     agentTimeoutMs,
@@ -441,6 +557,13 @@ export async function runLearnMode(options: LearnOptions): Promise<void> {
   const state = loadState(statePath)
   let criteria = loadCriteria(defaultCriteriaPath)
 
+  // Initialize codebuff SDK client and load agent definitions
+  const client = new CodebuffClient({ cwd: repoPath })
+  const agentsDir = path.resolve(__dirname, '../../agents')
+  const loadedAgents = await loadLocalAgents({ agentsPath: agentsDir })
+  const agentDefinitions = Object.values(loadedAgents)
+  console.log(`Loaded ${agentDefinitions.length} agent definitions from ${agentsDir}`)
+
   // Get the repo's remote URL
   let repoUrl: string
   try {
@@ -464,7 +587,7 @@ export async function runLearnMode(options: LearnOptions): Promise<void> {
   console.log(`Evalbuff Learn Mode:`)
   console.log(`  Repo: ${repoPath}`)
   console.log(`  Remote: ${repoUrl}`)
-  console.log(`  Agent: ${agentCommand}`)
+  console.log(`  Agent: ${agentId}`)
   console.log(`  Parallelism: ${parallelism}`)
   console.log(`  Reviewer agents: ${(reviewerAgents || ['claude', 'codex']).join(', ')}`)
   console.log(`  Commits to process: ${commits.length}`)
@@ -520,12 +643,16 @@ export async function runLearnMode(options: LearnOptions): Promise<void> {
       const result = await improveDocs({
         taskId: shortSha,
         prompt: task.prompt,
+        commitMessage: task.message,
         repoPath,
         repoUrl,
+        localRepoPath: repoPath,
         parentSha: task.parentSha,
         initCommand,
         groundTruthDiff: task.diff,
-        agentCommand,
+        client,
+        agentId,
+        agentDefinitions,
         parallelism,
         agentTimeoutMs,
         criteria,
@@ -592,7 +719,7 @@ export async function runLearnMode(options: LearnOptions): Promise<void> {
 export async function runPromptMode(options: PromptOptions): Promise<void> {
   const {
     repoPath,
-    agentCommand,
+    agentId,
     parallelism,
     maxCostUsd,
     agentTimeoutMs,
@@ -608,6 +735,12 @@ export async function runPromptMode(options: PromptOptions): Promise<void> {
 
   const criteria = loadCriteria(defaultCriteriaPath)
 
+  // Initialize codebuff SDK client and load agent definitions
+  const client = new CodebuffClient({ cwd: repoPath })
+  const agentsDir = path.resolve(__dirname, '../../agents')
+  const loadedAgents = await loadLocalAgents({ agentsPath: agentsDir })
+  const agentDefinitions = Object.values(loadedAgents)
+
   let repoUrl: string
   try {
     repoUrl = execSync('git remote get-url origin', {
@@ -629,7 +762,7 @@ export async function runPromptMode(options: PromptOptions): Promise<void> {
   console.log(`Evalbuff Prompt Mode:`)
   console.log(`  Repo: ${repoPath}`)
   console.log(`  Remote: ${repoUrl}`)
-  console.log(`  Agent: ${agentCommand}`)
+  console.log(`  Agent: ${agentId}`)
   console.log(`  Parallelism: ${parallelism}`)
   console.log(`  Reviewer agents: ${(reviewerAgents || ['claude', 'codex']).join(', ')}`)
   console.log(`  Max cost: $${maxCostUsd}`)
@@ -656,10 +789,13 @@ export async function runPromptMode(options: PromptOptions): Promise<void> {
       prompt,
       repoPath,
       repoUrl,
+      localRepoPath: repoPath,
       parentSha: headSha,
       initCommand,
       // No ground truth diff in prompt mode
-      agentCommand,
+      client,
+      agentId,
+      agentDefinitions,
       parallelism,
       agentTimeoutMs,
       criteria,
@@ -709,7 +845,7 @@ async function main() {
   const hasArg = (name: string): boolean => args.includes(`--${name}`)
 
   const repoPath = getArg('repo')
-  const agentCommand = getArg('agent', 'codebuff --agent base2-free')
+  const agentId = getArg('agent', 'base2-free-evals')
   const parallelism = parseInt(getArg('parallelism', '5'))
   const maxCostUsd = parseFloat(getArg('max-cost', '100'))
   const agentTimeoutMs = parseInt(getArg('agent-timeout', '300000'))
@@ -728,7 +864,7 @@ async function main() {
     await runPromptMode({
       mode: 'prompt',
       repoPath,
-      agentCommand,
+      agentId,
       parallelism,
       maxCostUsd,
       agentTimeoutMs,
@@ -743,7 +879,7 @@ async function main() {
     await runLearnMode({
       mode: 'learn',
       repoPath,
-      agentCommand,
+      agentId,
       parallelism,
       maxCostUsd,
       agentTimeoutMs,
diff --git a/evalbuff/src/test-repo-utils.ts b/evalbuff/src/test-repo-utils.ts
index 60039a3a6..7c1ba6700 100644
--- a/evalbuff/src/test-repo-utils.ts
+++ b/evalbuff/src/test-repo-utils.ts
@@ -7,11 +7,16 @@ import { getErrorObject } from '@codebuff/common/util/error'
 
 /**
  * Helper function to manage test repository lifecycle
- * Sets up a test repo, runs a function with the repo cwd, then cleans up
+ * Sets up a test repo, runs a function with the repo cwd, then cleans up.
+ *
+ * When localRepoPath is provided, uses a local clone (near-instant via hardlinks)
+ * instead of a remote clone (5-30s per clone). This is the single biggest
+ * speedup in evalbuff — with parallelism=5, saves 10-30 remote clones per commit.
  */
 export const withTestRepo = async <T>(
   repoConfig: {
     repoUrl: string
+    localRepoPath?: string
     // The sha of the commit to checkout. If you have a commit with changes to replicate, you would check out the parent commit.
     parentSha: string
     initCommand?: string
@@ -19,20 +24,27 @@ export const withTestRepo = async <T>(
   },
   fn: (cwd: string) => Promise<T>,
 ): Promise<T> => {
-  const { repoUrl, parentSha, initCommand, env } = repoConfig
+  const { repoUrl, localRepoPath, parentSha, initCommand, env } = repoConfig
 
   // Create a temporary directory for the test repo
   const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codebuff-eval-'))
   const repoDir = path.join(tempDir, 'repo')
 
   try {
-    execSync(`git clone --depth 1 ${repoUrl} ${repoDir}`, { stdio: 'ignore' })
-
-    execSync(`git fetch --depth 1 origin ${parentSha}`, {
-      cwd: repoDir,
-      stdio: 'ignore',
-    })
-    execSync(`git checkout ${parentSha}`, { cwd: repoDir, stdio: 'ignore' })
+    if (localRepoPath) {
+      // Local clone: uses hardlinks for objects, nearly instant
+      execSync(`git clone --no-checkout "${localRepoPath}" "${repoDir}"`, { stdio: 'ignore' })
+      execSync(`git checkout ${parentSha}`, { cwd: repoDir, stdio: 'ignore' })
+    } else {
+      // Remote clone: slow but works without local repo
+      execSync(`git clone --depth 1 ${repoUrl} ${repoDir}`, { stdio: 'ignore' })
+
+      execSync(`git fetch --depth 1 origin ${parentSha}`, {
+        cwd: repoDir,
+        stdio: 'ignore',
+      })
+      execSync(`git checkout ${parentSha}`, { cwd: repoDir, stdio: 'ignore' })
+    }
 
     if (initCommand) {
       console.log(`Running init command: ${initCommand}...`)

From a845783db753a8482137924a8ca3f6b92c6bbd2c Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 30 Mar 2026 11:52:29 -0700
Subject: [PATCH 09/12] Remove overfit pattern docs and simplify compareScores
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Delete all docs/patterns/** files generated by evalbuff — they overfit
to specific commits rather than teaching generalizable principles.
Simplify compareScores now that parallelism is always 5.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 AGENTS.md                                     |   7 -
 .../existing-implementation-validation.md     | 150 --------------
 docs/patterns/handle-steps-generators.md      | 180 -----------------
 docs/patterns/implementation-validation.md    | 171 ----------------
 docs/patterns/task-completion-validation.md   | 130 ------------
 docs/patterns/task-scope-adherence.md         | 139 -------------
 docs/patterns/task-type-identification.md     | 160 ---------------
 docs/patterns/template-literal-escaping.md    |  88 --------
 .../terminal-alternate-screen-buffer.md       | 191 ------------------
 evalbuff/src/docs-optimizer.ts                |  14 +-
 evalbuff/src/run-evalbuff.ts                  |   2 +-
 11 files changed, 4 insertions(+), 1228 deletions(-)
 delete mode 100644 docs/patterns/existing-implementation-validation.md
 delete mode 100644 docs/patterns/handle-steps-generators.md
 delete mode 100644 docs/patterns/implementation-validation.md
 delete mode 100644 docs/patterns/task-completion-validation.md
 delete mode 100644 docs/patterns/task-scope-adherence.md
 delete mode 100644 docs/patterns/task-type-identification.md
 delete mode 100644 docs/patterns/template-literal-escaping.md
 delete mode 100644 docs/patterns/terminal-alternate-screen-buffer.md

diff --git a/AGENTS.md b/AGENTS.md
index 721b5575e..ca06ab44c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -43,10 +43,3 @@ Make an efficient learning agent that can do anything.
 - [`docs/agents-and-tools.md`](docs/agents-and-tools.md) — Agent system, shell shims, tool definitions
 - [`docs/patterns/handle-steps-generators.md`](docs/patterns/handle-steps-generators.md) — handleSteps generator patterns and spawn_agents tool calls
 - [docs/evalbuff/interpreting-task-prompts.md](docs/evalbuff/interpreting-task-prompts.md)
-- [docs/patterns/task-completion-validation.md](docs/patterns/task-completion-validation.md)
-- [docs/patterns/terminal-alternate-screen-buffer.md](docs/patterns/terminal-alternate-screen-buffer.md)
-- [docs/patterns/template-literal-escaping.md](docs/patterns/template-literal-escaping.md)
-- [docs/patterns/task-scope-adherence.md](docs/patterns/task-scope-adherence.md)
-- [docs/patterns/task-type-identification.md](docs/patterns/task-type-identification.md)
-- [docs/patterns/implementation-validation.md](docs/patterns/implementation-validation.md)
-- [docs/patterns/existing-implementation-validation.md](docs/patterns/existing-implementation-validation.md)
diff --git a/docs/patterns/existing-implementation-validation.md b/docs/patterns/existing-implementation-validation.md
deleted file mode 100644
index fa54821ef..000000000
--- a/docs/patterns/existing-implementation-validation.md
+++ /dev/null
@@ -1,150 +0,0 @@
-# Existing Implementation Validation Pattern
-
-When you discover existing code that appears to implement the requested functionality, validate three things: location correctness, functionality, and clear communication to the user.
-
-## The Problem
-
-Agents often find existing implementations and immediately conclude the task is done without:
-- Verifying the implementation is in the correct location/context
-- Testing that it actually works as expected
-- Clearly communicating the task status to the user
-
-This leads to:
-- Missing implementations in the wrong location (main app vs specific sub-app)
-- Assuming code works without verification
-- Silent completion that leaves users uncertain
-
-## The Three-Step Validation Process
-
-### 1. Location Validation
-
-Even if the functionality exists somewhere in the codebase, verify it's in the RIGHT place:
-
-```bash
-# Task: "Add redirects to freebuff/web/next.config.mjs"
-# Found: Redirects in web/next.config.mjs
-# Problem: Wrong location - freebuff/web is a separate app
-```
-
-**Check:**
-- Is this the correct file path mentioned in the task?
-- Is this the right application/service (monorepo apps are separate)
-- Does the implementation serve the intended use case?
-
-### 2. Functionality Validation
-
-Don't assume existing code works. Test the actual behavior:
-
-```bash
-# For Next.js redirects
-curl -I http://localhost:PORT/b/test123
-# Should return 307/308 with correct Location header
-
-# For API endpoints
-curl http://localhost:PORT/api/endpoint
-# Should return expected response
-
-# For features
-# Test the actual user-facing behavior
-```
-
-**Verify:**
-- The implementation is active (not just configured)
-- It handles the exact patterns requested
-- Edge cases work (query parameters, special characters)
-
-### 3. Clear Communication
-
-Always explicitly tell the user what you found and what the status is:
-
-**✅ When implementation exists and works in the right place:**
-```
-I found the redirect configuration in freebuff/web/next.config.mjs:
-
-[show the code]
-
-Testing confirms it works correctly:
-$ curl -I http://localhost:3001/b/test123
-HTTP/1.1 307 Temporary Redirect
-Location: https://go.trybeluga.ai/test123
-
-The feature is already implemented and functional. No changes needed.
-```
-
-**✅ When implementation exists but in wrong location:**
-```
-I found redirect configuration in the main web/next.config.mjs, but the task requires it in freebuff/web/next.config.mjs (separate app).
-
-Adding the redirect to freebuff/web/next.config.mjs:
-
-[show the changes]
-
-This will enable the redirect specifically for the freebuff web app.
-```
-
-**✅ When implementation doesn't exist:**
-```
-I need to add the redirect configuration to freebuff/web/next.config.mjs:
-
-[show the changes]
-
-This will redirect /b/:hash to https://go.trybeluga.ai/:hash as requested.
-```
-
-## Context-Specific Validation
-
-### Monorepo Applications
-
-In monorepos, similar functionality may exist in multiple apps:
-- `web/` - main application
-- `freebuff/web/` - separate free tier app
-- Each has its own `next.config.mjs`, `package.json`, etc.
-
-**Always check the specific path mentioned in the task.**
-
-### Configuration vs Runtime
-
-Configuration existing ≠ feature working:
-- **Next.js redirects:** Must be in active config AND server restarted
-- **API routes:** Must be in correct location AND export right methods
-- **Environment variables:** Must be set in runtime environment
-
-### Test Coverage
-
-If tests exist, use them as validation:
-```typescript
-// If you find tests like this:
-test('redirects to go.trybeluga.ai with the hash', async ({ request }) => {
-  const response = await request.get('/b/test123', { maxRedirects: 0 })
-  expect(response.status()).toBe(307)
-  expect(response.headers()['location']).toBe('https://go.trybeluga.ai/test123')
-})
-
-// Run them to verify functionality
-bun test path/to/redirect.test.ts
-```
-
-## Anti-Patterns to Avoid
-
-❌ **Silent assumption:**
-```
-// Found redirect in web/next.config.mjs, task must be done
-// (Without checking if it's the right location or communicating status)
-```
-
-❌ **Location confusion:**
-```
-// Task asks for freebuff/web/next.config.mjs
-// Found in web/next.config.mjs
-// Assumed they're the same thing
-```
-
-❌ **No status communication:**
-```
-// Making no changes without explaining why
-// User left wondering if their request was handled
-```
-
-## Key Principle
-
-**Existing code is only a solution if it's in the right place, works correctly, and serves the intended use case.** Always validate all three before claiming task completion.
\ No newline at end of file
diff --git a/docs/patterns/handle-steps-generators.md b/docs/patterns/handle-steps-generators.md
deleted file mode 100644
index a3db4b672..000000000
--- a/docs/patterns/handle-steps-generators.md
+++ /dev/null
@@ -1,180 +0,0 @@
-# handleSteps Generator Pattern for Programmatic Agents
-
-When creating agents that use `handleSteps` generators to programmatically execute tool calls, follow these exact patterns to avoid TypeScript compilation errors.
-
-## Correct handleSteps Signature
-
-```typescript
-import type { AgentDefinition } from '../types/agent-definition'
-
-const definition: AgentDefinition = {
-  // ... other fields
-  
-  handleSteps: function* ({ agentState, prompt, params }) {
-    // Generator body
-  },
-}
-```
-
-## Yielding Tool Calls
-
-Yield objects with `toolName` and `input` properties. The input schema must match the tool's expected parameters exactly.
-
-### spawn_agents Tool
-
-```typescript
-handleSteps: function* ({ agentState, prompt, params }) {
-  const promptWithDefault = prompt ?? 'Default prompt'
-  
-  yield {
-    toolName: 'spawn_agents',
-    input: {
-      agents: [
-        {
-          agent_type: 'agent-id-1',
-          prompt: promptWithDefault,
-        },
-        {
-          agent_type: 'agent-id-2', 
-          prompt: promptWithDefault,
-        },
-      ],
-    },
-  }
-  
-  // After tool execution, yield 'STEP' to let the agent process results
-  yield 'STEP'
-},
-```
-
-### Common Mistakes
-
-**WRONG:** Using incorrect property names or nested structures
-```typescript
-// ❌ Incorrect - wrong tool call structure
-yield {
-  type: 'tool_call',
-  name: 'spawn_agents',
-  arguments: { ... }
-}
-```
-
-**WRONG:** Using `think_deeply` or custom tool names that don't exist
-```typescript
-// ❌ Incorrect - this tool doesn't exist
-yield {
-  toolName: 'think_deeply',
-  input: { ... }
-}
-```
-
-**CORRECT:** Use `toolName` and `input` at the top level
-```typescript
-// ✅ Correct
-yield {
-  toolName: 'spawn_agents',
-  input: {
-    agents: [{ agent_type: 'my-agent', prompt: 'Do something' }]
-  }
-}
-```
-
-## Yielding STEP
-
-After yielding tool calls, yield the string `'STEP'` to let the main agent process the results:
-
-```typescript
-handleSteps: function* ({ prompt }) {
-  yield {
-    toolName: 'spawn_agents',
-    input: { agents: [...] },
-  }
-  
-  // This tells the runtime to run an LLM step to process spawn results
-  yield 'STEP'
-},
-```
-
-## Agent Definition Requirements for Spawning
-
-Agents that spawn sub-agents must include:
-
-1. `toolNames: ['spawn_agents']` - Enable the spawn tool
-2. `spawnableAgents: ['agent-id-1', 'agent-id-2']` - List allowed sub-agents
-
-```typescript
-const definition: AgentDefinition = {
-  id: 'coordinator',
-  model: 'openai/gpt-5',
-  toolNames: ['spawn_agents'],
-  spawnableAgents: ['sub-agent-1', 'sub-agent-2', 'sub-agent-3'],
-  // ...
-}
-```
-
-## Complete Example: Multi-Model Coordinator
-
-See `.agents/deep-thinking/deep-thinker.ts` for a working example:
-
-```typescript
-import type { AgentDefinition } from '../types/agent-definition'
-
-const definition: AgentDefinition = {
-  id: 'deep-thinker',
-  displayName: 'Deep Thinker Agent',
-  model: 'openai/gpt-5',
-  
-  toolNames: ['spawn_agents'],
-  spawnableAgents: ['gpt5-thinker', 'sonnet-thinker', 'gemini-thinker'],
-  
-  inputSchema: {
-    prompt: {
-      type: 'string',
-      description: 'The topic to analyze',
-    },
-  },
-  
-  outputMode: 'last_message',
-  
-  handleSteps: function* ({ prompt }) {
-    const promptWithDefault = prompt ?? 'Think about this topic'
-    
-    yield {
-      toolName: 'spawn_agents',
-      input: {
-        agents: [
-          { agent_type: 'gpt5-thinker', prompt: promptWithDefault },
-          { agent_type: 'sonnet-thinker', prompt: promptWithDefault },
-          { agent_type: 'gemini-thinker', prompt: promptWithDefault },
-        ],
-      },
-    }
-    
-    yield 'STEP'
-  },
-}
-
-export default definition
-```
-
-## Directory Structure
-
-Place related agents in subdirectories under `.agents/`:
-
-```
-.agents/
-└── deep-thinking/
-    ├── deep-thinker.ts      # Coordinator
-    ├── deepest-thinker.ts   # Meta-coordinator  
-    ├── gpt5-thinker.ts      # Sub-agent
-    ├── sonnet-thinker.ts    # Sub-agent
-    └── gemini-thinker.ts    # Sub-agent
-```
-
-## Avoid Over-Engineering
-
-When implementing agents:
-- Only create files that are directly requested
-- Don't add documentation files unless explicitly asked
-- Keep agent definitions simple - use `AgentDefinition` type, not custom wrappers
-- Don't create factory patterns unless there's clear reuse need
\ No newline at end of file
diff --git a/docs/patterns/implementation-validation.md b/docs/patterns/implementation-validation.md
deleted file mode 100644
index da1f3a829..000000000
--- a/docs/patterns/implementation-validation.md
+++ /dev/null
@@ -1,171 +0,0 @@
-# Implementation Validation Pattern
-
-When you find existing code that appears to implement the requested functionality, always validate that it actually works before claiming the task is complete.
-
-## The Problem
-
-Agents often discover existing implementations and immediately conclude the task is done without testing functionality. This leads to:
-- Claiming features work when they don't
-- Missing broken configurations that look correct in code
-- Failing to communicate actual task status clearly
-- Users left uncertain whether their request was fulfilled
-
-## The Validation Process
-
-### 1. Find the Implementation
-
-When you discover code that seems to match the requirements:
-
-```bash
-# Found redirect in next.config.mjs
-{
-  source: '/b/:hash',
-  destination: 'https://go.trybeluga.ai/:hash',
-  permanent: false,
-}
-```
-
-### 2. Test the Actual Functionality
-
-Don't assume code works just because it exists. Test the user-facing behavior:
-
-```bash
-# For Next.js redirects - test the actual redirect
-curl -I http://localhost:3000/b/test123
-# Should return 307/308 with Location header
-
-# For API endpoints - test the endpoint
-curl http://localhost:3000/api/endpoint
-# Should return expected response
-
-# For UI features - verify the user flow works
-```
-
-### 3. Check Runtime vs Configuration
-
-Code existing ≠ code working. Verify the implementation is active:
-
-- **Next.js redirects**: Must be in active config AND server restarted
-- **API routes**: File must exist in correct location AND export right methods
-- **Features**: Code must be imported/called from the right places
-- **Environment**: Required env vars, database state, etc.
-
-### 4. Validate Against Exact Requirements
-
-Even working code might not match the specific requirements:
-
-- **URL patterns**: `/b/:hash` vs `/b/:id` vs `/buffer/:hash`
-- **Destinations**: `https://go.trybeluga.ai/:hash` vs other domains
-- **Behavior**: temporary vs permanent redirects, query handling
-
-## Response Patterns
-
-### ✅ When Implementation Works
-
-```
-I found the redirect configuration in next.config.mjs and verified it works:
-
-[show the existing code]
-
-Testing confirms functionality:
-$ curl -I http://localhost:3000/b/test123
-HTTP/1.1 307 Temporary Redirect
-Location: https://go.trybeluga.ai/test123
-
-The feature is already implemented and working correctly.
-```
-
-### ✅ When Implementation Exists But Broken
-
-```
-I found redirect configuration in next.config.mjs, but testing shows it's not working:
-
-[show the existing code]
-
-Testing reveals the issue:
-$ curl -I http://localhost:3000/b/test123
-HTTP/1.1 404 Not Found
-
-The server needs to be restarted for redirects to take effect.
-After restart, the redirect works correctly.
-```
-
-### ✅ When Implementation Is Missing
-
-```
-I need to add the redirect configuration to next.config.mjs:
-
-[show the changes being made]
-
-This will redirect /b/:hash to https://go.trybeluga.ai/:hash as requested.
-```
-
-## Testing Strategies by Feature Type
-
-### Web Redirects
-```bash
-curl -I http://localhost:PORT/path
-# Check status code (307/308) and Location header
-```
-
-### API Endpoints
-```bash
-curl http://localhost:PORT/api/endpoint
-# Check response status and body
-```
-
-### Database Features
-```bash
-# Check if tables/columns exist
-# Verify data can be inserted/queried
-# Test constraints and relationships
-```
-
-### UI Components
-```bash
-# Start dev server and manually test
-# Or run existing test suite
-bun test path/to/feature.test.ts
-```
-
-## Use Existing Tests When Available
-
-If tests exist for the feature:
-
-1. **Read the test file** to understand expected behavior
-2. **Run the tests** to verify current implementation
-3. **Report test results** as evidence of functionality
-
-Example:
-```typescript
-test('redirects to go.trybeluga.ai with the hash', async ({ request }) => {
-  const response = await request.get('/b/test123', { maxRedirects: 0 })
-  expect(response.status()).toBe(307)
-  expect(response.headers()['location']).toBe('https://go.trybeluga.ai/test123')
-})
-```
-
-If these tests exist and pass, the feature is confirmed working.
-
-## Anti-Patterns to Avoid
-
-❌ **Assuming functionality without testing**:
-```
-"The redirect is already implemented in next.config.mjs. No changes needed."
-// Without actually verifying it works
-```
-
-❌ **Confusing configuration with functionality**:
-```
-// Code exists in config file ≠ feature works at runtime
-```
-
-❌ **Silent completion without communication**:
-```
-// Making no changes and not explaining why
-// Leaving user unsure if their request was handled
-```
-
-## Key Principle
-
-**Code that looks right might not work right.** Always test the actual user-facing behavior before claiming a feature is complete. When in doubt, verify through testing rather than code inspection alone.
\ No newline at end of file
diff --git a/docs/patterns/task-completion-validation.md b/docs/patterns/task-completion-validation.md
deleted file mode 100644
index 44668fb4c..000000000
--- a/docs/patterns/task-completion-validation.md
+++ /dev/null
@@ -1,130 +0,0 @@
-# Task Completion Validation Pattern
-
-When implementing features, always validate whether the requested functionality already exists before making changes. Many tasks may already be complete or partially implemented.
-
-## The Validation Process
-
-### 1. Search for Existing Implementation
-
-Before implementing any feature, search the codebase for existing implementations:
-
-```bash
-# Search for relevant patterns
-code_search "pattern related to feature"
-# Check configuration files
-read_files ["config-file.js", "next.config.mjs"]
-```
-
-### 2. Verify Implementation Completeness
-
-When you find existing code, verify it matches the requirements:
-
-- **Check the exact pattern**: Does `/b/:hash` match the requested route?
-- **Verify the destination**: Does `https://go.trybeluga.ai/:hash` match the target?
-- **Confirm the behavior**: Is `permanent: false` the correct redirect type?
-
-### 3. Test the Implementation
-
-If implementation exists, verify it works:
-
-```bash
-# Look for existing tests
-code_search "test.*redirect.*b/"
-# Run relevant tests if they exist
-run_terminal_command "bun test path/to/redirect/tests"
-```
-
-### 4. Communicate Task Status Clearly
-
-Always provide explicit feedback about task completion:
-
-**✅ When feature is already implemented:**
-```
-The redirect for /b/:hash is already implemented in next.config.mjs:
-
-[show the relevant code]
-
-This configuration:
-- Redirects /b/:hash to https://go.trybeluga.ai/:hash
-- Uses temporary redirects (permanent: false)
-- Preserves query parameters
-
-No additional changes needed - the feature is complete!
-```
-
-**✅ When making changes:**
-```
-I need to add the redirect configuration to next.config.mjs:
-
-[show the changes being made]
-
-This will enable the requested /b/:hash → go.trybeluga.ai functionality.
-```
-
-## Common Scenarios
-
-### Next.js Redirects
-
-For Next.js redirect tasks, always check `next.config.mjs` first:
-
-```javascript
-// Look for existing redirects() function
-async redirects() {
-  return [
-    {
-      source: '/pattern',
-      destination: 'https://target.com/pattern',
-      permanent: false,
-    },
-  ]
-}
-```
-
-### API Routes
-
-For API endpoint tasks, check existing route files:
-
-```bash
-# Check if route already exists
-list_directory "src/app/api/target-path"
-read_files ["src/app/api/target-path/route.ts"]
-```
-
-### Component Features
-
-For UI feature tasks, search for existing components:
-
-```bash
-code_search "component.*feature.*name"
-glob "**/*ComponentName*"
-```
-
-## Anti-Patterns to Avoid
-
-❌ **Silent failure**: Making no changes without explanation
-❌ **Duplicate implementation**: Adding code when it already exists
-❌ **Incomplete verification**: Finding partial implementation but not checking if it's complete
-❌ **No status communication**: Leaving the user unsure whether the task succeeded
-
-## Test Validation
-
-When tests exist for the feature:
-
-1. **Read the test file** to understand expected behavior
-2. **Run the tests** to verify current implementation works
-3. **Report test results** as evidence of completion
-
-Example from redirect tests:
-```typescript
-test('redirects to go.trybeluga.ai with the hash', async ({ request }) => {
-  const response = await request.get('/b/test123', { maxRedirects: 0 })
-  expect(response.status()).toBe(307)
-  expect(response.headers()['location']).toBe('https://go.trybeluga.ai/test123')
-})
-```
-
-If these tests exist and pass, the feature is confirmed working.
-
-## Key Principle
-
-**Always explicitly state whether a task is complete, incomplete, or already done.** Never leave the user guessing about the status of their request.
\ No newline at end of file
diff --git a/docs/patterns/task-scope-adherence.md b/docs/patterns/task-scope-adherence.md
deleted file mode 100644
index 4b5731f62..000000000
--- a/docs/patterns/task-scope-adherence.md
+++ /dev/null
@@ -1,139 +0,0 @@
-# Task Scope Adherence Pattern
-
-When given a specific technical task, implement ONLY what is explicitly requested. Do not add "helpful" extras, documentation, or related improvements unless specifically asked.
-
-## The Problem
-
-Agents often interpret tasks broadly and add related work they think would be helpful:
-- Task: "Remove the wait-for command" → Agent also creates 4 new documentation files (548 lines)
-- Task: "Fix the redirect" → Agent also refactors related code and adds tests
-- Task: "Update the config" → Agent also adds validation and error handling
-
-While well-intentioned, this scope creep can:
-- Introduce unintended changes that break existing functionality
-- Make code review more difficult by mixing requested changes with unrequested additions
-- Violate the principle of least change in production systems
-- Create maintenance burden for code the user didn't ask for
-
-## The Solution: Strict Scope Adherence
-
-### 1. Parse the Task Precisely
-
-Identify the exact scope from the task description:
-- "Remove the wait-for command" = delete wait-for implementation and references
-- "Add a redirect" = add one specific redirect rule
-- "Fix the test" = make the failing test pass
-
-### 2. Implement Only What's Requested
-
-**DO:**
-- Remove the specific command/feature mentioned
-- Update direct references to use the replacement
-- Fix compilation errors caused by the removal
-- Update usage examples that directly reference the removed feature
-
-**DON'T:**
-- Add new documentation files unless specifically requested
-- Refactor related code "while you're at it"
-- Add validation, error handling, or tests unless they're breaking
-- Create helper utilities or abstractions
-- Update tangentially related files
-
-### 3. Resist the Urge to "Improve"
-
-Common scope creep patterns to avoid:
-
-```typescript
-// Task: Remove deprecated function
-// WRONG - adding documentation
-const changes = [
-  'Remove oldFunction()',
-  'Update all references', 
-  'Add migration guide',  // ❌ Not requested
-  'Create best practices doc', // ❌ Not requested
-  'Add usage examples' // ❌ Not requested
-]
-
-// CORRECT - minimal scope
-const changes = [
-  'Remove oldFunction()',
-  'Update direct references to use newFunction()'
-]
-```
-
-## Examples from Real Tasks
-
-### Task: "Remove wait-for command from tmux CLI agent"
-
-**Correct scope:**
-- Remove wait-for case from bash script
-- Update documentation strings to reference wait-idle instead
-- Update error message examples
-- Fix any compilation errors
-
-**Scope creep (avoid):**
-- Creating new documentation files about task validation
-- Adding template literal escaping guides
-- Creating terminal buffer management docs
-- Updating AGENTS.md with new doc references
-
-### Task: "Add redirect for /b/:hash"
-
-**Correct scope:**
-- Add one redirect rule to next.config.js
-- Verify it compiles
-
-**Scope creep (avoid):**
-- Adding tests for the redirect
-- Creating redirect management utilities
-- Adding analytics tracking
-- Documenting redirect patterns
-
-## When Additional Work IS Appropriate
-
-**Exception 1: Compilation/Runtime Errors**
-If your minimal change breaks compilation or causes runtime errors, fix those:
-```typescript
-// If removing wait-for breaks template literals, fix the escaping
-// If removing a function breaks imports, update the imports
-```
-
-**Exception 2: Direct Dependencies**
-If the change requires updating direct references:
-```typescript
-// If removing wait-for, update help text that mentions it
-// If renaming a function, update its direct callers
-```
-
-**Exception 3: Explicit "and" in Task**
-```
-"Remove wait-for command and add documentation" // Two explicit tasks
-"Fix the bug and add a test" // Two explicit tasks
-```
-
-## Validation Questions
-
-Before adding anything beyond the core task, ask:
-1. **Was this explicitly requested?** If no, don't add it.
-2. **Does the minimal change break without this?** If no, don't add it.
-3. **Is this a direct reference that must be updated?** If no, don't add it.
-
-## Communication Pattern
-
-When completing a task, clearly separate what was requested vs. what you considered:
-
-```
-✅ Completed: Removed wait-for command from tmux CLI agent
-- Removed wait-for case from helper script
-- Updated documentation to use wait-idle
-- Fixed template literal escaping issues
-
-💭 Considered but did not implement (not requested):
-- Adding comprehensive documentation about wait patterns
-- Creating validation guides
-- Refactoring related timing code
-```
-
-## Key Principle
-
-**The best code change is the smallest one that accomplishes the exact goal.** Resist the urge to "improve while you're there" unless explicitly asked. Production systems value predictability and minimal change over comprehensive improvements.
\ No newline at end of file
diff --git a/docs/patterns/task-type-identification.md b/docs/patterns/task-type-identification.md
deleted file mode 100644
index f2ac84018..000000000
--- a/docs/patterns/task-type-identification.md
+++ /dev/null
@@ -1,160 +0,0 @@
-# Task Type Identification: Implementation vs Documentation
-
-When given a task, correctly identify whether you're being asked to IMPLEMENT functionality or CREATE documentation. Don't assume a task is about documentation just because you found existing code.
-
-## The Problem
-
-Agents often misclassify implementation tasks as documentation tasks when they discover existing code that seems to match the requirements. This leads to:
-- Creating unnecessary documentation instead of implementing features
-- Claiming tasks are "already complete" without proper validation
-- Ignoring the actual user need (working functionality)
-
-## Task Classification Rules
-
-### Implementation Tasks (require code changes)
-
-Keywords that indicate implementation work:
-- "Add redirects for..."
-- "Implement feature X"
-- "Create endpoint that..."
-- "Make it so that when..."
-- "Fix the bug where..."
-- "Update the config to..."
-
-**Action required:** Write/modify code, test functionality, verify it works.
-
-### Documentation Tasks (require writing docs)
-
-Keywords that indicate documentation work:
-- "Document how to..."
-- "Write a guide for..."
-- "Create documentation explaining..."
-- "Add README section about..."
-- "Explain the architecture of..."
-
-**Action required:** Write markdown/text files, create examples, explain concepts.
-
-## When You Find Existing Code
-
-If you discover code that appears to implement the requested functionality:
-
-### 1. Verify It Actually Works
-
-```bash
-# For Next.js redirects, test the actual redirect
-curl -I http://localhost:3000/b/test123
-# Should return 307/308 with Location header
-
-# For API endpoints, test the endpoint
-curl http://localhost:3000/api/endpoint
-# Should return expected response
-
-# For features, test the user flow
-# Navigate to the feature and verify it behaves as expected
-```
-
-### 2. Check Configuration vs Runtime
-
-Just because code exists doesn't mean it's active:
-- **Next.js redirects:** Must be in `next.config.mjs` redirects() function AND server must be restarted
-- **API routes:** File must exist in correct location AND export correct HTTP methods
-- **Features:** Code must be imported/called from the right places
-
-### 3. Validate Against Requirements
-
-Even if code exists, check if it matches the exact requirements:
-- **Correct URL pattern:** `/b/:hash` vs `/b/:id` vs `/buffer/:hash`
-- **Correct destination:** `https://go.trybeluga.ai/:hash` vs other domains
-- **Correct behavior:** temporary vs permanent redirects, query parameter handling
-
-## Response Patterns
-
-### ✅ When Implementation Already Works
-
-```
-I found the redirect configuration in next.config.mjs and tested it:
-
-[show the existing code]
-
-Testing confirms it works:
-$ curl -I http://localhost:3000/b/test123
-HTTP/1.1 307 Temporary Redirect
-Location: https://go.trybeluga.ai/test123
-
-The feature is already implemented and functional. No changes needed.
-```
-
-### ✅ When Implementation Exists But Doesn't Work
-
-```
-I found redirect configuration in next.config.mjs, but testing shows it's not working:
-
-[show the existing code]
-
-Testing reveals the issue:
-$ curl -I http://localhost:3000/b/test123
-HTTP/1.1 404 Not Found
-
-The configuration looks correct but the server needs to be restarted. After restart:
-[show working test results]
-
-The feature now works correctly.
-```
-
-### ✅ When Implementation Is Missing
-
-```
-I need to add the redirect configuration to next.config.mjs:
-
-[show the changes being made]
-
-This will redirect /b/:hash to https://go.trybeluga.ai/:hash as requested.
-```
-
-## Anti-Patterns to Avoid
-
-❌ **Assuming task completion without testing:**
-```
-"The redirect is already implemented in next.config.mjs. No changes needed."
-// Without actually testing if it works
-```
-
-❌ **Creating documentation for implementation tasks:**
-```
-// Task: "Add redirects for short URLs"
-// WRONG: Creating docs/patterns/redirect-patterns.md
-// RIGHT: Modifying next.config.mjs
-```
-
-❌ **Confusing configuration with functionality:**
-```
-// Code exists in config file ≠ feature works
-// Must test the actual user-facing behavior
-```
-
-## Testing Implementation Tasks
-
-For common implementation types:
-
-**Web redirects:**
-```bash
-curl -I http://localhost:PORT/path
-# Check status code (307/308) and Location header
-```
-
-**API endpoints:**
-```bash
-curl http://localhost:PORT/api/endpoint
-# Check response status and body
-```
-
-**UI features:**
-```bash
-# Start dev server and manually test in browser
-# Or run existing test suite
-bun test path/to/feature.test.ts
-```
-
-## Key Principle
-
-**Implementation tasks require working code, not documentation about code.** When you find existing code, your job is to verify it works and fix it if it doesn't, not to document how it should work.
\ No newline at end of file
diff --git a/docs/patterns/template-literal-escaping.md b/docs/patterns/template-literal-escaping.md
deleted file mode 100644
index d4ce5c447..000000000
--- a/docs/patterns/template-literal-escaping.md
+++ /dev/null
@@ -1,88 +0,0 @@
-# Template Literal Escaping Pattern
-
-When modifying JavaScript/TypeScript code that contains template literals (backtick strings), always escape backticks that appear within the template literal content to prevent syntax errors.
-
-## The Problem
-
-Template literals use backticks (`) as delimiters. When you have backticks inside the template literal content, they must be escaped or they will break the JavaScript syntax.
-
-**WRONG:**
-```typescript
-const message = `Use \`wait-idle\` with send** (e.g., `--wait-idle 3`) to wait for output`
-//                                              ^ unescaped backtick breaks syntax
-```
-
-**CORRECT:**
-```typescript
-const message = `Use \`wait-idle\` with send** (e.g., \`--wait-idle 3\`) to wait for output`
-//                                              ^ properly escaped backticks
-```
-
-## When This Happens
-
-This issue commonly occurs when:
-- Modifying documentation strings that contain code examples with backticks
-- Updating help text or error messages that reference command-line syntax
-- Changing template literals that contain markdown-style code formatting
-- Replacing text that includes shell command examples
-
-## The Fix
-
-When working inside template literals, escape all backticks in the content:
-
-1. **Find all backticks** in the string content (not the template literal delimiters)
-2. **Escape each one** with a backslash: `` ` `` becomes `` \` ``
-3. **Verify the syntax** - the opening and closing backticks of the template literal should be the only unescaped ones
-
-## Examples from Real Code
-
-### Helper Script Documentation
-```typescript
-// WRONG - breaks compilation
-const helperScript = `
-  echo "Commands: send, capture, wait-idle"
-  # Usage example: helper wait-idle "session" 3
-  echo "Use \`--wait-idle 3\` for timing"
-`
-
-// CORRECT - properly escaped
-const helperScript = `
-  echo "Commands: send, capture, wait-idle"
-  # Usage example: helper wait-idle "session" 3
-  echo "Use \\\`--wait-idle 3\\\` for timing"
-`
-```
-
-### Quick Reference Strings
-```typescript
-// WRONG
-const quickRef = 
-  '- Send + wait: `' + helperPath + ' send "' + sessionName + '" "..." --wait-idle 3`\n' +
-  '- Example usage: `--wait-idle 3` waits for output\n'
-//                  ^ unescaped backticks in concatenated string
-
-// CORRECT  
-const quickRef = 
-  '- Send + wait: `' + helperPath + ' send "' + sessionName + '" "..." --wait-idle 3`\n' +
-  '- Example usage: \`--wait-idle 3\` waits for output\n'
-//                  ^ properly escaped backticks
-```
-
-## Detection
-
-Syntax errors from unescaped backticks typically show:
-- `TS1005: ',' expected` or `TS1005: ';' expected`
-- `TS1003: Identifier expected`
-- `error: Expected "}" but found "<word>"`
-- Compilation errors pointing to the line with unescaped backticks
-
-## Prevention
-
-1. **When modifying template literals**, scan for all backticks in the content
-2. **Use find-and-replace** to systematically escape backticks: find `` ` `` replace with `` \` ``
-3. **Test compilation** after making changes to catch syntax errors early
-4. **Be extra careful** with documentation strings, help text, and code examples
-
-## Key Rule
-
-Inside template literals, the only unescaped backticks should be the opening and closing delimiters of the template literal itself. All backticks in the content must be escaped with backslashes.
\ No newline at end of file
diff --git a/docs/patterns/terminal-alternate-screen-buffer.md b/docs/patterns/terminal-alternate-screen-buffer.md
deleted file mode 100644
index 9f7d8fc0b..000000000
--- a/docs/patterns/terminal-alternate-screen-buffer.md
+++ /dev/null
@@ -1,191 +0,0 @@
-# Terminal Alternate Screen Buffer Pattern
-
-When building CLI applications with full-screen UIs (like TUI apps), use the alternate screen buffer to prevent UI output from polluting the user's terminal scrollback when the app exits.
-
-## The Problem
-
-By default, terminal applications write to the main screen buffer. When a full-screen CLI app exits, all its UI output remains in the terminal scrollback, cluttering the user's terminal history. This is annoying for users who expect clean terminal behavior like vim, less, htop, and other well-behaved CLI tools.
-
-## The Solution: Alternate Screen Buffer
-
-Terminals support an alternate screen buffer that can be entered/exited using ANSI escape sequences:
-
-- **Enter alternate screen:** `\x1b[?1049h` (smcup)
-- **Exit alternate screen:** `\x1b[?1049l` (rmcup)
-
-When you enter the alternate screen buffer, the terminal saves the current screen content. When you exit, it restores the original content, leaving the scrollback clean.
-
-## Implementation Pattern
-
-### 1. Define the Escape Sequences
-
-```typescript
-// Terminal alternate screen buffer escape sequences
-export const ENTER_ALT_BUFFER = '\x1b[?1049h'
-export const EXIT_ALT_BUFFER = '\x1b[?1049l'
-```
-
-### 2. Enter Before Rendering
-
-Enter the alternate screen buffer BEFORE initializing your UI renderer:
-
-```typescript
-export function enterAlternateScreen(): void {
-  if (process.stdout.isTTY) {
-    process.stdout.write(ENTER_ALT_BUFFER)
-  }
-}
-
-async function main(): Promise<void> {
-  // Enter alternate screen buffer BEFORE rendering the app
-  if (process.stdout.isTTY) {
-    enterAlternateScreen()
-  }
-
-  // Initialize your UI renderer after entering alternate buffer
-  const renderer = await createCliRenderer({ ... })
-  // ... rest of app initialization
-}
-```
-
-### 3. Exit During Cleanup
-
-Ensure the alternate screen buffer is exited during all cleanup scenarios:
-
-```typescript
-const TERMINAL_RESET_SEQUENCES =
-  EXIT_ALT_BUFFER + // Exit alternate screen buffer (restores main screen)
-  '\x1b[?1000l' + // Disable X10 mouse mode
-  '\x1b[?1002l' + // Disable button event mouse mode
-  // ... other terminal reset sequences
-  '\x1b[?25h' // Show cursor
-
-function resetTerminalState(): void {
-  try {
-    process.stdout.write(TERMINAL_RESET_SEQUENCES)
-  } catch {
-    // Ignore errors - stdout may already be closed
-  }
-}
-```
-
-### 4. Handle All Exit Scenarios
-
-Register cleanup handlers for all possible exit scenarios:
-
-```typescript
-process.on('SIGTERM', cleanup)
-process.on('SIGHUP', cleanup)
-process.on('SIGINT', cleanup)
-process.on('beforeExit', cleanup)
-process.on('exit', cleanup)
-process.on('uncaughtException', cleanup)
-process.on('unhandledRejection', cleanup)
-```
-
-## Key Considerations
-
-### TTY Detection
-
-Only enter alternate screen buffer in interactive terminals:
-
-```typescript
-if (process.stdout.isTTY) {
-  enterAlternateScreen()
-}
-```
-
-This prevents issues when:
-- Output is piped to a file (`app > output.txt`)
-- Running in CI/automated environments
-- Output is redirected or captured
-
-### Timing is Critical
-
-1. **Enter alternate buffer FIRST** - before any UI initialization
-2. **Exit alternate buffer LAST** - as part of terminal reset sequences
-3. **Write exit sequence directly to stdout** - don't rely on UI renderer cleanup
-
-### Terminal Compatibility
-
-The `?1049` sequence is widely supported by modern terminals:
-- xterm, gnome-terminal, iTerm2, Terminal.app
-- tmux, screen (with proper configuration)
-- Windows Terminal, ConEmu
-
-Very old terminals may not support it, but the TTY check provides a reasonable fallback.
-
-## Integration with UI Frameworks
-
-### OpenTUI Example
-
-```typescript
-import { createCliRenderer } from '@opentui/core'
-
-async function main(): Promise<void> {
-  // Enter alternate screen BEFORE creating renderer
-  if (process.stdout.isTTY) {
-    enterAlternateScreen()
-  }
-
-  const renderer = await createCliRenderer({
-    backgroundColor: 'transparent',
-    exitOnCtrlC: false,
-  })
-  
-  // Install cleanup handlers that exit alternate screen
-  installProcessCleanupHandlers(renderer)
-  
-  // Render your app
-  createRoot(renderer).render(<App />)
-}
-```
-
-### Ink.js Example
-
-```typescript
-import { render } from 'ink'
-
-function main() {
-  if (process.stdout.isTTY) {
-    enterAlternateScreen()
-  }
-
-  const { unmount } = render(<App />)
-  
-  // Ensure cleanup on exit
-  process.on('exit', () => {
-    unmount()
-    resetTerminalState()
-  })
-}
-```
-
-## Testing
-
-To verify alternate screen buffer works correctly:
-
-1. **Before running your CLI:** Note some text in your terminal scrollback
-2. **Run your CLI:** The UI should appear in a clean screen
-3. **Exit your CLI:** You should return to the exact terminal state from step 1
-4. **Check scrollback:** The UI output should not appear in your scrollback history
-
-## Common Mistakes
-
-❌ **Entering alternate buffer too late** - after UI initialization
-❌ **Not checking TTY status** - breaks piped output
-❌ **Forgetting exit sequences** - leaves terminal in alternate buffer
-❌ **Not handling all exit scenarios** - cleanup only works for normal exit
-❌ **Relying on UI framework cleanup** - may not run if framework crashes
-
-## When to Use
-
-Use alternate screen buffer for:
-- Full-screen TUI applications
-- Interactive CLI tools with complex UIs
-- Any CLI that renders multiple lines of output that users don't need to reference later
-
-Don't use for:
-- Simple command-line tools with minimal output
-- Tools where users need to reference output after exit
-- Log viewers or tools that should integrate with terminal scrollback
\ No newline at end of file
diff --git a/evalbuff/src/docs-optimizer.ts b/evalbuff/src/docs-optimizer.ts
index e6d5fbae8..38e041752 100644
--- a/evalbuff/src/docs-optimizer.ts
+++ b/evalbuff/src/docs-optimizer.ts
@@ -351,20 +351,12 @@ export function revertDocEdit(
 export function compareScores(
   oldScore: number,
   newScore: number,
-  parallelism: number = 1,
 ): 'improved' | 'same' | 'worse' {
   const delta = newScore - oldScore
-  const improveThreshold = 0.3
-  // With parallelism=1, require a bigger drop before rejecting
-  const worseThreshold = parallelism <= 1 ? 1.5 : 0.3
+  const threshold = 0.3
 
-  if (delta >= improveThreshold) return 'improved'
-  if (delta <= -worseThreshold) return 'worse'
-
-  // Don't give benefit of the doubt when score is very low —
-  // if the doc didn't produce a clear improvement from near-zero,
-  // it's not helping. Require actual improvement.
-  if (oldScore < 2.0 && delta < improveThreshold) return 'worse'
+  if (delta >= threshold) return 'improved'
+  if (delta <= -threshold) return 'worse'
 
   return 'same'
 }
diff --git a/evalbuff/src/run-evalbuff.ts b/evalbuff/src/run-evalbuff.ts
index 40ebcdfa8..1b9405cc1 100644
--- a/evalbuff/src/run-evalbuff.ts
+++ b/evalbuff/src/run-evalbuff.ts
@@ -468,7 +468,7 @@ async function improveDocs(opts: {
       break
     }
 
-    const comparison = compareScores(currentScore, rerun.avgScore, parallelism)
+    const comparison = compareScores(currentScore, rerun.avgScore)
     console.log(`  New score: ${rerun.avgScore.toFixed(1)}/10 (${comparison}) (scores: ${rerun.scores.map((s) => s.toFixed(1)).join(', ')})`)
 
     if (comparison === 'improved' || comparison === 'same') {

From 6a7de4c2ff20b91836e1623d411b8d87c68eab25 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 30 Mar 2026 11:53:58 -0700
Subject: [PATCH 10/12] evalbuff: use sonnet 4.6, remove maxOutputTokens and
 temperature

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 evalbuff/src/llm.ts | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/evalbuff/src/llm.ts b/evalbuff/src/llm.ts
index 39ca39329..9861d75bf 100644
--- a/evalbuff/src/llm.ts
+++ b/evalbuff/src/llm.ts
@@ -12,7 +12,7 @@ import { generateText } from 'ai'
 
 const anthropic = createAnthropic()
 
-const DEFAULT_MODEL = 'claude-sonnet-4-20250514'
+const DEFAULT_MODEL = 'claude-sonnet-4-6-20250415'
 
 /**
  * Generate a task prompt from a commit diff using the LLM API directly.
@@ -26,8 +26,6 @@ export async function generatePrompt(
     model: anthropic(DEFAULT_MODEL),
     system: systemPrompt,
     prompt: userPrompt,
-    maxOutputTokens: 500,
-    temperature: 0.3,
   })
 
   return result.text.trim()
@@ -45,8 +43,6 @@ export async function analyzeFailureViaApi(
   const result = await generateText({
     model: anthropic(DEFAULT_MODEL),
     prompt,
-    maxOutputTokens: 4096,
-    temperature: 0.2,
   })
 
   return result.text.trim()

From af14ae3fac47495d6eb80e10634c43a843dfd9a6 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 30 Mar 2026 13:08:45 -0700
Subject: [PATCH 11/12] Fix model ID and clean up parallelism comments

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 evalbuff/src/docs-optimizer.ts | 9 ++-------
 evalbuff/src/llm.ts            | 2 +-
 evalbuff/src/run-evalbuff.ts   | 3 +--
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/evalbuff/src/docs-optimizer.ts b/evalbuff/src/docs-optimizer.ts
index 38e041752..408dffc4c 100644
--- a/evalbuff/src/docs-optimizer.ts
+++ b/evalbuff/src/docs-optimizer.ts
@@ -340,13 +340,8 @@ export function revertDocEdit(
 /**
  * Compare scores to determine if a doc edit improved things.
  *
- * With parallelism=1, score variance is very high (often 3+ points on
- * the same task). To avoid rejecting good docs due to noise:
- * - Require only small improvement to accept (0.3 threshold)
- * - Require large decline to reject (1.5 threshold) — benefit of the doubt
- *
- * With higher parallelism, averages are more stable so we can use
- * tighter thresholds.
+ * With parallelism=5, averages are reasonably stable. A 0.3 threshold
+ * catches real improvements without being too sensitive to noise.
  */
 export function compareScores(
   oldScore: number,
diff --git a/evalbuff/src/llm.ts b/evalbuff/src/llm.ts
index 9861d75bf..36e5eee61 100644
--- a/evalbuff/src/llm.ts
+++ b/evalbuff/src/llm.ts
@@ -12,7 +12,7 @@ import { generateText } from 'ai'
 
 const anthropic = createAnthropic()
 
-const DEFAULT_MODEL = 'claude-sonnet-4-6-20250415'
+const DEFAULT_MODEL = 'claude-sonnet-4-6'
 
 /**
  * Generate a task prompt from a commit diff using the LLM API directly.
diff --git a/evalbuff/src/run-evalbuff.ts b/evalbuff/src/run-evalbuff.ts
index 1b9405cc1..cac655a1d 100644
--- a/evalbuff/src/run-evalbuff.ts
+++ b/evalbuff/src/run-evalbuff.ts
@@ -473,8 +473,7 @@ async function improveDocs(opts: {
 
     if (comparison === 'improved' || comparison === 'same') {
       // 'improved' = clear signal the doc helps
-      // 'same' = within noise range — keep it (benefit of the doubt,
-      //   especially at low parallelism where variance is high)
+      // 'same' = within noise range — keep it (benefit of the doubt)
       const reason = comparison === 'improved' ? 'score improved' : 'within noise range, keeping'
       console.log(`  Keeping doc: ${docSuggestion.suggestedDocPath} (${reason})`)
       docsKept.push({

From 2c7b1b6d7cacfaafc9a5a6963d4740c6c1e50caa Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 30 Mar 2026 13:29:58 -0700
Subject: [PATCH 12/12] Fix type errors in send-message tests after merge from
 main
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes RunState type mismatches: sessionState: null → undefined,
fake session state objects cast as any, StreamStatus narrowing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../helpers/__tests__/send-message.test.ts    | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/cli/src/hooks/helpers/__tests__/send-message.test.ts b/cli/src/hooks/helpers/__tests__/send-message.test.ts
index 4247695f7..7f017deb1 100644
--- a/cli/src/hooks/helpers/__tests__/send-message.test.ts
+++ b/cli/src/hooks/helpers/__tests__/send-message.test.ts
@@ -35,6 +35,7 @@ const { createBatchedMessageUpdater } = await import(
   '../../../utils/message-updater'
 )
 import { createPaymentRequiredError } from '@codebuff/sdk'
+import type { RunState } from '@codebuff/sdk'
 
 const createMockTimerController = (): SendMessageTimerController & {
   startCalls: string[]
@@ -348,7 +349,7 @@ describe('handleRunCompletion', () => {
       let hasReceivedPlanResponse = false
 
       const runState = {
-        sessionState: null,
+        sessionState: undefined,
         output: { type: 'lastMessage' as const, value: [] },
       }
 
@@ -372,7 +373,7 @@ describe('handleRunCompletion', () => {
       expect(chainInProgress).toBe(false)
       expect(canProcessQueue).toBe(true)
       expect(isProcessingQueueRef.current).toBe(false)
-      expect(streamStatus).toBe('idle')
+      expect(streamStatus as StreamStatus).toBe('idle')
     })
 
     test('does not process server response when wasAbortedByUser is true', () => {
@@ -388,7 +389,7 @@ describe('handleRunCompletion', () => {
       let hasReceivedPlanResponse = false
 
       const runState = {
-        sessionState: null,
+        sessionState: undefined,
         output: {
           type: 'lastMessage' as const,
           value: [{ type: 'text' as const, text: 'Server response that should be ignored' }],
@@ -431,7 +432,7 @@ describe('handleRunCompletion', () => {
       let canProcessQueueCalled = false
 
       const runState = {
-        sessionState: null,
+        sessionState: undefined,
         output: { type: 'lastMessage' as const, value: [] },
       }
 
@@ -929,7 +930,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
 
     // Abort handler fires synchronously: UI is updated, but chain lock stays held
     expect(streamRefsA.state.wasAbortedByUser).toBe(true)
-    expect(streamStatus).toBe('idle')  // UI shows idle
+    expect(streamStatus as StreamStatus).toBe('idle')  // UI shows idle
     expect(chainInProgress).toBe(true) // But chain lock is still held!
 
     // --- PHASE 3: User types run B — verify it's BLOCKED ---
@@ -952,8 +953,8 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
     // Simulate what happens in useSendMessage after `await client.run(runConfig)`:
     // 1. previousRunStateRef.current = runState (state saved)
     // 2. handleRunCompletion is called
-    const runStateFromA = {
-      sessionState: { conversationId: 'conv-123', history: ['user msg A', 'partial assistant response'] },
+    const runStateFromA: RunState = {
+      sessionState: { conversationId: 'conv-123', history: ['user msg A', 'partial assistant response'] } as any,
       output: { type: 'lastMessage' as const, value: [{ type: 'text' as const, text: 'partial' }] },
     }
 
@@ -991,11 +992,11 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
     expect(chainInProgress).toBe(false)
     expect(canProcessQueue).toBe(true)
     expect(isProcessingQueueRef.current).toBe(false)
-    expect(streamStatus).toBe('idle')
+    expect(streamStatus as StreamStatus).toBe('idle')
 
     // The crucial state continuity: previousRunState from A is available for B
     expect(previousRunState).toBe(runStateFromA)
-    expect(previousRunState.sessionState).toEqual({
+    expect(previousRunState.sessionState as any).toEqual({
       conversationId: 'conv-123',
       history: ['user msg A', 'partial assistant response'],
     })
@@ -1049,7 +1050,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
     let chainInProgress = true
     const isProcessingQueueRef = { current: false }
     const isQueuePausedRef = { current: false }
-    let previousRunState: { sessionState: unknown; output: unknown } | null = null
+    let previousRunState: RunState | null = null
 
     const setStreamStatus = (status: StreamStatus) => { streamStatus = status }
     const setCanProcessQueue = (can: boolean) => { canProcessQueue = can }
@@ -1083,14 +1084,14 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
     expect(chainInProgress).toBe(true) // Lock held
 
     // client.run() resolves for run A
-    const runStateA = {
+    const runStateA: RunState = {
       sessionState: {
         id: 'session-abc',
         messages: [
           { role: 'user', content: 'first message' },
           { role: 'assistant', content: 'partial response before cancel' },
         ],
-      },
+      } as any,
       output: { type: 'lastMessage' as const, value: [] },
     }
     previousRunState = runStateA
@@ -1146,7 +1147,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
     // In the real code, this is: previousRunState: previousRunStateRef.current
     // passed to createRunConfig
     expect(previousRunState).toBe(runStateA)
-    expect(previousRunState!.sessionState).toEqual({
+    expect(previousRunState!.sessionState as any).toEqual({
       id: 'session-abc',
       messages: [
         { role: 'user', content: 'first message' },
@@ -1155,7 +1156,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
     })
 
     // Simulate run B completing normally
-    const runStateB = {
+    const runStateB: RunState = {
       sessionState: {
         id: 'session-abc',
         messages: [
@@ -1164,7 +1165,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
           { role: 'user', content: 'second message' },
           { role: 'assistant', content: 'full response to second message' },
         ],
-      },
+      } as any,
       output: { type: 'lastMessage' as const, value: [{ type: 'text' as const, text: 'full response' }] },
     }
     previousRunState = runStateB
@@ -1186,7 +1187,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
     })
 
     // Final state: both runs' messages are preserved in session history
-    expect(previousRunState!.sessionState).toEqual({
+    expect(previousRunState!.sessionState as any).toEqual({
       id: 'session-abc',
       messages: [
         { role: 'user', content: 'first message' },