diff --git a/.gitignore b/.gitignore
index a00be42b73..77ee39bd33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,6 @@ node_modules/
 *.log
 # Desktop Service Store
 *.DS_Store
+
+# JetBrains IDEs
+.idea
diff --git a/bugbug/tools/build_repair/__init__.py b/bugbug/tools/build_repair/__init__.py
new file mode 100644
index 0000000000..2f3caac2c6
--- /dev/null
+++ b/bugbug/tools/build_repair/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from bugbug.tools.build_repair.agent import AgentResponse, BuildFailure, BuildRepairTool
+
+__all__ = ["AgentResponse", "BuildFailure", "BuildRepairTool"]
diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py
new file mode 100644
index 0000000000..0845315871
--- /dev/null
+++ b/bugbug/tools/build_repair/agent.py
@@ -0,0 +1,407 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import subprocess
+import traceback
+from collections.abc import Callable
+from logging import getLogger
+from pathlib import Path
+
+from claude_agent_sdk import ClaudeAgentOptions, ResultMessage, query
+from pydantic import BaseModel, Field
+
+from bugbug.tools.base import GenerativeModelTool
+from bugbug.tools.build_repair.config import (
+    ADDITIONAL_DIRS,
+    ALLOWED_TOOLS,
+    ANALYSIS_MODEL,
+    FIREFOX_MCP_URL,
+    FIX_MODEL,
+    SANDBOX_CONFIG,
+)
+from bugbug.tools.build_repair.prompts import (
+    ANALYSIS_TEMPLATE,
+    EVAL_PROMPT,
+    FIX_TEMPLATE,
+)
+
+logger = getLogger(__name__)
+
+
+class BuildFailure(BaseModel):
+    """Input describing a build failure from the dataset."""
+
+    bug_id: int = Field(description="The ID of the bug in Bugzilla.")
+    bug_title: str | None = Field(default=None, description="Optional bug title.")
+    bug_comments: list[str] | None = Field(
+        default=None, description="Optional bug comments."
+    )
+    git_commit: str = Field(description="Git revision to checkout.")
+    failure_tasks: list[dict] = Field(
+        description="List of {task_name, task_id, retry_id, failure_lines}."
+    )
+
+
+class AgentResponse(BaseModel):
+    """Output from a build repair run, including analysis, diff, cost, and build results."""
+
+    summary: str = Field(default="")
+    analysis: str = Field(default="")
+    diff: str = Field(default="")
+    error: str | None = Field(default=None)
+    error_traceback: str | None = Field(default=None)
+    failure_stage: str | None = Field(default=None)
+    cost_usd: float = Field(default=0.0)
+    num_turns: int = Field(default=0)
+    input_tokens: int = Field(default=0)
+    output_tokens: int = Field(default=0)
+    cache_read_input_tokens: int = Field(default=0)
+    cache_creation_input_tokens: int = Field(default=0)
+    local_build_passed: bool | None = Field(default=None)
+    try_build_passed: bool | None = Field(default=None)
+    lando_job_id: str | None = Field(default=None)
+    treeherder_url: str | None = Field(default=None)
+    stage1_transcript: list[dict] = Field(default_factory=list)
+    stage2_transcript: list[dict] = Field(default_factory=list)
+
+
+class BuildRepairTool(GenerativeModelTool):
+    """Two-stage build repair agent using Claude Agent SDK.
+
+    Stage 1: Analyzes the failure and produces analysis/planning/summary docs.
+    Stage 2: Reads the analysis and implements a fix. Skipped in analysis-only mode.
+    After Stage 2, commits the fix, runs ./mach build, and optionally submits to try.
+    """
+
+    def __init__(
+        self,
+        target_software: str = "Mozilla Firefox",
+        analysis_only: bool = False,
+        eval_mode: bool = False,
+        analysis_model: str = ANALYSIS_MODEL,
+        fix_model: str = FIX_MODEL,
+    ) -> None:
+        self.eval_mode = eval_mode
+        self.target_software = target_software
+        self.analysis_only = analysis_only
+        self.analysis_model = analysis_model
+        self.fix_model = fix_model
+
+    @classmethod
+    def create(cls, **kwargs):
+        return cls(**kwargs)
+
+    @staticmethod
+    def _usage_fields(usage: dict) -> dict:
+        return {
+            "input_tokens": usage.get("input_tokens", 0),
+            "output_tokens": usage.get("output_tokens", 0),
+            "cache_read_input_tokens": usage.get("cache_read_input_tokens", 0),
+            "cache_creation_input_tokens": usage.get("cache_creation_input_tokens", 0),
+        }
+
+    @staticmethod
+    def _serialize_message(message) -> dict:
+        data = {"type": type(message).__name__}
+        if hasattr(message, "model_dump"):
+            data.update(message.model_dump())
+        elif hasattr(message, "__dict__"):
+            data.update(vars(message))
+        else:
+            data["raw"] = str(message)
+        return data
+
+    async def _run_stage(
+        self,
+        stage_name: str,
+        prompt: str,
+        model: str,
+        options: ClaudeAgentOptions,
+        bug_id: int,
+        on_message: Callable[[str, dict], None] | None = None,
+    ) -> tuple[list[dict], float, int, dict]:
+        transcript: list[dict] = []
+        cost = 0.0
+        turns = 0
+        result_data: dict = {}
+        usage: dict = {}
+
+        if on_message:
+            on_message(
+                stage_name,
+                {
+                    "type": "stage_start",
+                    "prompt": prompt,
+                    "model": model,
+                },
+            )
+        try:
+            async for message in query(prompt=prompt, options=options):
+                serialized = self._serialize_message(message)
+                transcript.append(serialized)
+                logger.info(f"Bug {bug_id}: {stage_name} [{serialized['type']}]")
+                logger.debug(f"Bug {bug_id}: {stage_name} detail: {serialized}")
+                if on_message:
+                    on_message(stage_name, serialized)
+                if isinstance(message, ResultMessage):
+                    cost += message.total_cost_usd or 0
+                    turns += message.num_turns or 0
+                    usage = getattr(message, "usage", {}) or {}
+                    result_data = serialized
+        finally:
+            if on_message:
+                on_message(
+                    stage_name,
+                    {
+                        "type": "stage_end",
+                        "cost_usd": cost,
+                        "num_turns": turns,
+                        "result_data": result_data,
+                    },
+                )
+
+        return transcript, cost, turns, usage
+
+    def _prepare_input_files(self, failure: BuildFailure, worktree_path: Path) -> None:
+        in_dir = worktree_path / "repair_agent" / "in" / str(failure.bug_id)
+        in_dir.mkdir(parents=True, exist_ok=True)
+
+        (in_dir / "bug_description.md").write_text(
+            f"# Bug {failure.bug_id}: {failure.bug_title}\n\n"
+            + "\n\n---\n\n".join(failure.bug_comments or [])
+        )
+
+        logs_content = ""
+        for task in failure.failure_tasks:
+            logs_content += f"## {task['task_name']} (task_id: {task['task_id']})\n\n"
+            logs_content += "\n".join(task["failure_lines"]) + "\n\n"
+        (in_dir / "build_failure_logs.md").write_text(logs_content)
+
+        out_dir = worktree_path / "repair_agent" / "out" / str(failure.bug_id)
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+        logger.info(
+            f"Prepared input files for bug {failure.bug_id} at {in_dir} "
+            f"({len(failure.failure_tasks)} failure tasks)"
+        )
+
+    def _read_output(self, failure: BuildFailure, worktree_path: Path, key: str) -> str:
+        path = (
+            worktree_path / "repair_agent" / "out" / str(failure.bug_id) / f"{key}.md"
+        )
+        if path.exists():
+            return path.read_text()
+        return ""
+
+    async def run(
+        self,
+        failure: BuildFailure,
+        worktree_path: Path,
+        skip_try_push: bool = False,
+        on_message: Callable[[str, dict], None] | None = None,
+    ) -> AgentResponse:
+        logger.info(
+            f"Starting build repair for bug {failure.bug_id} "
+            f"(commit={failure.git_commit}, worktree={worktree_path}, "
+            f"analysis_only={self.analysis_only}, skip_try_push={skip_try_push})"
+        )
+        self._prepare_input_files(failure, worktree_path)
+
+        mcp_servers = {"firefox": {"type": "http", "url": FIREFOX_MCP_URL}}
+        disallowed = ["AskUserQuestion", "Task"]
+        total_cost = 0.0
+        total_turns = 0
+        total_usage: dict = {}
+
+        logger.info(
+            f"Bug {failure.bug_id}: starting Stage 1 (analysis) "
+            f"with model={self.analysis_model}"
+        )
+        stage1_options = ClaudeAgentOptions(
+            model=self.analysis_model,
+            cwd=str(worktree_path),
+            allowed_tools=ALLOWED_TOOLS,
+            disallowed_tools=disallowed,
+            add_dirs=ADDITIONAL_DIRS,
+            sandbox=SANDBOX_CONFIG,
+            permission_mode="acceptEdits",
+            effort="high",
+            mcp_servers=mcp_servers,
+        )
+        analysis_prompt = ANALYSIS_TEMPLATE.format(
+            bug_id=failure.bug_id,
+            target_software=self.target_software,
+            eval=EVAL_PROMPT if self.eval_mode else "",
+        )
+        try:
+            (
+                stage1_transcript,
+                stage1_cost,
+                stage1_turns,
+                stage1_usage,
+            ) = await self._run_stage(
+                "analysis",
+                analysis_prompt,
+                self.analysis_model,
+                stage1_options,
+                failure.bug_id,
+                on_message,
+            )
+            total_cost += stage1_cost
+            total_turns += stage1_turns
+            for k, v in stage1_usage.items():
+                if isinstance(v, (int, float)):
+                    total_usage[k] = total_usage.get(k, 0) + v
+        except Exception as e:
+            logger.error(
+                f"Bug {failure.bug_id}: Stage 1 (analysis) failed: {e}", exc_info=True
+            )
+            return AgentResponse(
+                error=str(e),
+                error_traceback=traceback.format_exc(),
+                failure_stage="analysis",
+                cost_usd=total_cost,
+                num_turns=total_turns,
+                **self._usage_fields(total_usage),
+            )
+
+        logger.info(
+            f"Bug {failure.bug_id}: Stage 1 complete "
+            f"(cost=${total_cost:.4f}, turns={total_turns})"
+        )
+
+        summary = self._read_output(failure, worktree_path, "summary")
+        analysis = self._read_output(failure, worktree_path, "analysis")
+        logger.info(
+            f"Bug {failure.bug_id}: read output files "
+            f"(summary={len(summary)} chars, analysis={len(analysis)} chars)"
+        )
+
+        if self.analysis_only:
+            logger.info(f"Bug {failure.bug_id}: analysis-only mode, skipping Stage 2")
+            return AgentResponse(
+                summary=summary,
+                analysis=analysis,
+                cost_usd=total_cost,
+                num_turns=total_turns,
+                **self._usage_fields(total_usage),
+                stage1_transcript=stage1_transcript,
+            )
+
+        logger.info(
+            f"Bug {failure.bug_id}: starting Stage 2 (fix) with model={self.fix_model}"
+        )
+        stage2_options = ClaudeAgentOptions(
+            model=self.fix_model,
+            cwd=str(worktree_path),
+            allowed_tools=ALLOWED_TOOLS,
+            disallowed_tools=disallowed,
+            add_dirs=ADDITIONAL_DIRS,
+            sandbox=SANDBOX_CONFIG,
+            permission_mode="acceptEdits",
+            effort="low",
+            mcp_servers=mcp_servers,
+        )
+        fix_prompt = FIX_TEMPLATE.format(
+            bug_id=failure.bug_id, eval=EVAL_PROMPT if self.eval_mode else ""
+        )
+        try:
+            (
+                stage2_transcript,
+                stage2_cost,
+                stage2_turns,
+                stage2_usage,
+            ) = await self._run_stage(
+                "fix",
+                fix_prompt,
+                self.fix_model,
+                stage2_options,
+                failure.bug_id,
+                on_message,
+            )
+            total_cost += stage2_cost
+            total_turns += stage2_turns
+            for k, v in stage2_usage.items():
+                if isinstance(v, (int, float)):
+                    total_usage[k] = total_usage.get(k, 0) + v
+        except Exception as e:
+            logger.error(
+                f"Bug {failure.bug_id}: Stage 2 (fix) failed: {e}", exc_info=True
+            )
+            return AgentResponse(
+                summary=summary,
+                analysis=analysis,
+                error=str(e),
+                error_traceback=traceback.format_exc(),
+                failure_stage="fix",
+                cost_usd=total_cost,
+                num_turns=total_turns,
+                **self._usage_fields(total_usage),
+            )
+
+        logger.info(
+            f"Bug {failure.bug_id}: Stage 2 complete "
+            f"(cost=${total_cost:.4f}, turns={total_turns})"
+        )
+
+        diff_result = subprocess.run(
+            ["git", "diff", "HEAD"],
+            cwd=worktree_path,
+            capture_output=True,
+            text=True,
+        )
+        diff = diff_result.stdout
+        logger.info(f"Bug {failure.bug_id}: git diff produced {len(diff)} chars")
+
+        if not diff.strip():
+            logger.warning(f"Bug {failure.bug_id}: no diff produced, returning early")
+            return AgentResponse(
+                summary=summary,
+                analysis=analysis,
+                diff=diff,
+                cost_usd=total_cost,
+                num_turns=total_turns,
+                **self._usage_fields(total_usage),
+                stage1_transcript=stage1_transcript,
+                stage2_transcript=stage2_transcript,
+            )
+
+        from bugbug.tools.build_repair.try_server import run_try_verification
+
+        task_name = (
+            failure.failure_tasks[0]["task_name"] if failure.failure_tasks else ""
+        )
+        logger.info(
+            f"Bug {failure.bug_id}: starting try verification "
+            f"(task={task_name}, skip_try_push={skip_try_push})"
+        )
+        try_result = run_try_verification(
+            worktree_path=worktree_path,
+            bug_id=failure.bug_id,
+            task_name=task_name,
+            skip_try_push=skip_try_push,
+        )
+
+        logger.info(
+            f"Bug {failure.bug_id}: try verification done "
+            f"(local_build={try_result.local_build_passed}, "
+            f"try_build={try_result.try_build_passed}, "
+            f"lando_job={try_result.lando_job_id}, "
+            f"total_cost=${total_cost:.4f}, total_turns={total_turns})"
+        )
+        return AgentResponse(
+            summary=summary,
+            analysis=analysis,
+            diff=diff,
+            cost_usd=total_cost,
+            num_turns=total_turns,
+            **self._usage_fields(total_usage),
+            local_build_passed=try_result.local_build_passed,
+            try_build_passed=try_result.try_build_passed,
+            lando_job_id=try_result.lando_job_id,
+            treeherder_url=try_result.treeherder_url,
+            stage1_transcript=stage1_transcript,
+            stage2_transcript=stage2_transcript,
+        )
diff --git a/bugbug/tools/build_repair/config.py b/bugbug/tools/build_repair/config.py
new file mode 100644
index 0000000000..f4bde13a3a
--- /dev/null
+++ b/bugbug/tools/build_repair/config.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from datetime import date
+
+from claude_agent_sdk import SandboxNetworkConfig, SandboxSettings
+
+ANALYSIS_MODEL = "claude-opus-4-6"
+FIX_MODEL = "claude-opus-4-6"
+DEFAULT_MAX_TURNS = 80
+WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees"
+TRY_PUSH_TIMEOUT_SECONDS = 7200
+TRY_PUSH_POLL_INTERVAL_SECONDS = 60
+TREEHERDER_BASE_URL = "https://treeherder.mozilla.org"
+
+FIREFOX_MCP_URL = "https://mcp-dev.moz.tools/mcp"
+
+# Training data cutoff dates per model, for data contamination filtering.
+# Examples with fix_commit_date before the cutoff may have been in training data.
+# Source: https://platform.claude.com/docs/en/about-claude/models/overview
+MODEL_CUTOFF_DATES = {
+    "claude-opus-4-6": date(2025, 8, 1),
+    "claude-sonnet-4-6": date(2026, 1, 1),
+    "claude-haiku-4-5-20251001": date(2025, 7, 1),
+    "claude-sonnet-4-5-20250929": date(2025, 7, 1),
+    "claude-opus-4-5-20251101": date(2025, 8, 1),
+    "claude-opus-4-1-20250805": date(2025, 3, 1),
+    "claude-sonnet-4-20250514": date(2025, 3, 1),
+    "claude-3-7-sonnet-20250219": date(2024, 11, 1),
+    "claude-opus-4-20250514": date(2025, 3, 1),
+}
+
+ALLOWED_TOOLS = [
+    "Edit(~/.mozbuild)",
+    "Edit(~/.cache/uv)",
+    "Bash(./mach build:*)",
+    "Bash(./mach clobber:*)",
+    "Bash(./mach configure:*)",
+    "Bash(./mach run:*)",
+    "Bash(./mach test:*)",
+    "Bash(./mach wpt:*)",
+    "Bash(./mach lint:*)",
+    "Bash(./mach format:*)",
+    "Bash(./mach clang-format:*)",
+    "Bash(./mach try:*)",
+    "Bash(./mach help:*)",
+    "Bash(./mach vendor:*)",
+    "Bash(./mach bootstrap:*)",
+    "Bash(./mach artifact:*)",
+    "Bash(clang++:*)",
+    "Bash(rm:*)",
+    "Bash(timeout:*)",
+    "Bash(find:*)",
+    "Bash(grep:*)",
+    "Bash(tee:*)",
+    "Bash(kill:*)",
+    "Bash(searchfox-cli:*)",
+    "Bash(treeherder-cli:*)",
+    "Bash(jj:*)",
+    "WebFetch(domain:firefox-source-docs.mozilla.org)",
+    "WebFetch(domain:treeherder.mozilla.org)",
+    "WebFetch(domain:searchfox.org)",
+    "WebFetch(o1069899.ingest.sentry.io)",
+]
+
+ADDITIONAL_DIRS = [
+    "~/.mozbuild",
+    "~/.cache/uv/",
+]
+
+SANDBOX_CONFIG = SandboxSettings(
+    enabled=True,
+    autoAllowBashIfSandboxed=True,
+    allowUnsandboxedCommands=False,
+    network=SandboxNetworkConfig(allowLocalBinding=True),
+)
diff --git a/bugbug/tools/build_repair/prompts.py b/bugbug/tools/build_repair/prompts.py
new file mode 100644
index 0000000000..cdab7e11e0
--- /dev/null
+++ b/bugbug/tools/build_repair/prompts.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""Prompt templates for build repair agent."""
+
+ANALYSIS_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure.
+
+Investigate why the last commit broke {target_software} build.
+
+The last commit attempted to fix a bug from Bugzilla.
+
+Analyze the following:
+1. Git diff for the last commit
+2. Bugzilla bug description
+3. Taskcluster build failure logs
+The files with bug description and logs are located at @repair_agent/in/{bug_id}
+
+Create three separate documents:
+1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
+2. repair_agent/out/{bug_id}/planning.md with a fixing plan
+3. repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction
+
+Do not prompt to edit those documents.
+{eval}
+
+Do not write any code yet. Work fully autonomously, do not ask any questions. Think hard.
+"""
+
+FIX_TEMPLATE = """Read the following files and implement a fix of the failure:
+1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
+2. repair_agent/out/{bug_id}/planning.md with a fixing plan
+{eval}
+
+Do not prompt to edit files. Work fully autonomously, do not ask any questions. Use all allowed tools without prompting.
+"""
+
+EVAL_PROMPT = """
+Do not request bug info from Bugzilla or Phabricator. Use only the provided file with bug description.
+Do not look at git commits other than the specified last commit.
+"""
diff --git a/bugbug/tools/build_repair/scorer.py b/bugbug/tools/build_repair/scorer.py
new file mode 100644
index 0000000000..566b384a6a
--- /dev/null
+++ b/bugbug/tools/build_repair/scorer.py
@@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from logging import getLogger
+
+import weave
+
+logger = getLogger(__name__)
+
+
+def _pass_at_k(
+    score_rows: list[dict],
+    num_trials: int,
+    metric: str,
+) -> dict[str, float]:
+    """Compute pass@k from scorer rows ordered by trial.
+
+    Rows are ordered: first num_examples = trial 0, next = trial 1, etc.
+    Rows may be empty dicts when the model raised an exception.
+    """
+    num_examples = len(score_rows) // num_trials
+    pass_at: dict[str, float] = {}
+    for n in sorted({1, 3, num_trials}):
+        if n > num_trials:
+            continue
+        successes = sum(
+            any(score_rows[t * num_examples + i].get(metric) is True for t in range(n))
+            for i in range(num_examples)
+        )
+        pass_at[f"pass@{n}"] = successes / num_examples if num_examples else 0
+
+    all_pass = sum(
+        all(
+            score_rows[t * num_examples + i].get(metric) is True
+            for t in range(num_trials)
+        )
+        for i in range(num_examples)
+    )
+    pass_at[f"pass^{num_trials}"] = all_pass / num_examples if num_examples else 0
+
+    return pass_at
+
+
+class BasicMetricsScorer(weave.Scorer):
+    """Scores success rate, diff production rate, cost, and turn count."""
+
+    num_trials: int = 1
+
+    @weave.op()
+    def score(self, output: dict | None) -> dict:
+        if output is None:
+            return {
+                "successful": False,
+                "has_diff": False,
+                "cost_usd": 0,
+                "num_turns": 0,
+                "input_tokens": 0,
+                "output_tokens": 0,
+                "cache_read_input_tokens": 0,
+                "cache_creation_input_tokens": 0,
+            }
+        return {
+            "successful": output.get("error") is None,
+            "has_diff": bool(output.get("diff", "").strip()),
+            "cost_usd": output.get("cost_usd", 0),
+            "num_turns": output.get("num_turns", 0),
+            "input_tokens": output.get("input_tokens", 0),
+            "output_tokens": output.get("output_tokens", 0),
+            "cache_read_input_tokens": output.get("cache_read_input_tokens", 0),
+            "cache_creation_input_tokens": output.get("cache_creation_input_tokens", 0),
+        }
+
+    def summarize(self, score_rows: list[dict]) -> dict:
+        n = len(score_rows)
+        costs = [r.get("cost_usd", 0) for r in score_rows]
+        input_toks = [r.get("input_tokens", 0) for r in score_rows]
+        output_toks = [r.get("output_tokens", 0) for r in score_rows]
+        summary = {
+            "success_rate": sum(r.get("successful", False) for r in score_rows) / n
+            if n
+            else 0,
+            "diff_rate": sum(r.get("has_diff", False) for r in score_rows) / n
+            if n
+            else 0,
+            "avg_cost_usd": sum(costs) / n if n else 0,
+            "total_cost_usd": sum(costs),
+            "total_input_tokens": sum(input_toks),
+            "total_output_tokens": sum(output_toks),
+            "total_cache_read_tokens": sum(
+                r.get("cache_read_input_tokens", 0) for r in score_rows
+            ),
+            "total_cache_creation_tokens": sum(
+                r.get("cache_creation_input_tokens", 0) for r in score_rows
+            ),
+            "num_examples": n,
+        }
+        if self.num_trials > 1:
+            summary.update(_pass_at_k(score_rows, self.num_trials, "successful"))
+        logger.info(f"BasicMetrics summary: {summary}")
+        return summary
+
+
+class BuildPassRateScorer(weave.Scorer):
+    """Scores local ./mach build and try push pass rates."""
+
+    num_trials: int = 1
+
+    @weave.op()
+    def score(self, output: dict | None) -> dict:
+        if output is None:
+            return {
+                "local_build_passed": None,
+                "try_build_passed": None,
+            }
+        return {
+            "local_build_passed": output.get("local_build_passed"),
+            "try_build_passed": output.get("try_build_passed"),
+        }
+
+    def summarize(self, score_rows: list[dict]) -> dict:
+        n = len(score_rows)
+        local_passed = sum(1 for r in score_rows if r.get("local_build_passed") is True)
+        try_known = [r for r in score_rows if r.get("try_build_passed") is not None]
+        try_passed = sum(1 for r in try_known if r.get("try_build_passed") is True)
+        summary = {
+            "local_build_pass_rate": local_passed / n if n else 0,
+            "local_builds_passed": local_passed,
+            "try_build_pass_rate": try_passed / len(try_known) if try_known else 0,
+            "try_builds_passed": try_passed,
+            "try_builds_timed_out": n - len(try_known),
+            "num_examples": n,
+        }
+        if self.num_trials > 1:
+            summary.update(
+                _pass_at_k(score_rows, self.num_trials, "local_build_passed")
+            )
+        logger.info(f"BuildPassRate summary: {summary}")
+        return summary
+
+
+class LLMFixMatchingScorer(weave.Scorer):
+    """Scaffold for LLM-as-a-judge comparing agent fix to ground truth.
+
+    Implementation deferred. Will use a non-Claude LLM to semantically
+    compare the agent's diff against the ground truth fix commit.
+    """
+
+    @weave.op()
+    async def score(self, output: dict | None, gh_fix_commits: list[str]) -> dict:
+        if output is None:
+            return {
+                "match_score": None,
+                "match_category": "errored",
+            }
+        return {
+            "match_score": None,
+            "match_category": "not_implemented",
+        }
+
+    def summarize(self, score_rows: list[dict]) -> dict:
+        return {"status": "not_implemented"}
diff --git a/bugbug/tools/build_repair/try_server.py b/bugbug/tools/build_repair/try_server.py
new file mode 100644
index 0000000000..6ef558556e
--- /dev/null
+++ b/bugbug/tools/build_repair/try_server.py
@@ -0,0 +1,286 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import logging
+import os
+import re
+import subprocess
+import time
+from dataclasses import dataclass
+from logging import getLogger
+from pathlib import Path
+
+import requests
+
+from bugbug.tools.build_repair.config import (
+    TREEHERDER_BASE_URL,
+    TRY_PUSH_POLL_INTERVAL_SECONDS,
+    TRY_PUSH_TIMEOUT_SECONDS,
+)
+
+logger = getLogger(__name__)
+
+_HEADERS = {"User-Agent": "bugbug-build-repair-eval/1.0"}
+_LANDO_JOB_ID_RE = re.compile(r"landoCommitID=([A-Za-z0-9_-]+)")
+
+
+def _mach_env(worktree_path: Path) -> dict[str, str]:
+    env = os.environ.copy()
+    env["MOZBUILD_STATE_PATH"] = str(worktree_path / ".mozbuild")
+    return env
+
+
+@dataclass
+class TryPushResult:
+    """Result of local build verification and optional try push submission."""
+
+    local_build_passed: bool
+    try_build_passed: bool | None
+    lando_job_id: str | None
+    treeherder_url: str | None
+
+
+def _commit_fix(worktree_path: Path, bug_id: int) -> None:
+    logger.info(f"Committing fix for bug {bug_id} in {worktree_path}")
+    subprocess.run(
+        ["git", "add", "-A"],
+        cwd=worktree_path,
+        check=True,
+    )
+    subprocess.run(
+        [
+            "git",
+            "-c",
+            "user.name=bugbug",
+            "-c",
+            "user.email=bugbug@mozilla.com",
+            "commit",
+            "-m",
+            f"Build repair fix for bug {bug_id}",
+        ],
+        cwd=worktree_path,
+        check=True,
+    )
+    logger.info(f"Bug {bug_id}: fix committed")
+
+
+def _run_subprocess(
+    cmd: list[str], worktree_path: Path, capture: bool
+) -> subprocess.CompletedProcess[str]:
+    if capture:
+        return subprocess.run(
+            cmd,
+            cwd=worktree_path,
+            env=_mach_env(worktree_path),
+            capture_output=True,
+            text=True,
+        )
+    return subprocess.run(
+        cmd,
+        cwd=worktree_path,
+        env=_mach_env(worktree_path),
+        text=True,
+    )
+
+
+def _run_local_build(worktree_path: Path) -> bool:
+    capture = not logger.isEnabledFor(logging.DEBUG)
+
+    logger.info(f"Running bootstrap in {worktree_path}")
+    result = _run_subprocess(
+        ["./mach", "--no-interactive", "bootstrap"], worktree_path, capture
+    )
+    if result.returncode != 0:
+        if capture and result.stderr:
+            logger.warning(f"Bootstrap stderr:\n{result.stderr[-2000:]}")
+        raise RuntimeError(
+            f"Local bootstrap failed with return code {result.returncode}"
+        )
+
+    logger.info(f"Running local build in {worktree_path}")
+    result = _run_subprocess(["./mach", "build"], worktree_path, capture)
+    passed = result.returncode == 0
+    status = "passed" if passed else "failed"
+    logger.info(f"Local build {status} (returncode={result.returncode})")
+    if not passed and capture and result.stderr:
+        logger.warning(f"Build stderr:\n{result.stderr[-2000:]}")
+    return passed
+
+
+def _submit_try(worktree_path: Path, task_name: str) -> tuple[str | None, str | None]:
+    logger.info(f"Submitting try push for task={task_name} in {worktree_path}")
+    result = subprocess.run(
+        ["./mach", "try", "fuzzy", "--query", task_name],
+        cwd=worktree_path,
+        capture_output=True,
+        text=True,
+        env=_mach_env(worktree_path),
+    )
+    stdout = result.stdout + result.stderr
+    logger.debug(f"Try push output: {stdout}")
+    match = _LANDO_JOB_ID_RE.search(stdout)
+    if not match:
+        logger.warning(f"Could not parse Lando job ID from try output: {stdout}")
+        return None, None
+
+    lando_job_id = match.group(1)
+    treeherder_url = f"{TREEHERDER_BASE_URL}/jobs?repo=try&landoCommitID={lando_job_id}"
+    logger.info(
+        f"Try push submitted: lando_job_id={lando_job_id}, treeherder={treeherder_url}"
+    )
+    return lando_job_id, treeherder_url
+
+
+def _get_push_revision(lando_job_id: str) -> str | None:
+    try:
+        resp = requests.get(
+            f"{TREEHERDER_BASE_URL}/api/project/try/push/",
+            params={"lando_commit_id": lando_job_id},
+            headers=_HEADERS,
+            timeout=30,
+        )
+        resp.raise_for_status()
+        results = resp.json().get("results", [])
+        if results:
+            return results[0].get("revision")
+    except Exception:
+        logger.exception(f"Error fetching push revision for lando job {lando_job_id}")
+    return None
+
+
+def _get_push_by_revision(revision: str) -> dict | None:
+    try:
+        resp = requests.get(
+            f"{TREEHERDER_BASE_URL}/api/project/try/push/",
+            params={"revision": revision},
+            headers=_HEADERS,
+            timeout=30,
+        )
+        resp.raise_for_status()
+        results = resp.json().get("results", [])
+        return results[0] if results else None
+    except Exception:
+        logger.exception(f"Error fetching push by revision {revision}")
+    return None
+
+
+def _get_build_job_result(push_id: int, task_name: str) -> str | None:
+    try:
+        resp = requests.get(
+            f"{TREEHERDER_BASE_URL}/api/project/try/jobs/",
+            params={"push_id": push_id, "count": 2000},
+            headers=_HEADERS,
+            timeout=30,
+        )
+        resp.raise_for_status()
+        for job in resp.json().get("results", []):
+            if task_name in job.get("job_type_name", ""):
+                if job["state"] != "completed":
+                    return job["state"]
+                return job["result"]
+    except Exception:
+        logger.exception(f"Error fetching build job result for push {push_id}")
+    return None
+
+
+def _poll_treeherder(lando_job_id: str, task_name: str) -> bool | None:
+    logger.info(
+        f"Polling Treeherder for lando_job_id={lando_job_id}, task={task_name} "
+        f"(timeout={TRY_PUSH_TIMEOUT_SECONDS}s, "
+        f"interval={TRY_PUSH_POLL_INTERVAL_SECONDS}s)"
+    )
+    deadline = time.monotonic() + TRY_PUSH_TIMEOUT_SECONDS
+    push_id: int | None = None
+    poll_count = 0
+
+    while time.monotonic() < deadline:
+        poll_count += 1
+        if push_id is None:
+            revision = _get_push_revision(lando_job_id)
+            if revision:
+                logger.info(
+                    f"Resolved revision={revision} for lando_job_id={lando_job_id}"
+                )
+                push = _get_push_by_revision(revision)
+                if push:
+                    push_id = push["id"]
+                    logger.info(f"Resolved push_id={push_id} for revision={revision}")
+
+        if push_id is not None:
+            result = _get_build_job_result(push_id, task_name)
+            logger.debug(
+                f"Poll #{poll_count}: job result={result} for push_id={push_id}"
+            )
+            if result == "success":
+                logger.info(f"Try build succeeded for lando_job_id={lando_job_id}")
+                return True
+            if result in ("busted", "testfailed", "exception"):
+                logger.info(
+                    f"Try build failed ({result}) for lando_job_id={lando_job_id}"
+                )
+                return False
+        else:
+            logger.debug(
+                f"Poll #{poll_count}: push not yet available for "
+                f"lando_job_id={lando_job_id}"
+            )
+
+        time.sleep(TRY_PUSH_POLL_INTERVAL_SECONDS)
+
+    logger.warning(
+        f"Try push polling timed out after {poll_count} polls "
+        f"for lando job {lando_job_id}"
+    )
+    return None
+
+
+def run_try_verification(
+    worktree_path: Path,
+    bug_id: int,
+    task_name: str,
+    skip_try_push: bool = False,
+) -> TryPushResult:
+    logger.info(
+        f"Starting try verification for bug {bug_id} "
+        f"(task={task_name}, skip_try_push={skip_try_push})"
+    )
+    _commit_fix(worktree_path, bug_id)
+
+    local_passed = _run_local_build(worktree_path)
+    if not local_passed:
+        logger.warning(f"Bug {bug_id}: local build failed, skipping try push")
+        return TryPushResult(
+            local_build_passed=False,
+            try_build_passed=None,
+            lando_job_id=None,
+            treeherder_url=None,
+        )
+
+    if skip_try_push:
+        logger.info(f"Bug {bug_id}: local build passed, skipping try push as requested")
+        return TryPushResult(
+            local_build_passed=True,
+            try_build_passed=None,
+            lando_job_id=None,
+            treeherder_url=None,
+        )
+
+    lando_job_id, treeherder_url = _submit_try(worktree_path, task_name)
+    if not lando_job_id:
+        logger.warning(f"Bug {bug_id}: try push submission failed, no lando job ID")
+        return TryPushResult(
+            local_build_passed=True,
+            try_build_passed=None,
+            lando_job_id=None,
+            treeherder_url=None,
+        )
+
+    try_passed = _poll_treeherder(lando_job_id, task_name)
+    return TryPushResult(
+        local_build_passed=True,
+        try_build_passed=try_passed,
+        lando_job_id=lando_job_id,
+        treeherder_url=treeherder_url,
+    )
diff --git a/bugbug/tools/build_repair/worktree.py b/bugbug/tools/build_repair/worktree.py
new file mode 100644
index 0000000000..8ae10ea7da
--- /dev/null
+++ b/bugbug/tools/build_repair/worktree.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import subprocess
+from logging import getLogger
+from pathlib import Path
+
+from bugbug.tools.build_repair.config import WORKTREE_BASE_DIR
+
+logger = getLogger(__name__)
+
+
+class WorktreeManager:
+    """Manages git worktrees for parallel evaluation runs against a Firefox repo."""
+
+    def __init__(
+        self,
+        firefox_repo_path: str | Path,
+        base_dir: str = WORKTREE_BASE_DIR,
+    ):
+        self.repo = Path(firefox_repo_path)
+        self.base_dir = Path(base_dir)
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+
+    def create(self, commit_hash: str, name: str) -> Path:
+        worktree_path = self.base_dir / name
+        logger.info(
+            f"Creating worktree {name} at {worktree_path} (commit={commit_hash})"
+        )
+        if worktree_path.exists():
+            self.cleanup(name)
+        subprocess.run(
+            [
+                "git",
+                "worktree",
+                "add",
+                "--force",
+                "--force",
+                str(worktree_path),
+                commit_hash,
+            ],
+            cwd=self.repo,
+            check=True,
+        )
+        logger.info(f"Worktree {name} created")
+        return worktree_path
+
+    def cleanup(self, name: str) -> None:
+        logger.info(f"Cleaning up worktree {name}")
+        subprocess.run(
+            [
+                "git",
+                "worktree",
+                "remove",
+                "--force",
+                "--force",
+                str(self.base_dir / name),
+            ],
+            cwd=self.repo,
+            check=True,
+        )
+        logger.info(f"Worktree {name} removed")
+
+    def cleanup_all(self) -> None:
+        logger.info(f"Cleaning up all worktrees in {self.base_dir}")
+        for entry in self.base_dir.iterdir():
+            if entry.is_dir():
+                logger.info(f"Removing worktree {entry}")
+                subprocess.run(
+                    ["git", "worktree", "remove", "--force", "--force", str(entry)],
+                    cwd=self.repo,
+                    check=False,
+                )
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
new file mode 100644
index 0000000000..ff72a232b1
--- /dev/null
+++ b/docker-compose.dev.yml
@@ -0,0 +1,18 @@
+services:
+  # Base Docker image requires running `./mach taskgraph load-image debian12-amd64-build:latest`
+  build-repair:
+    # TO minimize rebuilding use `DOCKER_DEFAULT_PLATFORM=linux/amd64 docker build -t build-repair-debian-base -f docker/build_repair/Dockerfile .`
+    # image: build-repair-debian-base
+    build:
+      context: .
+      dockerfile: docker/build_repair/Dockerfile
+    volumes:
+      - .:/app # live code editing
+      - ${FIREFOX_REPO}:/workspace/firefox # Firefox repo
+      - build-repair-tmp:/tmp/build_repair_worktrees
+    environment:
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
+      - WANDB_API_KEY=${WANDB_API_KEY} # for weave
+      - FIREFOX_GIT_REPO=/workspace/firefox
+volumes:
+  build-repair-tmp:
diff --git a/docker/build_repair/Dockerfile b/docker/build_repair/Dockerfile
new file mode 100644
index 0000000000..597713be73
--- /dev/null
+++ b/docker/build_repair/Dockerfile
@@ -0,0 +1,16 @@
+# ./mach taskgraph load-image
+FROM debian12-amd64-build
+
+WORKDIR /app
+
+RUN apt-get install -y python3-pip python3-venv
+
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+RUN apt-get install -y git nodejs npm && rm -rf /var/lib/apt/lists/*
+RUN pip install weave>=0.52.29 pydantic claude-agent-sdk requests
+
+COPY . /app
+
+ENV FIREFOX_GIT_REPO=/workspace/firefox
diff --git a/notebooks/build_repair_create_dataset.ipynb b/notebooks/build_repair_create_dataset.ipynb
new file mode 100644
index 0000000000..f1da8fee77
--- /dev/null
+++ b/notebooks/build_repair_create_dataset.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "af4f2030",
+   "metadata": {},
+   "source": "# Create Dataset for Build Repairs"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f0f37fd7",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f57608e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import weave\n",
+    "\n",
+    "PROJECT_NAME = \"bugbug-build-repair-eval\"\n",
+    "DATASET_NAME = \"build_repair_one_commit_eval\"\n",
+    "\n",
+    "_ = weave.init(PROJECT_NAME)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "255ccaac",
+   "metadata": {},
+   "source": [
+    "## Prepare the Data\n",
+    "\n",
+    "### Load one commit build failures"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5bd97ef4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_json(\n",
+    "    \"https://community-tc.services.mozilla.com/api/queue/v1/task/Ra5r2qSyS8G-9pjLrS6l6Q/runs/0/artifacts/public%2Fci_failures.json.zst\",\n",
+    "    lines=True,\n",
+    ")\n",
+    "df = df[(df.failure_commits.apply(len) == 1) & (df.fix_commits.apply(len) == 1)]\n",
+    "print(f\"One commit fail and fix: {len(df)}\")\n",
+    "df = df[\n",
+    "    df.failures.apply(\n",
+    "        lambda fails: any(\n",
+    "            \"build\" in f[\"task_name\"] and \"test\" not in f[\"task_name\"] for f in fails\n",
+    "        )\n",
+    "    )\n",
+    "]\n",
+    "print(f\"Build fails only: {len(df)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3d5e260dc3245c36",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2026-02-13T23:52:42.195048Z",
+     "start_time": "2026-02-13T23:52:42.191945Z"
+    }
+   },
+   "source": "### Get GitHub revisions"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "196561e8d8b0659f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "\n",
+    "def get_git_rev(hg_revs):\n",
+    "    for rev in hg_revs:\n",
+    "        convert_url = f\"https://lando.moz.tools/api/hg2git/firefox/{rev}\"\n",
+    "        resp = requests.get(convert_url)\n",
+    "        if resp.status_code != 200:\n",
+    "            raise ValueError(f\"Unexpected HTTP status code: {resp.status_code}. {resp}\")\n",
+    "        yield resp.json()[\"git_hash\"]\n",
+    "\n",
+    "\n",
+    "df[\"gh_failure_commits\"] = df.failure_commits.apply(\n",
+    "    lambda commits: list(get_git_rev(commits))\n",
+    ")\n",
+    "df[\"gh_fix_commits\"] = df.fix_commits.apply(lambda commits: list(get_git_rev(commits)))\n",
+    "df = df.rename(\n",
+    "    columns={\"failure_commits\": \"hg_failure_commits\", \"fix_commits\": \"hg_fix_commits\"}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9dc12cf3f6f9b844",
+   "metadata": {},
+   "source": "### Ger bugzilla comments before the fix"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d43ac43a96c50c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from libmozdata import config\n",
+    "\n",
+    "from bugbug.tools.core.platforms.bugzilla import Bug\n",
+    "\n",
+    "config.set_default_value(\"User-Agent\", \"name\", \"bugbug/1.0\")\n",
+    "\n",
+    "\n",
+    "def _get_comments(bug, fail_commit):\n",
+    "    for comment in bug._metadata[\"comments\"]:\n",
+    "        if comment[\"creator\"] == \"pulsebot@bmo.tld\":\n",
+    "            if fail_commit[:12] in comment[\"raw_text\"]:\n",
+    "                # stop adding comments at failure commit push\n",
+    "                yield comment[\"raw_text\"]\n",
+    "                break\n",
+    "            else:\n",
+    "                continue\n",
+    "        if comment[\"raw_text\"]:\n",
+    "            yield comment[\"raw_text\"]\n",
+    "\n",
+    "\n",
+    "def _get_fix_commit_date(bug, fix_commit):\n",
+    "    for comment in bug._metadata[\"comments\"]:\n",
+    "        if (\n",
+    "            comment[\"creator\"] == \"pulsebot@bmo.tld\"\n",
+    "            and fix_commit[:12] in comment[\"raw_text\"]\n",
+    "        ):\n",
+    "            return comment[\"time\"]\n",
+    "    raise None\n",
+    "\n",
+    "\n",
+    "def get_bug_info_and_fix_date(build_fail):\n",
+    "    bug_id = build_fail[\"bug_id\"]\n",
+    "    fail_commit = build_fail[\"hg_failure_commits\"][0]\n",
+    "    fix_commit = build_fail[\"hg_fix_commits\"][0]\n",
+    "\n",
+    "    try:\n",
+    "        bug = Bug.get(bug_id)\n",
+    "    except ValueError as ex:\n",
+    "        print(ex)\n",
+    "        return pd.Series([None, None])\n",
+    "\n",
+    "    return pd.Series(\n",
+    "        [\n",
+    "            {\"title\": bug.summary, \"comments\": list(_get_comments(bug, fail_commit))},\n",
+    "            _get_fix_commit_date(bug, fix_commit),\n",
+    "        ]\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "df[[\"pre_fix_bug\", \"fix_commit_date\"]] = df.apply(get_bug_info_and_fix_date, axis=1)\n",
+    "df = df[df[\"pre_fix_bug\"].notnull() & df[\"fix_commit_date\"].notnull()]\n",
+    "print(f\"With bug info: {len(df)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab9m8qftc2q",
+   "metadata": {},
+   "source": "### Filter out data before model cutoff (data contamination prevention)"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0f792f5b775076d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CONTAMINATION_CUTOFF = \"2025-09-01\"\n",
+    "\n",
+    "before = len(df)\n",
+    "df = df[\n",
+    "    df[\"fix_commit_date\"].apply(lambda d: d is not None and d >= CONTAMINATION_CUTOFF)\n",
+    "]\n",
+    "print(\n",
+    "    f\"Filtered {before - len(df)} examples with fix date before {CONTAMINATION_CUTOFF}\"\n",
+    ")\n",
+    "print(f\"Final number of examples: {len(df)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fde92cac54949c08",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1695783abf03124f",
+   "metadata": {},
+   "source": "## Save the Dataset"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6096ee230ee902a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "examples = df.to_dict(orient=\"records\")\n",
+    "examples[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79d08ee1b92ca6de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = weave.Dataset(\n",
+    "    name=DATASET_NAME,\n",
+    "    description=\"Build repair evaluation dataset with failure logs, ground truth fix commits and pre fix Bugzilla comments.\",\n",
+    "    rows=examples,\n",
+    ")\n",
+    "\n",
+    "_ = weave.publish(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "384f041b61c6c2e7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "bugbug (3.12.7)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/requirements.txt b/requirements.txt
index f431d8fa49..7ee624df7f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ amqp==5.3.1
 async-lru==2.1.0
 beautifulsoup4==4.14.3
 boto3==1.42.49
+claude-agent-sdk>=0.1.30
 httpx==0.28.1
 imbalanced-learn==0.14.1
 langchain==1.2.10
diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py
new file mode 100644
index 0000000000..aa7d91cc1f
--- /dev/null
+++ b/scripts/build_repair_eval.py
@@ -0,0 +1,355 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""Standalone CLI for build repair evaluation.
+
+Usage:
+    python scripts/build_repair_eval.py
+    python scripts/build_repair_eval.py --analysis-only
+    python scripts/build_repair_eval.py --trials 3
+    python scripts/build_repair_eval.py --limit 5
+    python scripts/build_repair_eval.py --parallelism 4
+    python scripts/build_repair_eval.py --no-try-push
+    python scripts/build_repair_eval.py --verbose
+"""
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import uuid
+from datetime import datetime
+from functools import cached_property
+from typing import Any
+
+import weave
+
+from bugbug.tools.build_repair.agent import AgentResponse, BuildFailure, BuildRepairTool
+from bugbug.tools.build_repair.config import MODEL_CUTOFF_DATES
+from bugbug.tools.build_repair.scorer import (
+    BasicMetricsScorer,
+    BuildPassRateScorer,
+    LLMFixMatchingScorer,
+)
+from bugbug.tools.build_repair.worktree import WorktreeManager
+
+logger = logging.getLogger(__name__)
+
+# TODO: replace with native tracing for Anthropic Agents SDK when released by W&B
+
+
+def _attr(obj, key, default=None):
+    if isinstance(obj, dict):
+        return obj.get(key, default)
+    return getattr(obj, key, default)
+
+
+def _to_chat_message(data: dict) -> dict | None:
+    """Convert a serialized claude_agent_sdk message to OpenAI chat format.
+
+    Content blocks may be dicts (from model_dump) or dataclass instances
+    (from vars), so we use _attr() for uniform access.
+    """
+    msg_type = data.get("type", "")
+
+    if msg_type == "AssistantMessage":
+        blocks = data.get("content", [])
+        text_parts = []
+        tool_calls = []
+        for block in blocks:
+            text = _attr(block, "text")
+            if text is not None:
+                text_parts.append(text)
+                continue
+            name = _attr(block, "name")
+            block_id = _attr(block, "id")
+            if name is not None and block_id is not None:
+                tool_calls.append(
+                    {
+                        "id": block_id,
+                        "type": "function",
+                        "function": {
+                            "name": name,
+                            "arguments": json.dumps(_attr(block, "input", {})),
+                        },
+                    }
+                )
+        if not text_parts and not tool_calls:
+            return None
+        msg: dict = {"role": "assistant"}
+        if text_parts:
+            msg["content"] = "\n".join(text_parts)
+        if tool_calls:
+            msg["tool_calls"] = tool_calls
+        return msg
+
+    if msg_type == "UserMessage":
+        content = data.get("content", "")
+        if isinstance(content, list):
+            for block in content:
+                tool_use_id = _attr(block, "tool_use_id")
+                if tool_use_id:
+                    block_content = _attr(block, "content", "")
+                    return {
+                        "role": "tool",
+                        "tool_call_id": tool_use_id,
+                        "content": str(block_content) if block_content else "",
+                    }
+
+    return None
+
+
+@weave.op(kind="llm")
+def trace_llm_stage(
+    stage: str,
+    messages: list[dict],
+    model: str,
+    result_data: dict | None = None,
+) -> dict:
+    last_assistant = ""
+    for msg in reversed(messages):
+        if msg.get("role") == "assistant" and msg.get("content"):
+            last_assistant = msg["content"]
+            break
+
+    result: dict[str, Any] = {
+        "model": model,
+        "choices": [
+            {
+                "message": {"role": "assistant", "content": last_assistant},
+            }
+        ],
+    }
+    if result_data:
+        raw_usage = result_data.get("usage", {}) or {}
+        input_tokens = raw_usage.get("input_tokens", 0)
+        output_tokens = raw_usage.get("output_tokens", 0)
+        result["usage"] = {
+            "prompt_tokens": input_tokens,
+            "completion_tokens": output_tokens,
+            "total_tokens": input_tokens + output_tokens,
+            "cache_read_input_tokens": raw_usage.get("cache_read_input_tokens", 0),
+            "cache_creation_input_tokens": raw_usage.get(
+                "cache_creation_input_tokens", 0
+            ),
+            "total_cost_usd": result_data.get("total_cost_usd", 0),
+            "num_turns": result_data.get("num_turns", 0),
+        }
+    return result
+
+
+# Per-token costs in USD (standard, non-cached rates).
+# Weave uses these for its built-in cost UI; the SDK's total_cost_usd
+# (which accounts for cache pricing) is tracked separately as the authoritative cost.
+ANTHROPIC_TOKEN_COSTS: dict[str, tuple[float, float]] = {
+    "claude-opus-4-6": (15.0e-6, 75.0e-6),
+    "claude-sonnet-4-6": (3.0e-6, 15.0e-6),
+    "claude-haiku-4-5-20251001": (0.8e-6, 4.0e-6),
+    "claude-sonnet-4-5-20250929": (3.0e-6, 15.0e-6),
+    "claude-opus-4-5-20251101": (15.0e-6, 75.0e-6),
+    "claude-opus-4-1-20250805": (15.0e-6, 75.0e-6),
+    "claude-sonnet-4-20250514": (3.0e-6, 15.0e-6),
+    "claude-3-7-sonnet-20250219": (3.0e-6, 15.0e-6),
+    "claude-opus-4-20250514": (15.0e-6, 75.0e-6),
+}
+
+
+def _register_model_costs(client) -> None:
+    for model_id, (prompt_cost, completion_cost) in ANTHROPIC_TOKEN_COSTS.items():
+        try:
+            client.add_cost(
+                llm_id=model_id,
+                prompt_token_cost=prompt_cost,
+                completion_token_cost=completion_cost,
+            )
+        except Exception as e:
+            logger.debug(f"Could not register cost for {model_id}: {e}")
+
+
+def _make_weave_callback():
+    stages: dict[str, dict] = {}
+
+    def on_message(stage: str, data: dict) -> None:
+        msg_type = data["type"]
+        if msg_type == "stage_start":
+            messages = []
+            if "system_prompt" in data:
+                messages.append({"role": "system", "content": data["system_prompt"]})
+            messages.append({"role": "user", "content": data["prompt"]})
+
+            stages[stage] = {
+                "model": data["model"],
+                "messages": messages,
+            }
+        elif msg_type == "stage_end":
+            if stage in stages:
+                s = stages.pop(stage)
+                trace_llm_stage(
+                    stage=stage,
+                    messages=s["messages"],
+                    model=s["model"],
+                    result_data=data.get("result_data") or None,
+                )
+        else:
+            if stage in stages:
+                chat_msg = _to_chat_message(data)
+                if chat_msg:
+                    stages[stage]["messages"].append(chat_msg)
+
+    return on_message
+
+
+class BuildRepairError(Exception):
+    """Raised when the agent completes but reports an error."""
+
+    def __init__(self, output: dict):
+        self.output = output
+        super().__init__(output.get("error", "Unknown error"))
+
+
+class BuildRepairModel(weave.Model):
+    """Weave Model wrapper that creates a worktree per example and runs BuildRepairTool."""
+
+    firefox_repo: str
+    analysis_only: bool = False
+    no_try_push: bool = False
+
+    @cached_property
+    def tool(self) -> BuildRepairTool:
+        return BuildRepairTool.create(analysis_only=self.analysis_only, eval_mode=True)
+
+    @cached_property
+    def worktree_mgr(self) -> WorktreeManager:
+        return WorktreeManager(self.firefox_repo)
+
+    @weave.op()
+    async def invoke(
+        self,
+        bug_id: int,
+        pre_fix_bug: dict,
+        gh_failure_commits: list[str],
+        failures: list[dict],
+        fix_commit_date: str,
+        **kwargs,
+    ) -> dict:
+        wt_name = f"bug-{bug_id}-{uuid.uuid4().hex[:8]}"
+        logger.info(
+            f"Invoking bug {bug_id} "
+            f"(commit={gh_failure_commits[0][:12]}, {len(failures)} failures)"
+        )
+
+        worktree_created = False
+        try:
+            cutoff = max(
+                MODEL_CUTOFF_DATES[self.tool.analysis_model],
+                MODEL_CUTOFF_DATES[self.tool.fix_model],
+            )
+            if datetime.fromisoformat(fix_commit_date).date() < cutoff:
+                logger.warning(
+                    f"Skipping bug {bug_id}: fix date {fix_commit_date} "
+                    f"is before model cutoff {cutoff}"
+                )
+                raise ValueError("skipped_data_contamination")
+
+            worktree_path = self.worktree_mgr.create(gh_failure_commits[0], wt_name)
+            worktree_created = True
+
+            failure = BuildFailure(
+                bug_id=bug_id,
+                bug_title=pre_fix_bug["title"],
+                bug_comments=pre_fix_bug["comments"],
+                git_commit=gh_failure_commits[0],
+                failure_tasks=failures,
+            )
+            result: AgentResponse = await self.tool.run(
+                failure,
+                worktree_path=worktree_path,
+                skip_try_push=self.no_try_push,
+                on_message=_make_weave_callback(),
+            )
+            logger.info(
+                f"Bug {bug_id} completed: error={result.error}, "
+                f"diff_len={len(result.diff)}, cost=${result.cost_usd:.4f}, "
+                f"turns={result.num_turns}, "
+                f"local_build={result.local_build_passed}, "
+                f"try_build={result.try_build_passed}"
+            )
+
+            output = result.model_dump()
+            if result.error:
+                raise BuildRepairError(output)
+            return output
+        finally:
+            if worktree_created:
+                logger.info(f"Bug {bug_id}: cleaning up worktree {wt_name}")
+                self.worktree_mgr.cleanup(wt_name)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Build repair evaluation")
+    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument("--trials", type=int, default=1)
+    parser.add_argument("--parallelism", type=int, default=8)
+    parser.add_argument("--firefox-repo", default=os.environ.get("FIREFOX_GIT_REPO"))
+    parser.add_argument("--dataset", default="build_repair_one_commit_eval")
+    parser.add_argument("--analysis-only", action="store_true")
+    parser.add_argument("--no-try-push", action="store_true")
+    parser.add_argument("--verbose", action="store_true", help="Enable DEBUG logging")
+    args = parser.parse_args()
+
+    if not args.firefox_repo:
+        parser.error("--firefox-repo or FIREFOX_GIT_REPO env var is required")
+
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+    if not args.verbose:
+        logging.getLogger("httpx").setLevel(logging.WARNING)
+        logging.getLogger("httpcore").setLevel(logging.WARNING)
+        logging.getLogger("hgitaly").setLevel(logging.WARNING)
+        logging.getLogger("urllib3").setLevel(logging.WARNING)
+
+    logger.info(
+        f"Starting evaluation: dataset={args.dataset}, limit={args.limit}, "
+        f"trials={args.trials}, parallelism={args.parallelism}, "
+        f"analysis_only={args.analysis_only}, no_try_push={args.no_try_push}, "
+        f"firefox_repo={args.firefox_repo}"
+    )
+
+    os.environ["WEAVE_PARALLELISM"] = str(args.parallelism)
+    client = weave.init("bugbug-build-repair-eval")
+    _register_model_costs(client)
+
+    dataset = weave.ref(args.dataset).get()
+    logger.info(f"Loaded dataset {args.dataset} with {len(dataset.rows)} rows")
+    if args.limit:
+        dataset.rows = dataset.rows[: args.limit]
+        logger.info(f"Limited to {len(dataset.rows)} rows")
+
+    scorers = [BasicMetricsScorer(num_trials=args.trials), LLMFixMatchingScorer()]
+    if not args.analysis_only:
+        scorers.insert(1, BuildPassRateScorer(num_trials=args.trials))
+    logger.info(f"Scorers: {[type(s).__name__ for s in scorers]}")
+
+    model = BuildRepairModel(
+        firefox_repo=args.firefox_repo,
+        analysis_only=args.analysis_only,
+        no_try_push=args.no_try_push,
+    )
+    evaluation = weave.Evaluation(
+        name="build-repair",
+        dataset=dataset,
+        scorers=scorers,
+        trials=args.trials,
+    )
+    results = asyncio.run(evaluation.evaluate(model))
+    logger.info(f"Evaluation results: {results}")
+
+
+if __name__ == "__main__":
+    main()