diff --git a/.gitignore b/.gitignore index a00be42b73..77ee39bd33 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,6 @@ node_modules/ *.log # Desktop Service Store *.DS_Store + +# JetBrains IDEs +.idea diff --git a/bugbug/tools/build_repair/__init__.py b/bugbug/tools/build_repair/__init__.py new file mode 100644 index 0000000000..2f3caac2c6 --- /dev/null +++ b/bugbug/tools/build_repair/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +from bugbug.tools.build_repair.agent import AgentResponse, BuildFailure, BuildRepairTool + +__all__ = ["AgentResponse", "BuildFailure", "BuildRepairTool"] diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py new file mode 100644 index 0000000000..0845315871 --- /dev/null +++ b/bugbug/tools/build_repair/agent.py @@ -0,0 +1,407 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import subprocess +import traceback +from collections.abc import Callable +from logging import getLogger +from pathlib import Path + +from claude_agent_sdk import ClaudeAgentOptions, ResultMessage, query +from pydantic import BaseModel, Field + +from bugbug.tools.base import GenerativeModelTool +from bugbug.tools.build_repair.config import ( + ADDITIONAL_DIRS, + ALLOWED_TOOLS, + ANALYSIS_MODEL, + FIREFOX_MCP_URL, + FIX_MODEL, + SANDBOX_CONFIG, +) +from bugbug.tools.build_repair.prompts import ( + ANALYSIS_TEMPLATE, + EVAL_PROMPT, + FIX_TEMPLATE, +) + +logger = getLogger(__name__) + + +class BuildFailure(BaseModel): + """Input describing a build failure from the dataset.""" + + bug_id: int = Field(description="The ID of the bug in Bugzilla.") + bug_title: str | None = Field(default=None, description="Optional bug title.") + bug_comments: list[str] | None = Field( + default=None, description="Optional bug comments." + ) + git_commit: str = Field(description="Git revision to checkout.") + failure_tasks: list[dict] = Field( + description="List of {task_name, task_id, retry_id, failure_lines}." + ) + + +class AgentResponse(BaseModel): + """Output from a build repair run, including analysis, diff, cost, and build results.""" + + summary: str = Field(default="") + analysis: str = Field(default="") + diff: str = Field(default="") + error: str | None = Field(default=None) + error_traceback: str | None = Field(default=None) + failure_stage: str | None = Field(default=None) + cost_usd: float = Field(default=0.0) + num_turns: int = Field(default=0) + input_tokens: int = Field(default=0) + output_tokens: int = Field(default=0) + cache_read_input_tokens: int = Field(default=0) + cache_creation_input_tokens: int = Field(default=0) + local_build_passed: bool | None = Field(default=None) + try_build_passed: bool | None = Field(default=None) + lando_job_id: str | None = Field(default=None) + treeherder_url: str | None = Field(default=None) + stage1_transcript: list[dict] = Field(default_factory=list) + stage2_transcript: list[dict] = Field(default_factory=list) + + +class BuildRepairTool(GenerativeModelTool): + """Two-stage build repair agent using Claude Agent SDK. + + Stage 1: Analyzes the failure and produces analysis/planning/summary docs. + Stage 2: Reads the analysis and implements a fix. Skipped in analysis-only mode. + After Stage 2, commits the fix, runs ./mach build, and optionally submits to try. + """ + + def __init__( + self, + target_software: str = "Mozilla Firefox", + analysis_only: bool = False, + eval_mode: bool = False, + analysis_model: str = ANALYSIS_MODEL, + fix_model: str = FIX_MODEL, + ) -> None: + self.eval_mode = eval_mode + self.target_software = target_software + self.analysis_only = analysis_only + self.analysis_model = analysis_model + self.fix_model = fix_model + + @classmethod + def create(cls, **kwargs): + return cls(**kwargs) + + @staticmethod + def _usage_fields(usage: dict) -> dict: + return { + "input_tokens": usage.get("input_tokens", 0), + "output_tokens": usage.get("output_tokens", 0), + "cache_read_input_tokens": usage.get("cache_read_input_tokens", 0), + "cache_creation_input_tokens": usage.get("cache_creation_input_tokens", 0), + } + + @staticmethod + def _serialize_message(message) -> dict: + data = {"type": type(message).__name__} + if hasattr(message, "model_dump"): + data.update(message.model_dump()) + elif hasattr(message, "__dict__"): + data.update(vars(message)) + else: + data["raw"] = str(message) + return data + + async def _run_stage( + self, + stage_name: str, + prompt: str, + model: str, + options: ClaudeAgentOptions, + bug_id: int, + on_message: Callable[[str, dict], None] | None = None, + ) -> tuple[list[dict], float, int, dict]: + transcript: list[dict] = [] + cost = 0.0 + turns = 0 + result_data: dict = {} + usage: dict = {} + + if on_message: + on_message( + stage_name, + { + "type": "stage_start", + "prompt": prompt, + "model": model, + }, + ) + try: + async for message in query(prompt=prompt, options=options): + serialized = self._serialize_message(message) + transcript.append(serialized) + logger.info(f"Bug {bug_id}: {stage_name} [{serialized['type']}]") + logger.debug(f"Bug {bug_id}: {stage_name} detail: {serialized}") + if on_message: + on_message(stage_name, serialized) + if isinstance(message, ResultMessage): + cost += message.total_cost_usd or 0 + turns += message.num_turns or 0 + usage = getattr(message, "usage", {}) or {} + result_data = serialized + finally: + if on_message: + on_message( + stage_name, + { + "type": "stage_end", + "cost_usd": cost, + "num_turns": turns, + "result_data": result_data, + }, + ) + + return transcript, cost, turns, usage + + def _prepare_input_files(self, failure: BuildFailure, worktree_path: Path) -> None: + in_dir = worktree_path / "repair_agent" / "in" / str(failure.bug_id) + in_dir.mkdir(parents=True, exist_ok=True) + + (in_dir / "bug_description.md").write_text( + f"# Bug {failure.bug_id}: {failure.bug_title}\n\n" + + "\n\n---\n\n".join(failure.bug_comments or []) + ) + + logs_content = "" + for task in failure.failure_tasks: + logs_content += f"## {task['task_name']} (task_id: {task['task_id']})\n\n" + logs_content += "\n".join(task["failure_lines"]) + "\n\n" + (in_dir / "build_failure_logs.md").write_text(logs_content) + + out_dir = worktree_path / "repair_agent" / "out" / str(failure.bug_id) + out_dir.mkdir(parents=True, exist_ok=True) + + logger.info( + f"Prepared input files for bug {failure.bug_id} at {in_dir} " + f"({len(failure.failure_tasks)} failure tasks)" + ) + + def _read_output(self, failure: BuildFailure, worktree_path: Path, key: str) -> str: + path = ( + worktree_path / "repair_agent" / "out" / str(failure.bug_id) / f"{key}.md" + ) + if path.exists(): + return path.read_text() + return "" + + async def run( + self, + failure: BuildFailure, + worktree_path: Path, + skip_try_push: bool = False, + on_message: Callable[[str, dict], None] | None = None, + ) -> AgentResponse: + logger.info( + f"Starting build repair for bug {failure.bug_id} " + f"(commit={failure.git_commit}, worktree={worktree_path}, " + f"analysis_only={self.analysis_only}, skip_try_push={skip_try_push})" + ) + self._prepare_input_files(failure, worktree_path) + + mcp_servers = {"firefox": {"type": "http", "url": FIREFOX_MCP_URL}} + disallowed = ["AskUserQuestion", "Task"] + total_cost = 0.0 + total_turns = 0 + total_usage: dict = {} + + logger.info( + f"Bug {failure.bug_id}: starting Stage 1 (analysis) " + f"with model={self.analysis_model}" + ) + stage1_options = ClaudeAgentOptions( + model=self.analysis_model, + cwd=str(worktree_path), + allowed_tools=ALLOWED_TOOLS, + disallowed_tools=disallowed, + add_dirs=ADDITIONAL_DIRS, + sandbox=SANDBOX_CONFIG, + permission_mode="acceptEdits", + effort="high", + mcp_servers=mcp_servers, + ) + analysis_prompt = ANALYSIS_TEMPLATE.format( + bug_id=failure.bug_id, + target_software=self.target_software, + eval=EVAL_PROMPT if self.eval_mode else "", + ) + try: + ( + stage1_transcript, + stage1_cost, + stage1_turns, + stage1_usage, + ) = await self._run_stage( + "analysis", + analysis_prompt, + self.analysis_model, + stage1_options, + failure.bug_id, + on_message, + ) + total_cost += stage1_cost + total_turns += stage1_turns + for k, v in stage1_usage.items(): + if isinstance(v, (int, float)): + total_usage[k] = total_usage.get(k, 0) + v + except Exception as e: + logger.error( + f"Bug {failure.bug_id}: Stage 1 (analysis) failed: {e}", exc_info=True + ) + return AgentResponse( + error=str(e), + error_traceback=traceback.format_exc(), + failure_stage="analysis", + cost_usd=total_cost, + num_turns=total_turns, + **self._usage_fields(total_usage), + ) + + logger.info( + f"Bug {failure.bug_id}: Stage 1 complete " + f"(cost=${total_cost:.4f}, turns={total_turns})" + ) + + summary = self._read_output(failure, worktree_path, "summary") + analysis = self._read_output(failure, worktree_path, "analysis") + logger.info( + f"Bug {failure.bug_id}: read output files " + f"(summary={len(summary)} chars, analysis={len(analysis)} chars)" + ) + + if self.analysis_only: + logger.info(f"Bug {failure.bug_id}: analysis-only mode, skipping Stage 2") + return AgentResponse( + summary=summary, + analysis=analysis, + cost_usd=total_cost, + num_turns=total_turns, + **self._usage_fields(total_usage), + stage1_transcript=stage1_transcript, + ) + + logger.info( + f"Bug {failure.bug_id}: starting Stage 2 (fix) with model={self.fix_model}" + ) + stage2_options = ClaudeAgentOptions( + model=self.fix_model, + cwd=str(worktree_path), + allowed_tools=ALLOWED_TOOLS, + disallowed_tools=disallowed, + add_dirs=ADDITIONAL_DIRS, + sandbox=SANDBOX_CONFIG, + permission_mode="acceptEdits", + effort="low", + mcp_servers=mcp_servers, + ) + fix_prompt = FIX_TEMPLATE.format( + bug_id=failure.bug_id, eval=EVAL_PROMPT if self.eval_mode else "" + ) + try: + ( + stage2_transcript, + stage2_cost, + stage2_turns, + stage2_usage, + ) = await self._run_stage( + "fix", + fix_prompt, + self.fix_model, + stage2_options, + failure.bug_id, + on_message, + ) + total_cost += stage2_cost + total_turns += stage2_turns + for k, v in stage2_usage.items(): + if isinstance(v, (int, float)): + total_usage[k] = total_usage.get(k, 0) + v + except Exception as e: + logger.error( + f"Bug {failure.bug_id}: Stage 2 (fix) failed: {e}", exc_info=True + ) + return AgentResponse( + summary=summary, + analysis=analysis, + error=str(e), + error_traceback=traceback.format_exc(), + failure_stage="fix", + cost_usd=total_cost, + num_turns=total_turns, + **self._usage_fields(total_usage), + ) + + logger.info( + f"Bug {failure.bug_id}: Stage 2 complete " + f"(cost=${total_cost:.4f}, turns={total_turns})" + ) + + diff_result = subprocess.run( + ["git", "diff", "HEAD"], + cwd=worktree_path, + capture_output=True, + text=True, + ) + diff = diff_result.stdout + logger.info(f"Bug {failure.bug_id}: git diff produced {len(diff)} chars") + + if not diff.strip(): + logger.warning(f"Bug {failure.bug_id}: no diff produced, returning early") + return AgentResponse( + summary=summary, + analysis=analysis, + diff=diff, + cost_usd=total_cost, + num_turns=total_turns, + **self._usage_fields(total_usage), + stage1_transcript=stage1_transcript, + stage2_transcript=stage2_transcript, + ) + + from bugbug.tools.build_repair.try_server import run_try_verification + + task_name = ( + failure.failure_tasks[0]["task_name"] if failure.failure_tasks else "" + ) + logger.info( + f"Bug {failure.bug_id}: starting try verification " + f"(task={task_name}, skip_try_push={skip_try_push})" + ) + try_result = run_try_verification( + worktree_path=worktree_path, + bug_id=failure.bug_id, + task_name=task_name, + skip_try_push=skip_try_push, + ) + + logger.info( + f"Bug {failure.bug_id}: try verification done " + f"(local_build={try_result.local_build_passed}, " + f"try_build={try_result.try_build_passed}, " + f"lando_job={try_result.lando_job_id}, " + f"total_cost=${total_cost:.4f}, total_turns={total_turns})" + ) + return AgentResponse( + summary=summary, + analysis=analysis, + diff=diff, + cost_usd=total_cost, + num_turns=total_turns, + **self._usage_fields(total_usage), + local_build_passed=try_result.local_build_passed, + try_build_passed=try_result.try_build_passed, + lando_job_id=try_result.lando_job_id, + treeherder_url=try_result.treeherder_url, + stage1_transcript=stage1_transcript, + stage2_transcript=stage2_transcript, + ) diff --git a/bugbug/tools/build_repair/config.py b/bugbug/tools/build_repair/config.py new file mode 100644 index 0000000000..f4bde13a3a --- /dev/null +++ b/bugbug/tools/build_repair/config.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +from datetime import date + +from claude_agent_sdk import SandboxNetworkConfig, SandboxSettings + +ANALYSIS_MODEL = "claude-opus-4-6" +FIX_MODEL = "claude-opus-4-6" +DEFAULT_MAX_TURNS = 80 +WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees" +TRY_PUSH_TIMEOUT_SECONDS = 7200 +TRY_PUSH_POLL_INTERVAL_SECONDS = 60 +TREEHERDER_BASE_URL = "https://treeherder.mozilla.org" + +FIREFOX_MCP_URL = "https://mcp-dev.moz.tools/mcp" + +# Training data cutoff dates per model, for data contamination filtering. +# Examples with fix_commit_date before the cutoff may have been in training data. +# Source: https://platform.claude.com/docs/en/about-claude/models/overview +MODEL_CUTOFF_DATES = { + "claude-opus-4-6": date(2025, 8, 1), + "claude-sonnet-4-6": date(2026, 1, 1), + "claude-haiku-4-5-20251001": date(2025, 7, 1), + "claude-sonnet-4-5-20250929": date(2025, 7, 1), + "claude-opus-4-5-20251101": date(2025, 8, 1), + "claude-opus-4-1-20250805": date(2025, 3, 1), + "claude-sonnet-4-20250514": date(2025, 3, 1), + "claude-3-7-sonnet-20250219": date(2024, 11, 1), + "claude-opus-4-20250514": date(2025, 3, 1), +} + +ALLOWED_TOOLS = [ + "Edit(~/.mozbuild)", + "Edit(~/.cache/uv)", + "Bash(./mach build:*)", + "Bash(./mach clobber:*)", + "Bash(./mach configure:*)", + "Bash(./mach run:*)", + "Bash(./mach test:*)", + "Bash(./mach wpt:*)", + "Bash(./mach lint:*)", + "Bash(./mach format:*)", + "Bash(./mach clang-format:*)", + "Bash(./mach try:*)", + "Bash(./mach help:*)", + "Bash(./mach vendor:*)", + "Bash(./mach bootstrap:*)", + "Bash(./mach artifact:*)", + "Bash(clang++:*)", + "Bash(rm:*)", + "Bash(timeout:*)", + "Bash(find:*)", + "Bash(grep:*)", + "Bash(tee:*)", + "Bash(kill:*)", + "Bash(searchfox-cli:*)", + "Bash(treeherder-cli:*)", + "Bash(jj:*)", + "WebFetch(domain:firefox-source-docs.mozilla.org)", + "WebFetch(domain:treeherder.mozilla.org)", + "WebFetch(domain:searchfox.org)", + "WebFetch(o1069899.ingest.sentry.io)", +] + +ADDITIONAL_DIRS = [ + "~/.mozbuild", + "~/.cache/uv/", +] + +SANDBOX_CONFIG = SandboxSettings( + enabled=True, + autoAllowBashIfSandboxed=True, + allowUnsandboxedCommands=False, + network=SandboxNetworkConfig(allowLocalBinding=True), +) diff --git a/bugbug/tools/build_repair/prompts.py b/bugbug/tools/build_repair/prompts.py new file mode 100644 index 0000000000..cdab7e11e0 --- /dev/null +++ b/bugbug/tools/build_repair/prompts.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Prompt templates for build repair agent.""" + +ANALYSIS_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure. + +Investigate why the last commit broke {target_software} build. + +The last commit attempted to fix a bug from Bugzilla. + +Analyze the following: +1. Git diff for the last commit +2. Bugzilla bug description +3. Taskcluster build failure logs +The files with bug description and logs are located at @repair_agent/in/{bug_id} + +Create three separate documents: +1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues +2. repair_agent/out/{bug_id}/planning.md with a fixing plan +3. repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction + +Do not prompt to edit those documents. +{eval} + +Do not write any code yet. Work fully autonomously, do not ask any questions. Think hard. +""" + +FIX_TEMPLATE = """Read the following files and implement a fix of the failure: +1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues +2. repair_agent/out/{bug_id}/planning.md with a fixing plan +{eval} + +Do not prompt to edit files. Work fully autonomously, do not ask any questions. Use all allowed tools without prompting. +""" + +EVAL_PROMPT = """ +Do not request bug info from Bugzilla or Phabricator. Use only the provided file with bug description. +Do not look at git commits other than the specified last commit. +""" diff --git a/bugbug/tools/build_repair/scorer.py b/bugbug/tools/build_repair/scorer.py new file mode 100644 index 0000000000..566b384a6a --- /dev/null +++ b/bugbug/tools/build_repair/scorer.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +from logging import getLogger + +import weave + +logger = getLogger(__name__) + + +def _pass_at_k( + score_rows: list[dict], + num_trials: int, + metric: str, +) -> dict[str, float]: + """Compute pass@k from scorer rows ordered by trial. + + Rows are ordered: first num_examples = trial 0, next = trial 1, etc. + Rows may be empty dicts when the model raised an exception. + """ + num_examples = len(score_rows) // num_trials + pass_at: dict[str, float] = {} + for n in sorted({1, 3, num_trials}): + if n > num_trials: + continue + successes = sum( + any(score_rows[t * num_examples + i].get(metric) is True for t in range(n)) + for i in range(num_examples) + ) + pass_at[f"pass@{n}"] = successes / num_examples if num_examples else 0 + + all_pass = sum( + all( + score_rows[t * num_examples + i].get(metric) is True + for t in range(num_trials) + ) + for i in range(num_examples) + ) + pass_at[f"pass^{num_trials}"] = all_pass / num_examples if num_examples else 0 + + return pass_at + + +class BasicMetricsScorer(weave.Scorer): + """Scores success rate, diff production rate, cost, and turn count.""" + + num_trials: int = 1 + + @weave.op() + def score(self, output: dict | None) -> dict: + if output is None: + return { + "successful": False, + "has_diff": False, + "cost_usd": 0, + "num_turns": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_input_tokens": 0, + "cache_creation_input_tokens": 0, + } + return { + "successful": output.get("error") is None, + "has_diff": bool(output.get("diff", "").strip()), + "cost_usd": output.get("cost_usd", 0), + "num_turns": output.get("num_turns", 0), + "input_tokens": output.get("input_tokens", 0), + "output_tokens": output.get("output_tokens", 0), + "cache_read_input_tokens": output.get("cache_read_input_tokens", 0), + "cache_creation_input_tokens": output.get("cache_creation_input_tokens", 0), + } + + def summarize(self, score_rows: list[dict]) -> dict: + n = len(score_rows) + costs = [r.get("cost_usd", 0) for r in score_rows] + input_toks = [r.get("input_tokens", 0) for r in score_rows] + output_toks = [r.get("output_tokens", 0) for r in score_rows] + summary = { + "success_rate": sum(r.get("successful", False) for r in score_rows) / n + if n + else 0, + "diff_rate": sum(r.get("has_diff", False) for r in score_rows) / n + if n + else 0, + "avg_cost_usd": sum(costs) / n if n else 0, + "total_cost_usd": sum(costs), + "total_input_tokens": sum(input_toks), + "total_output_tokens": sum(output_toks), + "total_cache_read_tokens": sum( + r.get("cache_read_input_tokens", 0) for r in score_rows + ), + "total_cache_creation_tokens": sum( + r.get("cache_creation_input_tokens", 0) for r in score_rows + ), + "num_examples": n, + } + if self.num_trials > 1: + summary.update(_pass_at_k(score_rows, self.num_trials, "successful")) + logger.info(f"BasicMetrics summary: {summary}") + return summary + + +class BuildPassRateScorer(weave.Scorer): + """Scores local ./mach build and try push pass rates.""" + + num_trials: int = 1 + + @weave.op() + def score(self, output: dict | None) -> dict: + if output is None: + return { + "local_build_passed": None, + "try_build_passed": None, + } + return { + "local_build_passed": output.get("local_build_passed"), + "try_build_passed": output.get("try_build_passed"), + } + + def summarize(self, score_rows: list[dict]) -> dict: + n = len(score_rows) + local_passed = sum(1 for r in score_rows if r.get("local_build_passed") is True) + try_known = [r for r in score_rows if r.get("try_build_passed") is not None] + try_passed = sum(1 for r in try_known if r.get("try_build_passed") is True) + summary = { + "local_build_pass_rate": local_passed / n if n else 0, + "local_builds_passed": local_passed, + "try_build_pass_rate": try_passed / len(try_known) if try_known else 0, + "try_builds_passed": try_passed, + "try_builds_timed_out": n - len(try_known), + "num_examples": n, + } + if self.num_trials > 1: + summary.update( + _pass_at_k(score_rows, self.num_trials, "local_build_passed") + ) + logger.info(f"BuildPassRate summary: {summary}") + return summary + + +class LLMFixMatchingScorer(weave.Scorer): + """Scaffold for LLM-as-a-judge comparing agent fix to ground truth. + + Implementation deferred. Will use a non-Claude LLM to semantically + compare the agent's diff against the ground truth fix commit. + """ + + @weave.op() + async def score(self, output: dict | None, gh_fix_commits: list[str]) -> dict: + if output is None: + return { + "match_score": None, + "match_category": "errored", + } + return { + "match_score": None, + "match_category": "not_implemented", + } + + def summarize(self, score_rows: list[dict]) -> dict: + return {"status": "not_implemented"} diff --git a/bugbug/tools/build_repair/try_server.py b/bugbug/tools/build_repair/try_server.py new file mode 100644 index 0000000000..6ef558556e --- /dev/null +++ b/bugbug/tools/build_repair/try_server.py @@ -0,0 +1,286 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import logging +import os +import re +import subprocess +import time +from dataclasses import dataclass +from logging import getLogger +from pathlib import Path + +import requests + +from bugbug.tools.build_repair.config import ( + TREEHERDER_BASE_URL, + TRY_PUSH_POLL_INTERVAL_SECONDS, + TRY_PUSH_TIMEOUT_SECONDS, +) + +logger = getLogger(__name__) + +_HEADERS = {"User-Agent": "bugbug-build-repair-eval/1.0"} +_LANDO_JOB_ID_RE = re.compile(r"landoCommitID=([A-Za-z0-9_-]+)") + + +def _mach_env(worktree_path: Path) -> dict[str, str]: + env = os.environ.copy() + env["MOZBUILD_STATE_PATH"] = str(worktree_path / ".mozbuild") + return env + + +@dataclass +class TryPushResult: + """Result of local build verification and optional try push submission.""" + + local_build_passed: bool + try_build_passed: bool | None + lando_job_id: str | None + treeherder_url: str | None + + +def _commit_fix(worktree_path: Path, bug_id: int) -> None: + logger.info(f"Committing fix for bug {bug_id} in {worktree_path}") + subprocess.run( + ["git", "add", "-A"], + cwd=worktree_path, + check=True, + ) + subprocess.run( + [ + "git", + "-c", + "user.name=bugbug", + "-c", + "user.email=bugbug@mozilla.com", + "commit", + "-m", + f"Build repair fix for bug {bug_id}", + ], + cwd=worktree_path, + check=True, + ) + logger.info(f"Bug {bug_id}: fix committed") + + +def _run_subprocess( + cmd: list[str], worktree_path: Path, capture: bool +) -> subprocess.CompletedProcess[str]: + if capture: + return subprocess.run( + cmd, + cwd=worktree_path, + env=_mach_env(worktree_path), + capture_output=True, + text=True, + ) + return subprocess.run( + cmd, + cwd=worktree_path, + env=_mach_env(worktree_path), + text=True, + ) + + +def _run_local_build(worktree_path: Path) -> bool: + capture = not logger.isEnabledFor(logging.DEBUG) + + logger.info(f"Running bootstrap in {worktree_path}") + result = _run_subprocess( + ["./mach", "--no-interactive", "bootstrap"], worktree_path, capture + ) + if result.returncode != 0: + if capture and result.stderr: + logger.warning(f"Bootstrap stderr:\n{result.stderr[-2000:]}") + raise RuntimeError( + f"Local bootstrap failed with return code {result.returncode}" + ) + + logger.info(f"Running local build in {worktree_path}") + result = _run_subprocess(["./mach", "build"], worktree_path, capture) + passed = result.returncode == 0 + status = "passed" if passed else "failed" + logger.info(f"Local build {status} (returncode={result.returncode})") + if not passed and capture and result.stderr: + logger.warning(f"Build stderr:\n{result.stderr[-2000:]}") + return passed + + +def _submit_try(worktree_path: Path, task_name: str) -> tuple[str | None, str | None]: + logger.info(f"Submitting try push for task={task_name} in {worktree_path}") + result = subprocess.run( + ["./mach", "try", "fuzzy", "--query", task_name], + cwd=worktree_path, + capture_output=True, + text=True, + env=_mach_env(worktree_path), + ) + stdout = result.stdout + result.stderr + logger.debug(f"Try push output: {stdout}") + match = _LANDO_JOB_ID_RE.search(stdout) + if not match: + logger.warning(f"Could not parse Lando job ID from try output: {stdout}") + return None, None + + lando_job_id = match.group(1) + treeherder_url = f"{TREEHERDER_BASE_URL}/jobs?repo=try&landoCommitID={lando_job_id}" + logger.info( + f"Try push submitted: lando_job_id={lando_job_id}, treeherder={treeherder_url}" + ) + return lando_job_id, treeherder_url + + +def _get_push_revision(lando_job_id: str) -> str | None: + try: + resp = requests.get( + f"{TREEHERDER_BASE_URL}/api/project/try/push/", + params={"lando_commit_id": lando_job_id}, + headers=_HEADERS, + timeout=30, + ) + resp.raise_for_status() + results = resp.json().get("results", []) + if results: + return results[0].get("revision") + except Exception: + logger.exception(f"Error fetching push revision for lando job {lando_job_id}") + return None + + +def _get_push_by_revision(revision: str) -> dict | None: + try: + resp = requests.get( + f"{TREEHERDER_BASE_URL}/api/project/try/push/", + params={"revision": revision}, + headers=_HEADERS, + timeout=30, + ) + resp.raise_for_status() + results = resp.json().get("results", []) + return results[0] if results else None + except Exception: + logger.exception(f"Error fetching push by revision {revision}") + return None + + +def _get_build_job_result(push_id: int, task_name: str) -> str | None: + try: + resp = requests.get( + f"{TREEHERDER_BASE_URL}/api/project/try/jobs/", + params={"push_id": push_id, "count": 2000}, + headers=_HEADERS, + timeout=30, + ) + resp.raise_for_status() + for job in resp.json().get("results", []): + if task_name in job.get("job_type_name", ""): + if job["state"] != "completed": + return job["state"] + return job["result"] + except Exception: + logger.exception(f"Error fetching build job result for push {push_id}") + return None + + +def _poll_treeherder(lando_job_id: str, task_name: str) -> bool | None: + logger.info( + f"Polling Treeherder for lando_job_id={lando_job_id}, task={task_name} " + f"(timeout={TRY_PUSH_TIMEOUT_SECONDS}s, " + f"interval={TRY_PUSH_POLL_INTERVAL_SECONDS}s)" + ) + deadline = time.monotonic() + TRY_PUSH_TIMEOUT_SECONDS + push_id: int | None = None + poll_count = 0 + + while time.monotonic() < deadline: + poll_count += 1 + if push_id is None: + revision = _get_push_revision(lando_job_id) + if revision: + logger.info( + f"Resolved revision={revision} for lando_job_id={lando_job_id}" + ) + push = _get_push_by_revision(revision) + if push: + push_id = push["id"] + logger.info(f"Resolved push_id={push_id} for revision={revision}") + + if push_id is not None: + result = _get_build_job_result(push_id, task_name) + logger.debug( + f"Poll #{poll_count}: job result={result} for push_id={push_id}" + ) + if result == "success": + logger.info(f"Try build succeeded for lando_job_id={lando_job_id}") + return True + if result in ("busted", "testfailed", "exception"): + logger.info( + f"Try build failed ({result}) for lando_job_id={lando_job_id}" + ) + return False + else: + logger.debug( + f"Poll #{poll_count}: push not yet available for " + f"lando_job_id={lando_job_id}" + ) + + time.sleep(TRY_PUSH_POLL_INTERVAL_SECONDS) + + logger.warning( + f"Try push polling timed out after {poll_count} polls " + f"for lando job {lando_job_id}" + ) + return None + + +def run_try_verification( + worktree_path: Path, + bug_id: int, + task_name: str, + skip_try_push: bool = False, +) -> TryPushResult: + logger.info( + f"Starting try verification for bug {bug_id} " + f"(task={task_name}, skip_try_push={skip_try_push})" + ) + _commit_fix(worktree_path, bug_id) + + local_passed = _run_local_build(worktree_path) + if not local_passed: + logger.warning(f"Bug {bug_id}: local build failed, skipping try push") + return TryPushResult( + local_build_passed=False, + try_build_passed=None, + lando_job_id=None, + treeherder_url=None, + ) + + if skip_try_push: + logger.info(f"Bug {bug_id}: local build passed, skipping try push as requested") + return TryPushResult( + local_build_passed=True, + try_build_passed=None, + lando_job_id=None, + treeherder_url=None, + ) + + lando_job_id, treeherder_url = _submit_try(worktree_path, task_name) + if not lando_job_id: + logger.warning(f"Bug {bug_id}: try push submission failed, no lando job ID") + return TryPushResult( + local_build_passed=True, + try_build_passed=None, + lando_job_id=None, + treeherder_url=None, + ) + + try_passed = _poll_treeherder(lando_job_id, task_name) + return TryPushResult( + local_build_passed=True, + try_build_passed=try_passed, + lando_job_id=lando_job_id, + treeherder_url=treeherder_url, + ) diff --git a/bugbug/tools/build_repair/worktree.py b/bugbug/tools/build_repair/worktree.py new file mode 100644 index 0000000000..8ae10ea7da --- /dev/null +++ b/bugbug/tools/build_repair/worktree.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import subprocess +from logging import getLogger +from pathlib import Path + +from bugbug.tools.build_repair.config import WORKTREE_BASE_DIR + +logger = getLogger(__name__) + + +class WorktreeManager: + """Manages git worktrees for parallel evaluation runs against a Firefox repo.""" + + def __init__( + self, + firefox_repo_path: str | Path, + base_dir: str = WORKTREE_BASE_DIR, + ): + self.repo = Path(firefox_repo_path) + self.base_dir = Path(base_dir) + self.base_dir.mkdir(parents=True, exist_ok=True) + + def create(self, commit_hash: str, name: str) -> Path: + worktree_path = self.base_dir / name + logger.info( + f"Creating worktree {name} at {worktree_path} (commit={commit_hash})" + ) + if worktree_path.exists(): + self.cleanup(name) + subprocess.run( + [ + "git", + "worktree", + "add", + "--force", + "--force", + str(worktree_path), + commit_hash, + ], + cwd=self.repo, + check=True, + ) + logger.info(f"Worktree {name} created") + return worktree_path + + def cleanup(self, name: str) -> None: + logger.info(f"Cleaning up worktree {name}") + subprocess.run( + [ + "git", + "worktree", + "remove", + "--force", + "--force", + str(self.base_dir / name), + ], + cwd=self.repo, + check=True, + ) + logger.info(f"Worktree {name} removed") + + def cleanup_all(self) -> None: + logger.info(f"Cleaning up all worktrees in {self.base_dir}") + for entry in self.base_dir.iterdir(): + if entry.is_dir(): + logger.info(f"Removing worktree {entry}") + subprocess.run( + ["git", "worktree", "remove", "--force", "--force", str(entry)], + cwd=self.repo, + check=False, + ) diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 0000000000..ff72a232b1 --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,18 @@ +services: + # Base Docker image requires running `./mach taskgraph load-image debian12-amd64-build:latest` + build-repair: + # TO minimize rebuilding use `DOCKER_DEFAULT_PLATFORM=linux/amd64 docker build -t build-repair-debian-base -f docker/build_repair/Dockerfile .` + # image: build-repair-debian-base + build: + context: . + dockerfile: docker/build_repair/Dockerfile + volumes: + - .:/app # live code editing + - ${FIREFOX_REPO}:/workspace/firefox # Firefox repo + - build-repair-tmp:/tmp/build_repair_worktrees + environment: + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} + - WANDB_API_KEY=${WANDB_API_KEY} # for weave + - FIREFOX_GIT_REPO=/workspace/firefox +volumes: + build-repair-tmp: diff --git a/docker/build_repair/Dockerfile b/docker/build_repair/Dockerfile new file mode 100644 index 0000000000..597713be73 --- /dev/null +++ b/docker/build_repair/Dockerfile @@ -0,0 +1,16 @@ +# ./mach taskgraph load-image +FROM debian12-amd64-build + +WORKDIR /app + +RUN apt-get install -y python3-pip python3-venv + +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +RUN apt-get install -y git nodejs npm && rm -rf /var/lib/apt/lists/* +RUN pip install weave>=0.52.29 pydantic claude-agent-sdk requests + +COPY . /app + +ENV FIREFOX_GIT_REPO=/workspace/firefox diff --git a/notebooks/build_repair_create_dataset.ipynb b/notebooks/build_repair_create_dataset.ipynb new file mode 100644 index 0000000000..f1da8fee77 --- /dev/null +++ b/notebooks/build_repair_create_dataset.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "af4f2030", + "metadata": {}, + "source": "# Create Dataset for Build Repairs" + }, + { + "cell_type": "markdown", + "id": "f0f37fd7", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f57608e", + "metadata": {}, + "outputs": [], + "source": [ + "import weave\n", + "\n", + "PROJECT_NAME = \"bugbug-build-repair-eval\"\n", + "DATASET_NAME = \"build_repair_one_commit_eval\"\n", + "\n", + "_ = weave.init(PROJECT_NAME)" + ] + }, + { + "cell_type": "markdown", + "id": "255ccaac", + "metadata": {}, + "source": [ + "## Prepare the Data\n", + "\n", + "### Load one commit build failures" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bd97ef4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_json(\n", + " \"https://community-tc.services.mozilla.com/api/queue/v1/task/Ra5r2qSyS8G-9pjLrS6l6Q/runs/0/artifacts/public%2Fci_failures.json.zst\",\n", + " lines=True,\n", + ")\n", + "df = df[(df.failure_commits.apply(len) == 1) & (df.fix_commits.apply(len) == 1)]\n", + "print(f\"One commit fail and fix: {len(df)}\")\n", + "df = df[\n", + " df.failures.apply(\n", + " lambda fails: any(\n", + " \"build\" in f[\"task_name\"] and \"test\" not in f[\"task_name\"] for f in fails\n", + " )\n", + " )\n", + "]\n", + "print(f\"Build fails only: {len(df)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3d5e260dc3245c36", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-13T23:52:42.195048Z", + "start_time": "2026-02-13T23:52:42.191945Z" + } + }, + "source": "### Get GitHub revisions" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "196561e8d8b0659f", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "\n", + "def get_git_rev(hg_revs):\n", + " for rev in hg_revs:\n", + " convert_url = f\"https://lando.moz.tools/api/hg2git/firefox/{rev}\"\n", + " resp = requests.get(convert_url)\n", + " if resp.status_code != 200:\n", + " raise ValueError(f\"Unexpected HTTP status code: {resp.status_code}. {resp}\")\n", + " yield resp.json()[\"git_hash\"]\n", + "\n", + "\n", + "df[\"gh_failure_commits\"] = df.failure_commits.apply(\n", + " lambda commits: list(get_git_rev(commits))\n", + ")\n", + "df[\"gh_fix_commits\"] = df.fix_commits.apply(lambda commits: list(get_git_rev(commits)))\n", + "df = df.rename(\n", + " columns={\"failure_commits\": \"hg_failure_commits\", \"fix_commits\": \"hg_fix_commits\"}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9dc12cf3f6f9b844", + "metadata": {}, + "source": "### Ger bugzilla comments before the fix" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d43ac43a96c50c6", + "metadata": {}, + "outputs": [], + "source": [ + "from libmozdata import config\n", + "\n", + "from bugbug.tools.core.platforms.bugzilla import Bug\n", + "\n", + "config.set_default_value(\"User-Agent\", \"name\", \"bugbug/1.0\")\n", + "\n", + "\n", + "def _get_comments(bug, fail_commit):\n", + " for comment in bug._metadata[\"comments\"]:\n", + " if comment[\"creator\"] == \"pulsebot@bmo.tld\":\n", + " if fail_commit[:12] in comment[\"raw_text\"]:\n", + " # stop adding comments at failure commit push\n", + " yield comment[\"raw_text\"]\n", + " break\n", + " else:\n", + " continue\n", + " if comment[\"raw_text\"]:\n", + " yield comment[\"raw_text\"]\n", + "\n", + "\n", + "def _get_fix_commit_date(bug, fix_commit):\n", + " for comment in bug._metadata[\"comments\"]:\n", + " if (\n", + " comment[\"creator\"] == \"pulsebot@bmo.tld\"\n", + " and fix_commit[:12] in comment[\"raw_text\"]\n", + " ):\n", + " return comment[\"time\"]\n", + " raise None\n", + "\n", + "\n", + "def get_bug_info_and_fix_date(build_fail):\n", + " bug_id = build_fail[\"bug_id\"]\n", + " fail_commit = build_fail[\"hg_failure_commits\"][0]\n", + " fix_commit = build_fail[\"hg_fix_commits\"][0]\n", + "\n", + " try:\n", + " bug = Bug.get(bug_id)\n", + " except ValueError as ex:\n", + " print(ex)\n", + " return pd.Series([None, None])\n", + "\n", + " return pd.Series(\n", + " [\n", + " {\"title\": bug.summary, \"comments\": list(_get_comments(bug, fail_commit))},\n", + " _get_fix_commit_date(bug, fix_commit),\n", + " ]\n", + " )\n", + "\n", + "\n", + "df[[\"pre_fix_bug\", \"fix_commit_date\"]] = df.apply(get_bug_info_and_fix_date, axis=1)\n", + "df = df[df[\"pre_fix_bug\"].notnull() & df[\"fix_commit_date\"].notnull()]\n", + "print(f\"With bug info: {len(df)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "ab9m8qftc2q", + "metadata": {}, + "source": "### Filter out data before model cutoff (data contamination prevention)" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0f792f5b775076d", + "metadata": {}, + "outputs": [], + "source": [ + "CONTAMINATION_CUTOFF = \"2025-09-01\"\n", + "\n", + "before = len(df)\n", + "df = df[\n", + " df[\"fix_commit_date\"].apply(lambda d: d is not None and d >= CONTAMINATION_CUTOFF)\n", + "]\n", + "print(\n", + " f\"Filtered {before - len(df)} examples with fix date before {CONTAMINATION_CUTOFF}\"\n", + ")\n", + "print(f\"Final number of examples: {len(df)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fde92cac54949c08", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "1695783abf03124f", + "metadata": {}, + "source": "## Save the Dataset" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6096ee230ee902a", + "metadata": {}, + "outputs": [], + "source": [ + "examples = df.to_dict(orient=\"records\")\n", + "examples[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79d08ee1b92ca6de", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = weave.Dataset(\n", + " name=DATASET_NAME,\n", + " description=\"Build repair evaluation dataset with failure logs, ground truth fix commits and pre fix Bugzilla comments.\",\n", + " rows=examples,\n", + ")\n", + "\n", + "_ = weave.publish(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "384f041b61c6c2e7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bugbug (3.12.7)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/requirements.txt b/requirements.txt index f431d8fa49..7ee624df7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ amqp==5.3.1 async-lru==2.1.0 beautifulsoup4==4.14.3 boto3==1.42.49 +claude-agent-sdk>=0.1.30 httpx==0.28.1 imbalanced-learn==0.14.1 langchain==1.2.10 diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py new file mode 100644 index 0000000000..aa7d91cc1f --- /dev/null +++ b/scripts/build_repair_eval.py @@ -0,0 +1,355 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Standalone CLI for build repair evaluation. + +Usage: + python scripts/build_repair_eval.py + python scripts/build_repair_eval.py --analysis-only + python scripts/build_repair_eval.py --trials 3 + python scripts/build_repair_eval.py --limit 5 + python scripts/build_repair_eval.py --parallelism 4 + python scripts/build_repair_eval.py --no-try-push + python scripts/build_repair_eval.py --verbose +""" + +import argparse +import asyncio +import json +import logging +import os +import uuid +from datetime import datetime +from functools import cached_property +from typing import Any + +import weave + +from bugbug.tools.build_repair.agent import AgentResponse, BuildFailure, BuildRepairTool +from bugbug.tools.build_repair.config import MODEL_CUTOFF_DATES +from bugbug.tools.build_repair.scorer import ( + BasicMetricsScorer, + BuildPassRateScorer, + LLMFixMatchingScorer, +) +from bugbug.tools.build_repair.worktree import WorktreeManager + +logger = logging.getLogger(__name__) + +# TODO: replace with native tracing for Anthropic Agents SDK when released by W&B + + +def _attr(obj, key, default=None): + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _to_chat_message(data: dict) -> dict | None: + """Convert a serialized claude_agent_sdk message to OpenAI chat format. + + Content blocks may be dicts (from model_dump) or dataclass instances + (from vars), so we use _attr() for uniform access. + """ + msg_type = data.get("type", "") + + if msg_type == "AssistantMessage": + blocks = data.get("content", []) + text_parts = [] + tool_calls = [] + for block in blocks: + text = _attr(block, "text") + if text is not None: + text_parts.append(text) + continue + name = _attr(block, "name") + block_id = _attr(block, "id") + if name is not None and block_id is not None: + tool_calls.append( + { + "id": block_id, + "type": "function", + "function": { + "name": name, + "arguments": json.dumps(_attr(block, "input", {})), + }, + } + ) + if not text_parts and not tool_calls: + return None + msg: dict = {"role": "assistant"} + if text_parts: + msg["content"] = "\n".join(text_parts) + if tool_calls: + msg["tool_calls"] = tool_calls + return msg + + if msg_type == "UserMessage": + content = data.get("content", "") + if isinstance(content, list): + for block in content: + tool_use_id = _attr(block, "tool_use_id") + if tool_use_id: + block_content = _attr(block, "content", "") + return { + "role": "tool", + "tool_call_id": tool_use_id, + "content": str(block_content) if block_content else "", + } + + return None + + +@weave.op(kind="llm") +def trace_llm_stage( + stage: str, + messages: list[dict], + model: str, + result_data: dict | None = None, +) -> dict: + last_assistant = "" + for msg in reversed(messages): + if msg.get("role") == "assistant" and msg.get("content"): + last_assistant = msg["content"] + break + + result: dict[str, Any] = { + "model": model, + "choices": [ + { + "message": {"role": "assistant", "content": last_assistant}, + } + ], + } + if result_data: + raw_usage = result_data.get("usage", {}) or {} + input_tokens = raw_usage.get("input_tokens", 0) + output_tokens = raw_usage.get("output_tokens", 0) + result["usage"] = { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + "cache_read_input_tokens": raw_usage.get("cache_read_input_tokens", 0), + "cache_creation_input_tokens": raw_usage.get( + "cache_creation_input_tokens", 0 + ), + "total_cost_usd": result_data.get("total_cost_usd", 0), + "num_turns": result_data.get("num_turns", 0), + } + return result + + +# Per-token costs in USD (standard, non-cached rates). +# Weave uses these for its built-in cost UI; the SDK's total_cost_usd +# (which accounts for cache pricing) is tracked separately as the authoritative cost. +ANTHROPIC_TOKEN_COSTS: dict[str, tuple[float, float]] = { + "claude-opus-4-6": (15.0e-6, 75.0e-6), + "claude-sonnet-4-6": (3.0e-6, 15.0e-6), + "claude-haiku-4-5-20251001": (0.8e-6, 4.0e-6), + "claude-sonnet-4-5-20250929": (3.0e-6, 15.0e-6), + "claude-opus-4-5-20251101": (15.0e-6, 75.0e-6), + "claude-opus-4-1-20250805": (15.0e-6, 75.0e-6), + "claude-sonnet-4-20250514": (3.0e-6, 15.0e-6), + "claude-3-7-sonnet-20250219": (3.0e-6, 15.0e-6), + "claude-opus-4-20250514": (15.0e-6, 75.0e-6), +} + + +def _register_model_costs(client) -> None: + for model_id, (prompt_cost, completion_cost) in ANTHROPIC_TOKEN_COSTS.items(): + try: + client.add_cost( + llm_id=model_id, + prompt_token_cost=prompt_cost, + completion_token_cost=completion_cost, + ) + except Exception as e: + logger.debug(f"Could not register cost for {model_id}: {e}") + + +def _make_weave_callback(): + stages: dict[str, dict] = {} + + def on_message(stage: str, data: dict) -> None: + msg_type = data["type"] + if msg_type == "stage_start": + messages = [] + if "system_prompt" in data: + messages.append({"role": "system", "content": data["system_prompt"]}) + messages.append({"role": "user", "content": data["prompt"]}) + + stages[stage] = { + "model": data["model"], + "messages": messages, + } + elif msg_type == "stage_end": + if stage in stages: + s = stages.pop(stage) + trace_llm_stage( + stage=stage, + messages=s["messages"], + model=s["model"], + result_data=data.get("result_data") or None, + ) + else: + if stage in stages: + chat_msg = _to_chat_message(data) + if chat_msg: + stages[stage]["messages"].append(chat_msg) + + return on_message + + +class BuildRepairError(Exception): + """Raised when the agent completes but reports an error.""" + + def __init__(self, output: dict): + self.output = output + super().__init__(output.get("error", "Unknown error")) + + +class BuildRepairModel(weave.Model): + """Weave Model wrapper that creates a worktree per example and runs BuildRepairTool.""" + + firefox_repo: str + analysis_only: bool = False + no_try_push: bool = False + + @cached_property + def tool(self) -> BuildRepairTool: + return BuildRepairTool.create(analysis_only=self.analysis_only, eval_mode=True) + + @cached_property + def worktree_mgr(self) -> WorktreeManager: + return WorktreeManager(self.firefox_repo) + + @weave.op() + async def invoke( + self, + bug_id: int, + pre_fix_bug: dict, + gh_failure_commits: list[str], + failures: list[dict], + fix_commit_date: str, + **kwargs, + ) -> dict: + wt_name = f"bug-{bug_id}-{uuid.uuid4().hex[:8]}" + logger.info( + f"Invoking bug {bug_id} " + f"(commit={gh_failure_commits[0][:12]}, {len(failures)} failures)" + ) + + worktree_created = False + try: + cutoff = max( + MODEL_CUTOFF_DATES[self.tool.analysis_model], + MODEL_CUTOFF_DATES[self.tool.fix_model], + ) + if datetime.fromisoformat(fix_commit_date).date() < cutoff: + logger.warning( + f"Skipping bug {bug_id}: fix date {fix_commit_date} " + f"is before model cutoff {cutoff}" + ) + raise ValueError("skipped_data_contamination") + + worktree_path = self.worktree_mgr.create(gh_failure_commits[0], wt_name) + worktree_created = True + + failure = BuildFailure( + bug_id=bug_id, + bug_title=pre_fix_bug["title"], + bug_comments=pre_fix_bug["comments"], + git_commit=gh_failure_commits[0], + failure_tasks=failures, + ) + result: AgentResponse = await self.tool.run( + failure, + worktree_path=worktree_path, + skip_try_push=self.no_try_push, + on_message=_make_weave_callback(), + ) + logger.info( + f"Bug {bug_id} completed: error={result.error}, " + f"diff_len={len(result.diff)}, cost=${result.cost_usd:.4f}, " + f"turns={result.num_turns}, " + f"local_build={result.local_build_passed}, " + f"try_build={result.try_build_passed}" + ) + + output = result.model_dump() + if result.error: + raise BuildRepairError(output) + return output + finally: + if worktree_created: + logger.info(f"Bug {bug_id}: cleaning up worktree {wt_name}") + self.worktree_mgr.cleanup(wt_name) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Build repair evaluation") + parser.add_argument("--limit", type=int, default=None) + parser.add_argument("--trials", type=int, default=1) + parser.add_argument("--parallelism", type=int, default=8) + parser.add_argument("--firefox-repo", default=os.environ.get("FIREFOX_GIT_REPO")) + parser.add_argument("--dataset", default="build_repair_one_commit_eval") + parser.add_argument("--analysis-only", action="store_true") + parser.add_argument("--no-try-push", action="store_true") + parser.add_argument("--verbose", action="store_true", help="Enable DEBUG logging") + args = parser.parse_args() + + if not args.firefox_repo: + parser.error("--firefox-repo or FIREFOX_GIT_REPO env var is required") + + log_level = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig( + level=log_level, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + ) + if not args.verbose: + logging.getLogger("httpx").setLevel(logging.WARNING) + logging.getLogger("httpcore").setLevel(logging.WARNING) + logging.getLogger("hgitaly").setLevel(logging.WARNING) + logging.getLogger("urllib3").setLevel(logging.WARNING) + + logger.info( + f"Starting evaluation: dataset={args.dataset}, limit={args.limit}, " + f"trials={args.trials}, parallelism={args.parallelism}, " + f"analysis_only={args.analysis_only}, no_try_push={args.no_try_push}, " + f"firefox_repo={args.firefox_repo}" + ) + + os.environ["WEAVE_PARALLELISM"] = str(args.parallelism) + client = weave.init("bugbug-build-repair-eval") + _register_model_costs(client) + + dataset = weave.ref(args.dataset).get() + logger.info(f"Loaded dataset {args.dataset} with {len(dataset.rows)} rows") + if args.limit: + dataset.rows = dataset.rows[: args.limit] + logger.info(f"Limited to {len(dataset.rows)} rows") + + scorers = [BasicMetricsScorer(num_trials=args.trials), LLMFixMatchingScorer()] + if not args.analysis_only: + scorers.insert(1, BuildPassRateScorer(num_trials=args.trials)) + logger.info(f"Scorers: {[type(s).__name__ for s in scorers]}") + + model = BuildRepairModel( + firefox_repo=args.firefox_repo, + analysis_only=args.analysis_only, + no_try_push=args.no_try_push, + ) + evaluation = weave.Evaluation( + name="build-repair", + dataset=dataset, + scorers=scorers, + trials=args.trials, + ) + results = asyncio.run(evaluation.evaluate(model)) + logger.info(f"Evaluation results: {results}") + + +if __name__ == "__main__": + main()