From adafd6a1f9399bdd9acf5f41bfee430ef8973d46 Mon Sep 17 00:00:00 2001 From: labkey-jeckels Date: Sat, 21 Mar 2026 08:24:31 -0700 Subject: [PATCH 1/5] Claude Code pull request reviewer and eval tool --- .claude/commands/review-pr.md | 59 +++++ .claude/review-pr-eval/README.md | 59 +++++ .claude/review-pr-eval/eval.py | 319 +++++++++++++++++++++++ .claude/review-pr-eval/training_set.json | 25 ++ CLAUDE.md | 2 +- 5 files changed, 463 insertions(+), 1 deletion(-) create mode 100644 .claude/commands/review-pr.md create mode 100644 .claude/review-pr-eval/README.md create mode 100755 .claude/review-pr-eval/eval.py create mode 100644 .claude/review-pr-eval/training_set.json diff --git a/.claude/commands/review-pr.md b/.claude/commands/review-pr.md new file mode 100644 index 0000000000..c9d6b070df --- /dev/null +++ b/.claude/commands/review-pr.md @@ -0,0 +1,59 @@ +Use the `gh` CLI to fetch the PR details and diff, then perform a systematic code review. + +Steps: +1. Run `gh pr view $ARGUMENTS` to get the PR title, description, and author. +2. Run `gh pr diff $ARGUMENTS` to get the full diff. +3. For each file changed, if you need more context than the diff provides, read the relevant file(s). + +Then perform a thorough review in this exact order: + +--- + +## Phase 1: Understand the Intent + +Summarize in 2-3 sentences what this PR is supposed to do, based on the title, description, and diff. This is your baseline for correctness checks. + +## Phase 2: Logic Analysis (Most Critical) + +For **each changed function or method**, work through it mechanically: + +- **Trace the execution**: Walk through what the code does step by step in plain English. Do not just restate the code — describe what values flow through and what decisions are made. +- **Check conditions**: For every `if`, `while`, `for`, ternary, or boolean expression: is the condition correct? Could it be inverted? Are the operands in the right order? +- **Check edge cases**: What happens with null/empty/zero/negative/maximum inputs? Are bounds correct (off-by-one)? +- **Check missing cases**: Are there code paths the change forgot to handle? +- **Check state mutations**: If the code modifies shared state, is the order of operations correct? Could this cause incorrect behavior if called multiple times or concurrently? + +Do not skip this phase for "simple-looking" changes. Many bugs hide in code that appears straightforward. + +## Phase 3: Correctness Against Intent + +Compare what the code *actually does* (from Phase 2) against what it *should do* (from Phase 1). Call out any gaps. + +## Phase 4: Security + +- Input validation and sanitization +- Authentication and authorization checks +- SQL injection, XSS, path traversal +- Sensitive data in logs or responses +- Insecure defaults + +## Phase 5: Interactions and Side Effects + +- Could this change break existing callers that depend on the old behavior? +- Are there other places in the codebase that should have been updated alongside this change? +- Are tests updated to cover the new behavior? + +--- + +## Output Format + +For each issue found, report: + +**Finding #*IncrementingNumber* - [Severity: Critical/High/Medium/Low]** — *Category* — `file:line` +> **Issue**: What is wrong. +> **Why it matters**: The impact if unfixed. +> **Suggestion**: How to fix it. + +Lead with Critical and High severity issues. After all issues, give a one-paragraph overall assessment. + +ultrathink diff --git a/.claude/review-pr-eval/README.md b/.claude/review-pr-eval/README.md new file mode 100644 index 0000000000..da8cd57f55 --- /dev/null +++ b/.claude/review-pr-eval/README.md @@ -0,0 +1,59 @@ +# review-pr eval + +Evaluates variants of the `review-pr` prompt against a training set of GitHub PRs that contain known bugs, measuring how often the prompt catches them. + +## Prerequisites + +- Python 3.10+ +- `claude` CLI authenticated (`claude --version` should work) +- `gh` CLI authenticated (`gh auth status` should confirm) + +## Running + +```bash +# Evaluate the live prompt (../commands/review-pr.md) +python eval.py + +# Evaluate a specific variant +python eval.py prompts/my-variant.md + +# Compare the live prompt against a variant side by side +python eval.py --compare current my-variant +``` + +**Warning:** Each run invokes Claude on every PR in the training set. With the current 3-PR training set, expect **10+ minutes** per evaluation. A `--compare` with two names runs both sequentially, so plan for 20+ minutes. + +## Training set + +`training_set.json` lists GitHub PR URLs and the specific bugs that are expected to be caught. The judge (Claude Haiku) scores each review as `CAUGHT`, `PARTIAL`, or `MISSED` for each expected issue. + +To add a PR to the training set, append an entry: + +```json +{ + "url": "https://github.com/org/repo/pull/123", + "expected_issues": [ + "Description of the specific bug that should be caught" + ] +} +``` + +## Prompt variants + +The live prompt is always `../commands/review-pr.md`. Named variants live in `prompts/`. To create a variant: + +```bash +cp ../commands/review-pr.md prompts/my-variant.md +# edit prompts/my-variant.md +python eval.py --compare current my-variant +``` + +## Repo cache + +When evaluating, the script checks out each PR's merge commit so Claude has access to the full repository context. Clones are stored at `~/pr-eval-repos/` and reused across runs. Fetches are only performed if the required commit is not already present locally. These clones use `--filter=blob:none` (blobless) so they are relatively lightweight. + +## Results + +Results are saved as JSON files in the repo root `build/` directory, named `_.json`. Each file contains the full review text, per-issue verdicts, and a summary score. + +The catch rate counts `CAUGHT` as 1 and `PARTIAL` as 0.5. diff --git a/.claude/review-pr-eval/eval.py b/.claude/review-pr-eval/eval.py new file mode 100755 index 0000000000..b09a521308 --- /dev/null +++ b/.claude/review-pr-eval/eval.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +""" +Evaluate review-pr prompt variants against a training set of PRs with known critical bugs. + +Usage: + python eval.py # evaluate ../commands/review-pr.md + python eval.py prompts/variant1.md # evaluate a specific variant + python eval.py --compare current variant1 # compare two variants side by side + +Requires: + claude CLI (Claude Code) authenticated + gh CLI authenticated +""" + +import json +import subprocess +import sys +from contextlib import contextmanager +from pathlib import Path +from datetime import datetime + +SCRIPT_DIR = Path(__file__).parent +TRAINING_SET_FILE = SCRIPT_DIR / "training_set.json" +PROMPTS_DIR = SCRIPT_DIR / "prompts" +RESULTS_DIR = SCRIPT_DIR.parent.parent / "build" / "review-pr-output" +REPOS_DIR = Path.home() / "pr-eval-repos" +LIVE_PROMPT = SCRIPT_DIR.parent / "commands" / "review-pr.md" + +JUDGE_MODEL = "claude-haiku-4-5" + + +JUDGE_PROMPT = """You are evaluating whether a code review successfully identified a known critical issue. + +Known critical issue to find: +{expected_issue} + +Code review output to evaluate: +{review_output} + +Did the code review identify this issue or a substantially equivalent problem? + +Respond with exactly one of these verdicts, followed by a colon and brief explanation: +CAUGHT: — the review clearly identified this issue or its root cause +PARTIAL: — the review hinted at related concerns but didn't pinpoint the specific issue +MISSED: — the review did not identify this issue""" + + +def pr_label(url: str) -> str: + """Return 'owner/repo#number' from a full GitHub PR URL.""" + parts = url.rstrip("/").split("/") + # https://github.com/owner/repo/pull/number + return f"{parts[-4]}/{parts[-3]}#{parts[-1]}" + + +def get_pr_data(url: str) -> tuple[dict, str]: + view_json = subprocess.check_output( + ["gh", "pr", "view", url, "--json", "title,body,author,number,url,mergeCommit"], + text=True, + ) + pr_view = json.loads(view_json) + pr_diff = subprocess.check_output( + ["gh", "pr", "diff", url], + text=True, + ) + return pr_view, pr_diff + + +def repo_key(url: str) -> str: + """Return 'owner/repo' from a GitHub PR URL.""" + parts = url.rstrip("/").split("/") + return f"{parts[-4]}/{parts[-3]}" + + +@contextmanager +def get_merge_commit(pr_view: dict, url: str): + """Context manager that checks out the PR's merge commit in ~/pr-eval-repos/.""" + parts = url.rstrip("/").split("/") + org, repo_name = parts[-4], parts[-3] + repo_path = REPOS_DIR / repo_name + merge_commit = (pr_view.get("mergeCommit") or {}).get("oid") + + if not merge_commit: + print(f" (PR has no merge commit, skipping checkout)") + yield None + return + + if not repo_path.exists(): + REPOS_DIR.mkdir(parents=True, exist_ok=True) + print(f" cloning {org}/{repo_name}... ", end="", flush=True) + subprocess.check_call( + ["gh", "repo", "clone", f"{org}/{repo_name}", str(repo_path), "--", "--filter=blob:none"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + print("done") + + # Ensure commit is present (may need a fetch if repo is stale) + commit_known = subprocess.call( + ["git", "-C", str(repo_path), "cat-file", "-e", merge_commit], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) == 0 + if not commit_known: + print(f" fetching {org}/{repo_name}... ", end="", flush=True) + subprocess.check_call( + ["git", "-C", str(repo_path), "fetch", "--filter=blob:none", "origin"], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + print("done") + + print(f" checking out {merge_commit[:12]}... ", end="", flush=True) + subprocess.check_call( + ["git", "-C", str(repo_path), "checkout", "--detach", merge_commit], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + print("done") + try: + yield str(repo_path) + finally: + pass # Leave detached HEAD; next run will checkout the right commit + + +def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False, cwd: str = None) -> str: + cmd = ["claude", "-p", "--dangerously-skip-permissions"] # Run in headless mode. Trust Claude won't try anything dangerous + if extra_args: + cmd.extend(extra_args) + + if stream: + process = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=cwd, + ) + process.stdin.write(prompt) + process.stdin.close() + lines = [] + for line in process.stdout: + print(line, end="", flush=True) + lines.append(line) + process.wait(timeout=1200) + if process.returncode != 0: + raise RuntimeError(process.stderr.read().strip() or f"claude -p exited with code {process.returncode}") + return "".join(lines).strip() + else: + result = subprocess.run( + cmd, + input=prompt, + capture_output=True, + text=True, + timeout=1200, + cwd=cwd, + ) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or f"claude -p exited with code {result.returncode}") + return result.stdout.strip() + + +def run_review(prompt_template: str, pr_view: dict, pr_diff: str, cwd: str = None) -> str: + pr_summary = ( + f"PR #{pr_view['number']}: {pr_view['title']}\n" + f"URL: {pr_view.get('url', '')}\n" + f"Author: {pr_view['author']['login']}\n\n" + f"Description:\n{pr_view.get('body', '(no description)')}" + ) + # Inject the PR data in place of the gh CLI calls the prompt would normally make + full_prompt = f"""{prompt_template} + +--- +Note: The PR data has already been fetched. Use the following instead of running gh commands: + +## PR Details +{pr_summary} + +## Diff +{pr_diff}""" + + print() + return run_claude(full_prompt, stream=True, cwd=cwd) + + +def judge_review(review_output: str, expected_issue: str) -> tuple[str, str]: + prompt = JUDGE_PROMPT.format( + expected_issue=expected_issue, + review_output=review_output, + ) + text = run_claude(prompt, extra_args=["--model", JUDGE_MODEL]) + verdict = text.split(":")[0].strip().upper() + if verdict not in ("CAUGHT", "PARTIAL", "MISSED"): + verdict = "UNKNOWN" + return verdict, text + + +def evaluate_prompt(prompt_file: Path, training_set: list) -> dict: + prompt_template = prompt_file.read_text() + results = [] + + for entry in training_set: + url = entry["url"] + short = pr_label(url) + print(f" [{short}] fetching... ", end="", flush=True) + + try: + pr_view, pr_diff = get_pr_data(url) + print(f"{pr_view['title']}") + print(f" diff: {len(pr_diff):,} chars") + with get_merge_commit(pr_view, url) as cwd: + review = run_review(prompt_template, pr_view, pr_diff, cwd=cwd) + print(f"\n--- judging ---") + + findings = [] + for issue in entry["expected_issues"]: + print(f" {issue[:80]}... ", end="", flush=True) + verdict, judge_explanation = judge_review(review, issue) + print(verdict) + findings.append({ + "expected_issue": issue, + "verdict": verdict, + "judge_explanation": judge_explanation, + }) + + results.append({ + "url": url, + "title": pr_view["title"], + "findings": findings, + "review": review, + }) + except Exception as e: + print(f"ERROR: {e}") + results.append({ + "url": url, + "findings": [{"expected_issue": issue, "verdict": "ERROR", "error": str(e)} + for issue in entry["expected_issues"]], + }) + + all_findings = [f for r in results for f in r["findings"]] + total = len(all_findings) + caught = sum(1 for f in all_findings if f["verdict"] == "CAUGHT") + partial = sum(1 for f in all_findings if f["verdict"] == "PARTIAL") + missed = sum(1 for f in all_findings if f["verdict"] == "MISSED") + + return { + "prompt_file": str(prompt_file), + "timestamp": datetime.now().isoformat(), + "score": { + "caught": caught, + "partial": partial, + "missed": missed, + "total": total, + # Partial counts as 0.5 — it found something but wasn't precise + "catch_rate": round((caught + 0.5 * partial) / total, 2) if total > 0 else 0.0, + }, + "results": results, + } + + +def print_summary(evaluation: dict): + s = evaluation["score"] + name = Path(evaluation["prompt_file"]).stem + print(f"\n{name}: {s['caught']} caught, {s['partial']} partial, {s['missed']} missed / {s['total']} total (catch rate: {s['catch_rate']:.0%})") + for r in evaluation["results"]: + short = pr_label(r["url"]) + title = r.get("title", "") + for f in r["findings"]: + if f["verdict"] in ("MISSED", "PARTIAL", "ERROR"): + print(f" [{f['verdict']}] {short} {title[:30]}: {f.get('judge_explanation', f.get('error', ''))}") + + +def main(): + RESULTS_DIR.mkdir(exist_ok=True) + + if not TRAINING_SET_FILE.exists(): + print(f"Error: training set not found at {TRAINING_SET_FILE}") + sys.exit(1) + + training_set = json.loads(TRAINING_SET_FILE.read_text()) + args = sys.argv[1:] + + print(f"Warning: this evaluation runs Claude on {len(training_set)} PRs and will take 10+ minutes.") + + if "--compare" in args: + idx = args.index("--compare") + names = args[idx + 1:] + if not names: + print("Usage: eval.py --compare ...") + sys.exit(1) + + all_results = [] + for name in names: + prompt_file = LIVE_PROMPT if name == "current" else PROMPTS_DIR / f"{name}.md" + print(f"\nEvaluating {name}...") + result = evaluate_prompt(prompt_file, training_set) + all_results.append(result) + out_file = RESULTS_DIR / f"{name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + out_file.write_text(json.dumps(result, indent=2)) + print_summary(result) + + print("\n--- Comparison ---") + for r in all_results: + s = r["score"] + prompt_name = Path(r["prompt_file"]).stem + print(f" {prompt_name:25s} {s['catch_rate']:.0%} ({s['caught']}C {s['partial']}P {s['missed']}M)") + + else: + prompt_file = Path(args[0]) if args else LIVE_PROMPT + if not prompt_file.exists(): + print(f"Error: prompt file not found at {prompt_file}") + sys.exit(1) + print(f"Evaluating {prompt_file.name} against {len(training_set)} PRs...") + result = evaluate_prompt(prompt_file, training_set) + print_summary(result) + out_file = RESULTS_DIR / f"{prompt_file.stem}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + out_file.write_text(json.dumps(result, indent=2)) + print(f"\nResults saved to {out_file}") + + +if __name__ == "__main__": + main() diff --git a/.claude/review-pr-eval/training_set.json b/.claude/review-pr-eval/training_set.json new file mode 100644 index 0000000000..6ead00209e --- /dev/null +++ b/.claude/review-pr-eval/training_set.json @@ -0,0 +1,25 @@ +[ + { + "url": "https://github.com/LabKey/limsModules/pull/1792", + "expected_issues": [ + "The SQLServer version of sampleManagement-25.001-25.002.sql has the wrong WHERE clause and should use role='org.labkey.samplemanagement.security.roles.WorkflowEditorRole'", + "WorkflowController.SetDefaultEmailPrefAction unconditionally sets the success status for the response to false", + "WorkflowManager.addWorkflowJobAuditEvent() fetches originalJobMetadata but fails to pass it to encodeForDataMap() " + ] + }, { + "url": "https://github.com/LabKey/labkey-ui-components/pull/1144", + "expected_issues": [ + "The change to BulkUpdateForm.tsx displayFieldUpdates.merge(data) unconditionally includes pre-populated display values for StoredAmount/Units in every non-empty form submission" + ] + }, { + "url": "https://github.com/LabKey/platform/pull/5703", + "expected_issues": [ + "TsvDataSerializer.exportData() fails to write the first row of data for all files except the first due to a misplaced writeRow() call" + ] + }, { + "url": "https://github.com/LabKey/platform/pull/3949", + "expected_issues": [ + "SampleTypeUpdateServiceDI.getMaterialMapsWithInput() uses `filter` instead of `cfilter` in the call to TableSelector" + ] + } +] \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index eabbf4b886..9fb4f2a666 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -89,7 +89,7 @@ All external library versions are centralized in `gradle.properties` (200+ versi - **Java Streams**: Prefer `Stream` API over traditional for-loops for collection processing. - **Resources**: Use try-with-resources for automatic resource management. - **Nullability**: Use `org.jetbrains.annotations.NotNull` and `org.jetbrains.annotations.Nullable` annotations. Be explicit in public API signatures. -- **Logging**: Use Log4J2. Name the static logger `LOG`, initialized via `LogHelper.getLogger()`: +- **Logging**: Use Log4J2. Never use System.out or System.err. Name the static logger `LOG`, initialized via `LogHelper.getLogger()`: ```java private static final Logger LOG = LogHelper.getLogger(MyClass.class, "optional description"); ``` From 105e0d3dd9fbf9a534d43d9ef0c2069a35f2087d Mon Sep 17 00:00:00 2001 From: labkey-jeckels Date: Sat, 21 Mar 2026 08:56:42 -0700 Subject: [PATCH 2/5] Self-review improvements --- .claude/review-pr-eval/README.md | 7 ++-- .claude/review-pr-eval/eval.py | 34 +++++++++++-------- .../review-pr-eval/prompts/no-logic-trace.md | 33 ++++++++++++++++++ .claude/review-pr-eval/training_set.json | 2 +- 4 files changed, 59 insertions(+), 17 deletions(-) create mode 100644 .claude/review-pr-eval/prompts/no-logic-trace.md diff --git a/.claude/review-pr-eval/README.md b/.claude/review-pr-eval/README.md index da8cd57f55..ff5cffd5cb 100644 --- a/.claude/review-pr-eval/README.md +++ b/.claude/review-pr-eval/README.md @@ -2,6 +2,11 @@ Evaluates variants of the `review-pr` prompt against a training set of GitHub PRs that contain known bugs, measuring how often the prompt catches them. +Each run invokes Claude on every PR in the training set. With the current training set, expect **10+ minutes** per evaluation. A `--compare` with two names runs both sequentially, so plan for double that. + +**Security warning:** The eval script runs Claude with `--dangerously-skip-permissions` so it can read files from the checked-out repo. PR diffs are injected verbatim into Claude's prompt, so a PR containing adversarial instructions in its diff (e.g. in code comments or string literals) could act as a prompt injection attack and cause Claude to execute arbitrary commands without confirmation. Only add PRs from trusted sources — ideally already-merged, internal PRs where the diff content is known. + + ## Prerequisites - Python 3.10+ @@ -21,8 +26,6 @@ python eval.py prompts/my-variant.md python eval.py --compare current my-variant ``` -**Warning:** Each run invokes Claude on every PR in the training set. With the current 3-PR training set, expect **10+ minutes** per evaluation. A `--compare` with two names runs both sequentially, so plan for 20+ minutes. - ## Training set `training_set.json` lists GitHub PR URLs and the specific bugs that are expected to be caught. The judge (Claude Haiku) scores each review as `CAUGHT`, `PARTIAL`, or `MISSED` for each expected issue. diff --git a/.claude/review-pr-eval/eval.py b/.claude/review-pr-eval/eval.py index b09a521308..56a29fa7f3 100755 --- a/.claude/review-pr-eval/eval.py +++ b/.claude/review-pr-eval/eval.py @@ -23,7 +23,7 @@ TRAINING_SET_FILE = SCRIPT_DIR / "training_set.json" PROMPTS_DIR = SCRIPT_DIR / "prompts" RESULTS_DIR = SCRIPT_DIR.parent.parent / "build" / "review-pr-output" -REPOS_DIR = Path.home() / "pr-eval-repos" +REPOS_DIR = SCRIPT_DIR.parent.parent / "build" / "pr-eval-repos" LIVE_PROMPT = SCRIPT_DIR.parent / "commands" / "review-pr.md" JUDGE_MODEL = "claude-haiku-4-5" @@ -65,18 +65,13 @@ def get_pr_data(url: str) -> tuple[dict, str]: return pr_view, pr_diff -def repo_key(url: str) -> str: - """Return 'owner/repo' from a GitHub PR URL.""" - parts = url.rstrip("/").split("/") - return f"{parts[-4]}/{parts[-3]}" - @contextmanager def get_merge_commit(pr_view: dict, url: str): """Context manager that checks out the PR's merge commit in ~/pr-eval-repos/.""" parts = url.rstrip("/").split("/") org, repo_name = parts[-4], parts[-3] - repo_path = REPOS_DIR / repo_name + repo_path = REPOS_DIR / org / repo_name merge_commit = (pr_view.get("mergeCommit") or {}).get("oid") if not merge_commit: @@ -119,8 +114,10 @@ def get_merge_commit(pr_view: dict, url: str): pass # Leave detached HEAD; next run will checkout the right commit -def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False, cwd: str = None) -> str: - cmd = ["claude", "-p", "--dangerously-skip-permissions"] # Run in headless mode. Trust Claude won't try anything dangerous +def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False, cwd: str = None, skip_permissions: bool = False) -> str: + cmd = ["claude", "-p"] + if skip_permissions: + cmd.append("--dangerously-skip-permissions") if extra_args: cmd.extend(extra_args) @@ -139,7 +136,11 @@ def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False, for line in process.stdout: print(line, end="", flush=True) lines.append(line) - process.wait(timeout=1200) + try: + process.wait(timeout=1200) + except subprocess.TimeoutExpired: + process.kill() + raise RuntimeError("claude -p timed out after 20 minutes") if process.returncode != 0: raise RuntimeError(process.stderr.read().strip() or f"claude -p exited with code {process.returncode}") return "".join(lines).strip() @@ -177,7 +178,7 @@ def run_review(prompt_template: str, pr_view: dict, pr_diff: str, cwd: str = Non {pr_diff}""" print() - return run_claude(full_prompt, stream=True, cwd=cwd) + return run_claude(full_prompt, stream=True, cwd=cwd, skip_permissions=True) def judge_review(review_output: str, expected_issue: str) -> tuple[str, str]: @@ -268,7 +269,7 @@ def print_summary(evaluation: dict): def main(): - RESULTS_DIR.mkdir(exist_ok=True) + RESULTS_DIR.mkdir(parents=True, exist_ok=True) if not TRAINING_SET_FILE.exists(): print(f"Error: training set not found at {TRAINING_SET_FILE}") @@ -286,9 +287,14 @@ def main(): print("Usage: eval.py --compare ...") sys.exit(1) + prompt_files = {name: LIVE_PROMPT if name == "current" else PROMPTS_DIR / f"{name}.md" for name in names} + for name, prompt_file in prompt_files.items(): + if not prompt_file.exists(): + print(f"Error: prompt file not found at {prompt_file}") + sys.exit(1) + all_results = [] - for name in names: - prompt_file = LIVE_PROMPT if name == "current" else PROMPTS_DIR / f"{name}.md" + for name, prompt_file in prompt_files.items(): print(f"\nEvaluating {name}...") result = evaluate_prompt(prompt_file, training_set) all_results.append(result) diff --git a/.claude/review-pr-eval/prompts/no-logic-trace.md b/.claude/review-pr-eval/prompts/no-logic-trace.md new file mode 100644 index 0000000000..5e5d643a20 --- /dev/null +++ b/.claude/review-pr-eval/prompts/no-logic-trace.md @@ -0,0 +1,33 @@ +[//]: # (Intentionally degraded version of the prompt. Useful for testing the comparison feature of eval.py.) + +Use the `gh` CLI to fetch the PR details and diff, then perform a code review. + +Steps: +1. Run `gh pr view $ARGUMENTS` to get the PR title, description, and author. +2. Run `gh pr diff $ARGUMENTS` to get the full diff. + +Then review the PR: + +## Step 1: Understand the Intent + +Summarize in 2-3 sentences what this PR is supposed to do. + +## Step 2: General Impressions + +Look over the changed code and note anything that seems off or could be improved. Focus on obvious issues like missing null checks, unclear variable names, or missing tests. + +## Step 3: Security + +- Input validation and sanitization +- Authentication and authorization checks +- SQL injection, XSS, path traversal + +## Output Format + +For each issue found, report: + +**Finding #*IncrementingNumber* - [Severity: Critical/High/Medium/Low]** — *Category* — `file:line` +> **Issue**: What is wrong. +> **Suggestion**: How to fix it. + +Give a one-paragraph overall assessment. diff --git a/.claude/review-pr-eval/training_set.json b/.claude/review-pr-eval/training_set.json index 6ead00209e..ccac6fa156 100644 --- a/.claude/review-pr-eval/training_set.json +++ b/.claude/review-pr-eval/training_set.json @@ -22,4 +22,4 @@ "SampleTypeUpdateServiceDI.getMaterialMapsWithInput() uses `filter` instead of `cfilter` in the call to TableSelector" ] } -] \ No newline at end of file +] From d8a049c77a46a0b26a121cc6b9b14c30e4d775e6 Mon Sep 17 00:00:00 2001 From: labkey-jeckels Date: Sat, 21 Mar 2026 09:04:32 -0700 Subject: [PATCH 3/5] Self-review improvements --- .claude/review-pr-eval/README.md | 2 +- .claude/review-pr-eval/eval.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.claude/review-pr-eval/README.md b/.claude/review-pr-eval/README.md index ff5cffd5cb..5279682270 100644 --- a/.claude/review-pr-eval/README.md +++ b/.claude/review-pr-eval/README.md @@ -53,7 +53,7 @@ python eval.py --compare current my-variant ## Repo cache -When evaluating, the script checks out each PR's merge commit so Claude has access to the full repository context. Clones are stored at `~/pr-eval-repos/` and reused across runs. Fetches are only performed if the required commit is not already present locally. These clones use `--filter=blob:none` (blobless) so they are relatively lightweight. +When evaluating, the script checks out each PR's merge commit so Claude has access to the full repository context. Clones are stored at `build/pr-eval-repos//` (relative to the server repo root) and reused across runs. Fetches are only performed if the required commit is not already present locally. These clones use `--filter=blob:none` (blobless) so they are relatively lightweight. Note that running `./gradlew clean` will delete the cached clones. ## Results diff --git a/.claude/review-pr-eval/eval.py b/.claude/review-pr-eval/eval.py index 56a29fa7f3..3c73f0a0f3 100755 --- a/.claude/review-pr-eval/eval.py +++ b/.claude/review-pr-eval/eval.py @@ -15,6 +15,7 @@ import json import subprocess import sys +import threading from contextlib import contextmanager from pathlib import Path from datetime import datetime @@ -68,7 +69,7 @@ def get_pr_data(url: str) -> tuple[dict, str]: @contextmanager def get_merge_commit(pr_view: dict, url: str): - """Context manager that checks out the PR's merge commit in ~/pr-eval-repos/.""" + """Context manager that checks out the PR's merge commit in /build/pr-eval-repos//.""" parts = url.rstrip("/").split("/") org, repo_name = parts[-4], parts[-3] repo_path = REPOS_DIR / org / repo_name @@ -132,17 +133,21 @@ def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False, ) process.stdin.write(prompt) process.stdin.close() + stderr_lines = [] + stderr_thread = threading.Thread(target=lambda: stderr_lines.extend(process.stderr), daemon=True) + stderr_thread.start() lines = [] for line in process.stdout: print(line, end="", flush=True) lines.append(line) + stderr_thread.join() try: process.wait(timeout=1200) except subprocess.TimeoutExpired: process.kill() raise RuntimeError("claude -p timed out after 20 minutes") if process.returncode != 0: - raise RuntimeError(process.stderr.read().strip() or f"claude -p exited with code {process.returncode}") + raise RuntimeError("".join(stderr_lines).strip() or f"claude -p exited with code {process.returncode}") return "".join(lines).strip() else: result = subprocess.run( From ada8a0b3edcc60d91649876f94db12e9a2bb2311 Mon Sep 17 00:00:00 2001 From: labkey-jeckels Date: Sat, 21 Mar 2026 15:11:40 -0700 Subject: [PATCH 4/5] Caching, model comparison, and more --- .claude/review-pr-eval/README.md | 12 ++ .claude/review-pr-eval/eval.py | 262 ++++++++++++++++------- .claude/review-pr-eval/training_set.json | 17 -- 3 files changed, 196 insertions(+), 95 deletions(-) diff --git a/.claude/review-pr-eval/README.md b/.claude/review-pr-eval/README.md index 5279682270..a97c8031fd 100644 --- a/.claude/review-pr-eval/README.md +++ b/.claude/review-pr-eval/README.md @@ -22,10 +22,21 @@ python eval.py # Evaluate a specific variant python eval.py prompts/my-variant.md +# Evaluate using a specific model +python eval.py --model claude-opus-4-6 + # Compare the live prompt against a variant side by side python eval.py --compare current my-variant + +# Compare the same prompt across two models +python eval.py --compare current@claude-opus-4-6 current@claude-sonnet-4-6 + +# Compare a variant on a specific model against the live prompt +python eval.py --compare current my-variant@claude-opus-4-6 ``` +The `name@model` syntax in `--compare` specifies which Claude model to use for the review step. Cache keys include the model, so results for different models are stored separately. + ## Training set `training_set.json` lists GitHub PR URLs and the specific bugs that are expected to be caught. The judge (Claude Haiku) scores each review as `CAUGHT`, `PARTIAL`, or `MISSED` for each expected issue. @@ -49,6 +60,7 @@ The live prompt is always `../commands/review-pr.md`. Named variants live in `pr cp ../commands/review-pr.md prompts/my-variant.md # edit prompts/my-variant.md python eval.py --compare current my-variant +python eval.py --compare current my-variant@claude-opus-4-6 ``` ## Repo cache diff --git a/.claude/review-pr-eval/eval.py b/.claude/review-pr-eval/eval.py index 3c73f0a0f3..7572b21a24 100755 --- a/.claude/review-pr-eval/eval.py +++ b/.claude/review-pr-eval/eval.py @@ -12,10 +12,13 @@ gh CLI authenticated """ +import hashlib import json import subprocess import sys import threading +import time +from collections import Counter from contextlib import contextmanager from pathlib import Path from datetime import datetime @@ -24,6 +27,7 @@ TRAINING_SET_FILE = SCRIPT_DIR / "training_set.json" PROMPTS_DIR = SCRIPT_DIR / "prompts" RESULTS_DIR = SCRIPT_DIR.parent.parent / "build" / "review-pr-output" +CACHE_DIR = RESULTS_DIR / "cache" REPOS_DIR = SCRIPT_DIR.parent.parent / "build" / "pr-eval-repos" LIVE_PROMPT = SCRIPT_DIR.parent / "commands" / "review-pr.md" @@ -115,55 +119,50 @@ def get_merge_commit(pr_view: dict, url: str): pass # Leave detached HEAD; next run will checkout the right commit -def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False, cwd: str = None, skip_permissions: bool = False) -> str: - cmd = ["claude", "-p"] +def _format_usage(usage: dict) -> str: + if not usage: + return "" + parts = [] + if "input_tokens" in usage: + parts.append(f"in: {usage['input_tokens']:,}") + if "cache_read_input_tokens" in usage and usage["cache_read_input_tokens"]: + parts.append(f"cache_read: {usage['cache_read_input_tokens']:,}") + if "output_tokens" in usage: + parts.append(f"out: {usage['output_tokens']:,}") + return f" [{', '.join(parts)}]" if parts else "" + + +def run_claude(prompt: str, extra_args: list[str] = None, cwd: str = None, skip_permissions: bool = False) -> tuple[str, dict]: + cmd = ["claude", "-p", "--output-format", "json"] if skip_permissions: cmd.append("--dangerously-skip-permissions") if extra_args: cmd.extend(extra_args) - if stream: - process = subprocess.Popen( - cmd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=cwd, - ) - process.stdin.write(prompt) - process.stdin.close() - stderr_lines = [] - stderr_thread = threading.Thread(target=lambda: stderr_lines.extend(process.stderr), daemon=True) - stderr_thread.start() - lines = [] - for line in process.stdout: - print(line, end="", flush=True) - lines.append(line) - stderr_thread.join() - try: - process.wait(timeout=1200) - except subprocess.TimeoutExpired: - process.kill() - raise RuntimeError("claude -p timed out after 20 minutes") - if process.returncode != 0: - raise RuntimeError("".join(stderr_lines).strip() or f"claude -p exited with code {process.returncode}") - return "".join(lines).strip() - else: - result = subprocess.run( - cmd, - input=prompt, - capture_output=True, - text=True, - timeout=1200, - cwd=cwd, - ) - if result.returncode != 0: - raise RuntimeError(result.stderr.strip() or f"claude -p exited with code {result.returncode}") - return result.stdout.strip() + process = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=cwd, + ) + timer = threading.Timer(1800, lambda: process.kill()) + try: + timer.start() + stdout, stderr = process.communicate(input=prompt) + finally: + timer.cancel() + if process.returncode != 0: + raise RuntimeError(stderr.strip() or f"claude -p exited with code {process.returncode}") + try: + data = json.loads(stdout) + return data.get("result", "").strip(), data.get("usage", {}) + except (json.JSONDecodeError, KeyError): + return stdout.strip(), {} -def run_review(prompt_template: str, pr_view: dict, pr_diff: str, cwd: str = None) -> str: +def run_review(prompt_template: str, pr_view: dict, pr_diff: str, cwd: str = None, model: str = None) -> str: pr_summary = ( f"PR #{pr_view['number']}: {pr_view['title']}\n" f"URL: {pr_view.get('url', '')}\n" @@ -182,8 +181,14 @@ def run_review(prompt_template: str, pr_view: dict, pr_diff: str, cwd: str = Non ## Diff {pr_diff}""" - print() - return run_claude(full_prompt, stream=True, cwd=cwd, skip_permissions=True) + model_label = f" [{model}]" if model else "" + print(f" [{datetime.now().strftime('%H:%M:%S')}] reviewing{model_label}... ", end="", flush=True) + t0 = time.time() + extra_args = ["--model", model] if model else None + result, usage = run_claude(full_prompt, extra_args=extra_args, cwd=cwd, skip_permissions=True) + elapsed = int(time.time() - t0) + print(f"done in {elapsed}s ({len(result.splitlines())} lines){_format_usage(usage)}") + return result def judge_review(review_output: str, expected_issue: str) -> tuple[str, str]: @@ -191,46 +196,104 @@ def judge_review(review_output: str, expected_issue: str) -> tuple[str, str]: expected_issue=expected_issue, review_output=review_output, ) - text = run_claude(prompt, extra_args=["--model", JUDGE_MODEL]) + print(f" [{datetime.now().strftime('%H:%M:%S')}] {expected_issue[:70]}... ", end="", flush=True) + t0 = time.time() + text, usage = run_claude(prompt, extra_args=["--model", JUDGE_MODEL]) + elapsed = int(time.time() - t0) verdict = text.split(":")[0].strip().upper() if verdict not in ("CAUGHT", "PARTIAL", "MISSED"): verdict = "UNKNOWN" + print(f"{verdict} ({elapsed}s){_format_usage(usage)}") return verdict, text -def evaluate_prompt(prompt_file: Path, training_set: list) -> dict: +def _cache_key(prompt_template: str, url: str, model: str = "") -> str: + return hashlib.sha256((prompt_template + "\n" + url + "\n" + model).encode()).hexdigest()[:32] + + +def load_cached_pr_result(prompt_template: str, url: str, model: str = "") -> dict | None: + cache_file = CACHE_DIR / f"{_cache_key(prompt_template, url, model)}.json" + return json.loads(cache_file.read_text()) if cache_file.exists() else None + + +def save_cached_pr_result(prompt_template: str, url: str, result: dict, model: str = ""): + CACHE_DIR.mkdir(parents=True, exist_ok=True) + cache_file = CACHE_DIR / f"{_cache_key(prompt_template, url, model)}.json" + cache_file.write_text(json.dumps(result, indent=2)) + + +def _aggregate_findings(all_run_findings: list[list[dict]]) -> list[dict]: + """Merge per-run findings into one finding per issue with verdict breakdown.""" + aggregated = [] + for i, base in enumerate(all_run_findings[0]): + verdicts = [run[i]["verdict"] for run in all_run_findings] + catch_rate = round( + (verdicts.count("CAUGHT") + 0.5 * verdicts.count("PARTIAL")) / len(verdicts), 2 + ) + majority = Counter(verdicts).most_common(1)[0][0] + aggregated.append({ + "expected_issue": base["expected_issue"], + "verdict": majority, + "verdicts": verdicts, + "multi_run_catch_rate": catch_rate, + "judge_explanation": all_run_findings[-1][i]["judge_explanation"], + }) + return aggregated + + +def evaluate_prompt(prompt_file: Path, training_set: list, num_runs: int = 1, model: str = None, use_cache: bool = True) -> dict: prompt_template = prompt_file.read_text() results = [] for entry in training_set: url = entry["url"] short = pr_label(url) + + if use_cache and num_runs == 1: + cached = load_cached_pr_result(prompt_template, url, model or "") + if cached: + print(f" [{short}] using cached result") + results.append(cached) + continue + print(f" [{short}] fetching... ", end="", flush=True) try: pr_view, pr_diff = get_pr_data(url) print(f"{pr_view['title']}") print(f" diff: {len(pr_diff):,} chars") - with get_merge_commit(pr_view, url) as cwd: - review = run_review(prompt_template, pr_view, pr_diff, cwd=cwd) - print(f"\n--- judging ---") - - findings = [] - for issue in entry["expected_issues"]: - print(f" {issue[:80]}... ", end="", flush=True) - verdict, judge_explanation = judge_review(review, issue) - print(verdict) - findings.append({ - "expected_issue": issue, - "verdict": verdict, - "judge_explanation": judge_explanation, - }) + all_run_findings = [] + last_review = None + with get_merge_commit(pr_view, url) as cwd: + for run_idx in range(num_runs): + if num_runs > 1: + print(f" run {run_idx + 1}/{num_runs}:") + last_review = run_review(prompt_template, pr_view, pr_diff, cwd=cwd, model=model) + + print(f" --- judging ---") + run_findings = [] + for issue in entry["expected_issues"]: + verdict, judge_explanation = judge_review(last_review, issue) + run_findings.append({ + "expected_issue": issue, + "verdict": verdict, + "judge_explanation": judge_explanation, + }) + all_run_findings.append(run_findings) + save_cached_pr_result(prompt_template, url, { + "url": url, + "title": pr_view["title"], + "findings": run_findings, + "review": last_review, + }, model or "") + + findings = _aggregate_findings(all_run_findings) if num_runs > 1 else all_run_findings[0] results.append({ "url": url, "title": pr_view["title"], "findings": findings, - "review": review, + "review": last_review, }) except Exception as e: print(f"ERROR: {e}") @@ -242,20 +305,32 @@ def evaluate_prompt(prompt_file: Path, training_set: list) -> dict: all_findings = [f for r in results for f in r["findings"]] total = len(all_findings) - caught = sum(1 for f in all_findings if f["verdict"] == "CAUGHT") - partial = sum(1 for f in all_findings if f["verdict"] == "PARTIAL") - missed = sum(1 for f in all_findings if f["verdict"] == "MISSED") + if num_runs > 1: + # Average the per-issue catch rates across runs + catch_rate = round(sum(f.get("multi_run_catch_rate", 0) for f in all_findings) / total, 2) if total > 0 else 0.0 + caught = round(sum(f.get("multi_run_catch_rate", 0) for f in all_findings)) + partial = 0 + missed = total - caught + else: + caught = sum(1 for f in all_findings if f["verdict"] == "CAUGHT") + partial = sum(1 for f in all_findings if f["verdict"] == "PARTIAL") + missed = sum(1 for f in all_findings if f["verdict"] == "MISSED") + catch_rate = round((caught + 0.5 * partial) / total, 2) if total > 0 else 0.0 + stem = prompt_file.stem + model_label = f"{stem}@{model}" if model else stem return { "prompt_file": str(prompt_file), + "model": model, + "model_label": model_label, "timestamp": datetime.now().isoformat(), + "num_runs": num_runs, "score": { "caught": caught, "partial": partial, "missed": missed, "total": total, - # Partial counts as 0.5 — it found something but wasn't precise - "catch_rate": round((caught + 0.5 * partial) / total, 2) if total > 0 else 0.0, + "catch_rate": catch_rate, }, "results": results, } @@ -263,13 +338,18 @@ def evaluate_prompt(prompt_file: Path, training_set: list) -> dict: def print_summary(evaluation: dict): s = evaluation["score"] - name = Path(evaluation["prompt_file"]).stem - print(f"\n{name}: {s['caught']} caught, {s['partial']} partial, {s['missed']} missed / {s['total']} total (catch rate: {s['catch_rate']:.0%})") + name = evaluation.get("model_label") or Path(evaluation["prompt_file"]).stem + num_runs = evaluation.get("num_runs", 1) + run_label = f" over {num_runs} runs" if num_runs > 1 else "" + print(f"\n{name}{run_label}: {s['caught']} caught, {s['partial']} partial, {s['missed']} missed / {s['total']} total (catch rate: {s['catch_rate']:.0%})") for r in evaluation["results"]: short = pr_label(r["url"]) title = r.get("title", "") for f in r["findings"]: - if f["verdict"] in ("MISSED", "PARTIAL", "ERROR"): + if "verdicts" in f: + breakdown = ", ".join(f"{v}×{f['verdicts'].count(v)}" for v in ("CAUGHT", "PARTIAL", "MISSED") if f["verdicts"].count(v)) + print(f" [{f['verdict']}] {short} {title[:30]}: {breakdown} — {f.get('judge_explanation', '')}") + elif f["verdict"] in ("MISSED", "PARTIAL", "ERROR"): print(f" [{f['verdict']}] {short} {title[:30]}: {f.get('judge_explanation', f.get('error', ''))}") @@ -283,7 +363,27 @@ def main(): training_set = json.loads(TRAINING_SET_FILE.read_text()) args = sys.argv[1:] - print(f"Warning: this evaluation runs Claude on {len(training_set)} PRs and will take 10+ minutes.") + num_runs = 1 + runs_specified = "--runs" in args + if runs_specified: + idx = args.index("--runs") + if idx + 1 >= len(args): + print("Usage: eval.py --runs ") + sys.exit(1) + num_runs = int(args[idx + 1]) + args = args[:idx] + args[idx + 2:] + + run_label = f" ({num_runs} runs each)" if num_runs > 1 else "" + print(f"Warning: this evaluation runs Claude on {len(training_set)} PRs{run_label} and will take 10+ minutes.") + + single_model = None + if "--model" in args: + idx = args.index("--model") + if idx + 1 >= len(args): + print("Usage: eval.py --model ") + sys.exit(1) + single_model = args[idx + 1] + args = args[:idx] + args[idx + 2:] if "--compare" in args: idx = args.index("--compare") @@ -292,26 +392,32 @@ def main(): print("Usage: eval.py --compare ...") sys.exit(1) - prompt_files = {name: LIVE_PROMPT if name == "current" else PROMPTS_DIR / f"{name}.md" for name in names} - for name, prompt_file in prompt_files.items(): + entries = [] + for name in names: + prompt_name, _, model = name.partition("@") + prompt_file = LIVE_PROMPT if prompt_name == "current" else PROMPTS_DIR / f"{prompt_name}.md" + entries.append((name, prompt_file, model or None)) + + for name, prompt_file, _ in entries: if not prompt_file.exists(): print(f"Error: prompt file not found at {prompt_file}") sys.exit(1) all_results = [] - for name, prompt_file in prompt_files.items(): + for name, prompt_file, model in entries: print(f"\nEvaluating {name}...") - result = evaluate_prompt(prompt_file, training_set) + result = evaluate_prompt(prompt_file, training_set, num_runs=num_runs, model=model, use_cache=not runs_specified) all_results.append(result) - out_file = RESULTS_DIR / f"{name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + safe_name = name.replace("@", "_at_") + out_file = RESULTS_DIR / f"{safe_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" out_file.write_text(json.dumps(result, indent=2)) print_summary(result) print("\n--- Comparison ---") for r in all_results: s = r["score"] - prompt_name = Path(r["prompt_file"]).stem - print(f" {prompt_name:25s} {s['catch_rate']:.0%} ({s['caught']}C {s['partial']}P {s['missed']}M)") + label = r.get("model_label") or Path(r["prompt_file"]).stem + print(f" {label:35s} {s['catch_rate']:.0%} ({s['caught']}C {s['partial']}P {s['missed']}M)") else: prompt_file = Path(args[0]) if args else LIVE_PROMPT @@ -319,7 +425,7 @@ def main(): print(f"Error: prompt file not found at {prompt_file}") sys.exit(1) print(f"Evaluating {prompt_file.name} against {len(training_set)} PRs...") - result = evaluate_prompt(prompt_file, training_set) + result = evaluate_prompt(prompt_file, training_set, num_runs=num_runs, model=single_model, use_cache=not runs_specified) print_summary(result) out_file = RESULTS_DIR / f"{prompt_file.stem}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" out_file.write_text(json.dumps(result, indent=2)) diff --git a/.claude/review-pr-eval/training_set.json b/.claude/review-pr-eval/training_set.json index ccac6fa156..db921ee311 100644 --- a/.claude/review-pr-eval/training_set.json +++ b/.claude/review-pr-eval/training_set.json @@ -1,25 +1,8 @@ [ { - "url": "https://github.com/LabKey/limsModules/pull/1792", - "expected_issues": [ - "The SQLServer version of sampleManagement-25.001-25.002.sql has the wrong WHERE clause and should use role='org.labkey.samplemanagement.security.roles.WorkflowEditorRole'", - "WorkflowController.SetDefaultEmailPrefAction unconditionally sets the success status for the response to false", - "WorkflowManager.addWorkflowJobAuditEvent() fetches originalJobMetadata but fails to pass it to encodeForDataMap() " - ] - }, { - "url": "https://github.com/LabKey/labkey-ui-components/pull/1144", - "expected_issues": [ - "The change to BulkUpdateForm.tsx displayFieldUpdates.merge(data) unconditionally includes pre-populated display values for StoredAmount/Units in every non-empty form submission" - ] - }, { "url": "https://github.com/LabKey/platform/pull/5703", "expected_issues": [ "TsvDataSerializer.exportData() fails to write the first row of data for all files except the first due to a misplaced writeRow() call" ] - }, { - "url": "https://github.com/LabKey/platform/pull/3949", - "expected_issues": [ - "SampleTypeUpdateServiceDI.getMaterialMapsWithInput() uses `filter` instead of `cfilter` in the call to TableSelector" - ] } ] From 0838a0fda740e770423d52dd78457305be27b508 Mon Sep 17 00:00:00 2001 From: labkey-jeckels Date: Sat, 21 Mar 2026 15:34:20 -0700 Subject: [PATCH 5/5] Model args --- .claude/review-pr-eval/eval.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.claude/review-pr-eval/eval.py b/.claude/review-pr-eval/eval.py index 7572b21a24..8c911cc2ca 100755 --- a/.claude/review-pr-eval/eval.py +++ b/.claude/review-pr-eval/eval.py @@ -394,9 +394,12 @@ def main(): entries = [] for name in names: - prompt_name, _, model = name.partition("@") + prompt_name, has_at, inline_model = name.partition("@") + if has_at and single_model: + print(f"Warning: --model {single_model!r} and @model syntax both specified for {name!r}; @model takes precedence") + effective_model = (inline_model if has_at else None) or single_model prompt_file = LIVE_PROMPT if prompt_name == "current" else PROMPTS_DIR / f"{prompt_name}.md" - entries.append((name, prompt_file, model or None)) + entries.append((name, prompt_file, effective_model)) for name, prompt_file, _ in entries: if not prompt_file.exists():