From adafd6a1f9399bdd9acf5f41bfee430ef8973d46 Mon Sep 17 00:00:00 2001
From: labkey-jeckels <jeckels@labkey.com>
Date: Sat, 21 Mar 2026 08:24:31 -0700
Subject: [PATCH 1/5] Claude Code pull request reviewer and eval tool

---
 .claude/commands/review-pr.md            |  59 +++++
 .claude/review-pr-eval/README.md         |  59 +++++
 .claude/review-pr-eval/eval.py           | 319 +++++++++++++++++++++++
 .claude/review-pr-eval/training_set.json |  25 ++
 CLAUDE.md                                |   2 +-
 5 files changed, 463 insertions(+), 1 deletion(-)
 create mode 100644 .claude/commands/review-pr.md
 create mode 100644 .claude/review-pr-eval/README.md
 create mode 100755 .claude/review-pr-eval/eval.py
 create mode 100644 .claude/review-pr-eval/training_set.json

diff --git a/.claude/commands/review-pr.md b/.claude/commands/review-pr.md
new file mode 100644
index 0000000000..c9d6b070df
--- /dev/null
+++ b/.claude/commands/review-pr.md
@@ -0,0 +1,59 @@
+Use the `gh` CLI to fetch the PR details and diff, then perform a systematic code review.
+
+Steps:
+1. Run `gh pr view $ARGUMENTS` to get the PR title, description, and author.
+2. Run `gh pr diff $ARGUMENTS` to get the full diff.
+3. For each file changed, if you need more context than the diff provides, read the relevant file(s).
+
+Then perform a thorough review in this exact order:
+
+---
+
+## Phase 1: Understand the Intent
+
+Summarize in 2-3 sentences what this PR is supposed to do, based on the title, description, and diff. This is your baseline for correctness checks.
+
+## Phase 2: Logic Analysis (Most Critical)
+
+For **each changed function or method**, work through it mechanically:
+
+- **Trace the execution**: Walk through what the code does step by step in plain English. Do not just restate the code — describe what values flow through and what decisions are made.
+- **Check conditions**: For every `if`, `while`, `for`, ternary, or boolean expression: is the condition correct? Could it be inverted? Are the operands in the right order?
+- **Check edge cases**: What happens with null/empty/zero/negative/maximum inputs? Are bounds correct (off-by-one)?
+- **Check missing cases**: Are there code paths the change forgot to handle?
+- **Check state mutations**: If the code modifies shared state, is the order of operations correct? Could this cause incorrect behavior if called multiple times or concurrently?
+
+Do not skip this phase for "simple-looking" changes. Many bugs hide in code that appears straightforward.
+
+## Phase 3: Correctness Against Intent
+
+Compare what the code *actually does* (from Phase 2) against what it *should do* (from Phase 1). Call out any gaps.
+
+## Phase 4: Security
+
+- Input validation and sanitization
+- Authentication and authorization checks
+- SQL injection, XSS, path traversal
+- Sensitive data in logs or responses
+- Insecure defaults
+
+## Phase 5: Interactions and Side Effects
+
+- Could this change break existing callers that depend on the old behavior?
+- Are there other places in the codebase that should have been updated alongside this change?
+- Are tests updated to cover the new behavior?
+
+---
+
+## Output Format
+
+For each issue found, report:
+
+**Finding #*IncrementingNumber* -  [Severity: Critical/High/Medium/Low]** — *Category* — `file:line`
+> **Issue**: What is wrong.
+> **Why it matters**: The impact if unfixed.
+> **Suggestion**: How to fix it.
+
+Lead with Critical and High severity issues. After all issues, give a one-paragraph overall assessment.
+
+ultrathink
diff --git a/.claude/review-pr-eval/README.md b/.claude/review-pr-eval/README.md
new file mode 100644
index 0000000000..da8cd57f55
--- /dev/null
+++ b/.claude/review-pr-eval/README.md
@@ -0,0 +1,59 @@
+# review-pr eval
+
+Evaluates variants of the `review-pr` prompt against a training set of GitHub PRs that contain known bugs, measuring how often the prompt catches them.
+
+## Prerequisites
+
+- Python 3.10+
+- `claude` CLI authenticated (`claude --version` should work)
+- `gh` CLI authenticated (`gh auth status` should confirm)
+
+## Running
+
+```bash
+# Evaluate the live prompt (../commands/review-pr.md)
+python eval.py
+
+# Evaluate a specific variant
+python eval.py prompts/my-variant.md
+
+# Compare the live prompt against a variant side by side
+python eval.py --compare current my-variant
+```
+
+**Warning:** Each run invokes Claude on every PR in the training set. With the current 3-PR training set, expect **10+ minutes** per evaluation. A `--compare` with two names runs both sequentially, so plan for 20+ minutes.
+
+## Training set
+
+`training_set.json` lists GitHub PR URLs and the specific bugs that are expected to be caught. The judge (Claude Haiku) scores each review as `CAUGHT`, `PARTIAL`, or `MISSED` for each expected issue.
+
+To add a PR to the training set, append an entry:
+
+```json
+{
+  "url": "https://github.com/org/repo/pull/123",
+  "expected_issues": [
+    "Description of the specific bug that should be caught"
+  ]
+}
+```
+
+## Prompt variants
+
+The live prompt is always `../commands/review-pr.md`. Named variants live in `prompts/`. To create a variant:
+
+```bash
+cp ../commands/review-pr.md prompts/my-variant.md
+# edit prompts/my-variant.md
+python eval.py --compare current my-variant
+```
+
+## Repo cache
+
+When evaluating, the script checks out each PR's merge commit so Claude has access to the full repository context. Clones are stored at `~/pr-eval-repos/<repo-name>` and reused across runs. Fetches are only performed if the required commit is not already present locally. These clones use `--filter=blob:none` (blobless) so they are relatively lightweight.
+
+## Results
+
+Results are saved as JSON files in the repo root `build/` directory, named `<prompt-stem>_<timestamp>.json`. Each file contains the full review text, per-issue verdicts, and a summary score.
+
+The catch rate counts `CAUGHT` as 1 and `PARTIAL` as 0.5.
diff --git a/.claude/review-pr-eval/eval.py b/.claude/review-pr-eval/eval.py
new file mode 100755
index 0000000000..b09a521308
--- /dev/null
+++ b/.claude/review-pr-eval/eval.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""
+Evaluate review-pr prompt variants against a training set of PRs with known critical bugs.
+
+Usage:
+  python eval.py                              # evaluate ../commands/review-pr.md
+  python eval.py prompts/variant1.md          # evaluate a specific variant
+  python eval.py --compare current variant1   # compare two variants side by side
+
+Requires:
+  claude CLI (Claude Code) authenticated
+  gh CLI authenticated
+"""
+
+import json
+import subprocess
+import sys
+from contextlib import contextmanager
+from pathlib import Path
+from datetime import datetime
+
+SCRIPT_DIR = Path(__file__).parent
+TRAINING_SET_FILE = SCRIPT_DIR / "training_set.json"
+PROMPTS_DIR = SCRIPT_DIR / "prompts"
+RESULTS_DIR = SCRIPT_DIR.parent.parent / "build" / "review-pr-output"
+REPOS_DIR = Path.home() / "pr-eval-repos"
+LIVE_PROMPT = SCRIPT_DIR.parent / "commands" / "review-pr.md"
+
+JUDGE_MODEL = "claude-haiku-4-5"
+
+
+JUDGE_PROMPT = """You are evaluating whether a code review successfully identified a known critical issue.
+
+Known critical issue to find:
+{expected_issue}
+
+Code review output to evaluate:
+{review_output}
+
+Did the code review identify this issue or a substantially equivalent problem?
+
+Respond with exactly one of these verdicts, followed by a colon and brief explanation:
+CAUGHT: <explanation> — the review clearly identified this issue or its root cause
+PARTIAL: <explanation> — the review hinted at related concerns but didn't pinpoint the specific issue
+MISSED: <explanation> — the review did not identify this issue"""
+
+
+def pr_label(url: str) -> str:
+    """Return 'owner/repo#number' from a full GitHub PR URL."""
+    parts = url.rstrip("/").split("/")
+    # https://github.com/owner/repo/pull/number
+    return f"{parts[-4]}/{parts[-3]}#{parts[-1]}"
+
+
+def get_pr_data(url: str) -> tuple[dict, str]:
+    view_json = subprocess.check_output(
+        ["gh", "pr", "view", url, "--json", "title,body,author,number,url,mergeCommit"],
+        text=True,
+    )
+    pr_view = json.loads(view_json)
+    pr_diff = subprocess.check_output(
+        ["gh", "pr", "diff", url],
+        text=True,
+    )
+    return pr_view, pr_diff
+
+
+def repo_key(url: str) -> str:
+    """Return 'owner/repo' from a GitHub PR URL."""
+    parts = url.rstrip("/").split("/")
+    return f"{parts[-4]}/{parts[-3]}"
+
+
+@contextmanager
+def get_merge_commit(pr_view: dict, url: str):
+    """Context manager that checks out the PR's merge commit in ~/pr-eval-repos/<repo>."""
+    parts = url.rstrip("/").split("/")
+    org, repo_name = parts[-4], parts[-3]
+    repo_path = REPOS_DIR / repo_name
+    merge_commit = (pr_view.get("mergeCommit") or {}).get("oid")
+
+    if not merge_commit:
+        print(f"  (PR has no merge commit, skipping checkout)")
+        yield None
+        return
+
+    if not repo_path.exists():
+        REPOS_DIR.mkdir(parents=True, exist_ok=True)
+        print(f"  cloning {org}/{repo_name}... ", end="", flush=True)
+        subprocess.check_call(
+            ["gh", "repo", "clone", f"{org}/{repo_name}", str(repo_path), "--", "--filter=blob:none"],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        print("done")
+
+    # Ensure commit is present (may need a fetch if repo is stale)
+    commit_known = subprocess.call(
+        ["git", "-C", str(repo_path), "cat-file", "-e", merge_commit],
+        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+    ) == 0
+    if not commit_known:
+        print(f"  fetching {org}/{repo_name}... ", end="", flush=True)
+        subprocess.check_call(
+            ["git", "-C", str(repo_path), "fetch", "--filter=blob:none", "origin"],
+            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+        )
+        print("done")
+
+    print(f"  checking out {merge_commit[:12]}... ", end="", flush=True)
+    subprocess.check_call(
+        ["git", "-C", str(repo_path), "checkout", "--detach", merge_commit],
+        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+    )
+    print("done")
+    try:
+        yield str(repo_path)
+    finally:
+        pass  # Leave detached HEAD; next run will checkout the right commit
+
+
+def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False, cwd: str = None) -> str:
+    cmd = ["claude", "-p", "--dangerously-skip-permissions"]  # Run in headless mode. Trust Claude won't try anything dangerous
+    if extra_args:
+        cmd.extend(extra_args)
+
+    if stream:
+        process = subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            cwd=cwd,
+        )
+        process.stdin.write(prompt)
+        process.stdin.close()
+        lines = []
+        for line in process.stdout:
+            print(line, end="", flush=True)
+            lines.append(line)
+        process.wait(timeout=1200)
+        if process.returncode != 0:
+            raise RuntimeError(process.stderr.read().strip() or f"claude -p exited with code {process.returncode}")
+        return "".join(lines).strip()
+    else:
+        result = subprocess.run(
+            cmd,
+            input=prompt,
+            capture_output=True,
+            text=True,
+            timeout=1200,
+            cwd=cwd,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(result.stderr.strip() or f"claude -p exited with code {result.returncode}")
+        return result.stdout.strip()
+
+
+def run_review(prompt_template: str, pr_view: dict, pr_diff: str, cwd: str = None) -> str:
+    pr_summary = (
+        f"PR #{pr_view['number']}: {pr_view['title']}\n"
+        f"URL: {pr_view.get('url', '')}\n"
+        f"Author: {pr_view['author']['login']}\n\n"
+        f"Description:\n{pr_view.get('body', '(no description)')}"
+    )
+    # Inject the PR data in place of the gh CLI calls the prompt would normally make
+    full_prompt = f"""{prompt_template}
+
+---
+Note: The PR data has already been fetched. Use the following instead of running gh commands:
+
+## PR Details
+{pr_summary}
+
+## Diff
+{pr_diff}"""
+
+    print()
+    return run_claude(full_prompt, stream=True, cwd=cwd)
+
+
+def judge_review(review_output: str, expected_issue: str) -> tuple[str, str]:
+    prompt = JUDGE_PROMPT.format(
+        expected_issue=expected_issue,
+        review_output=review_output,
+    )
+    text = run_claude(prompt, extra_args=["--model", JUDGE_MODEL])
+    verdict = text.split(":")[0].strip().upper()
+    if verdict not in ("CAUGHT", "PARTIAL", "MISSED"):
+        verdict = "UNKNOWN"
+    return verdict, text
+
+
+def evaluate_prompt(prompt_file: Path, training_set: list) -> dict:
+    prompt_template = prompt_file.read_text()
+    results = []
+
+    for entry in training_set:
+        url = entry["url"]
+        short = pr_label(url)
+        print(f"  [{short}] fetching... ", end="", flush=True)
+
+        try:
+            pr_view, pr_diff = get_pr_data(url)
+            print(f"{pr_view['title']}")
+            print(f"  diff: {len(pr_diff):,} chars")
+            with get_merge_commit(pr_view, url) as cwd:
+                review = run_review(prompt_template, pr_view, pr_diff, cwd=cwd)
+            print(f"\n--- judging ---")
+
+            findings = []
+            for issue in entry["expected_issues"]:
+                print(f"  {issue[:80]}... ", end="", flush=True)
+                verdict, judge_explanation = judge_review(review, issue)
+                print(verdict)
+                findings.append({
+                    "expected_issue": issue,
+                    "verdict": verdict,
+                    "judge_explanation": judge_explanation,
+                })
+
+            results.append({
+                "url": url,
+                "title": pr_view["title"],
+                "findings": findings,
+                "review": review,
+            })
+        except Exception as e:
+            print(f"ERROR: {e}")
+            results.append({
+                "url": url,
+                "findings": [{"expected_issue": issue, "verdict": "ERROR", "error": str(e)}
+                             for issue in entry["expected_issues"]],
+            })
+
+    all_findings = [f for r in results for f in r["findings"]]
+    total = len(all_findings)
+    caught = sum(1 for f in all_findings if f["verdict"] == "CAUGHT")
+    partial = sum(1 for f in all_findings if f["verdict"] == "PARTIAL")
+    missed = sum(1 for f in all_findings if f["verdict"] == "MISSED")
+
+    return {
+        "prompt_file": str(prompt_file),
+        "timestamp": datetime.now().isoformat(),
+        "score": {
+            "caught": caught,
+            "partial": partial,
+            "missed": missed,
+            "total": total,
+            # Partial counts as 0.5 — it found something but wasn't precise
+            "catch_rate": round((caught + 0.5 * partial) / total, 2) if total > 0 else 0.0,
+        },
+        "results": results,
+    }
+
+
+def print_summary(evaluation: dict):
+    s = evaluation["score"]
+    name = Path(evaluation["prompt_file"]).stem
+    print(f"\n{name}: {s['caught']} caught, {s['partial']} partial, {s['missed']} missed / {s['total']} total  (catch rate: {s['catch_rate']:.0%})")
+    for r in evaluation["results"]:
+        short = pr_label(r["url"])
+        title = r.get("title", "")
+        for f in r["findings"]:
+            if f["verdict"] in ("MISSED", "PARTIAL", "ERROR"):
+                print(f"  [{f['verdict']}] {short} {title[:30]}: {f.get('judge_explanation', f.get('error', ''))}")
+
+
+def main():
+    RESULTS_DIR.mkdir(exist_ok=True)
+
+    if not TRAINING_SET_FILE.exists():
+        print(f"Error: training set not found at {TRAINING_SET_FILE}")
+        sys.exit(1)
+
+    training_set = json.loads(TRAINING_SET_FILE.read_text())
+    args = sys.argv[1:]
+
+    print(f"Warning: this evaluation runs Claude on {len(training_set)} PRs and will take 10+ minutes.")
+
+    if "--compare" in args:
+        idx = args.index("--compare")
+        names = args[idx + 1:]
+        if not names:
+            print("Usage: eval.py --compare <name1> <name2> ...")
+            sys.exit(1)
+
+        all_results = []
+        for name in names:
+            prompt_file = LIVE_PROMPT if name == "current" else PROMPTS_DIR / f"{name}.md"
+            print(f"\nEvaluating {name}...")
+            result = evaluate_prompt(prompt_file, training_set)
+            all_results.append(result)
+            out_file = RESULTS_DIR / f"{name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+            out_file.write_text(json.dumps(result, indent=2))
+            print_summary(result)
+
+        print("\n--- Comparison ---")
+        for r in all_results:
+            s = r["score"]
+            prompt_name = Path(r["prompt_file"]).stem
+            print(f"  {prompt_name:25s}  {s['catch_rate']:.0%}  ({s['caught']}C {s['partial']}P {s['missed']}M)")
+
+    else:
+        prompt_file = Path(args[0]) if args else LIVE_PROMPT
+        if not prompt_file.exists():
+            print(f"Error: prompt file not found at {prompt_file}")
+            sys.exit(1)
+        print(f"Evaluating {prompt_file.name} against {len(training_set)} PRs...")
+        result = evaluate_prompt(prompt_file, training_set)
+        print_summary(result)
+        out_file = RESULTS_DIR / f"{prompt_file.stem}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+        out_file.write_text(json.dumps(result, indent=2))
+        print(f"\nResults saved to {out_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.claude/review-pr-eval/training_set.json b/.claude/review-pr-eval/training_set.json
new file mode 100644
index 0000000000..6ead00209e
--- /dev/null
+++ b/.claude/review-pr-eval/training_set.json
@@ -0,0 +1,25 @@
+[
+  {
+    "url": "https://github.com/LabKey/limsModules/pull/1792",
+    "expected_issues": [
+      "The SQLServer version of sampleManagement-25.001-25.002.sql has the wrong WHERE clause and should use role='org.labkey.samplemanagement.security.roles.WorkflowEditorRole'",
+      "WorkflowController.SetDefaultEmailPrefAction unconditionally sets the success status for the response to false",
+      "WorkflowManager.addWorkflowJobAuditEvent() fetches originalJobMetadata but fails to pass it to encodeForDataMap() "
+    ]
+  }, {
+    "url": "https://github.com/LabKey/labkey-ui-components/pull/1144",
+    "expected_issues": [
+      "The change to BulkUpdateForm.tsx displayFieldUpdates.merge(data) unconditionally includes pre-populated display values for StoredAmount/Units in every non-empty form submission"
+    ]
+  }, {
+    "url": "https://github.com/LabKey/platform/pull/5703",
+    "expected_issues": [
+      "TsvDataSerializer.exportData() fails to write the first row of data for all files except the first due to a misplaced writeRow() call"
+    ]
+  }, {
+    "url": "https://github.com/LabKey/platform/pull/3949",
+    "expected_issues": [
+      "SampleTypeUpdateServiceDI.getMaterialMapsWithInput() uses `filter` instead of `cfilter` in the call to TableSelector"
+    ]
+  }
+]
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
index eabbf4b886..9fb4f2a666 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -89,7 +89,7 @@ All external library versions are centralized in `gradle.properties` (200+ versi
 - **Java Streams**: Prefer `Stream` API over traditional for-loops for collection processing.
 - **Resources**: Use try-with-resources for automatic resource management.
 - **Nullability**: Use `org.jetbrains.annotations.NotNull` and `org.jetbrains.annotations.Nullable` annotations. Be explicit in public API signatures.
-- **Logging**: Use Log4J2. Name the static logger `LOG`, initialized via `LogHelper.getLogger()`:
+- **Logging**: Use Log4J2. Never use System.out or System.err. Name the static logger `LOG`, initialized via `LogHelper.getLogger()`:
   ```java
   private static final Logger LOG = LogHelper.getLogger(MyClass.class, "optional description");
   ```

From 105e0d3dd9fbf9a534d43d9ef0c2069a35f2087d Mon Sep 17 00:00:00 2001
From: labkey-jeckels <jeckels@labkey.com>
Date: Sat, 21 Mar 2026 08:56:42 -0700
Subject: [PATCH 2/5] Self-review improvements

---
 .claude/review-pr-eval/README.md              |  7 ++--
 .claude/review-pr-eval/eval.py                | 34 +++++++++++--------
 .../review-pr-eval/prompts/no-logic-trace.md  | 33 ++++++++++++++++++
 .claude/review-pr-eval/training_set.json      |  2 +-
 4 files changed, 59 insertions(+), 17 deletions(-)
 create mode 100644 .claude/review-pr-eval/prompts/no-logic-trace.md

diff --git a/.claude/review-pr-eval/README.md b/.claude/review-pr-eval/README.md
index da8cd57f55..ff5cffd5cb 100644
--- a/.claude/review-pr-eval/README.md
+++ b/.claude/review-pr-eval/README.md
@@ -2,6 +2,11 @@
 
 Evaluates variants of the `review-pr` prompt against a training set of GitHub PRs that contain known bugs, measuring how often the prompt catches them.
 
+Each run invokes Claude on every PR in the training set. With the current training set, expect **10+ minutes** per evaluation. A `--compare` with two names runs both sequentially, so plan for double that.
+
+**Security warning:** The eval script runs Claude with `--dangerously-skip-permissions` so it can read files from the checked-out repo. PR diffs are injected verbatim into Claude's prompt, so a PR containing adversarial instructions in its diff (e.g. in code comments or string literals) could act as a prompt injection attack and cause Claude to execute arbitrary commands without confirmation. Only add PRs from trusted sources — ideally already-merged, internal PRs where the diff content is known.
+
+
 ## Prerequisites
 
 - Python 3.10+
@@ -21,8 +26,6 @@ python eval.py prompts/my-variant.md
 python eval.py --compare current my-variant
 ```
 
-**Warning:** Each run invokes Claude on every PR in the training set. With the current 3-PR training set, expect **10+ minutes** per evaluation. A `--compare` with two names runs both sequentially, so plan for 20+ minutes.
-
 ## Training set
 
 `training_set.json` lists GitHub PR URLs and the specific bugs that are expected to be caught. The judge (Claude Haiku) scores each review as `CAUGHT`, `PARTIAL`, or `MISSED` for each expected issue.
diff --git a/.claude/review-pr-eval/eval.py b/.claude/review-pr-eval/eval.py
index b09a521308..56a29fa7f3 100755
--- a/.claude/review-pr-eval/eval.py
+++ b/.claude/review-pr-eval/eval.py
@@ -23,7 +23,7 @@
 TRAINING_SET_FILE = SCRIPT_DIR / "training_set.json"
 PROMPTS_DIR = SCRIPT_DIR / "prompts"
 RESULTS_DIR = SCRIPT_DIR.parent.parent / "build" / "review-pr-output"
-REPOS_DIR = Path.home() / "pr-eval-repos"
+REPOS_DIR = SCRIPT_DIR.parent.parent / "build" / "pr-eval-repos"
 LIVE_PROMPT = SCRIPT_DIR.parent / "commands" / "review-pr.md"
 
 JUDGE_MODEL = "claude-haiku-4-5"
@@ -65,18 +65,13 @@ def get_pr_data(url: str) -> tuple[dict, str]:
     return pr_view, pr_diff
 
 
-def repo_key(url: str) -> str:
-    """Return 'owner/repo' from a GitHub PR URL."""
-    parts = url.rstrip("/").split("/")
-    return f"{parts[-4]}/{parts[-3]}"
-
 
 @contextmanager
 def get_merge_commit(pr_view: dict, url: str):
     """Context manager that checks out the PR's merge commit in ~/pr-eval-repos/<repo>."""
     parts = url.rstrip("/").split("/")
     org, repo_name = parts[-4], parts[-3]
-    repo_path = REPOS_DIR / repo_name
+    repo_path = REPOS_DIR / org / repo_name
     merge_commit = (pr_view.get("mergeCommit") or {}).get("oid")
 
     if not merge_commit:
@@ -119,8 +114,10 @@ def get_merge_commit(pr_view: dict, url: str):
         pass  # Leave detached HEAD; next run will checkout the right commit
 
 
-def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False, cwd: str = None) -> str:
-    cmd = ["claude", "-p", "--dangerously-skip-permissions"]  # Run in headless mode. Trust Claude won't try anything dangerous
+def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False, cwd: str = None, skip_permissions: bool = False) -> str:
+    cmd = ["claude", "-p"]
+    if skip_permissions:
+        cmd.append("--dangerously-skip-permissions")
     if extra_args:
         cmd.extend(extra_args)
 
@@ -139,7 +136,11 @@ def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False,
         for line in process.stdout:
             print(line, end="", flush=True)
             lines.append(line)
-        process.wait(timeout=1200)
+        try:
+            process.wait(timeout=1200)
+        except subprocess.TimeoutExpired:
+            process.kill()
+            raise RuntimeError("claude -p timed out after 20 minutes")
         if process.returncode != 0:
             raise RuntimeError(process.stderr.read().strip() or f"claude -p exited with code {process.returncode}")
         return "".join(lines).strip()
@@ -177,7 +178,7 @@ def run_review(prompt_template: str, pr_view: dict, pr_diff: str, cwd: str = Non
 {pr_diff}"""
 
     print()
-    return run_claude(full_prompt, stream=True, cwd=cwd)
+    return run_claude(full_prompt, stream=True, cwd=cwd, skip_permissions=True)
 
 
 def judge_review(review_output: str, expected_issue: str) -> tuple[str, str]:
@@ -268,7 +269,7 @@ def print_summary(evaluation: dict):
 
 
 def main():
-    RESULTS_DIR.mkdir(exist_ok=True)
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
 
     if not TRAINING_SET_FILE.exists():
         print(f"Error: training set not found at {TRAINING_SET_FILE}")
@@ -286,9 +287,14 @@ def main():
             print("Usage: eval.py --compare <name1> <name2> ...")
             sys.exit(1)
 
+        prompt_files = {name: LIVE_PROMPT if name == "current" else PROMPTS_DIR / f"{name}.md" for name in names}
+        for name, prompt_file in prompt_files.items():
+            if not prompt_file.exists():
+                print(f"Error: prompt file not found at {prompt_file}")
+                sys.exit(1)
+
         all_results = []
-        for name in names:
-            prompt_file = LIVE_PROMPT if name == "current" else PROMPTS_DIR / f"{name}.md"
+        for name, prompt_file in prompt_files.items():
             print(f"\nEvaluating {name}...")
             result = evaluate_prompt(prompt_file, training_set)
             all_results.append(result)
diff --git a/.claude/review-pr-eval/prompts/no-logic-trace.md b/.claude/review-pr-eval/prompts/no-logic-trace.md
new file mode 100644
index 0000000000..5e5d643a20
--- /dev/null
+++ b/.claude/review-pr-eval/prompts/no-logic-trace.md
@@ -0,0 +1,33 @@
+[//]: # (Intentionally degraded version of the prompt. Useful for testing the comparison feature of eval.py.)
+
+Use the `gh` CLI to fetch the PR details and diff, then perform a code review.
+
+Steps:
+1. Run `gh pr view $ARGUMENTS` to get the PR title, description, and author.
+2. Run `gh pr diff $ARGUMENTS` to get the full diff.
+
+Then review the PR:
+
+## Step 1: Understand the Intent
+
+Summarize in 2-3 sentences what this PR is supposed to do.
+
+## Step 2: General Impressions
+
+Look over the changed code and note anything that seems off or could be improved. Focus on obvious issues like missing null checks, unclear variable names, or missing tests.
+
+## Step 3: Security
+
+- Input validation and sanitization
+- Authentication and authorization checks
+- SQL injection, XSS, path traversal
+
+## Output Format
+
+For each issue found, report:
+
+**Finding #*IncrementingNumber* -  [Severity: Critical/High/Medium/Low]** — *Category* — `file:line`
+> **Issue**: What is wrong.
+> **Suggestion**: How to fix it.
+
+Give a one-paragraph overall assessment.
diff --git a/.claude/review-pr-eval/training_set.json b/.claude/review-pr-eval/training_set.json
index 6ead00209e..ccac6fa156 100644
--- a/.claude/review-pr-eval/training_set.json
+++ b/.claude/review-pr-eval/training_set.json
@@ -22,4 +22,4 @@
       "SampleTypeUpdateServiceDI.getMaterialMapsWithInput() uses `filter` instead of `cfilter` in the call to TableSelector"
     ]
   }
-]
\ No newline at end of file
+]

From d8a049c77a46a0b26a121cc6b9b14c30e4d775e6 Mon Sep 17 00:00:00 2001
From: labkey-jeckels <jeckels@labkey.com>
Date: Sat, 21 Mar 2026 09:04:32 -0700
Subject: [PATCH 3/5] Self-review improvements

---
 .claude/review-pr-eval/README.md | 2 +-
 .claude/review-pr-eval/eval.py   | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.claude/review-pr-eval/README.md b/.claude/review-pr-eval/README.md
index ff5cffd5cb..5279682270 100644
--- a/.claude/review-pr-eval/README.md
+++ b/.claude/review-pr-eval/README.md
@@ -53,7 +53,7 @@ python eval.py --compare current my-variant
 
 ## Repo cache
 
-When evaluating, the script checks out each PR's merge commit so Claude has access to the full repository context. Clones are stored at `~/pr-eval-repos/<repo-name>` and reused across runs. Fetches are only performed if the required commit is not already present locally. These clones use `--filter=blob:none` (blobless) so they are relatively lightweight.
+When evaluating, the script checks out each PR's merge commit so Claude has access to the full repository context. Clones are stored at `build/pr-eval-repos/<org>/<repo-name>` (relative to the server repo root) and reused across runs. Fetches are only performed if the required commit is not already present locally. These clones use `--filter=blob:none` (blobless) so they are relatively lightweight. Note that running `./gradlew clean` will delete the cached clones.
 
 ## Results
 
diff --git a/.claude/review-pr-eval/eval.py b/.claude/review-pr-eval/eval.py
index 56a29fa7f3..3c73f0a0f3 100755
--- a/.claude/review-pr-eval/eval.py
+++ b/.claude/review-pr-eval/eval.py
@@ -15,6 +15,7 @@
 import json
 import subprocess
 import sys
+import threading
 from contextlib import contextmanager
 from pathlib import Path
 from datetime import datetime
@@ -68,7 +69,7 @@ def get_pr_data(url: str) -> tuple[dict, str]:
 
 @contextmanager
 def get_merge_commit(pr_view: dict, url: str):
-    """Context manager that checks out the PR's merge commit in ~/pr-eval-repos/<repo>."""
+    """Context manager that checks out the PR's merge commit in <repo-root>/build/pr-eval-repos/<org>/<repo>."""
     parts = url.rstrip("/").split("/")
     org, repo_name = parts[-4], parts[-3]
     repo_path = REPOS_DIR / org / repo_name
@@ -132,17 +133,21 @@ def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False,
         )
         process.stdin.write(prompt)
         process.stdin.close()
+        stderr_lines = []
+        stderr_thread = threading.Thread(target=lambda: stderr_lines.extend(process.stderr), daemon=True)
+        stderr_thread.start()
         lines = []
         for line in process.stdout:
             print(line, end="", flush=True)
             lines.append(line)
+        stderr_thread.join()
         try:
             process.wait(timeout=1200)
         except subprocess.TimeoutExpired:
             process.kill()
             raise RuntimeError("claude -p timed out after 20 minutes")
         if process.returncode != 0:
-            raise RuntimeError(process.stderr.read().strip() or f"claude -p exited with code {process.returncode}")
+            raise RuntimeError("".join(stderr_lines).strip() or f"claude -p exited with code {process.returncode}")
         return "".join(lines).strip()
     else:
         result = subprocess.run(

From ada8a0b3edcc60d91649876f94db12e9a2bb2311 Mon Sep 17 00:00:00 2001
From: labkey-jeckels <jeckels@labkey.com>
Date: Sat, 21 Mar 2026 15:11:40 -0700
Subject: [PATCH 4/5] Caching, model comparison, and more

---
 .claude/review-pr-eval/README.md         |  12 ++
 .claude/review-pr-eval/eval.py           | 262 ++++++++++++++++-------
 .claude/review-pr-eval/training_set.json |  17 --
 3 files changed, 196 insertions(+), 95 deletions(-)

diff --git a/.claude/review-pr-eval/README.md b/.claude/review-pr-eval/README.md
index 5279682270..a97c8031fd 100644
--- a/.claude/review-pr-eval/README.md
+++ b/.claude/review-pr-eval/README.md
@@ -22,10 +22,21 @@ python eval.py
 # Evaluate a specific variant
 python eval.py prompts/my-variant.md
 
+# Evaluate using a specific model
+python eval.py --model claude-opus-4-6
+
 # Compare the live prompt against a variant side by side
 python eval.py --compare current my-variant
+
+# Compare the same prompt across two models
+python eval.py --compare current@claude-opus-4-6 current@claude-sonnet-4-6
+
+# Compare a variant on a specific model against the live prompt
+python eval.py --compare current my-variant@claude-opus-4-6
 ```
 
+The `name@model` syntax in `--compare` specifies which Claude model to use for the review step. Cache keys include the model, so results for different models are stored separately.
+
 ## Training set
 
 `training_set.json` lists GitHub PR URLs and the specific bugs that are expected to be caught. The judge (Claude Haiku) scores each review as `CAUGHT`, `PARTIAL`, or `MISSED` for each expected issue.
@@ -49,6 +60,7 @@ The live prompt is always `../commands/review-pr.md`. Named variants live in `pr
 cp ../commands/review-pr.md prompts/my-variant.md
 # edit prompts/my-variant.md
 python eval.py --compare current my-variant
+python eval.py --compare current my-variant@claude-opus-4-6
 ```
 
 ## Repo cache
diff --git a/.claude/review-pr-eval/eval.py b/.claude/review-pr-eval/eval.py
index 3c73f0a0f3..7572b21a24 100755
--- a/.claude/review-pr-eval/eval.py
+++ b/.claude/review-pr-eval/eval.py
@@ -12,10 +12,13 @@
   gh CLI authenticated
 """
 
+import hashlib
 import json
 import subprocess
 import sys
 import threading
+import time
+from collections import Counter
 from contextlib import contextmanager
 from pathlib import Path
 from datetime import datetime
@@ -24,6 +27,7 @@
 TRAINING_SET_FILE = SCRIPT_DIR / "training_set.json"
 PROMPTS_DIR = SCRIPT_DIR / "prompts"
 RESULTS_DIR = SCRIPT_DIR.parent.parent / "build" / "review-pr-output"
+CACHE_DIR = RESULTS_DIR / "cache"
 REPOS_DIR = SCRIPT_DIR.parent.parent / "build" / "pr-eval-repos"
 LIVE_PROMPT = SCRIPT_DIR.parent / "commands" / "review-pr.md"
 
@@ -115,55 +119,50 @@ def get_merge_commit(pr_view: dict, url: str):
         pass  # Leave detached HEAD; next run will checkout the right commit
 
 
-def run_claude(prompt: str, extra_args: list[str] = None, stream: bool = False, cwd: str = None, skip_permissions: bool = False) -> str:
-    cmd = ["claude", "-p"]
+def _format_usage(usage: dict) -> str:
+    if not usage:
+        return ""
+    parts = []
+    if "input_tokens" in usage:
+        parts.append(f"in: {usage['input_tokens']:,}")
+    if "cache_read_input_tokens" in usage and usage["cache_read_input_tokens"]:
+        parts.append(f"cache_read: {usage['cache_read_input_tokens']:,}")
+    if "output_tokens" in usage:
+        parts.append(f"out: {usage['output_tokens']:,}")
+    return f"  [{', '.join(parts)}]" if parts else ""
+
+
+def run_claude(prompt: str, extra_args: list[str] = None, cwd: str = None, skip_permissions: bool = False) -> tuple[str, dict]:
+    cmd = ["claude", "-p", "--output-format", "json"]
     if skip_permissions:
         cmd.append("--dangerously-skip-permissions")
     if extra_args:
         cmd.extend(extra_args)
 
-    if stream:
-        process = subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-            cwd=cwd,
-        )
-        process.stdin.write(prompt)
-        process.stdin.close()
-        stderr_lines = []
-        stderr_thread = threading.Thread(target=lambda: stderr_lines.extend(process.stderr), daemon=True)
-        stderr_thread.start()
-        lines = []
-        for line in process.stdout:
-            print(line, end="", flush=True)
-            lines.append(line)
-        stderr_thread.join()
-        try:
-            process.wait(timeout=1200)
-        except subprocess.TimeoutExpired:
-            process.kill()
-            raise RuntimeError("claude -p timed out after 20 minutes")
-        if process.returncode != 0:
-            raise RuntimeError("".join(stderr_lines).strip() or f"claude -p exited with code {process.returncode}")
-        return "".join(lines).strip()
-    else:
-        result = subprocess.run(
-            cmd,
-            input=prompt,
-            capture_output=True,
-            text=True,
-            timeout=1200,
-            cwd=cwd,
-        )
-        if result.returncode != 0:
-            raise RuntimeError(result.stderr.strip() or f"claude -p exited with code {result.returncode}")
-        return result.stdout.strip()
+    process = subprocess.Popen(
+        cmd,
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        cwd=cwd,
+    )
+    timer = threading.Timer(1800, lambda: process.kill())
+    try:
+        timer.start()
+        stdout, stderr = process.communicate(input=prompt)
+    finally:
+        timer.cancel()
+    if process.returncode != 0:
+        raise RuntimeError(stderr.strip() or f"claude -p exited with code {process.returncode}")
+    try:
+        data = json.loads(stdout)
+        return data.get("result", "").strip(), data.get("usage", {})
+    except (json.JSONDecodeError, KeyError):
+        return stdout.strip(), {}
 
 
-def run_review(prompt_template: str, pr_view: dict, pr_diff: str, cwd: str = None) -> str:
+def run_review(prompt_template: str, pr_view: dict, pr_diff: str, cwd: str = None, model: str = None) -> str:
     pr_summary = (
         f"PR #{pr_view['number']}: {pr_view['title']}\n"
         f"URL: {pr_view.get('url', '')}\n"
@@ -182,8 +181,14 @@ def run_review(prompt_template: str, pr_view: dict, pr_diff: str, cwd: str = Non
 ## Diff
 {pr_diff}"""
 
-    print()
-    return run_claude(full_prompt, stream=True, cwd=cwd, skip_permissions=True)
+    model_label = f" [{model}]" if model else ""
+    print(f"  [{datetime.now().strftime('%H:%M:%S')}] reviewing{model_label}... ", end="", flush=True)
+    t0 = time.time()
+    extra_args = ["--model", model] if model else None
+    result, usage = run_claude(full_prompt, extra_args=extra_args, cwd=cwd, skip_permissions=True)
+    elapsed = int(time.time() - t0)
+    print(f"done in {elapsed}s ({len(result.splitlines())} lines){_format_usage(usage)}")
+    return result
 
 
 def judge_review(review_output: str, expected_issue: str) -> tuple[str, str]:
@@ -191,46 +196,104 @@ def judge_review(review_output: str, expected_issue: str) -> tuple[str, str]:
         expected_issue=expected_issue,
         review_output=review_output,
     )
-    text = run_claude(prompt, extra_args=["--model", JUDGE_MODEL])
+    print(f"    [{datetime.now().strftime('%H:%M:%S')}] {expected_issue[:70]}... ", end="", flush=True)
+    t0 = time.time()
+    text, usage = run_claude(prompt, extra_args=["--model", JUDGE_MODEL])
+    elapsed = int(time.time() - t0)
     verdict = text.split(":")[0].strip().upper()
     if verdict not in ("CAUGHT", "PARTIAL", "MISSED"):
         verdict = "UNKNOWN"
+    print(f"{verdict} ({elapsed}s){_format_usage(usage)}")
     return verdict, text
 
 
-def evaluate_prompt(prompt_file: Path, training_set: list) -> dict:
+def _cache_key(prompt_template: str, url: str, model: str = "") -> str:
+    return hashlib.sha256((prompt_template + "\n" + url + "\n" + model).encode()).hexdigest()[:32]
+
+
+def load_cached_pr_result(prompt_template: str, url: str, model: str = "") -> dict | None:
+    cache_file = CACHE_DIR / f"{_cache_key(prompt_template, url, model)}.json"
+    return json.loads(cache_file.read_text()) if cache_file.exists() else None
+
+
+def save_cached_pr_result(prompt_template: str, url: str, result: dict, model: str = ""):
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    cache_file = CACHE_DIR / f"{_cache_key(prompt_template, url, model)}.json"
+    cache_file.write_text(json.dumps(result, indent=2))
+
+
+def _aggregate_findings(all_run_findings: list[list[dict]]) -> list[dict]:
+    """Merge per-run findings into one finding per issue with verdict breakdown."""
+    aggregated = []
+    for i, base in enumerate(all_run_findings[0]):
+        verdicts = [run[i]["verdict"] for run in all_run_findings]
+        catch_rate = round(
+            (verdicts.count("CAUGHT") + 0.5 * verdicts.count("PARTIAL")) / len(verdicts), 2
+        )
+        majority = Counter(verdicts).most_common(1)[0][0]
+        aggregated.append({
+            "expected_issue": base["expected_issue"],
+            "verdict": majority,
+            "verdicts": verdicts,
+            "multi_run_catch_rate": catch_rate,
+            "judge_explanation": all_run_findings[-1][i]["judge_explanation"],
+        })
+    return aggregated
+
+
+def evaluate_prompt(prompt_file: Path, training_set: list, num_runs: int = 1, model: str = None, use_cache: bool = True) -> dict:
     prompt_template = prompt_file.read_text()
     results = []
 
     for entry in training_set:
         url = entry["url"]
         short = pr_label(url)
+
+        if use_cache and num_runs == 1:
+            cached = load_cached_pr_result(prompt_template, url, model or "")
+            if cached:
+                print(f"  [{short}] using cached result")
+                results.append(cached)
+                continue
+
         print(f"  [{short}] fetching... ", end="", flush=True)
 
         try:
             pr_view, pr_diff = get_pr_data(url)
             print(f"{pr_view['title']}")
             print(f"  diff: {len(pr_diff):,} chars")
-            with get_merge_commit(pr_view, url) as cwd:
-                review = run_review(prompt_template, pr_view, pr_diff, cwd=cwd)
-            print(f"\n--- judging ---")
-
-            findings = []
-            for issue in entry["expected_issues"]:
-                print(f"  {issue[:80]}... ", end="", flush=True)
-                verdict, judge_explanation = judge_review(review, issue)
-                print(verdict)
-                findings.append({
-                    "expected_issue": issue,
-                    "verdict": verdict,
-                    "judge_explanation": judge_explanation,
-                })
 
+            all_run_findings = []
+            last_review = None
+            with get_merge_commit(pr_view, url) as cwd:
+                for run_idx in range(num_runs):
+                    if num_runs > 1:
+                        print(f"  run {run_idx + 1}/{num_runs}:")
+                    last_review = run_review(prompt_template, pr_view, pr_diff, cwd=cwd, model=model)
+
+                    print(f"  --- judging ---")
+                    run_findings = []
+                    for issue in entry["expected_issues"]:
+                        verdict, judge_explanation = judge_review(last_review, issue)
+                        run_findings.append({
+                            "expected_issue": issue,
+                            "verdict": verdict,
+                            "judge_explanation": judge_explanation,
+                        })
+                    all_run_findings.append(run_findings)
+                    save_cached_pr_result(prompt_template, url, {
+                        "url": url,
+                        "title": pr_view["title"],
+                        "findings": run_findings,
+                        "review": last_review,
+                    }, model or "")
+
+            findings = _aggregate_findings(all_run_findings) if num_runs > 1 else all_run_findings[0]
             results.append({
                 "url": url,
                 "title": pr_view["title"],
                 "findings": findings,
-                "review": review,
+                "review": last_review,
             })
         except Exception as e:
             print(f"ERROR: {e}")
@@ -242,20 +305,32 @@ def evaluate_prompt(prompt_file: Path, training_set: list) -> dict:
 
     all_findings = [f for r in results for f in r["findings"]]
     total = len(all_findings)
-    caught = sum(1 for f in all_findings if f["verdict"] == "CAUGHT")
-    partial = sum(1 for f in all_findings if f["verdict"] == "PARTIAL")
-    missed = sum(1 for f in all_findings if f["verdict"] == "MISSED")
+    if num_runs > 1:
+        # Average the per-issue catch rates across runs
+        catch_rate = round(sum(f.get("multi_run_catch_rate", 0) for f in all_findings) / total, 2) if total > 0 else 0.0
+        caught = round(sum(f.get("multi_run_catch_rate", 0) for f in all_findings))
+        partial = 0
+        missed = total - caught
+    else:
+        caught = sum(1 for f in all_findings if f["verdict"] == "CAUGHT")
+        partial = sum(1 for f in all_findings if f["verdict"] == "PARTIAL")
+        missed = sum(1 for f in all_findings if f["verdict"] == "MISSED")
+        catch_rate = round((caught + 0.5 * partial) / total, 2) if total > 0 else 0.0
 
+    stem = prompt_file.stem
+    model_label = f"{stem}@{model}" if model else stem
     return {
         "prompt_file": str(prompt_file),
+        "model": model,
+        "model_label": model_label,
         "timestamp": datetime.now().isoformat(),
+        "num_runs": num_runs,
         "score": {
             "caught": caught,
             "partial": partial,
             "missed": missed,
             "total": total,
-            # Partial counts as 0.5 — it found something but wasn't precise
-            "catch_rate": round((caught + 0.5 * partial) / total, 2) if total > 0 else 0.0,
+            "catch_rate": catch_rate,
         },
         "results": results,
     }
@@ -263,13 +338,18 @@ def evaluate_prompt(prompt_file: Path, training_set: list) -> dict:
 
 def print_summary(evaluation: dict):
     s = evaluation["score"]
-    name = Path(evaluation["prompt_file"]).stem
-    print(f"\n{name}: {s['caught']} caught, {s['partial']} partial, {s['missed']} missed / {s['total']} total  (catch rate: {s['catch_rate']:.0%})")
+    name = evaluation.get("model_label") or Path(evaluation["prompt_file"]).stem
+    num_runs = evaluation.get("num_runs", 1)
+    run_label = f" over {num_runs} runs" if num_runs > 1 else ""
+    print(f"\n{name}{run_label}: {s['caught']} caught, {s['partial']} partial, {s['missed']} missed / {s['total']} total  (catch rate: {s['catch_rate']:.0%})")
     for r in evaluation["results"]:
         short = pr_label(r["url"])
         title = r.get("title", "")
         for f in r["findings"]:
-            if f["verdict"] in ("MISSED", "PARTIAL", "ERROR"):
+            if "verdicts" in f:
+                breakdown = ", ".join(f"{v}×{f['verdicts'].count(v)}" for v in ("CAUGHT", "PARTIAL", "MISSED") if f["verdicts"].count(v))
+                print(f"  [{f['verdict']}] {short} {title[:30]}: {breakdown} — {f.get('judge_explanation', '')}")
+            elif f["verdict"] in ("MISSED", "PARTIAL", "ERROR"):
                 print(f"  [{f['verdict']}] {short} {title[:30]}: {f.get('judge_explanation', f.get('error', ''))}")
 
 
@@ -283,7 +363,27 @@ def main():
     training_set = json.loads(TRAINING_SET_FILE.read_text())
     args = sys.argv[1:]
 
-    print(f"Warning: this evaluation runs Claude on {len(training_set)} PRs and will take 10+ minutes.")
+    num_runs = 1
+    runs_specified = "--runs" in args
+    if runs_specified:
+        idx = args.index("--runs")
+        if idx + 1 >= len(args):
+            print("Usage: eval.py --runs <N>")
+            sys.exit(1)
+        num_runs = int(args[idx + 1])
+        args = args[:idx] + args[idx + 2:]
+
+    run_label = f" ({num_runs} runs each)" if num_runs > 1 else ""
+    print(f"Warning: this evaluation runs Claude on {len(training_set)} PRs{run_label} and will take 10+ minutes.")
+
+    single_model = None
+    if "--model" in args:
+        idx = args.index("--model")
+        if idx + 1 >= len(args):
+            print("Usage: eval.py --model <model>")
+            sys.exit(1)
+        single_model = args[idx + 1]
+        args = args[:idx] + args[idx + 2:]
 
     if "--compare" in args:
         idx = args.index("--compare")
@@ -292,26 +392,32 @@ def main():
             print("Usage: eval.py --compare <name1> <name2> ...")
             sys.exit(1)
 
-        prompt_files = {name: LIVE_PROMPT if name == "current" else PROMPTS_DIR / f"{name}.md" for name in names}
-        for name, prompt_file in prompt_files.items():
+        entries = []
+        for name in names:
+            prompt_name, _, model = name.partition("@")
+            prompt_file = LIVE_PROMPT if prompt_name == "current" else PROMPTS_DIR / f"{prompt_name}.md"
+            entries.append((name, prompt_file, model or None))
+
+        for name, prompt_file, _ in entries:
             if not prompt_file.exists():
                 print(f"Error: prompt file not found at {prompt_file}")
                 sys.exit(1)
 
         all_results = []
-        for name, prompt_file in prompt_files.items():
+        for name, prompt_file, model in entries:
             print(f"\nEvaluating {name}...")
-            result = evaluate_prompt(prompt_file, training_set)
+            result = evaluate_prompt(prompt_file, training_set, num_runs=num_runs, model=model, use_cache=not runs_specified)
             all_results.append(result)
-            out_file = RESULTS_DIR / f"{name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+            safe_name = name.replace("@", "_at_")
+            out_file = RESULTS_DIR / f"{safe_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
             out_file.write_text(json.dumps(result, indent=2))
             print_summary(result)
 
         print("\n--- Comparison ---")
         for r in all_results:
             s = r["score"]
-            prompt_name = Path(r["prompt_file"]).stem
-            print(f"  {prompt_name:25s}  {s['catch_rate']:.0%}  ({s['caught']}C {s['partial']}P {s['missed']}M)")
+            label = r.get("model_label") or Path(r["prompt_file"]).stem
+            print(f"  {label:35s}  {s['catch_rate']:.0%}  ({s['caught']}C {s['partial']}P {s['missed']}M)")
 
     else:
         prompt_file = Path(args[0]) if args else LIVE_PROMPT
@@ -319,7 +425,7 @@ def main():
             print(f"Error: prompt file not found at {prompt_file}")
             sys.exit(1)
         print(f"Evaluating {prompt_file.name} against {len(training_set)} PRs...")
-        result = evaluate_prompt(prompt_file, training_set)
+        result = evaluate_prompt(prompt_file, training_set, num_runs=num_runs, model=single_model, use_cache=not runs_specified)
         print_summary(result)
         out_file = RESULTS_DIR / f"{prompt_file.stem}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
         out_file.write_text(json.dumps(result, indent=2))
diff --git a/.claude/review-pr-eval/training_set.json b/.claude/review-pr-eval/training_set.json
index ccac6fa156..db921ee311 100644
--- a/.claude/review-pr-eval/training_set.json
+++ b/.claude/review-pr-eval/training_set.json
@@ -1,25 +1,8 @@
 [
   {
-    "url": "https://github.com/LabKey/limsModules/pull/1792",
-    "expected_issues": [
-      "The SQLServer version of sampleManagement-25.001-25.002.sql has the wrong WHERE clause and should use role='org.labkey.samplemanagement.security.roles.WorkflowEditorRole'",
-      "WorkflowController.SetDefaultEmailPrefAction unconditionally sets the success status for the response to false",
-      "WorkflowManager.addWorkflowJobAuditEvent() fetches originalJobMetadata but fails to pass it to encodeForDataMap() "
-    ]
-  }, {
-    "url": "https://github.com/LabKey/labkey-ui-components/pull/1144",
-    "expected_issues": [
-      "The change to BulkUpdateForm.tsx displayFieldUpdates.merge(data) unconditionally includes pre-populated display values for StoredAmount/Units in every non-empty form submission"
-    ]
-  }, {
     "url": "https://github.com/LabKey/platform/pull/5703",
     "expected_issues": [
       "TsvDataSerializer.exportData() fails to write the first row of data for all files except the first due to a misplaced writeRow() call"
     ]
-  }, {
-    "url": "https://github.com/LabKey/platform/pull/3949",
-    "expected_issues": [
-      "SampleTypeUpdateServiceDI.getMaterialMapsWithInput() uses `filter` instead of `cfilter` in the call to TableSelector"
-    ]
   }
 ]

From 0838a0fda740e770423d52dd78457305be27b508 Mon Sep 17 00:00:00 2001
From: labkey-jeckels <jeckels@labkey.com>
Date: Sat, 21 Mar 2026 15:34:20 -0700
Subject: [PATCH 5/5] Model args

---
 .claude/review-pr-eval/eval.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.claude/review-pr-eval/eval.py b/.claude/review-pr-eval/eval.py
index 7572b21a24..8c911cc2ca 100755
--- a/.claude/review-pr-eval/eval.py
+++ b/.claude/review-pr-eval/eval.py
@@ -394,9 +394,12 @@ def main():
 
         entries = []
         for name in names:
-            prompt_name, _, model = name.partition("@")
+            prompt_name, has_at, inline_model = name.partition("@")
+            if has_at and single_model:
+                print(f"Warning: --model {single_model!r} and @model syntax both specified for {name!r}; @model takes precedence")
+            effective_model = (inline_model if has_at else None) or single_model
             prompt_file = LIVE_PROMPT if prompt_name == "current" else PROMPTS_DIR / f"{prompt_name}.md"
-            entries.append((name, prompt_file, model or None))
+            entries.append((name, prompt_file, effective_model))
 
         for name, prompt_file, _ in entries:
             if not prompt_file.exists():