ambient-code · mergify · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/.claude/skills/dev-cluster/SKILL.md b/.claude/skills/dev-cluster/SKILL.md
@@ -382,6 +382,46 @@ npm run dev
 - Backend, operator, or runner changes (those still need image rebuild + load)
 - Testing changes to container configuration or deployment manifests
 
+## Benchmarking Developer Loops
+
+Use the benchmark harness when the user wants measured cold-start or rebuild timing rather than ad hoc impressions.
+
+### Commands
+
+```bash
+# Human-friendly local summary
+make benchmark
+
+# Agent / automation friendly output
+make benchmark FORMAT=tsv
+
+# Single component
+make benchmark COMPONENT=frontend MODE=cold
+make benchmark COMPONENT=backend MODE=warm
+```
+
+### Agent Guidance
+
+- Prefer `FORMAT=tsv` when another agent, script, or evaluation harness will consume the output.
+- Prefer the default `human` format for interactive local use in a terminal.
+- `frontend` benchmarking requires **Node.js 20+**.
+- `warm` currently measures **rebuild proxies**, not browser-observed hot reload latency.
+- If `reports/benchmarks/` is not writable in the current environment, the harness will fall back to a temp directory and print a warning.
+- Session benchmarking is **contract-only** in v1 (`bench_session_*` stubs in `scripts/benchmarks/bench-manifest.sh`).
+- Start with the **smallest relevant benchmark**:
+  - backend/operator/public-api change -> `MODE=warm COMPONENT=<component> REPEATS=1`
+  - frontend contributor setup -> `MODE=cold COMPONENT=frontend REPEATS=1`
+  - only run all components when you explicitly need the whole matrix
+- Treat preflight failures as useful environment signals; do not work around them unless the user asks.
+- Use full-sweep benchmarking sparingly because each component still performs untimed setup before the measured warm rebuild.
+
+### Interpreting Results
+
+- `cold`: approximates first-contributor setup/install cost with isolated caches
+- `warm`: approximates incremental rebuild cost after setup has already completed
+- `budget_ok=false` on cold runs means the component exceeded the 60-second contributor budget
+- Large deltas on a single repeat should be treated cautiously; use more repeats before drawing conclusions
+
 ## Best Practices
 
 1. **Use local dev server for frontend**: Fastest feedback loop, no image rebuilds needed

diff --git a/.env.local.example b/.env.local.example
@@ -1 +1,18 @@
+# Root `.env.local` (optional, loaded by Makefile via `-include .env.local`)
+# Used for kind-on-remote-host and other repo-wide overrides.
+#
+# Example: point tools at a remote machine running kind (Tailscale, etc.)
 KIND_HOST=100.x.x.x
+
+# --- Frontend local dev (`components/frontend/.env.local`) ---
+# Prefer generating this file with `make dev-env` after the cluster is up; it sets:
+#   BACKEND_URL=http://localhost:<KIND_FWD_BACKEND_PORT>/api
+#   OC_TOKEN=<from cluster test-user-token>
+#   ENABLE_OC_WHOAMI=0
+#
+# `make dev COMPONENT=frontend` writes/updates that file and runs `npm run dev`.
+# `make dev COMPONENT=frontend,backend` uses BACKEND_URL=http://localhost:8080/api (local go run).
+#
+# Makefile variables for the dev workflow (pass on the command line):
+#   COMPONENT=frontend|backend|frontend,backend
+#   AUTO_CLUSTER=true          # run kind-up without prompting if cluster is missing
diff --git a/.github/workflows/component-benchmarks.yml b/.github/workflows/component-benchmarks.yml
@@ -0,0 +1,123 @@
+name: Component Benchmarks
+
+permissions:
+  contents: read
+
+on:
+  workflow_dispatch:
+    inputs:
+      components:
+        description: Components to benchmark (comma-separated or all)
+        required: false
+        default: all
+        type: string
+      mode:
+        description: cold, warm, or both
+        required: false
+        default: both
+        type: string
+      baseline_ref:
+        description: Optional baseline git ref
+        required: false
+        default: ""
+        type: string
+  pull_request:
+    types: [labeled]
+
+jobs:
+  benchmark:
+    if: github.event_name == 'workflow_dispatch' || github.event.label.name == 'benchmark'
+    runs-on: ubuntu-latest
+    timeout-minutes: 90
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Set up Go
+        uses: actions/setup-go@v6
+        with:
+          go-version-file: components/backend/go.mod
+          cache: false
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "20"
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.11"
+
+      - name: Run benchmark harness self-tests
+        run: |
+          bash tests/bench-test.sh
+
+      - name: Run component benchmarks
+        env:
+          COMPONENTS: ${{ inputs.components }}
+          MODE: ${{ inputs.mode }}
+          BASELINE_REF_INPUT: ${{ inputs.baseline_ref }}
+        run: |
+          set -euo pipefail
+
+          ARGS=()
+          if [[ -n "${COMPONENTS:-}" && "${COMPONENTS}" != "all" ]]; then
+            ARGS+=(--components "${COMPONENTS}")
+          fi
+          if [[ -n "${MODE:-}" ]]; then
+            ARGS+=(--mode "${MODE}")
+          fi
+          if [[ -n "${BASELINE_REF_INPUT:-}" ]]; then
+            ARGS+=(--baseline-ref "${BASELINE_REF_INPUT}")
+          fi
+
+          bash scripts/benchmarks/component-bench.sh --ci "${ARGS[@]}" >/dev/null
+
+      - name: Publish benchmark summary
+        if: always()
+        run: |
+          {
+            echo "### Component Benchmarks"
+            echo
+
+            if [[ -f reports/benchmarks/results.tsv ]]; then
+              OVER_BUDGET=$(awk -F'\t' 'NR > 1 && $2 == "cold" && $8 == "false" { print $1 " (" $4 "s)" }' reports/benchmarks/results.tsv)
+              REGRESSIONS=$(awk -F'\t' 'NR > 1 && ($6 + 0) > 10.0 { print $1 " " $2 " (" $6 "%)" }' reports/benchmarks/results.tsv)
+
+              if [[ -n "${OVER_BUDGET}" ]]; then
+                echo "**Over 60s budget:**"
+                while IFS= read -r line; do
+                  [[ -n "$line" ]] && echo "- $line"
+                done <<<"${OVER_BUDGET}"
+                echo
+              fi
+
+              if [[ -n "${REGRESSIONS}" ]]; then
+                echo "**Regressions over 10%:**"
+                while IFS= read -r line; do
+                  [[ -n "$line" ]] && echo "- $line"
+                done <<<"${REGRESSIONS}"
+                echo
+              fi
+            fi
+
+            echo '```text'
+            if [[ -f reports/benchmarks/results.human.txt ]]; then
+              cat reports/benchmarks/results.human.txt
+            else
+              echo "No human-readable benchmark report was generated."
+            fi
+            echo '```'
+          } >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Upload benchmark artifacts
+        if: always()
+        uses: actions/upload-artifact@v6
+        with:
+          name: component-benchmarks-${{ github.run_id }}
+          path: reports/benchmarks/
+          retention-days: 7
diff --git a/.gitignore b/.gitignore
@@ -60,6 +60,7 @@ venv.bak/
 # Environment files
 .env
 .env.local
+components/frontend/.env.local
 .env.uat
 .dev-bootstrap.env
 

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -42,6 +42,7 @@ make test                     # Run tests
 make lint                     # Lint code
 make kind-up                  # Start local Kind cluster
 make test-e2e-local           # Run E2E tests against Kind
+make benchmark                # Run component benchmark harness
 ```
 
 ### Per-Component
@@ -61,6 +62,26 @@ cd components/runners/ambient-runner && uv venv && uv pip install -e .
 cd docs && npm run dev  # http://localhost:4321
 ```
 
+### Benchmarking
+
+```shell
+# Human-friendly summary
+make benchmark
+
+# Agent / automation friendly output
+make benchmark FORMAT=tsv
+
+# Single component
+make benchmark COMPONENT=frontend MODE=cold
+```
+
+Benchmark notes:
+
+- `frontend` requires **Node.js 20+**
+- `FORMAT=tsv` is preferred for agents to minimize token usage
+- `warm` measures rebuild proxies, not browser-observed hot reload latency
+- See `scripts/benchmarks/README.md` for semantics and caveats
+
 ## Critical Context
 
 - **User token auth required**: All user-facing API ops use `GetK8sClientsForRequest(c)`, never the backend service account