From 53d4a06bc7749c95f8e9b390b1239202303d7732 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 09:56:42 +0000
Subject: [PATCH 01/42] test: add granularity marker taxonomy infrastructure
 (#727)

Register unit/integration/e2e markers in conftest and pyproject.toml.
Add unit auto-apply hook in pytest_collection_modifyitems. Deprecate
llm marker (synonym for e2e). Remove dead plugins marker. Rewrite
MARKERS_GUIDE.md as authoritative marker reference. Sync AGENTS.md
Section 3 with new taxonomy.
---
 AGENTS.md             |  31 ++-
 pyproject.toml        |   9 +-
 test/MARKERS_GUIDE.md | 543 ++++++++++++++----------------------------
 test/conftest.py      |  25 +-
 4 files changed, 227 insertions(+), 381 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index c5768d81d..6c0dbb723 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -47,9 +47,18 @@ uv run mypy .                         # Type check
 | `scratchpad/` | Experiments (git-ignored) |
 
 ## 3. Test Markers
-All tests and examples use markers to indicate requirements. The test infrastructure automatically skips tests based on system capabilities.
+All tests and examples use markers to indicate requirements. The test infrastructure automatically skips tests based on system capabilities. See `test/MARKERS_GUIDE.md` for the full reference.
+
+**Granularity Tiers** (every test belongs to exactly one):
+
+- `unit` — Self-contained, no services, no I/O. **Auto-applied by conftest** — never write explicitly.
+- `@pytest.mark.integration` — Multiple components wired together, may need fixture-managed services.
+- `@pytest.mark.e2e` — Real backends (cloud APIs, local servers, GPU models). Always paired with backend markers.
+- `@pytest.mark.qualitative` — Subset of e2e with non-deterministic assertions. Per-function only.
+- `@pytest.mark.llm` — **Deprecated**, synonym for `e2e`. Use `e2e` in new tests.
+
+**Backend Markers** (e2e/qualitative only):
 
-**Backend Markers:**
 - `@pytest.mark.ollama` — Requires Ollama running (local, lightweight)
 - `@pytest.mark.huggingface` — Requires HuggingFace backend (local, heavy)
 - `@pytest.mark.vllm` — Requires vLLM backend (local, GPU required)
@@ -57,20 +66,17 @@ All tests and examples use markers to indicate requirements. The test infrastruc
 - `@pytest.mark.watsonx` — Requires Watsonx API (requires API key)
 - `@pytest.mark.litellm` — Requires LiteLLM backend
 
-**Capability Markers:**
+**Resource/Capability Markers** (e2e/qualitative only):
+
 - `@pytest.mark.requires_gpu` — Requires GPU
 - `@pytest.mark.requires_heavy_ram` — Requires 48GB+ RAM
 - `@pytest.mark.requires_api_key` — Requires external API keys
-- `@pytest.mark.qualitative` — LLM output quality tests (skipped in CI via `CICD=1`)
-- `@pytest.mark.llm` — Makes LLM calls (needs at least Ollama)
-- `@pytest.mark.slow` — Tests taking >5 minutes (skipped via `SKIP_SLOW=1`)
-
-**Execution Strategy Markers:**
 - `@pytest.mark.requires_gpu_isolation` — Requires OS-level process isolation to clear CUDA memory (use with `--isolate-heavy` or `CICD=1`)
+- `@pytest.mark.slow` — Tests taking >1 minute (skipped by default)
 
 **Examples in `docs/examples/`** use comment-based markers for clean code:
 ```python
-# pytest: ollama, llm, requires_heavy_ram
+# pytest: e2e, ollama, qualitative
 """Example description..."""
 
 # Your clean example code here
@@ -79,12 +85,13 @@ All tests and examples use markers to indicate requirements. The test infrastruc
 Tests/examples automatically skip if system lacks required resources. Heavy examples (e.g., HuggingFace) are skipped during collection to prevent memory issues.
 
 **Default behavior:**
-- `uv run pytest` skips slow tests (>5 min) but runs qualitative tests
+- `uv run pytest` skips slow tests (>1 min) but runs qualitative tests
 - Use `pytest -m "not qualitative"` for fast tests only (~2 min)
-- Use `pytest -m slow` or `pytest` (without config) to include slow tests
+- Use `pytest -m unit` for self-contained tests only (fastest)
+- Use `pytest -m slow` to include slow tests
 
 ⚠️ Don't add `qualitative` to trivial tests—keep the fast loop fast.
-⚠️ Mark tests taking >5 minutes with `slow` (e.g., dataset loading, extensive evaluations).
+⚠️ Mark tests taking >1 minute with `slow` (e.g., dataset loading, extensive evaluations).
 
 ## 4. Coding Standards
 - **Types required** on all core functions
diff --git a/pyproject.toml b/pyproject.toml
index f64a15d2a..628b02527 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -327,6 +327,11 @@ skip = 'requirements.txt,uv.lock,CHANGELOG.md,test/formatters/granite/testdata/t
 [tool.pytest.ini_options]
 testpaths = ["test", "docs"]  # Run test/ first (fail fast), then docs/
 markers = [
+    # Granularity markers
+    "unit: Self-contained tests — no services, no I/O (auto-applied when no other granularity marker present)",
+    "integration: Tests needing additional services or multi-component wiring (may use fixture-managed dependencies)",
+    "e2e: Tests against real backends — cloud APIs, local servers, or GPU-loaded models",
+
     # Backend markers
     "ollama: Tests requiring Ollama backend (local, light)",
     "openai: Tests requiring OpenAI API (requires API key)",
@@ -342,8 +347,8 @@ markers = [
     "qualitative: Non-deterministic quality tests",
     "slow: Tests taking >1 minute (e.g., multi-step pipelines like researcher)",
 
-    # Composite markers
-    "llm: Tests that make LLM calls (needs at least Ollama)",
+    # Composite markers (llm is deprecated — use e2e instead)
+    "llm: Tests that make LLM calls (deprecated — use e2e instead)",
 ]
 asyncio_mode = "auto"  # Don't require explicitly marking async tests.
 addopts = [
diff --git a/test/MARKERS_GUIDE.md b/test/MARKERS_GUIDE.md
index 49253bab9..aec85a104 100644
--- a/test/MARKERS_GUIDE.md
+++ b/test/MARKERS_GUIDE.md
@@ -1,447 +1,264 @@
-# Pytest Markers Guide for Mellea Tests
+# Pytest Markers Guide
 
-## Overview
-
-This guide explains the pytest marker system for categorizing and running mellea tests based on backend requirements, resource availability, and test characteristics.
-
-## 🎯 What's Automatic vs Manual
-
-### ✅ Automatic (No Configuration Needed)
-When you run `pytest`, the system **automatically detects** and skips tests based on:
-- **Ollama availability** - Checks if port 11434 is listening
-- **API keys** - Checks environment variables (`OPENAI_API_KEY`, `WATSONX_API_KEY`, etc.)
-- **GPU availability** - Checks for CUDA (NVIDIA) or MPS (Apple Silicon) via torch
-- **System RAM** - Checks via `psutil.virtual_memory()` (if psutil installed)
-
-**You don't need to configure anything!** Just run `pytest` and tests will automatically skip with helpful messages if requirements aren't met.
-
-**Note:**
-- GPU detection requires `torch` (included in `mellea[hf]` and `mellea[vllm]`)
-- RAM detection requires `psutil` (included in dev dependencies)
-- If you're not using dev dependencies, install with: `pip install psutil`
-
-### ⚠️ Manual (Developer Adds to Test Files)
-Developers must **add markers** to test files to indicate what each test needs:
-```python
-# Developer adds these markers once per test file
-pytestmark = [pytest.mark.ollama, pytest.mark.llm]
-```
-
-**Summary:** Markers are manual (one-time setup per test file), detection is automatic (every test run).
-
-### 🔧 Override Auto-Detection (Advanced)
-Want to try running tests even when requirements aren't met? Use these pytest options:
+## Quick Reference
 
 ```bash
-# Try GPU tests without GPU (will use CPU, may be slow/fail)
-pytest --ignore-gpu-check test/backends/test_vllm.py
+# By granularity tier
+pytest -m unit                          # Self-contained, no services (fast)
+pytest -m integration                   # Multi-component, fixture-managed deps
+pytest -m e2e                           # Real backends (ollama, APIs, GPU models)
+pytest -m "e2e and not qualitative"     # Deterministic real-backend tests only
+
+# By backend
+pytest -m ollama                        # Ollama tests
+pytest -m huggingface                   # HuggingFace tests
+pytest -m "openai or watsonx"           # Cloud API tests
+
+# By characteristics
+pytest -m "not qualitative"             # Fast, deterministic tests (~2 min)
+pytest -m qualitative                   # Non-deterministic output quality tests
+pytest -m slow                          # Long-running tests (>1 min)
+
+# Default (configured in pyproject.toml): skips slow, includes qualitative
+pytest
+```
 
-# Try with less RAM than recommended
-pytest --ignore-ram-check test/backends/test_huggingface.py
+## Granularity Tiers
 
-# Try without Ollama running
-pytest --ignore-ollama-check test/backends/test_ollama.py
+Every test belongs to exactly one tier. The tier determines what infrastructure
+the test needs and how fast/heavy it is to run.
 
-# Try without API keys (will fail at API call)
-pytest --ignore-api-key-check test/backends/test_openai.py
+### Unit (auto-applied)
 
-# Ignore all checks at once (convenience flag)
-pytest --ignore-all-checks
+**Entirely self-contained** — no services, no I/O, no fixtures that connect
+to anything external. Pure logic testing.
 
-# Enable GPU process isolation (opt-in, slower but guarantees CUDA memory release)
-pytest --isolate-heavy test/backends/test_vllm.py test/backends/test_huggingface.py
+- Auto-applied by conftest hook when no other granularity marker is present
+- **Never write `@pytest.mark.unit` on files** — it is implicit
+- Runs in milliseconds to low seconds, minimal memory
+- Would pass on any machine with just Python and project deps
 
-# Combine multiple overrides
-pytest --ignore-gpu-check --ignore-ram-check -m "huggingface"
+```python
+# No markers needed — auto-applied as unit
+def test_cblock_repr():
+    assert str(CBlock(value="hi")) == "hi"
 ```
 
-**Use Cases:**
-- Testing with CPU when GPU tests might work (slower but functional)
-- Trying with less RAM (might work for smaller models)
-- Debugging test infrastructure
+### Integration (explicit)
 
-**Warning:** Tests will likely fail if requirements aren't actually met!
+**Multiple components wired together**, potentially needing additional services
+or fixture-managed dependencies. Backends may be mocked, stubbed, or stood up
+by test fixtures. The test controls or provides its own dependencies.
 
-## Quick Start
+- Add `@pytest.mark.integration` explicitly
+- Slower than unit (fixture setup, service lifecycle), may consume more memory
+- No backend markers needed — integration tests don't use real backends
 
-```bash
-# Default: qualitative tests, skip slow tests
-pytest
-
-# Fast tests only (no qualitative, no slow)
-pytest -m "not qualitative"
-
-# Run only slow tests
-pytest -m "slow"
+```python
+@pytest.mark.integration
+def test_session_chains_components(mock_backend):
+    session = start_session(backend=mock_backend)
+    result = session.instruct("hello")
+    assert mock_backend.generate.called
+```
 
-# Run ALL tests including slow (bypass config)
-pytest --co -q
+### E2E (explicit)
 
-# Run only fast unit tests (no LLM calls)
-pytest -m "not llm"
+**Tests against real backends** — cloud APIs, local servers (ollama), or
+GPU-loaded models (huggingface, vllm). No mocks on the critical path.
 
-# Run Ollama tests only (local, light resources)
-pytest -m "ollama"
+- Add `@pytest.mark.e2e` explicitly, always combined with backend marker(s)
+- Resource/capability markers (`requires_gpu`, `requires_heavy_ram`, etc.)
+  only apply to e2e and qualitative tests
+- Assertions are **deterministic** — structural, type-based, or functional
 
-# Run tests that don't require API keys
-pytest -m "not requires_api_key"
+```python
+pytestmark = [pytest.mark.e2e, pytest.mark.ollama]
 
-# Run quality tests for Ollama
-pytest -m "ollama and qualitative"
+def test_structured_output(session):
+    result = session.format(Person, "Make up a person")
+    assert isinstance(json.loads(result.value), dict)
 ```
 
-**Note:** By default, `pytest` excludes slow tests (>5 min) but includes qualitative tests (configured in `pyproject.toml`). Use `pytest --co -q` to run all tests including slow ones.
-
-## Marker Categories
+### Qualitative (explicit, per-function)
 
-### Backend Markers
+**Subset of e2e.** Same infrastructure requirements, but assertions check
+**non-deterministic output content** that may vary across model versions or runs.
 
-Specify which backend the test uses:
+- Add `@pytest.mark.qualitative` per-function (not module-level)
+- Module must also carry `e2e` + backend markers at module level
+- Skipped in CI when `CICD=1`
+- Included by default in local runs
 
-- **`@pytest.mark.ollama`**: Tests requiring Ollama backend
-  - Local execution
-  - Light resources (CPU, ~2-4GB RAM)
-  - No API key required
-  - Example: `test/backends/test_ollama.py`
-
-- **`@pytest.mark.openai`**: Tests requiring OpenAI API
-  - Requires `OPENAI_API_KEY` environment variable
-  - Light resources (API calls only)
-  - Incurs API costs
-  - Example: `test/backends/test_vision_openai.py`
+```python
+pytestmark = [pytest.mark.e2e, pytest.mark.ollama]
 
-- **`@pytest.mark.watsonx`**: Tests requiring Watsonx API
-  - Requires `WATSONX_API_KEY`, `WATSONX_URL`, and `WATSONX_PROJECT_ID` environment variables
-  - Light resources (API calls only)
-  - Incurs API costs
-  - Example: `test/backends/test_watsonx.py`
+@pytest.mark.qualitative
+def test_greeting_content(session):
+    result = session.instruct("Write a greeting")
+    assert "hello" in result.value.lower()
+```
 
-- **`@pytest.mark.huggingface`**: Tests requiring HuggingFace backend
-  - Local execution
-  - Heavy resources (GPU recommended, 16-32GB RAM, ~8GB VRAM)
-  - Downloads models (~3-8GB)
-  - No API key required
-  - Example: `test/backends/test_huggingface.py`
+**Decision rule:** If swapping the model version could break the assertion
+despite the system working correctly, it is `qualitative`. If the assertion
+checks structure, types, or functional correctness, it is `e2e`.
 
-- **`@pytest.mark.vllm`**: Tests requiring vLLM backend
-  - Local execution
-  - Heavy resources (GPU required, 16-32GB RAM, 8GB+ VRAM)
-  - Example: `test/backends/test_vllm.py`
+### The `llm` marker (deprecated)
 
-- **`@pytest.mark.litellm`**: Tests requiring LiteLLM backend
-  - Requirements depend on underlying backend
-  - Example: `test/backends/test_litellm_ollama.py`
+`llm` is a legacy marker equivalent to `e2e`. It remains registered for
+backward compatibility but should not be used in new tests. Use `e2e` instead.
 
-### Capability Markers
+The conftest auto-apply hook treats `llm` the same as `e2e` — tests marked
+`llm` will not receive the `unit` marker.
 
-Specify resource or authentication requirements:
+## Backend Markers
 
-- **`@pytest.mark.requires_api_key`**: Tests requiring external API keys
-  - Auto-skipped if required API key not found
-  - Use with backend markers (openai, watsonx)
+Backend markers identify which backend a test needs. They enable selective
+test runs (`pytest -m ollama`) and drive auto-skip logic.
 
-- **`@pytest.mark.requires_gpu`**: Tests requiring GPU
-  - Auto-skipped if no GPU detected
-  - Typically used with huggingface, vllm
+**Backend markers only go on e2e and qualitative tests.** Unit and integration
+tests don't need real backends.
 
-- **`@pytest.mark.requires_heavy_ram`**: Tests requiring 48GB+ RAM
-  - Auto-skipped if insufficient RAM detected
-  - Typically used with huggingface, vllm
+| Marker         | Backend                       | Resources                             |
+| -------------- | ----------------------------- | ------------------------------------- |
+| `ollama`       | Ollama (port 11434)           | Local, light (~2-4GB RAM)             |
+| `openai`       | OpenAI API or compatible      | API calls (may use Ollama `/v1`)      |
+| `watsonx`      | Watsonx API                   | API calls, requires credentials       |
+| `huggingface`  | HuggingFace transformers      | Local, GPU, 48GB+ RAM                 |
+| `vllm`         | vLLM                          | Local, GPU required, 48GB+ RAM        |
+| `litellm`      | LiteLLM (wraps other backends)| Depends on underlying backend         |
 
-- **`@pytest.mark.qualitative`**: Non-deterministic quality tests
-  - Tests LLM output quality rather than infrastructure
-  - **Included by default** (run with standard `pytest`)
-  - Skipped in CI (when `CICD=1`)
-  - May be flaky due to model variability
-  - Use `pytest -m "not qualitative"` to exclude these tests
+### OpenAI-via-Ollama pattern
 
-- **`@pytest.mark.slow`**: Tests taking >5 minutes
-  - Tests that load large datasets, run extensive evaluations, etc.
-  - **Excluded by default** (configured in `pyproject.toml` addopts)
-  - Use `pytest -m slow` or `pytest --co -q` to include these tests
+Some tests use the OpenAI client pointed at Ollama's `/v1` endpoint. Mark
+these with **both** `openai` and `ollama`, but **not** `requires_api_key`:
 
-### Execution Strategy Markers
+```python
+pytestmark = [pytest.mark.e2e, pytest.mark.openai, pytest.mark.ollama]
+```
 
-- **`@pytest.mark.requires_gpu_isolation`**: Tests requiring OS-level process isolation
-  - Used for heavy GPU tests (vLLM, HuggingFace) that need CUDA memory fully released between modules
-  - Only activates when `--isolate-heavy` flag is used or `CICD=1` is set
-  - Runs each marked module in a separate subprocess to guarantee GPU memory release
-  - **Not a capability check** - this is an execution strategy, not a resource requirement
-  - Separate from `requires_gpu` (which checks if GPU exists)
-  - Example: `test/backends/test_vllm.py`, `test/backends/test_huggingface.py`
+## Resource / Capability Markers
 
-### Composite Markers
+These markers gate tests on hardware or credentials. They only apply to
+e2e and qualitative tests — unit and integration tests should never need them.
+Use sparingly.
 
-- **`@pytest.mark.llm`**: Tests that make LLM calls
-  - Requires at least Ollama to be available
-  - Use to distinguish from pure unit tests
+| Marker                   | Gate                                  | Auto-skip when                                    |
+| ------------------------ | ------------------------------------- | ------------------------------------------------- |
+| `requires_gpu`           | CUDA or MPS                           | `torch.cuda.is_available()` is False              |
+| `requires_heavy_ram`     | 48GB+ system RAM                      | `psutil` reports < 48GB                           |
+| `requires_gpu_isolation` | Subprocess isolation for CUDA memory  | `--isolate-heavy` not set and `CICD != 1`         |
+| `requires_api_key`       | External API credentials              | Env vars missing (checked per backend)            |
+| `slow`                   | Tests taking >1 minute                | Excluded by default via `pyproject.toml` addopts  |
+| `qualitative`            | Non-deterministic output              | Skipped when `CICD=1`                             |
 
-## Auto-Detection and Skipping
+### Typical combinations
 
-The test suite automatically detects your system capabilities and skips tests that cannot run:
+- `huggingface` → `requires_gpu` + `requires_heavy_ram` + `requires_gpu_isolation`
+- `vllm` → `requires_gpu` + `requires_heavy_ram` + `requires_gpu_isolation`
+- `watsonx` → `requires_api_key`
+- `openai` → `requires_api_key` only when using real OpenAI API (not Ollama-compatible)
 
-### API Key Detection
-```python
-# Automatically checks for:
-OPENAI_API_KEY          # For OpenAI tests
-WATSONX_API_KEY         # For Watsonx tests
-WATSONX_URL             # For Watsonx tests
-WATSONX_PROJECT_ID      # For Watsonx tests
-```
+## Auto-Detection
 
-### Backend Availability Detection
-```python
-# Automatically detects:
-- Ollama availability (checks if port 11434 is listening)
-```
+The test suite automatically detects system capabilities and skips tests
+whose requirements are not met. No configuration needed.
 
-### Resource Detection
-```python
-# Automatically detects:
-- GPU availability (via torch.cuda.is_available())
-- GPU memory (via torch.cuda.get_device_properties())
-- System RAM (via psutil.virtual_memory())
-```
+| Capability | How detected                  | Override flag            |
+| ---------- | ----------------------------- | ------------------------ |
+| Ollama     | Port 11434 check              | `--ignore-ollama-check`  |
+| GPU        | `torch.cuda.is_available()`   | `--ignore-gpu-check`     |
+| RAM        | `psutil.virtual_memory()`     | `--ignore-ram-check`     |
+| API keys   | Environment variable check    | `--ignore-api-key-check` |
+| All        | —                             | `--ignore-all-checks`    |
 
-### Skip Messages
-When a test is skipped, you'll see helpful messages (use `-rs` flag to show skip reasons):
+Use `-rs` with pytest to see skip reasons:
 ```bash
 pytest -rs
-
-# Output:
-SKIPPED [1] test/conftest.py:120: Skipping test: OPENAI_API_KEY not found in environment
-SKIPPED [1] test/conftest.py:125: Skipping test: GPU not available
-SKIPPED [1] test/conftest.py:130: Skipping test: Insufficient RAM (16.0GB < 32GB)
-SKIPPED [1] test/conftest.py:165: Skipping test: Ollama not available (port 11434 not listening)
 ```
 
-## Usage Examples
-
-### Module-Level Markers
-
-Apply markers to all tests in a module using `pytestmark`:
+## Common Marker Patterns
 
 ```python
-# test/backends/test_ollama.py
-import pytest
-
-# All tests in this module require Ollama and make LLM calls
-pytestmark = [pytest.mark.ollama, pytest.mark.llm]
-
-def test_simple_instruct(session):
-    # This test inherits ollama and llm markers
-    ...
-```
-
-### Multiple Markers
-
-Combine markers for complex requirements:
+# Unit — no markers needed (auto-applied by conftest)
+def test_cblock_repr():
+    assert str(CBlock(value="hi")) == "hi"
+
+# Integration — mocked backend
+@pytest.mark.integration
+def test_session_with_mock(mock_backend):
+    session = start_session(backend=mock_backend)
+    result = session.instruct("hello")
+    assert mock_backend.generate.called
+
+# E2E — real Ollama backend, deterministic
+pytestmark = [pytest.mark.e2e, pytest.mark.ollama]
+def test_structured_output(session):
+    result = session.format(Person, "Make up a person")
+    assert isinstance(json.loads(result.value), dict)
+
+# E2E + qualitative — real backend, non-deterministic
+pytestmark = [pytest.mark.e2e, pytest.mark.ollama]
+@pytest.mark.qualitative
+def test_greeting_content(session):
+    result = session.instruct("Write a greeting")
+    assert "hello" in result.value.lower()
 
-```python
-# test/backends/test_huggingface.py
+# Heavy GPU e2e
 pytestmark = [
+    pytest.mark.e2e,
     pytest.mark.huggingface,
-    pytest.mark.llm,
     pytest.mark.requires_gpu,
     pytest.mark.requires_heavy_ram,
-    pytest.mark.requires_gpu_isolation,  # Needs process isolation for GPU memory
+    pytest.mark.requires_gpu_isolation,
 ]
 ```
 
-### Individual Test Markers
+## Example Files (`docs/examples/`)
 
-Add markers to specific tests:
+Examples use a comment-based marker format instead of `pytestmark`:
 
 ```python
-@pytest.mark.qualitative
-def test_output_quality(session):
-    # This test checks LLM output quality
-    result = session.instruct("Write a poem")
-    assert "poem" in result.value.lower()
-```
-
-## Running Tests by Category
-
-### By Backend
-```bash
-# Ollama only
-pytest -m "ollama"
-
-# HuggingFace only
-pytest -m "huggingface"
-
-# All API-based backends
-pytest -m "openai or watsonx"
-```
-
-### By Resource Requirements
-```bash
-# Light tests only (no GPU, no heavy RAM)
-pytest -m "not (requires_gpu or requires_heavy_ram)"
-
-# Tests that work without API keys
-pytest -m "not requires_api_key"
-
-# GPU tests only
-pytest -m "requires_gpu"
+# pytest: e2e, ollama, qualitative
+"""Example description..."""
 ```
 
-### By Test Type
-```bash
-# Infrastructure tests only (deterministic)
-pytest -m "not qualitative"
-
-# Quality tests only (non-deterministic)
-pytest -m "qualitative"
+Same classification rules apply. The comment must appear in the first few
+lines before non-comment code. Parser: `docs/examples/conftest.py`
+(`_extract_markers_from_file`).
 
-# Fast unit tests (no LLM calls)
-pytest -m "not llm"
-```
-
-### Complex Combinations
-```bash
-# Ollama infrastructure tests
-pytest -m "ollama and not qualitative"
+## Adding Markers to New Tests
 
-# All tests that work with just Ollama (no API keys, no GPU)
-pytest -m "not (requires_api_key or requires_gpu or requires_heavy_ram)"
+1. **Classify the test** — unit, integration, e2e, or qualitative?
+2. **Add granularity marker** — integration and e2e are explicit; unit is auto-applied
+3. **Add backend marker(s)** — only for e2e/qualitative
+4. **Add resource markers** — only for e2e/qualitative, only when needed
+5. **Verify** — `pytest --collect-only -m "your_marker"` to check
 
-# Quality tests for local backends only
-pytest -m "qualitative and (ollama or huggingface or vllm)"
-```
+Use the `/audit-markers` skill to validate markers on existing or new test files.
 
 ## CI/CD Integration
 
-### Current Behavior
-- `CICD=1` environment variable skips all qualitative tests
-- Module-level skips for heavy backends (huggingface, vllm, watsonx)
-
-### Recommended CI Matrix
 ```yaml
-# .github/workflows/test.yml
 jobs:
   unit-tests:
-    # Fast unit tests, no LLM
-    run: pytest -m "not llm"
+    run: pytest -m unit              # Fast, no services needed
 
   ollama-tests:
-    # Ollama infrastructure tests
-    run: pytest -m "ollama and not qualitative"
+    run: pytest -m "e2e and ollama and not qualitative"
 
   quality-tests:
-    # Optional: Run quality tests on schedule
     if: github.event_name == 'schedule'
     run: pytest -m "qualitative and ollama"
 ```
 
-## Adding Markers to New Tests
-
-### Step 1: Identify Requirements
-Ask yourself:
-1. Which backend does this test use?
-2. Does it require an API key?
-3. Does it need a GPU?
-4. Does it need heavy RAM (48GB+)?
-5. Is it testing output quality (qualitative) or infrastructure?
-
-### Step 2: Add Appropriate Markers
-
-For a new Ollama test:
-```python
-# Use module-level marker if all tests use same backend
-pytestmark = [pytest.mark.ollama, pytest.mark.llm]
-
-@pytest.mark.qualitative  # Add if testing output quality
-def test_my_new_feature(session):
-    ...
-```
-
-For a new HuggingFace test:
-```python
-pytestmark = [
-    pytest.mark.huggingface,
-    pytest.mark.llm,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
-    pytest.mark.requires_gpu_isolation,  # Add if needs GPU memory isolation
-]
-
-@pytest.mark.qualitative
-def test_my_new_feature(session):
-    ...
-```
-
-### Step 3: Test Your Markers
-```bash
-# Verify your test is properly marked
-pytest --collect-only -m "your_marker"
-
-# Run just your test
-pytest -k "test_my_new_feature"
-```
-
-## Troubleshooting
-
-### Test Not Running
-```bash
-# Check which markers are applied
-pytest --collect-only test/path/to/test.py
-
-# Check why test is being skipped
-pytest -v test/path/to/test.py
-
-# Force run despite auto-skip (will likely fail if requirements not met)
-pytest test/path/to/test.py --runxfail
-```
-
-### Marker Not Recognized
-```bash
-# List all registered markers
-pytest --markers
-
-# Check pytest.ini configuration
-cat pytest.ini
-```
-
-### Auto-Skip Not Working
-```bash
-# Debug system capabilities
-pytest --setup-show test/path/to/test.py
-
-# Check conftest.py detection logic
-# See test/conftest.py:get_system_capabilities()
-
-# Run with verbose output to see skip reasons
-pytest -v -s test/path/to/test.py
-```
-
-### Force Run Tests (Override Auto-Skip)
-```bash
-# Run specific test ignoring auto-skip (useful for debugging)
-pytest test/backends/test_ollama.py --runxfail
-
-# Run with specific marker, will fail if requirements not met
-pytest -m "ollama" -v
-
-# Note: Tests will fail if actual requirements (Ollama, GPU, etc.) aren't met
-# This is useful for testing the test infrastructure itself
-```
-
-## Best Practices
-
-1. **Use module-level markers** for consistent backend requirements
-2. **Combine markers** to accurately describe test requirements
-3. **Keep qualitative marker** for non-deterministic tests
-4. **Test locally** before pushing to ensure markers work correctly
-5. **Document special requirements** in test docstrings
+- `CICD=1` skips qualitative tests
+- `CICD=1` enables GPU process isolation (`--isolate-heavy` behaviour)
+- `slow` tests excluded by default (add `-m slow` to include)
 
 ## Related Files
 
-- `test/conftest.py`: Auto-detection and skip logic
-- `pyproject.toml`: Marker definitions and pytest configuration
-
-## Questions?
-
-For questions or issues with the marker system:
-1. Check this guide first
-2. Open an issue on GitHub with the `testing` label
+- `test/conftest.py` — marker registration, auto-detection, skip logic, unit auto-apply hook
+- `docs/examples/conftest.py` — example marker parser (`_extract_markers_from_file`)
+- `pyproject.toml` — marker definitions and pytest configuration
+- `.agents/skills/audit-markers/SKILL.md` — skill for auditing and fixing markers
diff --git a/test/conftest.py b/test/conftest.py
index 17cfc2489..3c7b78814 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -336,14 +336,23 @@ def pytest_configure(config):
     )
     config.addinivalue_line("markers", "qualitative: Non-deterministic quality tests")
 
-    # Composite markers
+    # Granularity markers
     config.addinivalue_line(
-        "markers", "llm: Tests that make LLM calls (needs at least Ollama)"
+        "markers",
+        "unit: Self-contained tests — no services, no I/O (auto-applied when no other granularity marker present)",
+    )
+    config.addinivalue_line(
+        "markers",
+        "integration: Tests needing additional services or multi-component wiring (may use fixture-managed dependencies)",
+    )
+    config.addinivalue_line(
+        "markers",
+        "e2e: Tests against real backends — cloud APIs, local servers, or GPU-loaded models",
     )
 
-    # Plugin acceptance markers
+    # Composite markers (llm is deprecated — use e2e instead)
     config.addinivalue_line(
-        "markers", "plugins: Acceptance tests that register all built-in plugin sets"
+        "markers", "llm: Tests that make LLM calls (deprecated — use e2e instead)"
     )
 
     # Store vLLM isolation flag in config
@@ -661,12 +670,20 @@ def pytest_collection_modifyitems(config, items):
         reason="Ollama not available (port 11434 not listening)"
     )
 
+    # Auto-apply 'unit' marker to tests without explicit granularity markers.
+    # This enables `pytest -m unit` without per-file maintenance burden.
+    _NON_UNIT = {"integration", "e2e", "qualitative", "llm"}
+
     for item in items:
         # Skip ollama tests if ollama not available
         if item.get_closest_marker("ollama") and not ignore_ollama:
             if not capabilities["has_ollama"]:
                 item.add_marker(skip_ollama)
 
+        # Auto-apply unit marker
+        if not any(item.get_closest_marker(m) for m in _NON_UNIT):
+            item.add_marker(pytest.mark.unit)
+
     # Reorder tests by backend if requested
     if config.getoption("--group-by-backend", default=False):
         logger = FancyLogger.get_logger()

From 4ea0c50b390e0f1c1ae1c427176d098a2f5c2ff0 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 09:56:52 +0000
Subject: [PATCH 02/42] test: add audit-markers skill for test classification
 (#728)

Skill classifies tests as unit/integration/e2e/qualitative using
general heuristics (Part 1) and project-specific rules (Part 2).
Includes fixture chain tracing guidance, backend detection heuristics,
and example file handling. References MARKERS_GUIDE.md for tables.
---
 .agents/skills/audit-markers/SKILL.md | 366 ++++++++++++++++++++++++++
 AGENTS.md                             |  43 +--
 CONTRIBUTING.md                       |  45 +---
 3 files changed, 377 insertions(+), 77 deletions(-)
 create mode 100644 .agents/skills/audit-markers/SKILL.md

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
new file mode 100644
index 000000000..b42a86228
--- /dev/null
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -0,0 +1,366 @@
+---
+name: audit-markers
+description: >
+  Audit and fix pytest markers on test files and examples. Classifies tests as
+  unit/integration/e2e/qualitative using general heuristics and project-specific
+  marker rules. Use when reviewing markers, auditing test files, or checking
+  before commit. References test/MARKERS_GUIDE.md for project conventions.
+argument-hint: "[file-or-directory] [--dry-run]"
+compatibility: "Claude Code, IBM Bob"
+metadata:
+  version: "2026-03-25"
+  capabilities: [read_file, write_file, bash, grep, glob]
+---
+
+# Audit & Fix Pytest Markers
+
+Classify tests, validate markers, and fix issues. Works in two layers:
+general test classification (applicable to any project) plus project-specific
+marker rules for **mellea**.
+
+## Inputs
+
+- `$ARGUMENTS` — file path, directory, or glob. If empty, audit `test/` and `docs/examples/`.
+- `--dry-run` — report only, do not edit files.
+
+## Project References
+
+Read these before auditing — they are the authoritative source for marker conventions:
+
+- **Marker guide:** `test/MARKERS_GUIDE.md`
+- **Marker registration:** `test/conftest.py` (`pytest_configure`) and `pyproject.toml` (`[tool.pytest.ini_options]`)
+- **Example marker format:** `docs/examples/conftest.py` (`_extract_markers_from_file`)
+- **Epic context:** GitHub issues #726 (epic), #727 (granularity), #728 (backend/resource)
+
+---
+
+# Part 1: General Test Classification
+
+These principles apply to any test suite, not just mellea. Use them as the
+foundation for classifying every test function.
+
+## The Four Granularity Tiers
+
+Tests fall into exactly one tier based on what they exercise and what they need:
+
+### Unit
+
+**Entirely self-contained** — no services, no I/O, no fixtures that connect
+to anything external. Pure logic testing.
+
+**Recognise by:**
+- Imports only from the project and stdlib — no external service clients
+- Creates objects directly, calls methods, checks return values
+- If it uses test doubles, they replace external boundaries (network, DB, services)
+- No fixture that starts/connects to a real or fixture-managed service
+- Runs in milliseconds to low seconds
+- Would pass on any machine with just the language runtime and project deps
+
+**Examples of unit assertions:**
+```python
+assert str(cb) == "hello"
+assert len(items) == 3
+assert raises(ValueError)
+mock_backend.generate.assert_called_once()
+```
+
+### Integration
+
+Tests **multiple components working together**, potentially needing additional
+services or fixture-managed dependencies. Backends may be mocked, stubbed, or
+stood up by test fixtures.
+
+**Recognise by:**
+- Creates real instances of multiple project components and wires them together
+- External service boundaries may be mocked, stubbed, or managed by fixtures
+- Tests that the components interact correctly — data flows, callbacks fire, errors propagate
+- May need additional services, but the test controls or provides its dependencies
+- Slower than unit (fixture setup, service lifecycle) and may consume more memory
+
+**Key distinction from unit:** Unit is entirely self-contained with no services.
+Integration wires up components and may need services (even fixture-managed ones).
+
+**Key distinction from e2e:** Integration controls its dependencies (mocks, stubs,
+fixture-managed services). E2E uses real backends that exist independently.
+
+### E2E (End-to-End)
+
+Tests against **real backends** — cloud APIs, local servers, or GPU-loaded
+models. No mocks on the critical path.
+
+**Recognise by:**
+- Uses a real backend (however started — cloud API, local server, script-launched, GPU-loaded)
+- Needs infrastructure: running server, API key, GPU, sufficient RAM
+- Fixtures create real service connections, not mocks
+- Assertions check that the real service behaved correctly
+- Assertions are **deterministic** — structural, type-based, or functional
+
+**Examples of e2e assertions:**
+```python
+assert isinstance(result, CBlock)              # type check
+assert json.loads(result.value)                 # valid JSON
+assert result._meta["status"] == "complete"     # status check
+assert tool_call.function.name == "get_weather" # tool was invoked
+assert result.parsed_repr is not None           # output exists
+```
+
+### Qualitative
+
+Subset of e2e. Same infrastructure requirements, but assertions are on
+**non-deterministic output content** that may vary across model versions,
+temperatures, or runs.
+
+**Recognise by:**
+- Same as e2e (real backend, real calls)
+- Assertions check semantic content, natural language output, or quality
+- A different model version could break the assertion even if the system works correctly
+
+**Examples of qualitative assertions:**
+```python
+assert "hello" in result.value.lower()          # content check
+assert result.value.startswith("Subject")       # format of generated text
+assert len(result.value.split()) > 50           # output length
+assert "error" not in result.value.lower()      # absence of bad content
+```
+
+**The decision rule:** If swapping the model version could break the assertion
+despite the system working correctly → `qualitative`. If the assertion checks
+structure, types, or functional correctness → `e2e`.
+
+## When to Ask the User
+
+Some classifications are ambiguous. **Ask for confirmation** when:
+
+- A test mixes structural and content assertions (e2e vs qualitative)
+- A test uses a real backend but only checks that no exception was raised (could be e2e or integration if the backend call is incidental)
+- A test patches some but not all external boundaries (partial mock — unit or integration?)
+- An assertion is borderline: `assert len(result.value) > 0` could be structural (e2e) or content-dependent (qualitative)
+
+When asking, present the test code and your reasoning so the user can make an informed decision.
+
+---
+
+# Part 2: Project-Specific Rules
+
+Read `test/MARKERS_GUIDE.md` for the full marker reference (marker tables,
+resource gates, auto-skip logic, common patterns). This section covers only
+the **code analysis heuristics** the skill needs to classify tests — things
+that require reading the test source code rather than looking up a table.
+
+## Key project rules
+
+- `unit` is auto-applied by conftest — **never write it explicitly**
+- `llm` is deprecated (synonym for `e2e`) — **flag and recommend replacing**
+- Backend/resource markers only go on `e2e`/`qualitative` tests
+- `qualitative` is always per-function; module carries `e2e` + backend markers
+- If a file mixes unit and non-unit tests, apply markers per-function, not module-level
+
+## Backend detection heuristics
+
+When classifying a test file, check ALL of the following to determine which
+backend(s) it uses:
+
+- **Imports:** `from mellea.backends.ollama import ...` → `ollama`
+- **Session creation:** `start_session("ollama", ...)` → `ollama`; bare `start_session()` with no backend arg → `ollama` (default backend)
+- **Backend constructors:** `OllamaModelBackend(...)` → `ollama`; `OpenAIBackend(...)` → `openai`
+- **Environment variables checked:** `OPENAI_API_KEY` → `openai`; `WATSONX_API_KEY` → `watsonx`
+- **Dual backends:** `OpenAIBackend` pointed at Ollama's `/v1` endpoint → both `openai` AND `ollama` (but NOT `requires_api_key`)
+
+## Fixture chain tracing
+
+**This is the most important analysis step.** A test's tier depends on what its
+fixtures actually provide. The test function signature alone is not enough — you
+must trace each fixture back to its definition to determine whether it connects
+to a real backend, a mock, or nothing external.
+
+### How to trace
+
+1. **Read the test function signature.** List every fixture parameter
+   (e.g., `session`, `backend`, `m_session`, `gh_run`).
+2. **Locate each fixture definition.** Check (in order):
+   - Same file (local `@pytest.fixture` functions)
+   - Nearest `conftest.py` in the same directory
+   - Parent `conftest.py` files up to `test/conftest.py`
+   - Root `conftest.py` or plugin-provided fixtures
+3. **Follow the chain recursively.** If a fixture depends on another fixture,
+   trace that one too. Stop when you reach a leaf: a constructor, a mock, or
+   a conftest-provided value.
+4. **Classify the leaf.** The leaf determines the tier:
+   - **Real backend constructor** (`OllamaModelBackend()`, `LocalHFBackend()`,
+     `LocalVLLMBackend()`, `OpenAIBackend()`, `WatsonxAIBackend()`,
+     `LiteLLMBackend()`) → **e2e**
+   - **`start_session()`** (no mock involved) → **e2e** (default backend is ollama)
+   - **Subprocess that starts a server** (`subprocess.Popen(["vllm", "serve", ...])`) → **e2e**
+   - **Mock/MagicMock/patch** replacing the backend → **unit** (if self-contained)
+     or **integration** (if wiring multiple real components around the mock)
+   - **No external dependency at all** → **unit**
+
+### Common fixture chain patterns in this project
+
+**Pattern 1 — Direct session creation (e2e):**
+```
+test_func(session) → session fixture → start_session() → real ollama
+```
+Backend: `ollama`. Tier: e2e.
+
+**Pattern 2 — Backend → session chain (e2e):**
+```
+test_func(session) → session(backend) → backend fixture → LocalHFBackend(...)
+```
+Backend: `huggingface`. Tier: e2e.
+
+**Pattern 3 — Process → backend → session chain (e2e):**
+```
+test_func(m_session) → m_session(backend) → backend(vllm_process) → vllm_process spawns subprocess
+```
+Backend: `vllm` (via OpenAI client). Tier: e2e.
+
+**Pattern 4 — OpenAI-via-Ollama (e2e, dual markers):**
+```
+test_func(m_session) → m_session(backend) → OpenAIBackend(base_url="...ollama.../v1", api_key="ollama")
+```
+Backend markers: `openai` + `ollama`. NOT `requires_api_key`.
+
+**Pattern 5 — Mocked backend (unit or integration):**
+```
+test_func(session) → session uses MagicMock/MockBackend/patch
+```
+If the test only checks the mock was called → **unit**.
+If the test wires real components around the mock → **integration**.
+
+**Pattern 6 — No backend at all (unit):**
+```
+test_func() — or test_func(tmp_path, capsys, ...)
+```
+Only uses pytest built-in fixtures. Tier: **unit**.
+
+### What to watch for
+
+- **`gh_run` fixture** (from root conftest) — provides CI flag, does NOT indicate
+  a backend. Ignore for classification purposes.
+- **`autouse` fixtures** — `aggressive_cleanup`, `normalize_ollama_host`,
+  `auto_register_acceptance_sets` are infrastructure. They do not affect tier.
+- **Conditional fixture bodies** — some fixtures branch on `gh_run` to choose
+  model IDs or options. The backend is still real in both branches → still e2e.
+- **`pytest.skip()` inside fixtures** — a fixture that skips on CI
+  (e.g., watsonx) is still e2e when it runs.
+- **`MagicMock` vs real instance** — if a fixture returns `MagicMock(spec=Backend)`,
+  the test is NOT e2e regardless of what the test function does with it.
+- **Mixed files** — a file might define both a real `backend` fixture (used by
+  some tests) and have other tests that don't use any fixture. Classify
+  per-function, not per-file.
+
+## Resource marker inference
+
+These are not automatic — verify by reading the code:
+
+- `huggingface` usually → `requires_gpu` + `requires_heavy_ram` + `requires_gpu_isolation`
+- `vllm` usually → `requires_gpu` + `requires_heavy_ram` + `requires_gpu_isolation`
+- `watsonx` usually → `requires_api_key`
+- `openai` → `requires_api_key` ONLY when using the real OpenAI API (not Ollama-compatible)
+
+## Example files (`docs/examples/`)
+
+Examples use a comment-based marker format (not `pytestmark`):
+
+```python
+# pytest: e2e, ollama, qualitative
+```
+
+Same classification rules apply. Parser: `docs/examples/conftest.py`
+(`_extract_markers_from_file`).
+
+---
+
+# Audit Procedure
+
+## Step 1 — Read and identify
+
+Read the file fully. Identify:
+- Module-level `pytestmark` (test files) or `# pytest:` comment (examples)
+- Per-function `@pytest.mark.*` decorators
+- Fixtures and their backend dependencies (trace the fixture chain — see above)
+- Any use of the deprecated `llm` marker
+
+**For example files (`docs/examples/`):** Examples are standalone scripts, not
+fixture-based tests. Classification comes from reading the code directly —
+look for backend imports, `start_session()` calls, and constructor usage.
+The `# pytest:` comment is the only marker mechanism (no `pytestmark`).
+
+## Step 2 — Classify each test function
+
+For each `def test_*` or `async def test_*`, apply the general classification
+from Part 1 using the project-specific heuristics from Part 2:
+
+1. **Real backend or mocked?** → determines unit/integration vs e2e
+2. **Which backend(s)?** → backend markers (e2e only)
+3. **Deterministic or content-dependent assertions?** → e2e vs qualitative
+4. **What resources?** → resource markers
+
+If uncertain about a classification (especially qualitative vs e2e), note it
+and ask the user to confirm.
+
+## Step 3 — Compare and report
+
+Per-file report format:
+
+```
+## test/backends/test_ollama.py
+
+Module markers — Current: [llm, ollama] → Proposed: [e2e, ollama]
+  Note: replace deprecated `llm` with `e2e`
+
+  test_simple_instruct   — qualitative ✓
+  test_structured_output — Current: qualitative → WRONG: asserts JSON schema, remove qualitative
+  test_chat              — qualitative ✓
+```
+
+## Step 4 — Apply fixes (unless `--dry-run`)
+
+Surgical edits only — change specific marker lines, do not reformat surrounding code.
+
+When replacing `llm` with `e2e` in `pytestmark` lists, keep the same list structure.
+
+## Step 5 — Flag infrastructure notes
+
+Report issues outside marker-edit scope as **notes**. Do NOT fix these:
+- Missing conftest skip logic for a backend
+- Unregistered markers in pyproject.toml
+- MARKERS_GUIDE.md gaps
+- Tests with no assertions
+- Files mixing unit and e2e tests that could be split
+
+## Output Summary
+
+```
+## Audit Summary
+
+Files audited: N
+Files correct: N
+Files with issues: N
+
+Issues by type:
+  Missing markers:     N
+  Wrong markers:       N
+  Over-marked:         N
+  Deprecated (llm):    N
+
+Changes: N applied / N dry-run
+Infrastructure notes: N (see notes section)
+```
+
+## Infrastructure Note (not part of this skill's scope)
+
+For `pytest -m unit` to work, the project needs a conftest hook:
+
+```python
+# In test/conftest.py pytest_collection_modifyitems:
+_NON_UNIT = ("integration", "e2e", "qualitative", "llm")
+for item in items:
+    if not any(item.get_closest_marker(m) for m in _NON_UNIT):
+        item.add_marker(pytest.mark.unit)
+```
+
+The `e2e` and `integration` markers also need registering in `pytest_configure`
+and `pyproject.toml`. These are one-time infrastructure changes tracked in
+issue #727, not performed by this skill.
diff --git a/AGENTS.md b/AGENTS.md
index 6c0dbb723..bcb437958 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -47,51 +47,18 @@ uv run mypy .                         # Type check
 | `scratchpad/` | Experiments (git-ignored) |
 
 ## 3. Test Markers
-All tests and examples use markers to indicate requirements. The test infrastructure automatically skips tests based on system capabilities. See `test/MARKERS_GUIDE.md` for the full reference.
+Tests use a four-tier granularity system (`unit`, `integration`, `e2e`, `qualitative`) plus backend and resource markers. The `unit` marker is auto-applied by conftest — never write it explicitly. The `llm` marker is deprecated; use `e2e` instead.
 
-**Granularity Tiers** (every test belongs to exactly one):
+See **[test/MARKERS_GUIDE.md](test/MARKERS_GUIDE.md)** for the full marker reference (tier definitions, backend markers, resource gates, auto-skip logic, common patterns).
 
-- `unit` — Self-contained, no services, no I/O. **Auto-applied by conftest** — never write explicitly.
-- `@pytest.mark.integration` — Multiple components wired together, may need fixture-managed services.
-- `@pytest.mark.e2e` — Real backends (cloud APIs, local servers, GPU models). Always paired with backend markers.
-- `@pytest.mark.qualitative` — Subset of e2e with non-deterministic assertions. Per-function only.
-- `@pytest.mark.llm` — **Deprecated**, synonym for `e2e`. Use `e2e` in new tests.
-
-**Backend Markers** (e2e/qualitative only):
-
-- `@pytest.mark.ollama` — Requires Ollama running (local, lightweight)
-- `@pytest.mark.huggingface` — Requires HuggingFace backend (local, heavy)
-- `@pytest.mark.vllm` — Requires vLLM backend (local, GPU required)
-- `@pytest.mark.openai` — Requires OpenAI API (requires API key)
-- `@pytest.mark.watsonx` — Requires Watsonx API (requires API key)
-- `@pytest.mark.litellm` — Requires LiteLLM backend
-
-**Resource/Capability Markers** (e2e/qualitative only):
-
-- `@pytest.mark.requires_gpu` — Requires GPU
-- `@pytest.mark.requires_heavy_ram` — Requires 48GB+ RAM
-- `@pytest.mark.requires_api_key` — Requires external API keys
-- `@pytest.mark.requires_gpu_isolation` — Requires OS-level process isolation to clear CUDA memory (use with `--isolate-heavy` or `CICD=1`)
-- `@pytest.mark.slow` — Tests taking >1 minute (skipped by default)
-
-**Examples in `docs/examples/`** use comment-based markers for clean code:
+**Examples in `docs/examples/`** use comment-based markers:
 ```python
 # pytest: e2e, ollama, qualitative
 """Example description..."""
-
-# Your clean example code here
 ```
 
-Tests/examples automatically skip if system lacks required resources. Heavy examples (e.g., HuggingFace) are skipped during collection to prevent memory issues.
-
-**Default behavior:**
-- `uv run pytest` skips slow tests (>1 min) but runs qualitative tests
-- Use `pytest -m "not qualitative"` for fast tests only (~2 min)
-- Use `pytest -m unit` for self-contained tests only (fastest)
-- Use `pytest -m slow` to include slow tests
-
-⚠️ Don't add `qualitative` to trivial tests—keep the fast loop fast.
-⚠️ Mark tests taking >1 minute with `slow` (e.g., dataset loading, extensive evaluations).
+⚠️ Don't add `qualitative` to trivial tests — keep the fast loop fast.
+⚠️ Mark tests taking >1 minute with `slow`.
 
 ## 4. Coding Standards
 - **Types required** on all core functions
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 31edc181b..c086053f1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -333,19 +333,16 @@ uv run pytest
 # Fast tests only (no qualitative, ~2 min)
 uv run pytest -m "not qualitative"
 
-# Run only slow tests (>5 min)
-uv run pytest -m slow
+# Unit tests only (self-contained, no services)
+uv run pytest -m unit
 
-# Run ALL tests including slow (bypass config)
-uv run pytest --co -q
+# Run only slow tests (>1 min)
+uv run pytest -m slow
 
 # Run specific backend tests
 uv run pytest -m "ollama"
 uv run pytest -m "openai"
 
-# Run tests without LLM calls (unit tests only)
-uv run pytest -m "not llm"
-
 # CI/CD mode (skips qualitative tests)
 CICD=1 uv run pytest
 
@@ -366,37 +363,7 @@ _Note: ollama models can be obtained by running `ollama pull <model>`_
 
 ### Test Markers
 
-Tests are categorized using pytest markers:
-
-**Backend Markers:**
-- `@pytest.mark.ollama` - Requires Ollama running (local, lightweight)
-- `@pytest.mark.huggingface` - Requires HuggingFace backend (local, heavy)
-- `@pytest.mark.vllm` - Requires vLLM backend (local, GPU required)
-- `@pytest.mark.openai` - Requires OpenAI API (requires API key)
-- `@pytest.mark.watsonx` - Requires Watsonx API (requires API key)
-- `@pytest.mark.litellm` - Requires LiteLLM backend
-
-**Capability Markers:**
-- `@pytest.mark.requires_gpu` - Requires GPU
-- `@pytest.mark.requires_heavy_ram` - Requires 48GB+ RAM
-- `@pytest.mark.requires_api_key` - Requires external API keys
-- `@pytest.mark.qualitative` - LLM output quality tests (skipped in CI via `CICD=1`)
-- `@pytest.mark.llm` - Makes LLM calls (needs at least Ollama)
-- `@pytest.mark.slow` - Tests taking >5 minutes (skipped via `SKIP_SLOW=1`)
-
-**Execution Strategy Markers:**
-- `@pytest.mark.requires_gpu_isolation` - Requires OS-level process isolation to clear CUDA memory (use with `--isolate-heavy` or `CICD=1`)
-
-**Default behavior:**
-- `uv run pytest` skips slow tests (>5 min) but runs qualitative tests
-- Use `pytest -m "not qualitative"` for fast tests only (~2 min)
-- Use `pytest -m slow` or `pytest --co -q` to include slow tests
-
-⚠️ **Don't add `qualitative` to trivial tests** - keep the fast loop fast.
-⚠️ **Mark tests taking >5 minutes with `slow`** (e.g., dataset loading, extensive evaluations).
-
-For detailed information about test markers, resource requirements, and running specific
-test categories, see [test/MARKERS_GUIDE.md](test/MARKERS_GUIDE.md).
+Tests use a four-tier granularity system (`unit`, `integration`, `e2e`, `qualitative`) plus backend and resource markers. See [test/MARKERS_GUIDE.md](test/MARKERS_GUIDE.md) for the full marker reference, including tier definitions, backend markers, resource gates, and auto-skip logic.
 
 ### CI/CD Tests
 
@@ -417,7 +384,7 @@ CICD=1 uv run pytest
 
 - Fast tests (`-m "not qualitative"`): ~2 minutes
 - Default tests (qualitative, no slow): Several minutes
-- Slow tests (`-m slow`): >5 minutes
+- Slow tests (`-m slow`): >1 minute each
 - Pre-commit hooks: 1-5 minutes
 
 ⚠️ **Don't cancel mid-run** - canceling `pytest` or `pre-commit` can corrupt state.

From 4f248dc2f5365c1bf55cba65f97994b7c3c553f7 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 10:00:13 +0000
Subject: [PATCH 03/42] chore: add CLAUDE.md and agent skills infrastructure

Add CLAUDE.md referencing AGENTS.md for project directives. Add
skill-author meta-skill for cross-compatible skill creation. The
audit-markers skill was added in the previous commit.
---
 .agents/skills/skill-author/SKILL.md          | 119 ++++++++++++++++++
 .../skill-author/scripts/validate_skill.py    |  39 ++++++
 AGENTS.md                                     |  38 ++++--
 CLAUDE.md                                     |   5 +
 4 files changed, 193 insertions(+), 8 deletions(-)
 create mode 100644 .agents/skills/skill-author/SKILL.md
 create mode 100755 .agents/skills/skill-author/scripts/validate_skill.py
 create mode 100644 CLAUDE.md

diff --git a/.agents/skills/skill-author/SKILL.md b/.agents/skills/skill-author/SKILL.md
new file mode 100644
index 000000000..04c1cbe2a
--- /dev/null
+++ b/.agents/skills/skill-author/SKILL.md
@@ -0,0 +1,119 @@
+---
+name: skill-author
+description: >
+  Draft, validate, and install new agent skills. Use when asked to create a new
+  skill, automate a workflow, or add a capability. Produces cross-compatible
+  SKILL.md files that work in both Claude Code and IBM Bob.
+argument-hint: "[skill-name]"
+compatibility: "Claude Code, IBM Bob"
+metadata:
+  version: "2026-03-25"
+  capabilities: [bash, read_file, write_file]
+---
+
+# Skill Authoring Meta-Skill
+
+Create new agent skills that work across Claude Code (CLI/IDE) and IBM Bob.
+
+## Skill Location
+
+Skills live under `.agents/skills/<name>/SKILL.md`.
+
+Discovery configuration varies by tool:
+- **Claude Code:** Add `"skillLocations": [".agents/skills"]` to `.claude/settings.json`.
+  Without this, Claude Code looks in `.claude/skills/` by default.
+- **IBM Bob:** Discovers `.agents/skills/` natively per agentskills.io convention.
+
+Both tools read the same `SKILL.md` format. Use the frontmatter schema below
+to maximise compatibility.
+
+## Workflow
+
+1. **Name the skill** — kebab-case, max 64 chars (e.g. `api-tester`, `audit-markers`).
+
+2. **Scaffold the directory:**
+   ```
+   .agents/skills/<name>/
+   ├── SKILL.md          # Required — frontmatter + instructions
+   ├── scripts/          # Optional — helper scripts
+   └── templates/        # Optional — output templates
+   ```
+
+3. **Write SKILL.md** — YAML frontmatter + markdown body (see schema below).
+
+4. **Validate:**
+   - Check the skill is discoverable: list files in `.agents/skills/`.
+   - Confirm no frontmatter warnings from the IDE.
+   - Verify the skill does not conflict with existing skills or `AGENTS.md`.
+
+## SKILL.md Frontmatter Schema
+
+Use only fields from the **cross-compatible** set to avoid IDE warnings.
+
+### Cross-compatible fields (use these)
+
+| Field | Type | Purpose |
+|-------|------|---------|
+| `name` | string | Kebab-case identifier. Becomes the `/slash-command`. Max 64 chars. |
+| `description` | string | What the skill does and when to trigger it. Be specific — agents use this to decide whether to invoke the skill automatically. |
+| `argument-hint` | string | Autocomplete hint. E.g. `"[file] [--dry-run]"`, `"[issue-number]"`. |
+| `compatibility` | string | Which tools support this skill. E.g. `"Claude Code, IBM Bob"`. |
+| `disable-model-invocation` | boolean | `true` = manual `/name` only, no auto-invocation. |
+| `user-invocable` | boolean | `false` = hidden from `/` menu. Use for background knowledge skills. |
+| `license` | string | SPDX identifier if publishing. E.g. `"Apache-2.0"`. |
+| `metadata` | object | Free-form key-value pairs for tool-specific or custom fields. |
+
+### Tool-specific fields (put under `metadata`)
+
+These are useful but not universally supported — nest them under `metadata`:
+
+```yaml
+metadata:
+  version: "2026-03-25"
+  capabilities: [bash, read_file, write_file]   # Bob/agentskills.io
+```
+
+Claude Code's `allowed-tools` and `context`/`agent` fields are recognised by
+Claude Code but may trigger warnings in Bob's validator. If needed, add them
+to `metadata` or accept the warnings.
+
+### Example frontmatter
+
+```yaml
+---
+name: my-skill
+description: >
+  Does X when Y. Use when asked to Z.
+argument-hint: "[target] [--flag]"
+compatibility: "Claude Code, IBM Bob"
+metadata:
+  version: "2026-03-25"
+  capabilities: [bash, read_file, write_file]
+---
+```
+
+## SKILL.md Body Structure
+
+After frontmatter, write clear markdown instructions the agent follows:
+
+1. **Context section** — what the skill operates on, key reference files.
+2. **Procedure** — numbered steps the agent follows. Be explicit about decisions and edge cases.
+3. **Rules / constraints** — hard rules the agent must not break.
+4. **Output format** — what the agent should produce (report, edits, summary).
+
+### Guidelines
+
+- **Be specific.** Vague instructions produce inconsistent results across models.
+  "Check if markers are correct" is worse than "Compare the test's assertions
+  to the qualitative decision rule in section 3."
+- **Reference project files.** Point to docs, configs, and examples by relative
+  path so the agent can read them. E.g. "See `test/MARKERS_GUIDE.md` for the
+  full marker taxonomy."
+- **Declare scope boundaries.** State what the skill does NOT do. E.g. "This
+  skill does not modify conftest.py — flag infrastructure issues as notes."
+- **Use `$ARGUMENTS`** for user input. `$ARGUMENTS` is the full argument string;
+  `$1`, `$2` etc. are positional.
+- **Keep SKILL.md under 500 lines.** Use supporting files for large reference
+  material (link to them from the body).
+- **Portability:** use relative paths from the repo root, never absolute paths.
+- **Formatting:** use YYYY-MM-DD for dates, 24-hour clock for times, metric units.
diff --git a/.agents/skills/skill-author/scripts/validate_skill.py b/.agents/skills/skill-author/scripts/validate_skill.py
new file mode 100755
index 000000000..d3a270178
--- /dev/null
+++ b/.agents/skills/skill-author/scripts/validate_skill.py
@@ -0,0 +1,39 @@
+"""Validate SKILL.md frontmatter for agent skills."""
+
+import json
+import os
+import sys
+
+import yaml  # Ensure this is in the agent's environment
+
+
+def validate_skill(skill_path):
+    """Check that a skill directory has valid SKILL.md with required frontmatter keys."""
+    skill_file = os.path.join(skill_path, "SKILL.md")
+
+    if not os.path.exists(skill_file):
+        return {"status": "error", "message": "Missing SKILL.md"}
+
+    try:
+        with open(skill_file) as f:
+            content = f.read()
+            # Split YAML frontmatter
+            if content.startswith("---"):
+                parts = content.split("---")
+                metadata = yaml.safe_load(parts[1])
+
+                # Validation Logic
+                required_keys = ["name", "description", "version"]
+                for key in required_keys:
+                    if key not in metadata:
+                        return {"status": "error", "message": f"Missing key: {key}"}
+
+                return {"status": "success", "data": metadata}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+
+if __name__ == "__main__":
+    # Example usage: python3 validate_skill.py ./.agents/skills/new-skill
+    result = validate_skill(sys.argv[1])
+    print(json.dumps(result))
diff --git a/AGENTS.md b/AGENTS.md
index bcb437958..c329c3bf4 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -44,6 +44,7 @@ uv run mypy .                         # Type check
 | `cli/` | CLI commands (`m serve`, `m alora`, `m decompose`, `m eval`) |
 | `test/` | All tests (run from repo root) |
 | `docs/examples/` | Example code (run as tests via pytest) |
+| `.agents/skills/` | Agent skills ([agentskills.io](https://agentskills.io) standard) |
 | `scratchpad/` | Experiments (git-ignored) |
 
 ## 3. Test Markers
@@ -60,7 +61,27 @@ See **[test/MARKERS_GUIDE.md](test/MARKERS_GUIDE.md)** for the full marker refer
 ⚠️ Don't add `qualitative` to trivial tests — keep the fast loop fast.
 ⚠️ Mark tests taking >1 minute with `slow`.
 
-## 4. Coding Standards
+## 4. Agent Skills
+
+Skills live in `.agents/skills/` following the [agentskills.io](https://agentskills.io) open standard. Each skill is a directory with a `SKILL.md` file (YAML frontmatter + markdown instructions).
+
+**Tool discovery:**
+
+| Tool              | Project skills    | Global skills       | Config needed                                                      |
+| ----------------- | ----------------- | ------------------- | ------------------------------------------------------------------ |
+| Claude Code       | `.agents/skills/` | `~/.claude/skills/` | `"skillLocations": [".agents/skills"]` in `.claude/settings.json`  |
+| IBM Bob           | `.bob/skills/`    | `~/.bob/skills/`    | Symlink: `.bob/skills` → `.agents/skills`                          |
+| VS Code / Copilot | `.agents/skills/` | —                   | None (auto-discovered)                                             |
+
+**Bob users:** create the symlink once per clone:
+
+```bash
+mkdir -p .bob && ln -s ../.agents/skills .bob/skills
+```
+
+**Available skills:** `/audit-markers`, `/skill-author`
+
+## 5. Coding Standards
 - **Types required** on all core functions
 - **Docstrings are prompts** — be specific, the LLM reads them
 - **Google-style docstrings** — `Args:` on the **class docstring only**; `__init__` gets a single summary sentence. Add `Attributes:` only when a stored value differs in type/behaviour from its constructor input (type transforms, computed values, class constants). See CONTRIBUTING.md for a full example.
@@ -70,15 +91,15 @@ See **[test/MARKERS_GUIDE.md](test/MARKERS_GUIDE.md)** for the full marker refer
 - **Friendly Dependency Errors**: Wraps optional backend imports in `try/except ImportError` with a helpful message (e.g., "Please pip install mellea[hf]"). See `mellea/stdlib/session.py` for examples.
 - **Backend telemetry fields**: All backends must populate `mot.usage` (dict with `prompt_tokens`, `completion_tokens`, `total_tokens`), `mot.model` (str), and `mot.provider` (str) in their `post_processing()` method. Metrics are automatically recorded by `TokenMetricsPlugin` — don't add manual `record_token_usage_metrics()` calls.
 
-## 5. Commits & Hooks
+## 6. Commits & Hooks
 [Angular format](https://github.com/angular/angular/blob/main/CONTRIBUTING.md#commit): `feat:`, `fix:`, `docs:`, `test:`, `refactor:`, `release:`
 
 Pre-commit runs: ruff, mypy, uv-lock, codespell
 
-## 6. Timing
+## 7. Timing
 > **Don't cancel**: `pytest` (full) and `pre-commit --all-files` may take minutes. Canceling mid-run can corrupt state.
 
-## 7. Common Issues
+## 8. Common Issues
 | Problem | Fix |
 |---------|-----|
 | `ComponentParseError` | Add examples to docstring |
@@ -86,21 +107,22 @@ Pre-commit runs: ruff, mypy, uv-lock, codespell
 | Ollama refused | Run `ollama serve` |
 | Telemetry import errors | Run `uv sync` to install OpenTelemetry deps |
 
-## 8. Self-Review (before notifying user)
+## 9. Self-Review (before notifying user)
 1. `uv run pytest test/ -m "not qualitative"` passes?
 2. `ruff format` and `ruff check` clean?
 3. New functions typed with concise docstrings?
 4. Unit tests added for new functionality?
 5. Avoided over-engineering?
 
-## 9. Writing Tests
+## 10. Writing Tests
+
 - Place tests in `test/` mirroring source structure
 - Name files `test_*.py` (required for pydocstyle)
 - Use `gh_run` fixture for CI-aware tests (see `test/conftest.py`)
 - Mark tests checking LLM output quality with `@pytest.mark.qualitative`
 - If a test fails, fix the **code**, not the test (unless the test was wrong)
 
-## 10. Writing Docs
+## 11. Writing Docs
 
 If you are modifying or creating pages under `docs/docs/`, follow the writing
 conventions in [`docs/docs/guide/CONTRIBUTING.md`](docs/docs/guide/CONTRIBUTING.md).
@@ -118,7 +140,7 @@ Key rules that differ from typical Markdown habits:
   mellea source; mark forward-looking content with `> **Coming soon:**`
 - **No visible TODOs** — if content is missing, open a GitHub issue instead
 
-## 11. Feedback Loop
+## 12. Feedback Loop
 
 Found a bug, workaround, or pattern? Update the docs:
 
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 000000000..2c1cd8d46
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,5 @@
+# Claude Code Directives
+@AGENTS.md
+
+## Execution
+- If instructed to create a new capability, strictly trigger the `skill-author` meta-skill to ensure cross-compatibility.

From 9c82f82212c91cd194c1095d583d831095c05adb Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 11:35:17 +0000
Subject: [PATCH 04/42] test: improve audit-markers skill quality and add
 resource predicates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resolve 8 quality issues from dry-run review of the audit-markers skill:

- Add behavioural signal detection tables and Step 0 triage procedure
  for scaling to full-repo audits (grep for backend behaviour, not
  just existing markers)
- Clarify unit/integration boundary with scope-of-mocks rule
- Allow module-level qualitative when every function qualifies
- Replace resource marker inference with predicate factory pattern
- Make llm→e2e rule explicit for # pytest: comments in examples
- Redesign report format: 3-tier output (summary table, issues-only
  detail, batch groups) instead of per-function listing
- Remove stale infrastructure note (conftest hook already exists)

Add test/predicates.py with reusable skipif decorators: require_gpu,
require_ram, require_gpu_isolation, require_api_key, require_package,
require_ollama, require_python.

Update skill-author with dry-run review step and 4 new authoring
guidelines (variable scope, category boundaries, temporal assertions,
qualifying absolutes).

Refs: #727, #728
---
 .agents/skills/audit-markers/SKILL.md | 307 ++++++++++++++++++++++----
 .agents/skills/skill-author/SKILL.md  |  42 +++-
 test/predicates.py                    | 213 ++++++++++++++++++
 3 files changed, 519 insertions(+), 43 deletions(-)
 create mode 100644 test/predicates.py

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index b42a86228..18a9784f9 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -29,6 +29,7 @@ Read these before auditing — they are the authoritative source for marker conv
 
 - **Marker guide:** `test/MARKERS_GUIDE.md`
 - **Marker registration:** `test/conftest.py` (`pytest_configure`) and `pyproject.toml` (`[tool.pytest.ini_options]`)
+- **Resource predicates:** `test/predicates.py` (predicate functions for resource gating)
 - **Example marker format:** `docs/examples/conftest.py` (`_extract_markers_from_file`)
 - **Epic context:** GitHub issues #726 (epic), #727 (granularity), #728 (backend/resource)
 
@@ -77,12 +78,31 @@ stood up by test fixtures.
 - May need additional services, but the test controls or provides its dependencies
 - Slower than unit (fixture setup, service lifecycle) and may consume more memory
 
-**Key distinction from unit:** Unit is entirely self-contained with no services.
-Integration wires up components and may need services (even fixture-managed ones).
+**Key distinction from unit:** Count the real (non-mock) project components
+being exercised. Unit isolates **one** class or function — all collaborators
+are faked. Integration wires up **multiple** real components and mocks only
+at the external perimeter (network, backend, database).
 
 **Key distinction from e2e:** Integration controls its dependencies (mocks, stubs,
 fixture-managed services). E2E uses real backends that exist independently.
 
+**Borderline: unit vs integration (the "scope of mocks" rule)**
+
+When a test uses mocks, look at *what* is mocked to decide:
+
+- **Mock replaces external I/O only, multiple real internal components wired
+  together** → **integration**. Example: a test that registers a real `Plugin`,
+  calls real `invoke_hook()` and `register()`, but passes `MagicMock()` for
+  the backend. The plugin-manager wiring executes for real; only the LLM call
+  is faked.
+- **Mock replaces internal collaborators too, only one real component under
+  test** → **unit**. Example: a test that instantiates one `Plugin` but
+  passes `MagicMock()` for the session, the backend, and the hook dispatcher.
+  Only the plugin's own logic executes.
+
+When in doubt, ask: "if I broke the *wiring* between two components, would
+this test catch it?" If yes → integration. If no → unit.
+
 ### E2E (End-to-End)
 
 Tests against **real backends** — cloud APIs, local servers, or GPU-loaded
@@ -127,6 +147,52 @@ assert "error" not in result.value.lower()      # absence of bad content
 despite the system working correctly → `qualitative`. If the assertion checks
 structure, types, or functional correctness → `e2e`.
 
+## Behavioural Signal Detection
+
+Before deep-reading a file, grep-able signals reveal whether it is likely unit
+or non-unit. Use these to triage at scale (see Audit Procedure, Step 0).
+
+### Live-backend signals (test is likely NOT unit)
+
+| Category | Grep patterns | Notes |
+|---|---|---|
+| Network literals | `localhost`, `127.0.0.1`, `0.0.0.0`, port numbers (`:11434`, `:8000`) | Direct infra dependency |
+| HTTP clients | `requests.get(`, `httpx.`, `aiohttp.ClientSession`, `urllib.request.urlopen` | Real network unless mocked |
+| Raw networking | `socket.socket(`, `socket.connect(` | Low-level network |
+| Subprocess | `subprocess.Popen(`, `subprocess.run(`, `subprocess.call(`, `os.system(` | Spawns external process |
+| API credentials | `_API_KEY`, `_TOKEN`, `_SECRET` in `os.environ`/`os.getenv` calls | Credential dependency |
+| GPU / model loading | `import torch`, `.to("cuda")`, `.from_pretrained(` | Hardware dependency |
+| External downloads | URL literals (`http://`, `https://`), `urlopen`, `requests.get` with URLs | Network fetch |
+
+### Mock signals (test is likely unit)
+
+| Category | Grep patterns |
+|---|---|
+| Mock objects | `MagicMock`, `Mock(`, `AsyncMock`, `create_autospec` |
+| Patching | `@patch(`, `@mock.patch`, `monkeypatch`, `mocker` fixture |
+| HTTP mocks | `responses`, `respx`, `httpx_mock`, `aioresponses`, `vcr` |
+
+### Fixture signals (need chain tracing to resolve)
+
+| Signal | Likely tier |
+|---|---|
+| `tmp_path`, `capsys`, `monkeypatch`, `caplog` only | Unit |
+| Custom fixtures named `session`, `backend`, `m_session` | Could be real or mock — trace the chain |
+| Session/module-scoped fixtures (`scope="session"`) | Usually infra setup → e2e |
+| Fixture name starts with `mock_`, `fake_`, `stub_` | Unit |
+
+### Cross-referencing signals
+
+A single file may contain both live and mock signals. Cross-reference to
+determine the correct bucket:
+
+| Live signals? | Mock signals? | Classification |
+|---|---|---|
+| Yes | No | Almost certainly e2e — deep-read to confirm |
+| Yes | Yes | Needs inspection — partial mock = integration, or mixed file |
+| No | Yes | Likely unit — skip deep read |
+| No | No | Likely unit — skip deep read |
+
 ## When to Ask the User
 
 Some classifications are ambiguous. **Ask for confirmation** when:
@@ -150,9 +216,12 @@ that require reading the test source code rather than looking up a table.
 ## Key project rules
 
 - `unit` is auto-applied by conftest — **never write it explicitly**
-- `llm` is deprecated (synonym for `e2e`) — **flag and recommend replacing**
+- `llm` is deprecated (synonym for `e2e`) — **flag and recommend replacing**.
+  This applies to both `pytestmark` lists and `# pytest:` comments in examples.
 - Backend/resource markers only go on `e2e`/`qualitative` tests
-- `qualitative` is always per-function; module carries `e2e` + backend markers
+- `qualitative` is per-function; module carries `e2e` + backend markers.
+  **Exception:** if every test function in the file is qualitative, module-level
+  `qualitative` is acceptable to avoid repetitive per-function decorators.
 - If a file mixes unit and non-unit tests, apply markers per-function, not module-level
 
 ## Backend detection heuristics
@@ -166,6 +235,18 @@ backend(s) it uses:
 - **Environment variables checked:** `OPENAI_API_KEY` → `openai`; `WATSONX_API_KEY` → `watsonx`
 - **Dual backends:** `OpenAIBackend` pointed at Ollama's `/v1` endpoint → both `openai` AND `ollama` (but NOT `requires_api_key`)
 
+### Project-specific triage signals
+
+These supplement the general behavioural signals (Part 1) with mellea patterns:
+
+| Signal | Grep pattern | Implies |
+|---|---|---|
+| Backend import | `from mellea.backends.` | e2e (which backend depends on module) |
+| Session creation | `start_session(` | e2e, default ollama |
+| Backend constructor | `OllamaModelBackend(\|OpenAIBackend(\|LocalHFBackend(\|LocalVLLMBackend(\|WatsonxAIBackend(\|LiteLLMBackend(` | e2e |
+| Example marker comment | `# pytest:` | Already classified — validate |
+| Ollama port | `11434` | e2e, ollama |
+
 ## Fixture chain tracing
 
 **This is the most important analysis step.** A test's tier depends on what its
@@ -250,14 +331,82 @@ Only uses pytest built-in fixtures. Tier: **unit**.
   some tests) and have other tests that don't use any fixture. Classify
   per-function, not per-file.
 
-## Resource marker inference
-
-These are not automatic — verify by reading the code:
-
-- `huggingface` usually → `requires_gpu` + `requires_heavy_ram` + `requires_gpu_isolation`
-- `vllm` usually → `requires_gpu` + `requires_heavy_ram` + `requires_gpu_isolation`
-- `watsonx` usually → `requires_api_key`
-- `openai` → `requires_api_key` ONLY when using the real OpenAI API (not Ollama-compatible)
+## Resource gating
+
+E2e and qualitative tests need gating so they skip cleanly when the required
+infrastructure is absent. The preferred mechanism is **predicate functions**
+— reusable decorators that encapsulate availability checks. Test authors
+apply the predicate that matches their test's actual requirements.
+
+### The predicate factory pattern (general)
+
+Projects should provide a shared module of predicate functions that return
+`pytest.mark.skipif(...)` decorators. This gives test authors precision
+(exact thresholds, specific env vars) without ad-hoc `skipif` or blunt
+resource markers scattered across files.
+
+### What to audit
+
+Check the project's predicate module (see Project References) for available
+predicates, then apply the following checks to every e2e/qualitative file:
+
+1. **Legacy resource markers → migrate to predicates.** If a test uses
+   `@pytest.mark.requires_gpu`, `@pytest.mark.requires_heavy_ram`,
+   `@pytest.mark.requires_api_key`, or `@pytest.mark.requires_gpu_isolation`,
+   recommend replacing with the equivalent predicate from the project's
+   predicate module. Resource markers are deprecated in favour of predicates.
+2. **Ad-hoc `skipif` → migrate to predicate.** If a predicate exists for
+   the same check (e.g., `require_gpu()` exists but the test has a raw
+   `skipif(not torch.cuda.is_available())`), recommend the predicate.
+3. **Missing gating.** A test that uses a GPU backend but has no GPU
+   predicate and no `skipif` — recommend adding the appropriate predicate.
+4. **Imprecise gating.** A predicate that's too broad (e.g., `require_ram(48)`
+   on a test that only needs 16 GB) — suggest tightening the threshold.
+5. **Redundant CICD `skipif`.** `skipif(CICD == 1)` is usually redundant
+   when conftest auto-skip or predicates already handle the condition.
+   Flag as removable.
+
+### What NOT to flag
+
+Not every `skipif` needs migrating. Leave these alone:
+
+- **Python version gates** (`skipif(sys.version_info < (3, 11))`) — one-off,
+  or use `require_python()` predicate if available.
+- **`importorskip` for optional deps** — idiomatic pytest, or use
+  `require_package()` predicate if available and a decorator style is preferred.
+- **Truly one-off conditions** with no predicate equivalent and no pattern
+  of recurrence across files.
+
+For any inline `skipif` that IS NOT covered above, check whether a matching
+predicate exists. If it does → recommend migration. If it doesn't and the
+same condition appears in multiple files → flag as an infrastructure note
+("consider adding a predicate for this condition").
+
+Resource gating is orthogonal to tier classification — a test gated by
+`require_gpu()` is still e2e/qualitative based on what it exercises.
+
+### Project predicates (`test/predicates.py`)
+
+Read `test/predicates.py` for the available predicates. Expected patterns:
+
+| Predicate | Use when test needs |
+|---|---|
+| `require_gpu()` | Any GPU (CUDA or MPS) |
+| `require_gpu(min_vram_gb=N)` | GPU with at least N GB VRAM |
+| `require_ram(min_gb=N)` | N GB+ system RAM |
+| `require_gpu_isolation()` | Subprocess isolation for CUDA memory |
+| `require_api_key("OPENAI_API_KEY")` | Specific API credentials |
+| `require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID")` | Multiple credentials |
+| `require_package("cpex.framework")` | Optional dependency |
+| `require_ollama()` | Running Ollama server |
+| `require_python((3, 11))` | Minimum Python version |
+
+Typical combinations for backends:
+
+- `huggingface` → `require_gpu()` + `require_ram(48)` (adjust RAM per model)
+- `vllm` → `require_gpu(min_vram_gb=24)` + `require_ram(48)`
+- `watsonx` → `require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID")`
+- `openai` → `require_api_key("OPENAI_API_KEY")` only for real OpenAI (not Ollama-compat)
 
 ## Example files (`docs/examples/`)
 
@@ -274,6 +423,48 @@ Same classification rules apply. Parser: `docs/examples/conftest.py`
 
 # Audit Procedure
 
+## Step 0 — Triage (for scopes larger than ~5 files)
+
+When auditing a directory or the full repo, do NOT deep-read every file.
+Use behavioural signal detection (Part 1) to bucket files first, then
+deep-read only files that need inspection.
+
+### Phase 0: Fixture discovery
+
+Read `conftest.py` files in the target scope to catalog fixture names.
+Classify each fixture as **live** (returns a real backend/connection) or
+**mock** (returns a MagicMock, patch, or fake). Record these lists — they
+become additional grep patterns for the next phase.
+
+### Phase 1: Signal grep
+
+Run grep across all target files for:
+
+1. **Live-backend signals** — backend imports, constructors, `start_session(`,
+   network literals (`localhost`, `127.0.0.1`, port numbers), HTTP client
+   usage, subprocess calls, `_API_KEY`/`_TOKEN`/`_SECRET` in env var checks,
+   GPU/model loading (`torch`, `.from_pretrained(`), URL literals.
+2. **Mock signals** — `MagicMock`, `Mock(`, `AsyncMock`, `@patch(`,
+   `monkeypatch`, `mocker`, HTTP mock libraries.
+3. **Existing markers** — `pytestmark`, `@pytest.mark.`, `# pytest:`.
+4. **Live/mock fixture names** from Phase 0.
+
+### Phase 2: Bucket and prioritise
+
+Cross-reference the signal hits into four priority buckets:
+
+| Priority | Condition | Action |
+|---|---|---|
+| **P1 — Missing markers** | Live signals present, NO existing markers | Deep-read and classify. These are the most likely gaps. |
+| **P2 — Mixed signals** | Both live AND mock signals present | Deep-read to determine if integration, partial mock, or mixed file. |
+| **P3 — Validate existing** | Live signals present, markers already exist | Spot-check that markers match the actual backend. Replace deprecated `llm`. |
+| **P4 — Skip** | No live signals (mock-only or no signals at all) | Likely unit. Report as clean without deep-reading. Spot-check a sample if the count is large. |
+
+### Phase 3: Deep-read
+
+Process P1 → P2 → P3 files using Steps 1–5 below. For P4 files, list them
+in the summary as "N files — no live-backend signals, assumed unit."
+
 ## Step 1 — Read and identify
 
 Read the file fully. Identify:
@@ -302,19 +493,59 @@ and ask the user to confirm.
 
 ## Step 3 — Compare and report
 
-Per-file report format:
+### Output tiers
+
+Scale the report detail to the scope of the audit:
+
+**Tier 1 — Summary table (always).**  Print first so the user sees the big
+picture before any detail:
+
+```
+| Category | Files | Functions |
+|----------|------:|----------:|
+| Correct (no changes) | 42 | — |
+| Deprecated `llm` → `e2e` (simple) | 27 | — |
+| Missing tier marker | 8 | 12 |
+| Wrong marker | 3 | 5 |
+| Over-marked | 2 | 4 |
+| Missing resource gating | 4 | 6 |
+| Legacy resource marker → predicate | 5 | 9 |
+| Infrastructure notes | 3 | — |
+```
+
+**Tier 2 — Issues-only detail.**  For each file with at least one issue,
+print the file header and **only the functions that need changes**.  Omit
+functions that are already correct — they are noise at scale:
 
 ```
 ## test/backends/test_ollama.py
 
 Module markers — Current: [llm, ollama] → Proposed: [e2e, ollama]
-  Note: replace deprecated `llm` with `e2e`
+  ↳ replace deprecated `llm` with `e2e`
+
+  test_structured_output — WRONG: asserts JSON schema, remove `qualitative`
+```
 
-  test_simple_instruct   — qualitative ✓
-  test_structured_output — Current: qualitative → WRONG: asserts JSON schema, remove qualitative
-  test_chat              — qualitative ✓
+Functions without issues (`test_simple_instruct ✓`, `test_chat ✓`) are
+**not listed**.  Files where everything is correct appear only in the Tier 1
+count.
+
+**Tier 3 — Batch groups (for mechanical fixes).**  When many files share the
+same fix (e.g. `llm` → `e2e` in `pytestmark`), collapse them into a single
+block instead of repeating the per-file template:
+
+```
+### Deprecated `llm` → `e2e` (27 files, module-level pytestmark)
+
+test/backends/test_ollama.py
+test/backends/test_openai.py
+test/backends/test_watsonx.py
+... (24 more)
 ```
 
+The agent should list all files (not truncate) so the user can review before
+applying, but one line per file is sufficient when the fix is identical.
+
 ## Step 4 — Apply fixes (unless `--dry-run`)
 
 Surgical edits only — change specific marker lines, do not reformat surrounding code.
@@ -332,35 +563,27 @@ Report issues outside marker-edit scope as **notes**. Do NOT fix these:
 
 ## Output Summary
 
-```
-## Audit Summary
-
-Files audited: N
-Files correct: N
-Files with issues: N
-
-Issues by type:
-  Missing markers:     N
-  Wrong markers:       N
-  Over-marked:         N
-  Deprecated (llm):    N
+The output is the Tier 1 summary table (always printed first) followed by
+Tier 2 issues-only detail and Tier 3 batch groups as described in Step 3.
+End the report with:
 
+```
+---
+Files audited: N | Correct: N | With issues: N
 Changes: N applied / N dry-run
 Infrastructure notes: N (see notes section)
 ```
 
-## Infrastructure Note (not part of this skill's scope)
+## Infrastructure (already in place — do not re-add)
 
-For `pytest -m unit` to work, the project needs a conftest hook:
-
-```python
-# In test/conftest.py pytest_collection_modifyitems:
-_NON_UNIT = ("integration", "e2e", "qualitative", "llm")
-for item in items:
-    if not any(item.get_closest_marker(m) for m in _NON_UNIT):
-        item.add_marker(pytest.mark.unit)
-```
+The following infrastructure was set up in #727 and should NOT be recreated
+by this skill.  If an audit finds these missing, something has regressed —
+flag as a blocker, don't silently re-add:
 
-The `e2e` and `integration` markers also need registering in `pytest_configure`
-and `pyproject.toml`. These are one-time infrastructure changes tracked in
-issue #727, not performed by this skill.
+- **Auto-unit hook:** `test/conftest.py` `pytest_collection_modifyitems` adds
+  `pytest.mark.unit` to any test without `integration`, `e2e`, or `qualitative`.
+- **Marker registration:** all tier, backend, and resource markers registered in
+  `pytest_configure` and `pyproject.toml`.
+- **Resource predicates:** `test/predicates.py` provides `require_gpu`,
+  `require_ram`, `require_gpu_isolation`, `require_api_key`, `require_package`,
+  `require_ollama`, `require_python`.
diff --git a/.agents/skills/skill-author/SKILL.md b/.agents/skills/skill-author/SKILL.md
index 04c1cbe2a..ca1e5ca8b 100644
--- a/.agents/skills/skill-author/SKILL.md
+++ b/.agents/skills/skill-author/SKILL.md
@@ -41,7 +41,34 @@ to maximise compatibility.
 
 3. **Write SKILL.md** — YAML frontmatter + markdown body (see schema below).
 
-4. **Validate:**
+4. **Dry-run review** — mentally execute the skill against a realistic scenario
+   before finalising. Walk through the procedure on a concrete example (a real
+   file in the repo, not a hypothetical) and check for:
+   - **Scaling gaps:** Does the procedure work for 1 file AND 100 files? If the
+     skill accepts a directory or glob, it needs a triage strategy (e.g., "grep
+     first to find candidates, then deep-read only files with issues") — not
+     just "read every file fully."
+   - **Boundary ambiguity:** If the skill defines categories or classifications,
+     test the boundaries between adjacent categories with a real example. The
+     edges are where agents will disagree or ask the user. Sharpen definitions
+     until two agents reading the same test would classify it the same way.
+   - **Stale references:** If the skill describes project state ("this hook needs
+     to be added", "this marker is not yet registered"), verify those statements
+     are still true. Embed checks ("read conftest.py to confirm") rather than
+     assertions that rot.
+   - **Output format at scale:** Run the report template mentally against the
+     largest expected input. A per-function report for 5 files is fine; for 165
+     files it's unusable. Design output for the largest scope — summary table
+     first, per-item detail only where issues exist.
+   - **Format coverage:** If the skill operates on multiple input formats (e.g.,
+     `pytestmark` lists AND `# pytest:` comments), verify each format is
+     explicitly addressed in the procedure. Implicit coverage causes agents to
+     skip or guess.
+   - **Rigid rules:** If you wrote "always X" or "never Y", find the edge case
+     where the rule is wrong. Add the escape hatch. E.g., "per-function only"
+     should say "module-level is acceptable when every function qualifies."
+
+5. **Validate:**
    - Check the skill is discoverable: list files in `.agents/skills/`.
    - Confirm no frontmatter warnings from the IDE.
    - Verify the skill does not conflict with existing skills or `AGENTS.md`.
@@ -117,3 +144,16 @@ After frontmatter, write clear markdown instructions the agent follows:
   material (link to them from the body).
 - **Portability:** use relative paths from the repo root, never absolute paths.
 - **Formatting:** use YYYY-MM-DD for dates, 24-hour clock for times, metric units.
+- **Design for variable scope.** If the skill can operate on a single file or an
+  entire directory, provide a triage strategy for the large case. Agents given
+  "audit everything" with no prioritisation will either read every file (slow)
+  or skip files (incomplete).
+- **Sharpen category boundaries.** When defining classifications, the boundary
+  between adjacent categories causes the most disagreement. Add a "key
+  distinction from X" sentence for each pair of adjacent tiers.
+- **Avoid temporal assertions.** Don't write "this conftest hook needs to be
+  added" — write "check whether conftest.py already has the hook." State that
+  goes stale silently is worse than no guidance at all.
+- **Qualify absolutes.** "Always X" and "never Y" rules need escape hatches for
+  the common exception. E.g., "per-function only — unless every function in the
+  file qualifies, in which case module-level is acceptable."
diff --git a/test/predicates.py b/test/predicates.py
new file mode 100644
index 000000000..b7d10d085
--- /dev/null
+++ b/test/predicates.py
@@ -0,0 +1,213 @@
+"""Reusable test predicates for resource-gated test skipping.
+
+These return ``pytest.mark.skipif`` decorators that test authors apply directly.
+Each predicate encapsulates a specific availability check so that:
+
+- Test authors specify *exactly* what their test needs (not a vague tier).
+- Skip reasons are self-documenting.
+- No marker registration or conftest hook is required.
+
+Usage::
+
+    from test.predicates import require_gpu, require_ram, require_api_key
+
+    @require_gpu()
+    def test_cuda_basic():
+        ...
+
+    @require_gpu(min_vram_gb=24)
+    def test_large_model():
+        ...
+
+    # Module-level gating (applies to all tests in the file):
+    pytestmark = [pytest.mark.e2e, pytest.mark.huggingface, require_gpu(), require_ram(48)]
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# GPU
+# ---------------------------------------------------------------------------
+
+
+def _gpu_available() -> bool:
+    try:
+        import torch
+
+        return torch.cuda.is_available() or (
+            hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        )
+    except ImportError:
+        return False
+
+
+def _gpu_vram_gb() -> float:
+    """Return VRAM in GB for the first CUDA device, or 0 if unavailable."""
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            return torch.cuda.get_device_properties(0).total_memory / (1024**3)
+    except (ImportError, RuntimeError):
+        pass
+    return 0.0
+
+
+def require_gpu(*, min_vram_gb: int | None = None):
+    """Skip unless a GPU is available, optionally with minimum VRAM.
+
+    Args:
+        min_vram_gb: Minimum VRAM in GB.  When ``None``, any GPU suffices.
+    """
+    if not _gpu_available():
+        return pytest.mark.skipif(True, reason="No GPU available (CUDA/MPS)")
+
+    if min_vram_gb is not None:
+        vram = _gpu_vram_gb()
+        if vram < min_vram_gb:
+            return pytest.mark.skipif(
+                True,
+                reason=f"Insufficient VRAM: {vram:.0f} GB < {min_vram_gb} GB required",
+            )
+
+    return pytest.mark.skipif(False, reason="")
+
+
+# ---------------------------------------------------------------------------
+# System RAM
+# ---------------------------------------------------------------------------
+
+
+def _system_ram_gb() -> float:
+    try:
+        import psutil
+
+        return psutil.virtual_memory().total / (1024**3)
+    except ImportError:
+        return 0.0
+
+
+def require_ram(min_gb: int):
+    """Skip unless the system has at least *min_gb* GB of RAM."""
+    ram = _system_ram_gb()
+    if ram > 0 and ram < min_gb:
+        return pytest.mark.skipif(
+            True, reason=f"Insufficient RAM: {ram:.0f} GB < {min_gb} GB required"
+        )
+    return pytest.mark.skipif(False, reason="")
+
+
+# ---------------------------------------------------------------------------
+# GPU process isolation
+# ---------------------------------------------------------------------------
+
+
+def require_gpu_isolation():
+    """Skip unless GPU process isolation is enabled.
+
+    Isolation is active when ``--isolate-heavy`` is passed or ``CICD=1``.
+    Tests marked with this predicate will be run in separate subprocesses
+    to prevent CUDA OOM from cross-test memory leaks.
+    """
+    isolate = os.environ.get("CICD", "0") == "1"
+    # Note: --isolate-heavy is a pytest CLI flag checked at collection time
+    # by conftest.py.  At import time we can only check the env var.
+    return pytest.mark.skipif(
+        not (isolate or _gpu_available()),
+        reason="GPU isolation requires CICD=1 or --isolate-heavy with a GPU",
+    )
+
+
+# ---------------------------------------------------------------------------
+# API keys / credentials
+# ---------------------------------------------------------------------------
+
+
+def require_api_key(*env_vars: str):
+    """Skip unless all specified environment variables are set.
+
+    Usage::
+
+        @require_api_key("OPENAI_API_KEY")
+        def test_openai_chat(): ...
+
+        @require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID")
+        def test_watsonx_generate(): ...
+    """
+    missing = [v for v in env_vars if not os.environ.get(v)]
+    if missing:
+        return pytest.mark.skipif(
+            True, reason=f"Missing environment variables: {', '.join(missing)}"
+        )
+    return pytest.mark.skipif(False, reason="")
+
+
+# ---------------------------------------------------------------------------
+# Optional dependencies
+# ---------------------------------------------------------------------------
+
+
+def require_package(package: str):
+    """Skip unless *package* is importable.
+
+    For simple cases, ``pytest.importorskip(package)`` at module level is
+    equivalent and more idiomatic.  This predicate is useful when you want
+    a decorator rather than a module-level call::
+
+        @require_package("cpex.framework")
+        def test_plugin_registration(): ...
+    """
+    try:
+        __import__(package)
+        available = True
+    except ImportError:
+        available = False
+
+    return pytest.mark.skipif(not available, reason=f"{package} not installed")
+
+
+# ---------------------------------------------------------------------------
+# Service reachability
+# ---------------------------------------------------------------------------
+
+
+def require_ollama(*, host: str = "localhost", port: int = 11434):
+    """Skip unless Ollama is reachable on the given host/port."""
+    import socket
+
+    try:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.settimeout(1)
+        result = sock.connect_ex((host, port))
+        sock.close()
+        reachable = result == 0
+    except Exception:
+        reachable = False
+
+    return pytest.mark.skipif(
+        not reachable, reason=f"Ollama not available at {host}:{port}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Python version
+# ---------------------------------------------------------------------------
+
+
+def require_python(min_version: tuple[int, ...]):
+    """Skip unless running on at least the given Python version.
+
+    Usage::
+
+        @require_python((3, 11))
+        async def test_asyncio_timeout(): ...
+    """
+    version_str = ".".join(str(v) for v in min_version)
+    return pytest.mark.skipif(
+        sys.version_info < min_version, reason=f"Requires Python {version_str}+"
+    )

From 4f1db520dde5f086290125ead6b62fc44b5b6d18 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 11:36:34 +0000
Subject: [PATCH 05/42] chore: remove issue references from audit-markers skill

Epic/issue numbers are task context, not permanent skill knowledge.
---
 .agents/skills/audit-markers/SKILL.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index 18a9784f9..b7cbed69e 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -31,7 +31,6 @@ Read these before auditing — they are the authoritative source for marker conv
 - **Marker registration:** `test/conftest.py` (`pytest_configure`) and `pyproject.toml` (`[tool.pytest.ini_options]`)
 - **Resource predicates:** `test/predicates.py` (predicate functions for resource gating)
 - **Example marker format:** `docs/examples/conftest.py` (`_extract_markers_from_file`)
-- **Epic context:** GitHub issues #726 (epic), #727 (granularity), #728 (backend/resource)
 
 ---
 

From fc72f3f3ccf0b90e03ceff4af545785f374ad50a Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 11:41:13 +0000
Subject: [PATCH 06/42] docs: align MARKERS_GUIDE.md with predicate factory
 pattern
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MARKERS_GUIDE.md documented legacy resource markers (requires_gpu, etc.)
as the active convention while SKILL.md instructed migration to predicates
— a direct conflict that would cause the audit agent to stall or produce
incorrect edits.

- Replace resource markers section with predicate-first documentation
- Move legacy markers to deprecated subsection (conftest still handles them)
- Update common patterns example to use predicate imports
- Add test/predicates.py to related files
- Add explicit dry-run enforcement to SKILL.md Step 4

Refs: #727, #728
---
 .agents/skills/audit-markers/SKILL.md |  3 ++
 test/MARKERS_GUIDE.md                 | 76 +++++++++++++++++----------
 2 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index b7cbed69e..032975e52 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -547,6 +547,9 @@ applying, but one line per file is sufficient when the fix is identical.
 
 ## Step 4 — Apply fixes (unless `--dry-run`)
 
+**If `--dry-run` is active, do NOT execute any file writes or bash commands that
+modify code. Output the report only.**
+
 Surgical edits only — change specific marker lines, do not reformat surrounding code.
 
 When replacing `llm` with `e2e` in `pytestmark` lists, keep the same list structure.
diff --git a/test/MARKERS_GUIDE.md b/test/MARKERS_GUIDE.md
index aec85a104..95aebc002 100644
--- a/test/MARKERS_GUIDE.md
+++ b/test/MARKERS_GUIDE.md
@@ -68,8 +68,8 @@ def test_session_chains_components(mock_backend):
 GPU-loaded models (huggingface, vllm). No mocks on the critical path.
 
 - Add `@pytest.mark.e2e` explicitly, always combined with backend marker(s)
-- Resource/capability markers (`requires_gpu`, `requires_heavy_ram`, etc.)
-  only apply to e2e and qualitative tests
+- Resource predicates (`require_gpu()`, `require_ram()`, etc.) only apply to
+  e2e and qualitative tests — see "Resource Gating" section below
 - Assertions are **deterministic** — structural, type-based, or functional
 
 ```python
@@ -131,33 +131,55 @@ tests don't need real backends.
 ### OpenAI-via-Ollama pattern
 
 Some tests use the OpenAI client pointed at Ollama's `/v1` endpoint. Mark
-these with **both** `openai` and `ollama`, but **not** `requires_api_key`:
+these with **both** `openai` and `ollama`, but do **not** add `require_api_key`:
 
 ```python
 pytestmark = [pytest.mark.e2e, pytest.mark.openai, pytest.mark.ollama]
 ```
 
-## Resource / Capability Markers
+## Resource Gating (Predicates)
 
-These markers gate tests on hardware or credentials. They only apply to
-e2e and qualitative tests — unit and integration tests should never need them.
-Use sparingly.
+E2E and qualitative tests need gating so they skip cleanly when required
+infrastructure is absent. Use **predicate decorators** from `test/predicates.py`
+— they give test authors precise control over skip conditions.
 
-| Marker                   | Gate                                  | Auto-skip when                                    |
-| ------------------------ | ------------------------------------- | ------------------------------------------------- |
-| `requires_gpu`           | CUDA or MPS                           | `torch.cuda.is_available()` is False              |
-| `requires_heavy_ram`     | 48GB+ system RAM                      | `psutil` reports < 48GB                           |
-| `requires_gpu_isolation` | Subprocess isolation for CUDA memory  | `--isolate-heavy` not set and `CICD != 1`         |
-| `requires_api_key`       | External API credentials              | Env vars missing (checked per backend)            |
-| `slow`                   | Tests taking >1 minute                | Excluded by default via `pyproject.toml` addopts  |
-| `qualitative`            | Non-deterministic output              | Skipped when `CICD=1`                             |
+```python
+from test.predicates import require_gpu, require_ram, require_api_key
+```
+
+| Predicate | Use when test needs |
+| --------- | ------------------- |
+| `require_gpu()` | Any GPU (CUDA or MPS) |
+| `require_gpu(min_vram_gb=N)` | GPU with at least N GB VRAM |
+| `require_ram(min_gb=N)` | N GB+ system RAM |
+| `require_gpu_isolation()` | Subprocess isolation for CUDA memory |
+| `require_api_key("ENV_VAR")` | Specific API credentials |
+| `require_package("pkg")` | Optional dependency |
+| `require_ollama()` | Running Ollama server |
+| `require_python((3, 11))` | Minimum Python version |
 
 ### Typical combinations
 
-- `huggingface` → `requires_gpu` + `requires_heavy_ram` + `requires_gpu_isolation`
-- `vllm` → `requires_gpu` + `requires_heavy_ram` + `requires_gpu_isolation`
-- `watsonx` → `requires_api_key`
-- `openai` → `requires_api_key` only when using real OpenAI API (not Ollama-compatible)
+- `huggingface` → `require_gpu()` + `require_ram(min_gb=48)` (adjust per model)
+- `vllm` → `require_gpu(min_vram_gb=24)` + `require_ram(min_gb=48)`
+- `watsonx` → `require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID")`
+- `openai` → `require_api_key("OPENAI_API_KEY")` only for real OpenAI (not Ollama-compat)
+
+### Other gating markers
+
+These are not resource predicates but still control test selection:
+
+| Marker         | Gate                             | Auto-skip when                                   |
+| -------------- | -------------------------------- | ------------------------------------------------ |
+| `slow`         | Tests taking >1 minute           | Excluded by default via `pyproject.toml` addopts |
+| `qualitative`  | Non-deterministic output         | Skipped when `CICD=1`                            |
+
+### Legacy resource markers (deprecated)
+
+The markers `requires_gpu`, `requires_heavy_ram`, `requires_api_key`, and
+`requires_gpu_isolation` are deprecated. Existing tests using them still work
+(conftest auto-skip logic handles them) but new tests should use predicates.
+Migrate legacy markers to predicates when touching those files.
 
 ## Auto-Detection
 
@@ -204,14 +226,11 @@ def test_greeting_content(session):
     result = session.instruct("Write a greeting")
     assert "hello" in result.value.lower()
 
-# Heavy GPU e2e
-pytestmark = [
-    pytest.mark.e2e,
-    pytest.mark.huggingface,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
-    pytest.mark.requires_gpu_isolation,
-]
+# Heavy GPU e2e (predicates for resource gating)
+from test.predicates import require_gpu, require_ram, require_gpu_isolation
+
+pytestmark = [pytest.mark.e2e, pytest.mark.huggingface,
+              require_gpu(), require_ram(min_gb=48), require_gpu_isolation()]
 ```
 
 ## Example Files (`docs/examples/`)
@@ -232,7 +251,7 @@ lines before non-comment code. Parser: `docs/examples/conftest.py`
 1. **Classify the test** — unit, integration, e2e, or qualitative?
 2. **Add granularity marker** — integration and e2e are explicit; unit is auto-applied
 3. **Add backend marker(s)** — only for e2e/qualitative
-4. **Add resource markers** — only for e2e/qualitative, only when needed
+4. **Add resource predicates** — only for e2e/qualitative, use `test/predicates.py`
 5. **Verify** — `pytest --collect-only -m "your_marker"` to check
 
 Use the `/audit-markers` skill to validate markers on existing or new test files.
@@ -259,6 +278,7 @@ jobs:
 ## Related Files
 
 - `test/conftest.py` — marker registration, auto-detection, skip logic, unit auto-apply hook
+- `test/predicates.py` — resource gating predicates (`require_gpu`, `require_ram`, etc.)
 - `docs/examples/conftest.py` — example marker parser (`_extract_markers_from_file`)
 - `pyproject.toml` — marker definitions and pytest configuration
 - `.agents/skills/audit-markers/SKILL.md` — skill for auditing and fixing markers

From 845c6adcc965ace1f0f78942993091180748bbb2 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 11:41:57 +0000
Subject: [PATCH 07/42] fix: validate_skill.py schema mismatch and brittle YAML
 parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two bugs:
- Required `version` at root level but skill-author guide nests it
  under `metadata` — guaranteed failure on valid skills
- Naive `content.split('---')` breaks on markdown horizontal rules

Fix: use yaml.safe_load_all for robust frontmatter extraction, check
`name`/`description` at root and `version` under `metadata.version`.
---
 .../skill-author/scripts/validate_skill.py    | 45 ++++++++++++-------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/.agents/skills/skill-author/scripts/validate_skill.py b/.agents/skills/skill-author/scripts/validate_skill.py
index d3a270178..170e646c1 100755
--- a/.agents/skills/skill-author/scripts/validate_skill.py
+++ b/.agents/skills/skill-author/scripts/validate_skill.py
@@ -4,10 +4,10 @@
 import os
 import sys
 
-import yaml  # Ensure this is in the agent's environment
+import yaml
 
 
-def validate_skill(skill_path):
+def validate_skill(skill_path: str) -> dict:
     """Check that a skill directory has valid SKILL.md with required frontmatter keys."""
     skill_file = os.path.join(skill_path, "SKILL.md")
 
@@ -16,24 +16,37 @@ def validate_skill(skill_path):
 
     try:
         with open(skill_file) as f:
-            content = f.read()
-            # Split YAML frontmatter
-            if content.startswith("---"):
-                parts = content.split("---")
-                metadata = yaml.safe_load(parts[1])
+            # safe_load_all handles the --- delimiters correctly and won't
+            # break on markdown horizontal rules later in the file.
+            frontmatter = next(yaml.safe_load_all(f))
 
-                # Validation Logic
-                required_keys = ["name", "description", "version"]
-                for key in required_keys:
-                    if key not in metadata:
-                        return {"status": "error", "message": f"Missing key: {key}"}
+        if not isinstance(frontmatter, dict):
+            return {"status": "error", "message": "Frontmatter is not a YAML mapping"}
 
-                return {"status": "success", "data": metadata}
-    except Exception as e:
-        return {"status": "error", "message": str(e)}
+        # Root-level required keys
+        for key in ("name", "description"):
+            if key not in frontmatter:
+                return {"status": "error", "message": f"Missing root key: {key}"}
+
+        # version lives under metadata (per skill-author guide)
+        meta = frontmatter.get("metadata")
+        if not isinstance(meta, dict) or "version" not in meta:
+            return {
+                "status": "error",
+                "message": "Missing nested key: metadata.version",
+            }
+
+        return {"status": "success", "data": frontmatter}
+
+    except yaml.YAMLError as e:
+        return {"status": "error", "message": f"Invalid YAML: {e}"}
+    except StopIteration:
+        return {"status": "error", "message": "No YAML frontmatter found"}
 
 
 if __name__ == "__main__":
-    # Example usage: python3 validate_skill.py ./.agents/skills/new-skill
+    if len(sys.argv) < 2:
+        print("Usage: python3 validate_skill.py <skill-directory>", file=sys.stderr)
+        sys.exit(1)
     result = validate_skill(sys.argv[1])
     print(json.dumps(result))

From 62d4bbde42b62cabd2b54d9903eee752ed4aa75b Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 12:02:48 +0000
Subject: [PATCH 08/42] fix: migrate deprecated llm markers to e2e, add backend
 registry, update audit-markers skill

- Replace all `pytest.mark.llm` with `pytest.mark.e2e` across 34 test files
  and 87 example files (comment-based markers)
- Add `BACKEND_MARKERS` data-driven registry in test/conftest.py as single
  source of truth for backend marker registration
- Register `bedrock` backend marker in conftest.py, pyproject.toml,
  MARKERS_GUIDE.md, and add missing marker to test_bedrock.py
- Reclassify test_alora_train.py as integration (was unit); add importorskip
  for peft dependency
- Add missing `e2e` tier markers to test_tracing.py and test_tracing_backend.py
- Update audit-markers skill: report-first default, predicate migration as
  fix (not recommendation), backend registry gap detection
---
 .agents/skills/audit-markers/SKILL.md         | 72 ++++++++++++++-----
 docs/examples/aLora/101_example.py            |  2 +-
 docs/examples/aLora/102_example.py            |  2 +-
 .../aLora/example_readme_generator.py         |  2 +-
 docs/examples/aLora/make_training_data.py     |  2 +-
 docs/examples/aLora/stembolts_intrinsic.py    |  2 +-
 .../agents/react/react_from_scratch/react.py  |  2 +-
 .../react_from_scratch/react_instruct.py      |  2 +-
 .../agents/react/react_using_mellea.py        |  2 +-
 .../context/contexts_with_sampling.py         |  2 +-
 .../generative_slots/generate_with_context.py |  2 +-
 .../generative_slots/generative_gsm8k.py      |  2 +-
 .../generative_slots/generative_slots.py      |  2 +-
 .../generative_slots_with_requirements.py     |  2 +-
 .../decision_aides.py                         |  2 +-
 .../summarize_and_decide.py                   |  2 +-
 .../inter_module_composition/summarizers.py   |  2 +-
 .../generative_slots/investment_advice.py     |  2 +-
 docs/examples/hello_world.py                  |  2 +-
 docs/examples/helper/helpers.py               |  2 +-
 .../vision_litellm_backend.py                 |  2 +-
 .../image_text_models/vision_ollama_chat.py   |  2 +-
 .../vision_openai_examples.py                 |  2 +-
 .../101_with_gen_slots.py                     |  2 +-
 .../advanced_with_m_instruct.py               |  2 +-
 .../instruct_validate_repair/101_email.py     |  2 +-
 .../101_email_comparison.py                   |  2 +-
 .../101_email_with_requirements.py            |  2 +-
 .../101_email_with_validate.py                |  2 +-
 .../advanced_email_with_validate_function.py  |  2 +-
 .../multiturn_strategy_example.py             |  2 +-
 .../qiskit_code_validation.py                 |  2 +-
 docs/examples/intrinsics/answerability.py     |  2 +-
 docs/examples/intrinsics/citations.py         |  2 +-
 .../intrinsics/context_attribution.py         |  2 +-
 docs/examples/intrinsics/context_relevance.py |  2 +-
 .../intrinsics/factuality_correction.py       |  2 +-
 .../intrinsics/factuality_detection.py        |  2 +-
 docs/examples/intrinsics/guardian_core.py     |  2 +-
 .../intrinsics/hallucination_detection.py     |  2 +-
 docs/examples/intrinsics/intrinsics.py        |  2 +-
 docs/examples/intrinsics/policy_guardrails.py |  2 +-
 .../intrinsics/query_clarification.py         |  2 +-
 docs/examples/intrinsics/query_rewrite.py     |  2 +-
 docs/examples/intrinsics/requirement_check.py |  2 +-
 docs/examples/intrinsics/uncertainty.py       |  2 +-
 .../library_interop/langchain_messages.py     |  2 +-
 .../python/python_decompose_example.py        |  2 +-
 .../m_serve/m_serve_example_simple.py         |  2 +-
 docs/examples/melp/lazy.py                    |  2 +-
 docs/examples/melp/lazy_fib.py                |  2 +-
 docs/examples/melp/lazy_fib_sample.py         |  2 +-
 docs/examples/melp/simple_example.py          |  2 +-
 docs/examples/melp/states.py                  |  2 +-
 docs/examples/mify/mify.py                    |  2 +-
 docs/examples/mify/rich_document_advanced.py  |  2 +-
 .../examples/mify/rich_table_execute_basic.py |  2 +-
 docs/examples/mini_researcher/context_docs.py |  2 +-
 docs/examples/mini_researcher/researcher.py   |  2 +-
 docs/examples/mobject/table.py                |  2 +-
 docs/examples/plugins/class_plugin.py         |  2 +-
 docs/examples/plugins/execution_modes.py      |  2 +-
 docs/examples/plugins/payload_modification.py |  2 +-
 docs/examples/plugins/plugin_scoped.py        |  2 +-
 .../plugins/plugin_set_composition.py         |  2 +-
 docs/examples/plugins/quickstart.py           |  2 +-
 docs/examples/plugins/session_scoped.py       |  2 +-
 docs/examples/plugins/standalone_hooks.py     |  2 +-
 docs/examples/plugins/tool_hooks.py           |  2 +-
 docs/examples/safety/guardian.py              |  2 +-
 docs/examples/safety/guardian_huggingface.py  |  2 +-
 docs/examples/safety/repair_with_guardian.py  |  2 +-
 .../creating_a_new_type_of_session.py         |  2 +-
 docs/examples/sofai/sofai_graph_coloring.py   |  2 +-
 docs/examples/telemetry/metrics_example.py    |  2 +-
 docs/examples/telemetry/telemetry_example.py  |  2 +-
 docs/examples/tools/interpreter_example.py    |  2 +-
 docs/examples/tools/smolagents_example.py     |  2 +-
 docs/examples/tools/tool_decorator_example.py |  2 +-
 .../compositionality_with_generative_slots.py |  2 +-
 docs/examples/tutorial/context_example.py     |  2 +-
 docs/examples/tutorial/document_mobject.py    |  2 +-
 docs/examples/tutorial/example.py             |  2 +-
 .../tutorial/instruct_validate_repair.py      |  2 +-
 .../tutorial/model_options_example.py         |  2 +-
 .../examples/tutorial/sentiment_classifier.py |  2 +-
 docs/examples/tutorial/simple_email.py        |  2 +-
 docs/examples/tutorial/table_mobject.py       |  2 +-
 pyproject.toml                                |  1 +
 test/MARKERS_GUIDE.md                         |  1 +
 test/backends/test_bedrock.py                 |  3 +-
 test/backends/test_huggingface.py             |  2 +-
 test/backends/test_huggingface_tools.py       |  2 +-
 test/backends/test_litellm_ollama.py          |  2 +-
 test/backends/test_litellm_watsonx.py         |  2 +-
 test/backends/test_mellea_tool.py             |  4 +-
 test/backends/test_ollama.py                  |  2 +-
 test/backends/test_openai_ollama.py           |  2 +-
 test/backends/test_openai_vllm.py             |  2 +-
 test/backends/test_tool_calls.py              |  2 +-
 test/backends/test_vision_ollama.py           |  2 +-
 test/backends/test_vision_openai.py           |  2 +-
 test/backends/test_vllm.py                    |  2 +-
 test/backends/test_vllm_tools.py              |  2 +-
 test/backends/test_watsonx.py                 |  2 +-
 test/cli/test_alora_train.py                  | 10 +--
 test/cli/test_alora_train_integration.py      |  2 +-
 test/conftest.py                              | 36 +++++-----
 test/core/test_astream_incremental.py         | 12 ++--
 test/core/test_component_typing.py            |  6 +-
 test/core/test_model_output_thunk.py          |  2 +-
 test/stdlib/components/intrinsic/test_core.py |  2 +-
 .../components/intrinsic/test_guardian.py     |  2 +-
 test/stdlib/components/intrinsic/test_rag.py  |  2 +-
 test/stdlib/components/test_genslot.py        |  2 +-
 test/stdlib/requirements/test_requirement.py  |  2 +-
 test/stdlib/sampling/test_majority_voting.py  |  2 +-
 test/stdlib/sampling/test_sampling_ctx.py     |  2 +-
 .../sampling/test_sofai_graph_coloring.py     |  2 +-
 test/stdlib/sampling/test_sofai_sampling.py   |  2 +-
 .../sampling/test_think_budget_forcing.py     |  2 +-
 test/stdlib/test_chat_view.py                 |  2 +-
 test/stdlib/test_functional.py                |  2 +-
 test/stdlib/test_session.py                   |  2 +-
 test/stdlib/test_spans.py                     |  7 +-
 test/telemetry/test_metrics_backend.py        | 10 +--
 test/telemetry/test_tracing.py                |  4 ++
 test/telemetry/test_tracing_backend.py        |  8 ++-
 128 files changed, 229 insertions(+), 175 deletions(-)

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index 032975e52..3ecad67f6 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -5,10 +5,10 @@ description: >
   unit/integration/e2e/qualitative using general heuristics and project-specific
   marker rules. Use when reviewing markers, auditing test files, or checking
   before commit. References test/MARKERS_GUIDE.md for project conventions.
-argument-hint: "[file-or-directory] [--dry-run]"
+argument-hint: "[file-or-directory] [--dry-run | --apply]"
 compatibility: "Claude Code, IBM Bob"
 metadata:
-  version: "2026-03-25"
+  version: "2026-03-26"
   capabilities: [read_file, write_file, bash, grep, glob]
 ---
 
@@ -21,7 +21,9 @@ marker rules for **mellea**.
 ## Inputs
 
 - `$ARGUMENTS` — file path, directory, or glob. If empty, audit `test/` and `docs/examples/`.
-- `--dry-run` — report only, do not edit files.
+- **No flags (default)** — produce report, then ask user to confirm before applying.
+- `--apply` — produce report and apply fixes without asking.
+- `--dry-run` — report only, do not offer to apply.
 
 ## Project References
 
@@ -352,15 +354,19 @@ predicates, then apply the following checks to every e2e/qualitative file:
 1. **Legacy resource markers → migrate to predicates.** If a test uses
    `@pytest.mark.requires_gpu`, `@pytest.mark.requires_heavy_ram`,
    `@pytest.mark.requires_api_key`, or `@pytest.mark.requires_gpu_isolation`,
-   recommend replacing with the equivalent predicate from the project's
-   predicate module. Resource markers are deprecated in favour of predicates.
+   replace with the equivalent predicate from the project's predicate module.
+   Resource markers are deprecated in favour of predicates. This is a **fix**
+   (same priority as `llm` → `e2e`), not just a recommendation — apply it in
+   Step 4 like any other marker fix. The replacement requires adding an import
+   for the predicate and swapping the marker in the `pytestmark` list or
+   decorator.
 2. **Ad-hoc `skipif` → migrate to predicate.** If a predicate exists for
    the same check (e.g., `require_gpu()` exists but the test has a raw
-   `skipif(not torch.cuda.is_available())`), recommend the predicate.
+   `skipif(not torch.cuda.is_available())`), replace with the predicate.
 3. **Missing gating.** A test that uses a GPU backend but has no GPU
-   predicate and no `skipif` — recommend adding the appropriate predicate.
+   predicate and no `skipif` — add the appropriate predicate.
 4. **Imprecise gating.** A predicate that's too broad (e.g., `require_ram(48)`
-   on a test that only needs 16 GB) — suggest tightening the threshold.
+   on a test that only needs 16 GB) — tighten the threshold.
 5. **Redundant CICD `skipif`.** `skipif(CICD == 1)` is usually redundant
    when conftest auto-skip or predicates already handle the condition.
    Flag as removable.
@@ -545,21 +551,46 @@ test/backends/test_watsonx.py
 The agent should list all files (not truncate) so the user can review before
 applying, but one line per file is sufficient when the fix is identical.
 
-## Step 4 — Apply fixes (unless `--dry-run`)
+## Step 4 — Apply fixes
 
-**If `--dry-run` is active, do NOT execute any file writes or bash commands that
-modify code. Output the report only.**
+The apply behaviour depends on the flags passed:
 
-Surgical edits only — change specific marker lines, do not reformat surrounding code.
+| Flag | Behaviour |
+|------|-----------|
+| *(none)* | Output the full report (Steps 1–3 + Output Summary), then **ask the user** "Apply these N changes?" before writing any files. |
+| `--apply` | Output the full report, then apply all fixes **without asking**. |
+| `--dry-run` | Output the full report. Do NOT write any files or offer to apply. |
+
+**When applying fixes:**
 
+Surgical edits only — change specific marker lines, do not reformat surrounding code.
 When replacing `llm` with `e2e` in `pytestmark` lists, keep the same list structure.
+When replacing legacy resource markers with predicates, add the necessary import
+(`from test.predicates import ...`) at the top of the file and swap the marker
+in the `pytestmark` list or decorator.
+
+## Step 5 — Backend registry audit
+
+Check that every backend used in test files has a registered marker.
+The project's backend registry is `BACKEND_MARKERS` in `test/conftest.py`
+(single source of truth). Markers must also appear in `pyproject.toml`
+`[tool.pytest.ini_options].markers` and in `test/MARKERS_GUIDE.md`.
 
-## Step 5 — Flag infrastructure notes
+For each backend constructor or `start_session(backend_name=...)` call
+found during classification, verify:
+
+1. A marker exists in `BACKEND_MARKERS` for that backend.
+2. The marker appears in `pyproject.toml`.
+3. The marker appears in the MARKERS_GUIDE.md backend table.
+
+If a backend is used in tests but has no registered marker, flag it as
+a **missing backend marker** issue and add it to the registry, pyproject.toml,
+and MARKERS_GUIDE.md (same apply/confirm rules as other fixes in Step 4).
+
+## Step 6 — Flag infrastructure notes
 
 Report issues outside marker-edit scope as **notes**. Do NOT fix these:
 - Missing conftest skip logic for a backend
-- Unregistered markers in pyproject.toml
-- MARKERS_GUIDE.md gaps
 - Tests with no assertions
 - Files mixing unit and e2e tests that could be split
 
@@ -569,10 +600,13 @@ The output is the Tier 1 summary table (always printed first) followed by
 Tier 2 issues-only detail and Tier 3 batch groups as described in Step 3.
 End the report with:
 
+The summary table should include a row for `Missing backend marker` when
+backends are used in tests but not registered in `BACKEND_MARKERS`.
+
 ```
 ---
 Files audited: N | Correct: N | With issues: N
-Changes: N applied / N dry-run
+Changes: N applied / N pending confirmation / N dry-run
 Infrastructure notes: N (see notes section)
 ```
 
@@ -584,8 +618,10 @@ flag as a blocker, don't silently re-add:
 
 - **Auto-unit hook:** `test/conftest.py` `pytest_collection_modifyitems` adds
   `pytest.mark.unit` to any test without `integration`, `e2e`, or `qualitative`.
-- **Marker registration:** all tier, backend, and resource markers registered in
-  `pytest_configure` and `pyproject.toml`.
+- **Backend marker registry:** `BACKEND_MARKERS` dict in `test/conftest.py` is
+  the single source of truth for backend markers. `pytest_configure` iterates
+  over it. New backends are added by inserting one entry into the dict.
+  `pyproject.toml` and `test/MARKERS_GUIDE.md` must stay in sync manually.
 - **Resource predicates:** `test/predicates.py` provides `require_gpu`,
   `require_ram`, `require_gpu_isolation`, `require_api_key`, `require_package`,
   `require_ollama`, `require_python`.
diff --git a/docs/examples/aLora/101_example.py b/docs/examples/aLora/101_example.py
index b25d93746..b1538f4f4 100644
--- a/docs/examples/aLora/101_example.py
+++ b/docs/examples/aLora/101_example.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 import time
 
diff --git a/docs/examples/aLora/102_example.py b/docs/examples/aLora/102_example.py
index c2bf86a3a..05c9780fc 100644
--- a/docs/examples/aLora/102_example.py
+++ b/docs/examples/aLora/102_example.py
@@ -1,4 +1,4 @@
-# pytest: skip, huggingface, requires_heavy_ram, llm
+# pytest: skip, huggingface, requires_heavy_ram, e2e
 # SKIP REASON: Requires user input; tests same functionality as 101_example.py.
 
 from stembolts_intrinsic import (
diff --git a/docs/examples/aLora/example_readme_generator.py b/docs/examples/aLora/example_readme_generator.py
index fbfe5e025..3c586aa5e 100644
--- a/docs/examples/aLora/example_readme_generator.py
+++ b/docs/examples/aLora/example_readme_generator.py
@@ -1,4 +1,4 @@
-# pytest: skip, huggingface, requires_heavy_ram, llm
+# pytest: skip, huggingface, requires_heavy_ram, e2e
 # SKIP REASON: documentation only.
 
 from cli.alora.readme_generator import generate_readme, make_readme_jinja_dict
diff --git a/docs/examples/aLora/make_training_data.py b/docs/examples/aLora/make_training_data.py
index 864183fef..e3b25fb36 100644
--- a/docs/examples/aLora/make_training_data.py
+++ b/docs/examples/aLora/make_training_data.py
@@ -1,4 +1,4 @@
-# pytest: skip, huggingface, requires_heavy_ram, llm
+# pytest: skip, huggingface, requires_heavy_ram, e2e
 # SKIP REASON: documentation only.import argparse
 import argparse
 import json
diff --git a/docs/examples/aLora/stembolts_intrinsic.py b/docs/examples/aLora/stembolts_intrinsic.py
index 2559c132f..f3af5f42b 100644
--- a/docs/examples/aLora/stembolts_intrinsic.py
+++ b/docs/examples/aLora/stembolts_intrinsic.py
@@ -1,5 +1,5 @@
 # type: ignore
-# pytest: skip, huggingface, requires_heavy_ram, llm
+# pytest: skip, huggingface, requires_heavy_ram, e2e
 # SKIP REASON: needs to update.
 
 import mellea.stdlib.functional as mfuncs
diff --git a/docs/examples/agents/react/react_from_scratch/react.py b/docs/examples/agents/react/react_from_scratch/react.py
index fecf72f82..351bce08b 100644
--- a/docs/examples/agents/react/react_from_scratch/react.py
+++ b/docs/examples/agents/react/react_from_scratch/react.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm, qualitative
+# pytest: ollama, e2e, qualitative
 
 import datetime
 import inspect
diff --git a/docs/examples/agents/react/react_from_scratch/react_instruct.py b/docs/examples/agents/react/react_from_scratch/react_instruct.py
index a1a4ec787..8970d45b7 100644
--- a/docs/examples/agents/react/react_from_scratch/react_instruct.py
+++ b/docs/examples/agents/react/react_from_scratch/react_instruct.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm, qualitative
+# pytest: ollama, e2e, qualitative
 
 import datetime
 import inspect
diff --git a/docs/examples/agents/react/react_using_mellea.py b/docs/examples/agents/react/react_using_mellea.py
index b2a7ab589..8e5b93a6e 100644
--- a/docs/examples/agents/react/react_using_mellea.py
+++ b/docs/examples/agents/react/react_using_mellea.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 """React examples using the Mellea library's framework."""
 
diff --git a/docs/examples/context/contexts_with_sampling.py b/docs/examples/context/contexts_with_sampling.py
index d9a7c2809..724eb440c 100644
--- a/docs/examples/context/contexts_with_sampling.py
+++ b/docs/examples/context/contexts_with_sampling.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from mellea.stdlib.sampling import RejectionSamplingStrategy
 from mellea.stdlib.session import start_session
diff --git a/docs/examples/generative_slots/generate_with_context.py b/docs/examples/generative_slots/generate_with_context.py
index 1cc352623..6921f353a 100644
--- a/docs/examples/generative_slots/generate_with_context.py
+++ b/docs/examples/generative_slots/generate_with_context.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from mellea import generative, start_session
 from mellea.backends import ModelOption
diff --git a/docs/examples/generative_slots/generative_gsm8k.py b/docs/examples/generative_slots/generative_gsm8k.py
index 44da109df..d3cceae51 100644
--- a/docs/examples/generative_slots/generative_gsm8k.py
+++ b/docs/examples/generative_slots/generative_gsm8k.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, llm, slow
+# pytest: ollama, qualitative, e2e, slow
 
 """Example of chain-of-thought reasoning on a mathematical question from the GSM8K dataset, structured as code for improved performance with Granite 4 models. The original accuracy in standard "thinking" mode is approximately 80%, while this implementation achieves 85-89% accuracy—up to 9 points higher.
 
diff --git a/docs/examples/generative_slots/generative_slots.py b/docs/examples/generative_slots/generative_slots.py
index c5d24f4e9..0f7544ecb 100644
--- a/docs/examples/generative_slots/generative_slots.py
+++ b/docs/examples/generative_slots/generative_slots.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from typing import Literal
 
diff --git a/docs/examples/generative_slots/generative_slots_with_requirements.py b/docs/examples/generative_slots/generative_slots_with_requirements.py
index b7eeff8e9..25ab83842 100644
--- a/docs/examples/generative_slots/generative_slots_with_requirements.py
+++ b/docs/examples/generative_slots/generative_slots_with_requirements.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from typing import Literal
 
diff --git a/docs/examples/generative_slots/inter_module_composition/decision_aides.py b/docs/examples/generative_slots/inter_module_composition/decision_aides.py
index b5edc0056..deec21651 100644
--- a/docs/examples/generative_slots/inter_module_composition/decision_aides.py
+++ b/docs/examples/generative_slots/inter_module_composition/decision_aides.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from mellea import generative
 
diff --git a/docs/examples/generative_slots/inter_module_composition/summarize_and_decide.py b/docs/examples/generative_slots/inter_module_composition/summarize_and_decide.py
index 3a5d91ade..95bb23bd8 100644
--- a/docs/examples/generative_slots/inter_module_composition/summarize_and_decide.py
+++ b/docs/examples/generative_slots/inter_module_composition/summarize_and_decide.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from typing import Literal
 
diff --git a/docs/examples/generative_slots/inter_module_composition/summarizers.py b/docs/examples/generative_slots/inter_module_composition/summarizers.py
index 01656db32..99de7adda 100644
--- a/docs/examples/generative_slots/inter_module_composition/summarizers.py
+++ b/docs/examples/generative_slots/inter_module_composition/summarizers.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from mellea import generative
 
diff --git a/docs/examples/generative_slots/investment_advice.py b/docs/examples/generative_slots/investment_advice.py
index 7685e197c..33cafe358 100644
--- a/docs/examples/generative_slots/investment_advice.py
+++ b/docs/examples/generative_slots/investment_advice.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from typing import Literal
 
diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py
index d00766d61..46c27861d 100644
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 import mellea
 
diff --git a/docs/examples/helper/helpers.py b/docs/examples/helper/helpers.py
index 726f0ec04..6609c8942 100644
--- a/docs/examples/helper/helpers.py
+++ b/docs/examples/helper/helpers.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from textwrap import fill
 from typing import Any
diff --git a/docs/examples/image_text_models/vision_litellm_backend.py b/docs/examples/image_text_models/vision_litellm_backend.py
index 1ae85c11b..5c7e302a8 100644
--- a/docs/examples/image_text_models/vision_litellm_backend.py
+++ b/docs/examples/image_text_models/vision_litellm_backend.py
@@ -1,4 +1,4 @@
-# pytest: litellm, llm, ollama
+# pytest: litellm, e2e, ollama
 
 """Examples of using vision models with LiteLLM backend."""
 
diff --git a/docs/examples/image_text_models/vision_ollama_chat.py b/docs/examples/image_text_models/vision_ollama_chat.py
index a190e5b19..006e55687 100644
--- a/docs/examples/image_text_models/vision_ollama_chat.py
+++ b/docs/examples/image_text_models/vision_ollama_chat.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 """Example of using Ollama with vision models with linear context."""
 
diff --git a/docs/examples/image_text_models/vision_openai_examples.py b/docs/examples/image_text_models/vision_openai_examples.py
index 46136afa5..f1d423cd0 100644
--- a/docs/examples/image_text_models/vision_openai_examples.py
+++ b/docs/examples/image_text_models/vision_openai_examples.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm, requires_heavy_ram
+# pytest: ollama, e2e, requires_heavy_ram
 
 """Examples using vision models with OpenAI backend."""
 
diff --git a/docs/examples/information_extraction/101_with_gen_slots.py b/docs/examples/information_extraction/101_with_gen_slots.py
index 83a088033..9ab6ac5e6 100644
--- a/docs/examples/information_extraction/101_with_gen_slots.py
+++ b/docs/examples/information_extraction/101_with_gen_slots.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 """Simple Example of information extraction with Mellea using generative slots."""
 
diff --git a/docs/examples/information_extraction/advanced_with_m_instruct.py b/docs/examples/information_extraction/advanced_with_m_instruct.py
index 4084469cc..58cee91c2 100644
--- a/docs/examples/information_extraction/advanced_with_m_instruct.py
+++ b/docs/examples/information_extraction/advanced_with_m_instruct.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 """Advanced Example of information extraction with Mellea using m.instruct() and constraints."""
 
diff --git a/docs/examples/instruct_validate_repair/101_email.py b/docs/examples/instruct_validate_repair/101_email.py
index 6eb905d16..ee5eee664 100644
--- a/docs/examples/instruct_validate_repair/101_email.py
+++ b/docs/examples/instruct_validate_repair/101_email.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 # This is the 101 example for using `session` and `instruct`.
 # helper function to wrap text
diff --git a/docs/examples/instruct_validate_repair/101_email_comparison.py b/docs/examples/instruct_validate_repair/101_email_comparison.py
index e35e676e9..de0b346ac 100644
--- a/docs/examples/instruct_validate_repair/101_email_comparison.py
+++ b/docs/examples/instruct_validate_repair/101_email_comparison.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from docs.examples.helper import w
 from mellea import start_session
diff --git a/docs/examples/instruct_validate_repair/101_email_with_requirements.py b/docs/examples/instruct_validate_repair/101_email_with_requirements.py
index 21e726150..9c21347b5 100644
--- a/docs/examples/instruct_validate_repair/101_email_with_requirements.py
+++ b/docs/examples/instruct_validate_repair/101_email_with_requirements.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from docs.examples.helper import w
 from mellea import start_session
diff --git a/docs/examples/instruct_validate_repair/101_email_with_validate.py b/docs/examples/instruct_validate_repair/101_email_with_validate.py
index b510cf260..00949d837 100644
--- a/docs/examples/instruct_validate_repair/101_email_with_validate.py
+++ b/docs/examples/instruct_validate_repair/101_email_with_validate.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from docs.examples.helper import req_print, w
 from mellea import start_session
diff --git a/docs/examples/instruct_validate_repair/advanced_email_with_validate_function.py b/docs/examples/instruct_validate_repair/advanced_email_with_validate_function.py
index a6acafc32..2c4ef23d8 100644
--- a/docs/examples/instruct_validate_repair/advanced_email_with_validate_function.py
+++ b/docs/examples/instruct_validate_repair/advanced_email_with_validate_function.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from docs.examples.helper import w
 from mellea import start_session
diff --git a/docs/examples/instruct_validate_repair/multiturn_strategy_example.py b/docs/examples/instruct_validate_repair/multiturn_strategy_example.py
index 46cf46ba8..86cada2c4 100644
--- a/docs/examples/instruct_validate_repair/multiturn_strategy_example.py
+++ b/docs/examples/instruct_validate_repair/multiturn_strategy_example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm, qualitative
+# pytest: ollama, e2e, qualitative
 
 """MultiTurnStrategy Example with Validation Functions
 
diff --git a/docs/examples/instruct_validate_repair/qiskit_code_validation/qiskit_code_validation.py b/docs/examples/instruct_validate_repair/qiskit_code_validation/qiskit_code_validation.py
index 73e6b3155..228601ec2 100644
--- a/docs/examples/instruct_validate_repair/qiskit_code_validation/qiskit_code_validation.py
+++ b/docs/examples/instruct_validate_repair/qiskit_code_validation/qiskit_code_validation.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm, qualitative, skip
+# pytest: ollama, e2e, qualitative, skip
 # /// script
 # dependencies = [
 #   "mellea",
diff --git a/docs/examples/intrinsics/answerability.py b/docs/examples/intrinsics/answerability.py
index e88f9ea43..5875286d3 100644
--- a/docs/examples/intrinsics/answerability.py
+++ b/docs/examples/intrinsics/answerability.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the answerability intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/citations.py b/docs/examples/intrinsics/citations.py
index 90d395678..43b634792 100644
--- a/docs/examples/intrinsics/citations.py
+++ b/docs/examples/intrinsics/citations.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the citations intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/context_attribution.py b/docs/examples/intrinsics/context_attribution.py
index 1085a07ff..b31eba88a 100644
--- a/docs/examples/intrinsics/context_attribution.py
+++ b/docs/examples/intrinsics/context_attribution.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the context attribution intrinsic.
 
diff --git a/docs/examples/intrinsics/context_relevance.py b/docs/examples/intrinsics/context_relevance.py
index 3eef6b31d..a89effeb7 100644
--- a/docs/examples/intrinsics/context_relevance.py
+++ b/docs/examples/intrinsics/context_relevance.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the context relevance intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/factuality_correction.py b/docs/examples/intrinsics/factuality_correction.py
index cdf911ee1..d3902024a 100644
--- a/docs/examples/intrinsics/factuality_correction.py
+++ b/docs/examples/intrinsics/factuality_correction.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the factuality correction intrinsic.
 
diff --git a/docs/examples/intrinsics/factuality_detection.py b/docs/examples/intrinsics/factuality_detection.py
index e1c9cab48..511ceaab6 100644
--- a/docs/examples/intrinsics/factuality_detection.py
+++ b/docs/examples/intrinsics/factuality_detection.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the factuality detection intrinsic.
 
diff --git a/docs/examples/intrinsics/guardian_core.py b/docs/examples/intrinsics/guardian_core.py
index f4f22afbe..d80ad9de5 100644
--- a/docs/examples/intrinsics/guardian_core.py
+++ b/docs/examples/intrinsics/guardian_core.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the guardian-core intrinsic for safety and hallucination detection.
 
diff --git a/docs/examples/intrinsics/hallucination_detection.py b/docs/examples/intrinsics/hallucination_detection.py
index 247f755d2..6904d96b5 100644
--- a/docs/examples/intrinsics/hallucination_detection.py
+++ b/docs/examples/intrinsics/hallucination_detection.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the hallucination detection intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/intrinsics.py b/docs/examples/intrinsics/intrinsics.py
index 26d4abeba..b798039c3 100644
--- a/docs/examples/intrinsics/intrinsics.py
+++ b/docs/examples/intrinsics/intrinsics.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 import mellea.stdlib.functional as mfuncs
 from mellea.backends.adapters.adapter import AdapterType, IntrinsicAdapter
diff --git a/docs/examples/intrinsics/policy_guardrails.py b/docs/examples/intrinsics/policy_guardrails.py
index 80da68de9..782f2cab6 100644
--- a/docs/examples/intrinsics/policy_guardrails.py
+++ b/docs/examples/intrinsics/policy_guardrails.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the policy_guardrails intrinsic.
 
diff --git a/docs/examples/intrinsics/query_clarification.py b/docs/examples/intrinsics/query_clarification.py
index 6d660e1af..e2a0a3e9f 100644
--- a/docs/examples/intrinsics/query_clarification.py
+++ b/docs/examples/intrinsics/query_clarification.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 """
 Example usage of the query clarification intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/query_rewrite.py b/docs/examples/intrinsics/query_rewrite.py
index d1624b884..8b4025033 100644
--- a/docs/examples/intrinsics/query_rewrite.py
+++ b/docs/examples/intrinsics/query_rewrite.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the query rewrite intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/requirement_check.py b/docs/examples/intrinsics/requirement_check.py
index 00df9944b..08176fc97 100644
--- a/docs/examples/intrinsics/requirement_check.py
+++ b/docs/examples/intrinsics/requirement_check.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the requirement check intrinsic.
 
diff --git a/docs/examples/intrinsics/uncertainty.py b/docs/examples/intrinsics/uncertainty.py
index b0ade6e88..7f052abc2 100644
--- a/docs/examples/intrinsics/uncertainty.py
+++ b/docs/examples/intrinsics/uncertainty.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, llm
+# pytest: huggingface, requires_heavy_ram, e2e
 
 """Example usage of the uncertainty/certainty intrinsic.
 
diff --git a/docs/examples/library_interop/langchain_messages.py b/docs/examples/library_interop/langchain_messages.py
index 205583921..d84f5deb4 100644
--- a/docs/examples/library_interop/langchain_messages.py
+++ b/docs/examples/library_interop/langchain_messages.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm, qualitative
+# pytest: ollama, e2e, qualitative
 
 # Installing langchain is necessary for this example, but it works for any library
 # you may want to use Mellea with.
diff --git a/docs/examples/m_decompose/python/python_decompose_example.py b/docs/examples/m_decompose/python/python_decompose_example.py
index d8d31ad23..757296b56 100644
--- a/docs/examples/m_decompose/python/python_decompose_example.py
+++ b/docs/examples/m_decompose/python/python_decompose_example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm, slow, qualitative
+# pytest: ollama, e2e, slow, qualitative
 #!/usr/bin/env python3
 """
 Example: Using Mellea's decompose functionality programmatically
diff --git a/docs/examples/m_serve/m_serve_example_simple.py b/docs/examples/m_serve/m_serve_example_simple.py
index 74fe43298..d6220543c 100644
--- a/docs/examples/m_serve/m_serve_example_simple.py
+++ b/docs/examples/m_serve/m_serve_example_simple.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 """Example to run m serve."""
 
diff --git a/docs/examples/melp/lazy.py b/docs/examples/melp/lazy.py
index 0719774d4..2b81a8366 100644
--- a/docs/examples/melp/lazy.py
+++ b/docs/examples/melp/lazy.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, llm
+# pytest: ollama, qualitative, e2e
 
 import asyncio
 
diff --git a/docs/examples/melp/lazy_fib.py b/docs/examples/melp/lazy_fib.py
index 6c06c6cdc..2ed86d1ef 100644
--- a/docs/examples/melp/lazy_fib.py
+++ b/docs/examples/melp/lazy_fib.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 import asyncio
 
diff --git a/docs/examples/melp/lazy_fib_sample.py b/docs/examples/melp/lazy_fib_sample.py
index 19eae64d7..e4b673fdc 100644
--- a/docs/examples/melp/lazy_fib_sample.py
+++ b/docs/examples/melp/lazy_fib_sample.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 import asyncio
 
diff --git a/docs/examples/melp/simple_example.py b/docs/examples/melp/simple_example.py
index 772eb0a30..c6000e699 100644
--- a/docs/examples/melp/simple_example.py
+++ b/docs/examples/melp/simple_example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 import asyncio
 
diff --git a/docs/examples/melp/states.py b/docs/examples/melp/states.py
index bab5d810c..fbbcf05e6 100644
--- a/docs/examples/melp/states.py
+++ b/docs/examples/melp/states.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 import asyncio
 
diff --git a/docs/examples/mify/mify.py b/docs/examples/mify/mify.py
index 0d4eab887..e1f82ab57 100644
--- a/docs/examples/mify/mify.py
+++ b/docs/examples/mify/mify.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from mellea.stdlib.components.docs.richdocument import TableQuery
 from mellea.stdlib.components.mify import MifiedProtocol, mify
diff --git a/docs/examples/mify/rich_document_advanced.py b/docs/examples/mify/rich_document_advanced.py
index 0da4dee66..1fee383f3 100644
--- a/docs/examples/mify/rich_document_advanced.py
+++ b/docs/examples/mify/rich_document_advanced.py
@@ -1,4 +1,4 @@
-# pytest: skip, huggingface, requires_heavy_ram, llm
+# pytest: skip, huggingface, requires_heavy_ram, e2e
 # SKIP REASON: CXXABI_1.3.15 not found - conda environment issue on HPC systems with old glibc
 
 # ruff: noqa E402
diff --git a/docs/examples/mify/rich_table_execute_basic.py b/docs/examples/mify/rich_table_execute_basic.py
index edca015d0..3c4b9e665 100644
--- a/docs/examples/mify/rich_table_execute_basic.py
+++ b/docs/examples/mify/rich_table_execute_basic.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, llm
+# pytest: ollama, qualitative, e2e
 
 # This is an example of using the RichDocument class.
 import os
diff --git a/docs/examples/mini_researcher/context_docs.py b/docs/examples/mini_researcher/context_docs.py
index e4d2d900b..d928835dd 100644
--- a/docs/examples/mini_researcher/context_docs.py
+++ b/docs/examples/mini_researcher/context_docs.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from dataclasses import dataclass
 
diff --git a/docs/examples/mini_researcher/researcher.py b/docs/examples/mini_researcher/researcher.py
index db4532c00..708f3ba13 100644
--- a/docs/examples/mini_researcher/researcher.py
+++ b/docs/examples/mini_researcher/researcher.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, llm, slow
+# pytest: ollama, qualitative, e2e, slow
 
 from collections.abc import Callable
 from functools import cache
diff --git a/docs/examples/mobject/table.py b/docs/examples/mobject/table.py
index da2b98da1..1e6ec3df5 100644
--- a/docs/examples/mobject/table.py
+++ b/docs/examples/mobject/table.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from io import StringIO
 
diff --git a/docs/examples/plugins/class_plugin.py b/docs/examples/plugins/class_plugin.py
index 6b359b810..59c07326f 100644
--- a/docs/examples/plugins/class_plugin.py
+++ b/docs/examples/plugins/class_plugin.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 #
 # Class-based plugin — group related hooks in a single Plugin subclass.
 #
diff --git a/docs/examples/plugins/execution_modes.py b/docs/examples/plugins/execution_modes.py
index 1e1040837..94ca8d545 100644
--- a/docs/examples/plugins/execution_modes.py
+++ b/docs/examples/plugins/execution_modes.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 #
 # Execution modes — all five PluginMode values side by side.
 #
diff --git a/docs/examples/plugins/payload_modification.py b/docs/examples/plugins/payload_modification.py
index f0bb07b3f..fd7569c9c 100644
--- a/docs/examples/plugins/payload_modification.py
+++ b/docs/examples/plugins/payload_modification.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 #
 # Payload modification — how to modify payloads in hooks.
 #
diff --git a/docs/examples/plugins/plugin_scoped.py b/docs/examples/plugins/plugin_scoped.py
index 3be811666..93757bc1c 100644
--- a/docs/examples/plugins/plugin_scoped.py
+++ b/docs/examples/plugins/plugin_scoped.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 #
 # Scoped plugins — activate plugins for a specific block of code.
 #
diff --git a/docs/examples/plugins/plugin_set_composition.py b/docs/examples/plugins/plugin_set_composition.py
index 75d734468..686796ff8 100644
--- a/docs/examples/plugins/plugin_set_composition.py
+++ b/docs/examples/plugins/plugin_set_composition.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 #
 # PluginSet composition — group hooks by concern and register them together.
 #
diff --git a/docs/examples/plugins/quickstart.py b/docs/examples/plugins/quickstart.py
index 098370d27..66d8e4084 100644
--- a/docs/examples/plugins/quickstart.py
+++ b/docs/examples/plugins/quickstart.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 #
 # Quick Start — your first Mellea plugin in under 30 lines.
 #
diff --git a/docs/examples/plugins/session_scoped.py b/docs/examples/plugins/session_scoped.py
index c9be94c71..02e8a0bba 100644
--- a/docs/examples/plugins/session_scoped.py
+++ b/docs/examples/plugins/session_scoped.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 #
 # Session-scoped plugins — plugins that fire only within a specific session.
 #
diff --git a/docs/examples/plugins/standalone_hooks.py b/docs/examples/plugins/standalone_hooks.py
index 5838ddfc3..0166183a5 100644
--- a/docs/examples/plugins/standalone_hooks.py
+++ b/docs/examples/plugins/standalone_hooks.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 #
 # Standalone function hooks — the simplest way to extend Mellea.
 #
diff --git a/docs/examples/plugins/tool_hooks.py b/docs/examples/plugins/tool_hooks.py
index f97398e7e..fcd2e2fad 100644
--- a/docs/examples/plugins/tool_hooks.py
+++ b/docs/examples/plugins/tool_hooks.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 #
 # Tool hook plugins — safety and security policies for tool invocation.
 #
diff --git a/docs/examples/safety/guardian.py b/docs/examples/safety/guardian.py
index 18002f7ec..49edef8d0 100644
--- a/docs/examples/safety/guardian.py
+++ b/docs/examples/safety/guardian.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 """Example of using the Enhanced Guardian Requirement with Granite Guardian 3.3 8B"""
 
diff --git a/docs/examples/safety/guardian_huggingface.py b/docs/examples/safety/guardian_huggingface.py
index f1659caf0..532711e2c 100644
--- a/docs/examples/safety/guardian_huggingface.py
+++ b/docs/examples/safety/guardian_huggingface.py
@@ -1,4 +1,4 @@
-# pytest: ollama, huggingface, requires_heavy_ram, llm
+# pytest: ollama, huggingface, requires_heavy_ram, e2e
 
 """Example of using GuardianCheck with HuggingFace backend for direct model inference
 
diff --git a/docs/examples/safety/repair_with_guardian.py b/docs/examples/safety/repair_with_guardian.py
index 0e89a8b5f..cbee89548 100644
--- a/docs/examples/safety/repair_with_guardian.py
+++ b/docs/examples/safety/repair_with_guardian.py
@@ -1,4 +1,4 @@
-# pytest: ollama, huggingface, requires_heavy_ram, llm
+# pytest: ollama, huggingface, requires_heavy_ram, e2e
 
 """RepairTemplateStrategy Example with Actual Function Call Validation
 Demonstrates how RepairTemplateStrategy repairs responses using actual function calls.
diff --git a/docs/examples/sessions/creating_a_new_type_of_session.py b/docs/examples/sessions/creating_a_new_type_of_session.py
index 92846ff83..c265b1c64 100644
--- a/docs/examples/sessions/creating_a_new_type_of_session.py
+++ b/docs/examples/sessions/creating_a_new_type_of_session.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, llm
+# pytest: ollama, qualitative, e2e
 
 from typing import Literal
 
diff --git a/docs/examples/sofai/sofai_graph_coloring.py b/docs/examples/sofai/sofai_graph_coloring.py
index 134425c55..35583ba76 100644
--- a/docs/examples/sofai/sofai_graph_coloring.py
+++ b/docs/examples/sofai/sofai_graph_coloring.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, llm, requires_heavy_ram
+# pytest: ollama, qualitative, e2e, requires_heavy_ram
 
 """SOFAI Sampling Strategy Example: Graph Coloring Problem.
 
diff --git a/docs/examples/telemetry/metrics_example.py b/docs/examples/telemetry/metrics_example.py
index 8a58f7a35..a4a4cc504 100644
--- a/docs/examples/telemetry/metrics_example.py
+++ b/docs/examples/telemetry/metrics_example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 """Example demonstrating OpenTelemetry metrics exporters in Mellea.
 
diff --git a/docs/examples/telemetry/telemetry_example.py b/docs/examples/telemetry/telemetry_example.py
index 87a15adc5..a8c360a29 100644
--- a/docs/examples/telemetry/telemetry_example.py
+++ b/docs/examples/telemetry/telemetry_example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 """Example demonstrating OpenTelemetry tracing in Mellea.
 
diff --git a/docs/examples/tools/interpreter_example.py b/docs/examples/tools/interpreter_example.py
index de532e6a1..5ad7e15d1 100644
--- a/docs/examples/tools/interpreter_example.py
+++ b/docs/examples/tools/interpreter_example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from mellea import MelleaSession, start_session
 from mellea.backends import ModelOption
diff --git a/docs/examples/tools/smolagents_example.py b/docs/examples/tools/smolagents_example.py
index 23e1283a9..72f546153 100644
--- a/docs/examples/tools/smolagents_example.py
+++ b/docs/examples/tools/smolagents_example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 """Example showing how to use pre-built HuggingFace smolagents tools with Mellea.
 
 This demonstrates loading existing tools from the smolagents ecosystem,
diff --git a/docs/examples/tools/tool_decorator_example.py b/docs/examples/tools/tool_decorator_example.py
index e35e14053..b81be2b5d 100644
--- a/docs/examples/tools/tool_decorator_example.py
+++ b/docs/examples/tools/tool_decorator_example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 """Example demonstrating the @tool decorator for cleaner tool definitions."""
 
 import ast
diff --git a/docs/examples/tutorial/compositionality_with_generative_slots.py b/docs/examples/tutorial/compositionality_with_generative_slots.py
index 26c111519..7d4e3650f 100644
--- a/docs/examples/tutorial/compositionality_with_generative_slots.py
+++ b/docs/examples/tutorial/compositionality_with_generative_slots.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from mellea import generative
 
diff --git a/docs/examples/tutorial/context_example.py b/docs/examples/tutorial/context_example.py
index 313011cbf..36a9dea37 100644
--- a/docs/examples/tutorial/context_example.py
+++ b/docs/examples/tutorial/context_example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from mellea import start_session
 from mellea.stdlib.context import ChatContext
diff --git a/docs/examples/tutorial/document_mobject.py b/docs/examples/tutorial/document_mobject.py
index a3b55c501..f917cb8da 100644
--- a/docs/examples/tutorial/document_mobject.py
+++ b/docs/examples/tutorial/document_mobject.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, llm, requires_heavy_ram
+# pytest: ollama, qualitative, e2e, requires_heavy_ram
 
 from mellea.backends import model_ids
 from mellea.backends.model_ids import IBM_GRANITE_4_HYBRID_MICRO
diff --git a/docs/examples/tutorial/example.py b/docs/examples/tutorial/example.py
index eb8fdf212..53094677a 100644
--- a/docs/examples/tutorial/example.py
+++ b/docs/examples/tutorial/example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 import mellea
 
diff --git a/docs/examples/tutorial/instruct_validate_repair.py b/docs/examples/tutorial/instruct_validate_repair.py
index 4173f01bd..73581a2c4 100644
--- a/docs/examples/tutorial/instruct_validate_repair.py
+++ b/docs/examples/tutorial/instruct_validate_repair.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from mellea.core import Requirement
 from mellea.stdlib.requirements import check, req, simple_validate
diff --git a/docs/examples/tutorial/model_options_example.py b/docs/examples/tutorial/model_options_example.py
index 899711209..5cafbc54f 100644
--- a/docs/examples/tutorial/model_options_example.py
+++ b/docs/examples/tutorial/model_options_example.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 import mellea
 from mellea.backends import ModelOption, model_ids
diff --git a/docs/examples/tutorial/sentiment_classifier.py b/docs/examples/tutorial/sentiment_classifier.py
index d26269ed3..6d1d3367e 100644
--- a/docs/examples/tutorial/sentiment_classifier.py
+++ b/docs/examples/tutorial/sentiment_classifier.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from typing import Literal
 
diff --git a/docs/examples/tutorial/simple_email.py b/docs/examples/tutorial/simple_email.py
index fb86dae86..f7d03f041 100644
--- a/docs/examples/tutorial/simple_email.py
+++ b/docs/examples/tutorial/simple_email.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 import mellea
 
diff --git a/docs/examples/tutorial/table_mobject.py b/docs/examples/tutorial/table_mobject.py
index e4de110fb..1b79d69b7 100644
--- a/docs/examples/tutorial/table_mobject.py
+++ b/docs/examples/tutorial/table_mobject.py
@@ -1,4 +1,4 @@
-# pytest: ollama, llm
+# pytest: ollama, e2e
 
 from io import StringIO
 
diff --git a/pyproject.toml b/pyproject.toml
index 628b02527..31a531e91 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -339,6 +339,7 @@ markers = [
     "huggingface: Tests requiring HuggingFace backend (local, heavy)",
     "vllm: Tests requiring vLLM backend (local, GPU required)",
     "litellm: Tests requiring LiteLLM backend",
+    "bedrock: Tests requiring AWS Bedrock backend (requires credentials)",
 
     # Capability markers
     "requires_api_key: Tests requiring external API keys",
diff --git a/test/MARKERS_GUIDE.md b/test/MARKERS_GUIDE.md
index 95aebc002..e6fa3877b 100644
--- a/test/MARKERS_GUIDE.md
+++ b/test/MARKERS_GUIDE.md
@@ -127,6 +127,7 @@ tests don't need real backends.
 | `huggingface`  | HuggingFace transformers      | Local, GPU, 48GB+ RAM                 |
 | `vllm`         | vLLM                          | Local, GPU required, 48GB+ RAM        |
 | `litellm`      | LiteLLM (wraps other backends)| Depends on underlying backend         |
+| `bedrock`      | AWS Bedrock                   | API calls, requires credentials       |
 
 ### OpenAI-via-Ollama pattern
 
diff --git a/test/backends/test_bedrock.py b/test/backends/test_bedrock.py
index 9f0905be6..236a068ff 100644
--- a/test/backends/test_bedrock.py
+++ b/test/backends/test_bedrock.py
@@ -12,7 +12,8 @@
 
 # Skip entire module in CI since the single test is qualitative
 pytestmark = [
-    pytest.mark.llm,
+    pytest.mark.e2e,
+    pytest.mark.bedrock,
     pytest.mark.skipif(
         "AWS_BEARER_TOKEN_BEDROCK" not in os.environ.keys(),
         reason="Skipping Bedrock backend tests if $AWS_BEARER_TOKEN_BEDROCK is not set.",
diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
index a040c8ac9..8bdb5cab2 100644
--- a/test/backends/test_huggingface.py
+++ b/test/backends/test_huggingface.py
@@ -16,7 +16,7 @@
 # Mark all tests in this module with backend and resource requirements
 pytestmark = [
     pytest.mark.huggingface,
-    pytest.mark.llm,
+    pytest.mark.e2e,
     pytest.mark.requires_gpu,
     pytest.mark.requires_heavy_ram,
     pytest.mark.requires_gpu_isolation,  # Activate GPU memory isolation
diff --git a/test/backends/test_huggingface_tools.py b/test/backends/test_huggingface_tools.py
index eceded084..ce15ad449 100644
--- a/test/backends/test_huggingface_tools.py
+++ b/test/backends/test_huggingface_tools.py
@@ -7,7 +7,7 @@
 # Skip entire module in CI since the single test is qualitative
 pytestmark = [
     pytest.mark.huggingface,
-    pytest.mark.llm,
+    pytest.mark.e2e,
     pytest.mark.requires_gpu,
     pytest.mark.requires_heavy_ram,
     pytest.mark.requires_gpu_isolation,
diff --git a/test/backends/test_litellm_ollama.py b/test/backends/test_litellm_ollama.py
index 0788ccac2..3ca4bcbfb 100644
--- a/test/backends/test_litellm_ollama.py
+++ b/test/backends/test_litellm_ollama.py
@@ -4,7 +4,7 @@
 import pytest
 
 # Mark all tests in this module as requiring Ollama via LiteLLM
-pytestmark = [pytest.mark.litellm, pytest.mark.ollama, pytest.mark.llm]
+pytestmark = [pytest.mark.litellm, pytest.mark.ollama, pytest.mark.e2e]
 
 from mellea import MelleaSession, generative
 from mellea.backends import ModelOption, model_ids
diff --git a/test/backends/test_litellm_watsonx.py b/test/backends/test_litellm_watsonx.py
index 80f65b096..e224f9989 100644
--- a/test/backends/test_litellm_watsonx.py
+++ b/test/backends/test_litellm_watsonx.py
@@ -6,7 +6,7 @@
 pytestmark = [
     pytest.mark.litellm,
     pytest.mark.watsonx,
-    pytest.mark.llm,
+    pytest.mark.e2e,
     pytest.mark.requires_api_key,
 ]
 
diff --git a/test/backends/test_mellea_tool.py b/test/backends/test_mellea_tool.py
index 34ca4f4ef..3a941353a 100644
--- a/test/backends/test_mellea_tool.py
+++ b/test/backends/test_mellea_tool.py
@@ -54,7 +54,7 @@ def test_from_callable():
 
 @pytest.mark.qualitative
 @pytest.mark.ollama
-@pytest.mark.llm
+@pytest.mark.e2e
 def test_from_callable_generation(session: MelleaSession):
     t = MelleaTool.from_callable(callable, "mellea_tool")
 
@@ -101,7 +101,7 @@ def test_from_langchain():
 
 @pytest.mark.qualitative
 @pytest.mark.ollama
-@pytest.mark.llm
+@pytest.mark.e2e
 def test_from_langchain_generation(session: MelleaSession):
     t = MelleaTool.from_langchain(langchain_tool)
 
diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py
index 5ff7b8903..06d93456e 100644
--- a/test/backends/test_ollama.py
+++ b/test/backends/test_ollama.py
@@ -13,7 +13,7 @@
 from mellea.stdlib.requirements import simple_validate
 
 # Mark all tests in this module as requiring Ollama
-pytestmark = [pytest.mark.ollama, pytest.mark.llm]
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
 
 
 @pytest.fixture(scope="function")
diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py
index bc6216fa2..0372ae603 100644
--- a/test/backends/test_openai_ollama.py
+++ b/test/backends/test_openai_ollama.py
@@ -8,7 +8,7 @@
 import pytest
 
 # Mark all tests in this module as requiring Ollama via OpenAI-compatible API
-pytestmark = [pytest.mark.openai, pytest.mark.ollama, pytest.mark.llm]
+pytestmark = [pytest.mark.openai, pytest.mark.ollama, pytest.mark.e2e]
 
 from mellea import MelleaSession
 from mellea.backends import ModelOption
diff --git a/test/backends/test_openai_vllm.py b/test/backends/test_openai_vllm.py
index ec32996d5..52fcb85c9 100644
--- a/test/backends/test_openai_vllm.py
+++ b/test/backends/test_openai_vllm.py
@@ -12,7 +12,7 @@
 # Mark all tests in this module with backend and resource requirements
 pytestmark = [
     pytest.mark.openai,
-    pytest.mark.llm,
+    pytest.mark.e2e,
     pytest.mark.vllm,
     pytest.mark.requires_gpu,
     pytest.mark.requires_heavy_ram,
diff --git a/test/backends/test_tool_calls.py b/test/backends/test_tool_calls.py
index e54d0f5b9..abf27e51d 100644
--- a/test/backends/test_tool_calls.py
+++ b/test/backends/test_tool_calls.py
@@ -13,7 +13,7 @@
 from mellea.stdlib.context import ChatContext
 from mellea.stdlib.session import MelleaSession
 
-pytestmark = [pytest.mark.ollama, pytest.mark.llm]
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
 
 
 @pytest.fixture(scope="module")
diff --git a/test/backends/test_vision_ollama.py b/test/backends/test_vision_ollama.py
index 21782b1d8..75b5f0615 100644
--- a/test/backends/test_vision_ollama.py
+++ b/test/backends/test_vision_ollama.py
@@ -6,7 +6,7 @@
 from PIL import Image
 
 # Mark all tests in this module as requiring Ollama with vision support
-pytestmark = [pytest.mark.ollama, pytest.mark.llm]
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
 
 from mellea import MelleaSession, start_session
 from mellea.backends import ModelOption, model_ids
diff --git a/test/backends/test_vision_openai.py b/test/backends/test_vision_openai.py
index 599d6cbf0..c8348f2c8 100644
--- a/test/backends/test_vision_openai.py
+++ b/test/backends/test_vision_openai.py
@@ -7,7 +7,7 @@
 from PIL import Image
 
 # Mark all tests in this module as requiring OpenAI API with vision support
-pytestmark = [pytest.mark.openai, pytest.mark.llm, pytest.mark.ollama]
+pytestmark = [pytest.mark.openai, pytest.mark.e2e, pytest.mark.ollama]
 
 from mellea import MelleaSession, start_session
 from mellea.backends import ModelOption
diff --git a/test/backends/test_vllm.py b/test/backends/test_vllm.py
index 5ba864bb9..01d002034 100644
--- a/test/backends/test_vllm.py
+++ b/test/backends/test_vllm.py
@@ -8,7 +8,7 @@
 # Mark all tests in this module with backend and resource requirements
 pytestmark = [
     pytest.mark.vllm,
-    pytest.mark.llm,
+    pytest.mark.e2e,
     pytest.mark.requires_gpu,
     pytest.mark.requires_heavy_ram,
     pytest.mark.requires_gpu_isolation,  # Activate GPU memory isolation
diff --git a/test/backends/test_vllm_tools.py b/test/backends/test_vllm_tools.py
index aaf27a06d..5a61d5aba 100644
--- a/test/backends/test_vllm_tools.py
+++ b/test/backends/test_vllm_tools.py
@@ -7,7 +7,7 @@
 # Skip entire module in CI since the single test is qualitative
 pytestmark = [
     pytest.mark.vllm,
-    pytest.mark.llm,
+    pytest.mark.e2e,
     pytest.mark.requires_gpu,
     pytest.mark.requires_heavy_ram,
     pytest.mark.requires_gpu_isolation,
diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py
index 4ec7d71bb..eda9baa69 100644
--- a/test/backends/test_watsonx.py
+++ b/test/backends/test_watsonx.py
@@ -8,7 +8,7 @@
 # Mark all tests in this module with backend and auth requirements
 pytestmark = [
     pytest.mark.watsonx,
-    pytest.mark.llm,
+    pytest.mark.e2e,
     pytest.mark.requires_api_key,
     # Skip entire module in CI since 8/9 tests are qualitative
     pytest.mark.skipif(
diff --git a/test/cli/test_alora_train.py b/test/cli/test_alora_train.py
index 7a1c0a35f..813accc5f 100644
--- a/test/cli/test_alora_train.py
+++ b/test/cli/test_alora_train.py
@@ -1,12 +1,14 @@
-"""Unit tests for aLoRA/LoRA training configuration."""
+"""Integration tests for aLoRA/LoRA training configuration."""
 
 from unittest.mock import MagicMock, Mock, patch
 
 import pytest
+
+peft = pytest.importorskip("peft", reason="peft not installed (install mellea[hf])")
 from peft import LoraConfig
 
 
-@pytest.mark.huggingface
+@pytest.mark.integration
 def test_alora_config_creation():
     """Test that aLoRA config is created correctly with PEFT 0.18+."""
     from cli.alora.train import train_model
@@ -84,7 +86,7 @@ def test_alora_config_creation():
         assert peft_config.task_type == "CAUSAL_LM", "Task type should be CAUSAL_LM"
 
 
-@pytest.mark.huggingface
+@pytest.mark.integration
 def test_lora_config_creation():
     """Test that standard LoRA config is created correctly."""
     from cli.alora.train import train_model
@@ -159,7 +161,7 @@ def test_lora_config_creation():
         assert peft_config.task_type == "CAUSAL_LM", "Task type should be CAUSAL_LM"
 
 
-@pytest.mark.huggingface
+@pytest.mark.integration
 def test_invocation_prompt_tokenization():
     """Test that invocation prompt is correctly tokenized for aLoRA."""
     from cli.alora.train import train_model
diff --git a/test/cli/test_alora_train_integration.py b/test/cli/test_alora_train_integration.py
index d3ae74fa3..83c8cd0e6 100644
--- a/test/cli/test_alora_train_integration.py
+++ b/test/cli/test_alora_train_integration.py
@@ -16,7 +16,7 @@
 
 pytestmark = [
     pytest.mark.huggingface,
-    pytest.mark.llm,
+    pytest.mark.e2e,
     pytest.mark.requires_gpu,
     pytest.mark.requires_heavy_ram,
     # Skip entire module in CI since 17/18 tests are qualitative
diff --git a/test/conftest.py b/test/conftest.py
index 3c7b78814..b86b4a50c 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -304,25 +304,27 @@ def add_option_safe(option_name, **kwargs):
     )
 
 
+BACKEND_MARKERS: dict[str, str] = {
+    "ollama": "Tests requiring Ollama backend (local, light)",
+    "openai": "Tests requiring OpenAI API (requires API key)",
+    "watsonx": "Tests requiring Watsonx API (requires API key)",
+    "huggingface": "Tests requiring HuggingFace backend (local, heavy)",
+    "vllm": "Tests requiring vLLM backend (local, GPU required)",
+    "litellm": "Tests requiring LiteLLM backend",
+    "bedrock": "Tests requiring AWS Bedrock backend (requires credentials)",
+}
+"""Single source of truth for backend marker names and descriptions.
+
+Add new backends here — ``pytest_configure`` registers them automatically.
+Keep ``pyproject.toml`` ``[tool.pytest.ini_options].markers`` in sync.
+"""
+
+
 def pytest_configure(config):
     """Register custom markers."""
-    # Backend markers
-    config.addinivalue_line(
-        "markers", "ollama: Tests requiring Ollama backend (local, light)"
-    )
-    config.addinivalue_line(
-        "markers", "openai: Tests requiring OpenAI API (requires API key)"
-    )
-    config.addinivalue_line(
-        "markers", "watsonx: Tests requiring Watsonx API (requires API key)"
-    )
-    config.addinivalue_line(
-        "markers", "huggingface: Tests requiring HuggingFace backend (local, heavy)"
-    )
-    config.addinivalue_line(
-        "markers", "vllm: Tests requiring vLLM backend (local, GPU required)"
-    )
-    config.addinivalue_line("markers", "litellm: Tests requiring LiteLLM backend")
+    # Backend markers (driven by BACKEND_MARKERS registry)
+    for name, desc in BACKEND_MARKERS.items():
+        config.addinivalue_line("markers", f"{name}: {desc}")
 
     # Capability markers
     config.addinivalue_line(
diff --git a/test/core/test_astream_incremental.py b/test/core/test_astream_incremental.py
index af0dca903..3da4f2ede 100644
--- a/test/core/test_astream_incremental.py
+++ b/test/core/test_astream_incremental.py
@@ -13,7 +13,7 @@
 
 
 @pytest.mark.ollama
-@pytest.mark.llm
+@pytest.mark.e2e
 @pytest.mark.qualitative
 async def test_astream_returns_incremental_chunks():
     """Test that astream() returns only new content, not accumulated content.
@@ -68,7 +68,7 @@ async def test_astream_returns_incremental_chunks():
 
 
 @pytest.mark.ollama
-@pytest.mark.llm
+@pytest.mark.e2e
 @pytest.mark.qualitative
 async def test_astream_multiple_calls_accumulate_correctly():
     """Test that multiple astream() calls accumulate to the final value.
@@ -112,7 +112,7 @@ async def test_astream_multiple_calls_accumulate_correctly():
 
 
 @pytest.mark.ollama
-@pytest.mark.llm
+@pytest.mark.e2e
 @pytest.mark.qualitative
 async def test_astream_beginning_length_tracking():
     """Test that beginning_length is correctly tracked across astream calls.
@@ -143,7 +143,7 @@ async def test_astream_beginning_length_tracking():
 
 
 @pytest.mark.ollama
-@pytest.mark.llm
+@pytest.mark.e2e
 @pytest.mark.qualitative
 async def test_astream_empty_beginning():
     """Test astream when _underlying_value starts as None."""
@@ -168,7 +168,7 @@ async def test_astream_empty_beginning():
 
 
 @pytest.mark.ollama
-@pytest.mark.llm
+@pytest.mark.e2e
 async def test_computed_mot_raises_error_for_astream():
     """Test that computed mot raises an error for astream() calls."""
     # Create a pre-computed thunk
@@ -185,7 +185,7 @@ async def test_computed_mot_raises_error_for_astream():
 
 
 @pytest.mark.ollama
-@pytest.mark.llm
+@pytest.mark.e2e
 async def test_non_streaming_astream():
     """Test that non-streaming astream has exactly one chunk."""
     session = start_session()
diff --git a/test/core/test_component_typing.py b/test/core/test_component_typing.py
index 19ca2d234..b69aa7a9c 100644
--- a/test/core/test_component_typing.py
+++ b/test/core/test_component_typing.py
@@ -121,7 +121,7 @@ def test_incorrect_type_override():
 @pytest.mark.qualitative
 @pytest.mark.ollama
 @pytest.mark.requires_gpu
-@pytest.mark.llm
+@pytest.mark.e2e
 async def test_generating(session):
     m = session
     ic = IntComp("generate an int")
@@ -165,7 +165,7 @@ async def test_generating(session):
 @pytest.mark.ollama
 @pytest.mark.requires_gpu
 @pytest.mark.requires_heavy_ram
-@pytest.mark.llm
+@pytest.mark.e2e
 def test_message_typing(session):
     m = session
     user_message = Message("user", "Hello!")
@@ -182,7 +182,7 @@ def test_message_typing(session):
 @pytest.mark.ollama
 @pytest.mark.requires_gpu
 @pytest.mark.requires_heavy_ram
-@pytest.mark.llm
+@pytest.mark.e2e
 async def test_generating_with_sampling(session):
     m = session
     m = start_session()
diff --git a/test/core/test_model_output_thunk.py b/test/core/test_model_output_thunk.py
index ce1a171ef..bf4659944 100644
--- a/test/core/test_model_output_thunk.py
+++ b/test/core/test_model_output_thunk.py
@@ -6,7 +6,7 @@
 from mellea.core import ModelOutputThunk
 from mellea.stdlib.session import MelleaSession, start_session
 
-pytestmark = [pytest.mark.ollama, pytest.mark.llm]
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
 
 
 # Use generated ModelOutputThunks to fully test copying. This can technically be done without a
diff --git a/test/stdlib/components/intrinsic/test_core.py b/test/stdlib/components/intrinsic/test_core.py
index 5d9e5e4c8..c84718b32 100644
--- a/test/stdlib/components/intrinsic/test_core.py
+++ b/test/stdlib/components/intrinsic/test_core.py
@@ -27,7 +27,7 @@
     pytest.mark.huggingface,
     pytest.mark.requires_gpu,
     pytest.mark.requires_heavy_ram,
-    pytest.mark.llm,
+    pytest.mark.e2e,
 ]
 
 DATA_ROOT = pathlib.Path(os.path.dirname(__file__)) / "testdata"
diff --git a/test/stdlib/components/intrinsic/test_guardian.py b/test/stdlib/components/intrinsic/test_guardian.py
index 6502ddaba..05fd205d1 100644
--- a/test/stdlib/components/intrinsic/test_guardian.py
+++ b/test/stdlib/components/intrinsic/test_guardian.py
@@ -24,7 +24,7 @@
     pytest.mark.huggingface,
     pytest.mark.requires_gpu,
     pytest.mark.requires_heavy_ram,
-    pytest.mark.llm,
+    pytest.mark.e2e,
 ]
 
 DATA_ROOT = pathlib.Path(os.path.dirname(__file__)) / "testdata"
diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py
index 7d03631a4..49c8dcaf3 100644
--- a/test/stdlib/components/intrinsic/test_rag.py
+++ b/test/stdlib/components/intrinsic/test_rag.py
@@ -23,7 +23,7 @@
     pytest.mark.huggingface,
     pytest.mark.requires_gpu,
     pytest.mark.requires_heavy_ram,  # 3B model + document processing needs ~30-35GB
-    pytest.mark.llm,
+    pytest.mark.e2e,
 ]
 
 DATA_ROOT = pathlib.Path(os.path.dirname(__file__)) / "testdata"
diff --git a/test/stdlib/components/test_genslot.py b/test/stdlib/components/test_genslot.py
index 9f63d4d7e..33a6482f8 100644
--- a/test/stdlib/components/test_genslot.py
+++ b/test/stdlib/components/test_genslot.py
@@ -18,7 +18,7 @@
 from mellea.stdlib.sampling import RejectionSamplingStrategy
 
 # Module-level markers: Uses granite4:micro-h (3B hybrid, lightweight) in local mode
-pytestmark = [pytest.mark.ollama, pytest.mark.requires_gpu, pytest.mark.llm]
+pytestmark = [pytest.mark.ollama, pytest.mark.requires_gpu, pytest.mark.e2e]
 
 
 @pytest.fixture(scope="module")
diff --git a/test/stdlib/requirements/test_requirement.py b/test/stdlib/requirements/test_requirement.py
index b6dbc4b95..575e40ec7 100644
--- a/test/stdlib/requirements/test_requirement.py
+++ b/test/stdlib/requirements/test_requirement.py
@@ -8,7 +8,7 @@
 ctx = ChatContext()
 ctx = ctx.add(ModelOutputThunk("test"))
 
-pytestmark = [pytest.mark.ollama, pytest.mark.llm]
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
 
 
 async def test_llmaj_validation_req_output_field():
diff --git a/test/stdlib/sampling/test_majority_voting.py b/test/stdlib/sampling/test_majority_voting.py
index 1df8e575d..f887279fd 100644
--- a/test/stdlib/sampling/test_majority_voting.py
+++ b/test/stdlib/sampling/test_majority_voting.py
@@ -10,7 +10,7 @@
 )
 
 # Mark all tests as requiring Ollama (start_session defaults to Ollama)
-pytestmark = [pytest.mark.ollama, pytest.mark.llm, pytest.mark.qualitative]
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e, pytest.mark.qualitative]
 
 
 @pytest.fixture(scope="module")
diff --git a/test/stdlib/sampling/test_sampling_ctx.py b/test/stdlib/sampling/test_sampling_ctx.py
index 5919c339b..d049fb426 100644
--- a/test/stdlib/sampling/test_sampling_ctx.py
+++ b/test/stdlib/sampling/test_sampling_ctx.py
@@ -16,7 +16,7 @@ def m_session():
 
 
 @pytest.mark.ollama
-@pytest.mark.llm
+@pytest.mark.e2e
 @pytest.mark.qualitative
 class TestSamplingCtxCase:
     def _run_asserts_for_ctx_testing(self, res):
diff --git a/test/stdlib/sampling/test_sofai_graph_coloring.py b/test/stdlib/sampling/test_sofai_graph_coloring.py
index c1b85a907..66e884d5e 100644
--- a/test/stdlib/sampling/test_sofai_graph_coloring.py
+++ b/test/stdlib/sampling/test_sofai_graph_coloring.py
@@ -709,7 +709,7 @@ async def s2_generate(*args, **kwargs):
 
 @pytest.mark.qualitative
 @pytest.mark.ollama
-@pytest.mark.llm
+@pytest.mark.e2e
 class TestSOFAIGraphColoringIntegration:
     """Integration tests with actual LLM backends.
 
diff --git a/test/stdlib/sampling/test_sofai_sampling.py b/test/stdlib/sampling/test_sofai_sampling.py
index bae95ff4a..33015f0f6 100644
--- a/test/stdlib/sampling/test_sofai_sampling.py
+++ b/test/stdlib/sampling/test_sofai_sampling.py
@@ -268,7 +268,7 @@ def test_fallback_without_tags(self):
 
 @pytest.mark.qualitative
 @pytest.mark.ollama
-@pytest.mark.llm
+@pytest.mark.e2e
 class TestSOFAIIntegration:
     """Integration tests for SOFAISamplingStrategy.
 
diff --git a/test/stdlib/sampling/test_think_budget_forcing.py b/test/stdlib/sampling/test_think_budget_forcing.py
index f4025f0d5..f5d5860b7 100644
--- a/test/stdlib/sampling/test_think_budget_forcing.py
+++ b/test/stdlib/sampling/test_think_budget_forcing.py
@@ -15,7 +15,7 @@
     pytest.mark.ollama,
     pytest.mark.requires_gpu,
     pytest.mark.requires_heavy_ram,
-    pytest.mark.llm,
+    pytest.mark.e2e,
     pytest.mark.qualitative,
 ]
 
diff --git a/test/stdlib/test_chat_view.py b/test/stdlib/test_chat_view.py
index 9d06f8fb9..7ded6495e 100644
--- a/test/stdlib/test_chat_view.py
+++ b/test/stdlib/test_chat_view.py
@@ -5,7 +5,7 @@
 from mellea.stdlib.session import start_session
 
 # Mark all tests as requiring Ollama (start_session defaults to Ollama)
-pytestmark = [pytest.mark.ollama, pytest.mark.llm]
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
 
 
 @pytest.fixture(scope="function")
diff --git a/test/stdlib/test_functional.py b/test/stdlib/test_functional.py
index ac182596e..b55117f27 100644
--- a/test/stdlib/test_functional.py
+++ b/test/stdlib/test_functional.py
@@ -7,7 +7,7 @@
 from mellea.stdlib.requirements import req
 from mellea.stdlib.session import start_session
 
-pytestmark = [pytest.mark.ollama, pytest.mark.llm]
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
 
 
 @pytest.fixture(scope="module")
diff --git a/test/stdlib/test_session.py b/test/stdlib/test_session.py
index 076c92747..80506b591 100644
--- a/test/stdlib/test_session.py
+++ b/test/stdlib/test_session.py
@@ -10,7 +10,7 @@
 from mellea.stdlib.session import MelleaSession, start_session
 
 # Mark all tests as requiring Ollama (start_session defaults to Ollama)
-pytestmark = [pytest.mark.ollama, pytest.mark.llm]
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
 
 
 # We edit the context type in the async tests below. Don't change the scope here.
diff --git a/test/stdlib/test_spans.py b/test/stdlib/test_spans.py
index 8a20965e2..f39bd0e53 100644
--- a/test/stdlib/test_spans.py
+++ b/test/stdlib/test_spans.py
@@ -8,7 +8,12 @@
 from mellea.stdlib.session import MelleaSession, start_session
 
 # Module-level markers for all tests using Granite 4 hybrid micro (3B model)
-pytestmark = [pytest.mark.huggingface, pytest.mark.requires_gpu, pytest.mark.llm]
+pytestmark = [
+    pytest.mark.huggingface,
+    pytest.mark.requires_gpu,
+    pytest.mark.requires_heavy_ram,
+    pytest.mark.e2e,
+]
 
 
 # We edit the context type in the async tests below. Don't change the scope here.
diff --git a/test/telemetry/test_metrics_backend.py b/test/telemetry/test_metrics_backend.py
index 62016eafe..d8d79e6c1 100644
--- a/test/telemetry/test_metrics_backend.py
+++ b/test/telemetry/test_metrics_backend.py
@@ -106,7 +106,7 @@ def get_metric_value(metrics_data, metric_name, attributes=None):
 
 
 @pytest.mark.asyncio
-@pytest.mark.llm
+@pytest.mark.e2e
 @pytest.mark.ollama
 @pytest.mark.parametrize("stream", [False, True], ids=["non-streaming", "streaming"])
 async def test_ollama_token_metrics_integration(enable_metrics, metric_reader, stream):
@@ -160,7 +160,7 @@ async def test_ollama_token_metrics_integration(enable_metrics, metric_reader, s
 
 
 @pytest.mark.asyncio
-@pytest.mark.llm
+@pytest.mark.e2e
 @pytest.mark.ollama
 @pytest.mark.parametrize("stream", [False, True], ids=["non-streaming", "streaming"])
 async def test_openai_token_metrics_integration(enable_metrics, metric_reader, stream):
@@ -216,7 +216,7 @@ async def test_openai_token_metrics_integration(enable_metrics, metric_reader, s
 
 
 @pytest.mark.asyncio
-@pytest.mark.llm
+@pytest.mark.e2e
 @pytest.mark.watsonx
 @pytest.mark.requires_api_key
 async def test_watsonx_token_metrics_integration(enable_metrics, metric_reader):
@@ -266,7 +266,7 @@ async def test_watsonx_token_metrics_integration(enable_metrics, metric_reader):
 
 
 @pytest.mark.asyncio
-@pytest.mark.llm
+@pytest.mark.e2e
 @pytest.mark.litellm
 @pytest.mark.ollama
 @pytest.mark.parametrize("stream", [False, True], ids=["non-streaming", "streaming"])
@@ -329,7 +329,7 @@ async def test_litellm_token_metrics_integration(
 
 
 @pytest.mark.asyncio
-@pytest.mark.llm
+@pytest.mark.e2e
 @pytest.mark.huggingface
 @pytest.mark.parametrize("stream", [False, True], ids=["non-streaming", "streaming"])
 async def test_huggingface_token_metrics_integration(
diff --git a/test/telemetry/test_tracing.py b/test/telemetry/test_tracing.py
index 3e63e54d0..5ea1a9e00 100644
--- a/test/telemetry/test_tracing.py
+++ b/test/telemetry/test_tracing.py
@@ -109,6 +109,7 @@ def test_set_span_error_with_none_span():
     set_span_error(None, exception)
 
 
+@pytest.mark.e2e
 @pytest.mark.ollama
 def test_session_with_tracing_disabled():
     """Test that session works normally when tracing is disabled."""
@@ -119,6 +120,7 @@ def test_session_with_tracing_disabled():
         assert result is not None
 
 
+@pytest.mark.e2e
 @pytest.mark.ollama
 def test_session_with_application_tracing(enable_app_tracing):
     """Test that session works with application tracing enabled."""
@@ -130,6 +132,7 @@ def test_session_with_application_tracing(enable_app_tracing):
         assert result is not None
 
 
+@pytest.mark.e2e
 @pytest.mark.ollama
 def test_session_with_backend_tracing(enable_backend_tracing):
     """Test that session works with backend tracing enabled."""
@@ -141,6 +144,7 @@ def test_session_with_backend_tracing(enable_backend_tracing):
         assert result is not None
 
 
+@pytest.mark.e2e
 @pytest.mark.ollama
 def test_generative_function_with_tracing(enable_app_tracing):
     """Test that @generative functions work with tracing enabled."""
diff --git a/test/telemetry/test_tracing_backend.py b/test/telemetry/test_tracing_backend.py
index 1092bb593..044b336a8 100644
--- a/test/telemetry/test_tracing_backend.py
+++ b/test/telemetry/test_tracing_backend.py
@@ -27,9 +27,11 @@
 except ImportError:
     OTEL_AVAILABLE = False
 
-pytestmark = pytest.mark.skipif(
-    not OTEL_AVAILABLE, reason="OpenTelemetry not installed"
-)
+pytestmark = [
+    pytest.mark.skipif(not OTEL_AVAILABLE, reason="OpenTelemetry not installed"),
+    pytest.mark.e2e,
+    pytest.mark.ollama,
+]
 
 
 @pytest.fixture(scope="module", autouse=True)

From c3a56511923e9e995ea8ec684b7437c7103ee82f Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 12:43:03 +0000
Subject: [PATCH 09/42] feat: add estimate-vram skill and fix MPS VRAM
 detection

- New /estimate-vram agent skill that analyses test files to determine
  correct require_gpu(min_vram_gb=N) and require_ram(min_gb=N) values
  by tracing model IDs and looking up parameter counts dynamically
- Fix _gpu_vram_gb() in test/predicates.py to use
  torch.mps.recommended_max_memory() on macOS MPS instead of returning 0
- Fix get_system_capabilities() in test/conftest.py with same MPS path
- Update test/README.md with predicates table and legacy marker deprecation
- Add /estimate-vram cross-reference in audit-markers skill
---
 .agents/skills/audit-markers/SKILL.md |   6 +
 .agents/skills/estimate-vram/SKILL.md | 239 ++++++++++++++++++++++++++
 test/README.md                        |  28 ++-
 test/conftest.py                      |   8 +-
 test/predicates.py                    |  12 +-
 5 files changed, 286 insertions(+), 7 deletions(-)
 create mode 100644 .agents/skills/estimate-vram/SKILL.md

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index 3ecad67f6..f7b1ff762 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -346,6 +346,12 @@ Projects should provide a shared module of predicate functions that return
 (exact thresholds, specific env vars) without ad-hoc `skipif` or blunt
 resource markers scattered across files.
 
+### Determining `min_vram_gb` and `min_gb` values
+
+When migrating legacy `requires_gpu` or `requires_heavy_ram` markers to predicates,
+use the `/estimate-vram` skill to determine the correct `min_vram_gb` and `min_gb`
+values based on the model each test loads. Do not guess or use blanket thresholds.
+
 ### What to audit
 
 Check the project's predicate module (see Project References) for available
diff --git a/.agents/skills/estimate-vram/SKILL.md b/.agents/skills/estimate-vram/SKILL.md
new file mode 100644
index 000000000..82acec433
--- /dev/null
+++ b/.agents/skills/estimate-vram/SKILL.md
@@ -0,0 +1,239 @@
+---
+name: estimate-vram
+description: >
+  Estimate GPU VRAM and RAM requirements for test files by tracing model
+  identifiers and looking up parameter counts. Use when migrating legacy
+  requires_gpu/requires_heavy_ram markers to predicates, or adding resource
+  gating to new GPU tests. Produces a recommendations table with exact
+  require_gpu(min_vram_gb=N) and require_ram(min_gb=N) values.
+argument-hint: "[file-or-directory] [--precision fp16|fp32|int4|int8]"
+compatibility: "Claude Code, IBM Bob"
+metadata:
+  version: "2026-03-25"
+  capabilities: [read_file, bash, grep, glob]
+---
+
+# Estimate VRAM Requirements
+
+Analyse test files to determine appropriate `require_gpu(min_vram_gb=N)` and
+`require_ram(min_gb=N)` values for resource gating predicates.
+
+## Inputs
+
+- `$ARGUMENTS` — file path, directory, or glob. If empty, scan all test files
+  that have GPU-related markers or backend constructors.
+- `--precision` — override default precision assumption. One of `fp16` (default),
+  `fp32`, `int4`, `int8`.
+
+## Project References
+
+Read these before estimating — they provide model constants and predicate APIs:
+
+- **Model identifiers:** `mellea/backends/model_ids.py` (`ModelIdentifier` constants)
+- **Resource predicates:** `test/predicates.py` (available predicate functions)
+- **Marker conventions:** `test/MARKERS_GUIDE.md`
+
+---
+
+## Procedure
+
+### Step 1 — Find GPU-relevant test files
+
+If `$ARGUMENTS` is a specific file, use that. Otherwise grep across the target
+scope for any of these signals:
+
+- `requires_gpu`, `require_gpu` — existing or legacy resource gating
+- `requires_heavy_ram`, `require_ram` — existing or legacy RAM gating
+- `LocalHFBackend`, `LocalVLLMBackend` — local GPU backends
+- `.from_pretrained(` — direct model loading
+- `pytest.mark.huggingface`, `pytest.mark.vllm` — GPU backend markers
+
+Files with only `pytest.mark.ollama` or cloud backend markers (openai, watsonx,
+litellm, bedrock) do not need GPU gating analysis — skip them unless they also
+have a `requires_gpu` marker that may be wrong (like `test_genslot.py`).
+
+### Step 2 — Trace the model identifier
+
+For each file, determine which model(s) it loads. Check in order:
+
+1. **Module-level constants** — e.g. `BASE_MODEL = "ibm-granite/..."` or
+   `MODEL_ID = model_ids.QWEN3_0_6B`.
+2. **Fixture definitions** — trace `@pytest.fixture` functions for:
+   - `LocalHFBackend(model_id=...)` — extract the `model_id` argument
+   - `LocalVLLMBackend(model_id=...)` — extract the `model_id` argument
+   - `start_session("hf", model_id=...)` — extract `model_id`
+3. **ModelIdentifier resolution** — if the model_id is a constant like
+   `model_ids.QWEN3_0_6B`, read `mellea/backends/model_ids.py` and extract
+   the `hf_model_name` field.
+4. **Conftest fixtures** — check `conftest.py` files up the directory tree for
+   fixture definitions that provide model/backend instances.
+5. **Per-function overrides** — some files have different models per test function.
+   Track per-function when this occurs.
+
+Record the model ID as a HuggingFace repo name (e.g. `"ibm-granite/granite-4.0-micro"`)
+or an Ollama tag (e.g. `"granite4:micro-h"`) — whichever is available.
+
+### Step 3 — Look up parameter count
+
+Use three strategies in priority order. Stop at the first that succeeds.
+
+#### Strategy A: HuggingFace Hub API (preferred, requires network)
+
+If the model ID is an HF repo name and `huggingface_hub` is available:
+
+```python
+from huggingface_hub.utils._safetensors import get_safetensors_metadata
+meta = get_safetensors_metadata("ibm-granite/granite-3.3-8b-instruct")
+total_params = sum(meta.parameter_count.values())
+```
+
+This returns exact parameter counts by dtype. Use it when available.
+
+Run this via `uv run python -c "..."` — the agent does not need torch installed,
+only `huggingface_hub` (which is in the `[hf]` extra).
+
+If `huggingface_hub` is not installed or network is unavailable, fall through.
+
+#### Strategy B: Ollama model info (for Ollama-tagged models)
+
+If the model has an Ollama name and Ollama is running:
+
+```bash
+ollama show <model_name> --modelfile 2>/dev/null | grep -i 'parameter'
+```
+
+Or parse the Ollama tag for size hints (see Strategy C).
+
+#### Strategy C: Model name parsing (offline fallback)
+
+Extract parameter count from common naming patterns in the model ID string.
+Match against these regex patterns (case-insensitive):
+
+| Pattern | Extract | Example match |
+|---------|---------|---------------|
+| `(\d+\.?\d*)b[-_.]` or `-(\d+\.?\d*)b` | N billion | `granite-3.3-8b` → 8B |
+| `(\d+\.?\d*)B` (capital B in HF names) | N billion | `Qwen3-0.6B` → 0.6B |
+| `-(\d+)m[-_.]` or `(\d+)m-` | N million ÷ 1000 | `granite-4.0-h-350m` → 0.35B |
+| `micro` without explicit size | 0.35B–3B | Check ModelIdentifier catalog |
+| `tiny` without explicit size | 1B–7B | Check ModelIdentifier catalog |
+| `small` without explicit size | 3B–8B | Check ModelIdentifier catalog |
+
+When the name is ambiguous (e.g. `granite4:micro-h` has no explicit number),
+resolve via the `ModelIdentifier` constant in `model_ids.py` — the HF name
+usually contains the explicit size.
+
+#### Strategy D: Conservative default (last resort)
+
+If the model cannot be identified after A–C:
+- Assume **8B parameters** (16 GB at fp16)
+- Flag the file as **"model unidentified — manual review needed"**
+
+### Step 4 — Determine backend type
+
+The backend determines whether GPU gating is needed at all:
+
+| Backend | GPU loaded locally? | Predicate needed |
+|---------|--------------------|--------------------|
+| `LocalHFBackend` | Yes | `require_gpu(min_vram_gb=N)` |
+| `LocalVLLMBackend` | Yes | `require_gpu(min_vram_gb=N)` |
+| `OllamaModelBackend` | Managed by Ollama | `require_ollama()` only. Exception: models >8B through Ollama may need `require_ram(min_gb=N)` for the Ollama server process. |
+| `OpenAIBackend` (real API) | No | No GPU gate |
+| `OpenAIBackend` → Ollama `/v1` | Managed by Ollama | `require_ollama()` only |
+| `WatsonxAIBackend` | No | No GPU gate |
+| `LiteLLMBackend` | No | No GPU gate |
+| Cloud API (Bedrock, etc.) | No | No GPU gate |
+
+**Key rule:** Ollama manages its own GPU memory. Tests using Ollama backends
+should use `require_ollama()`, NOT `require_gpu()`. The only exception is when
+a test needs guaranteed GPU performance (rare) or uses a very large model where
+insufficient system RAM would cause Ollama to fail.
+
+### Step 5 — Compute VRAM and RAM estimates
+
+#### VRAM formula
+
+```
+vram_gb = params_B × bytes_per_param × 1.2
+```
+
+Where:
+- `params_B` = parameter count in billions
+- `bytes_per_param` depends on precision:
+  - fp32: 4.0
+  - fp16/bf16: 2.0 (default assumption)
+  - int8: 1.0
+  - int4: 0.5
+- `1.2` = 20% overhead for KV cache, activations, framework buffers
+
+Round `min_vram_gb` **up** to the next even integer.
+
+#### RAM formula
+
+For local GPU backends (HF, vLLM):
+```
+min_ram_gb = max(16, vram_gb + 8)
+```
+
+The `+ 8` accounts for OS, Python runtime, data loading, and test framework.
+Minimum 16 GB because the test environment needs a working OS + IDE.
+
+For Ollama backends with large models (>8B):
+```
+min_ram_gb = max(16, vram_gb + 12)
+```
+
+The `+ 12` accounts for the Ollama server process overhead on top of the model.
+
+#### GPU isolation
+
+If a test uses `LocalHFBackend` or `LocalVLLMBackend`, recommend
+`require_gpu_isolation()` in addition to `require_gpu()`. These backends hold
+GPU memory at the process level and need subprocess isolation for multi-module
+test runs.
+
+### Step 6 — Output recommendations
+
+#### Summary table (always print first)
+
+```
+| File | Model | Params | Backend | VRAM (GB) | Recommended predicates |
+|------|-------|--------|---------|-----------|------------------------|
+```
+
+Each row should show:
+- File path (relative to repo root)
+- Model identifier (short form)
+- Parameter count (e.g. "8B", "350M")
+- Backend type (HF, vLLM, Ollama, API)
+- Computed VRAM at the selected precision
+- The exact predicate call(s) to use
+
+#### Flag categories
+
+| Flag | Meaning |
+|------|---------|
+| `model unidentified` | Strategies A–C all failed. Default 8B applied. Manual review needed. |
+| `remove GPU gate` | Test uses Ollama/API backend — `require_gpu()` is unnecessary. |
+| `multi-GPU required` | VRAM exceeds 48 GB — cannot run on single consumer GPU. |
+| `verify precision` | Default fp16 assumed but test may use quantisation. |
+
+#### Footer
+
+```
+---
+Files analysed: N | Estimates computed: N | Manual review needed: N
+Precision: fp16 (default) | Override with --precision
+```
+
+---
+
+## Scope boundaries
+
+This skill does NOT:
+- Modify test files (it only produces recommendations)
+- Run the actual models
+- Modify `predicates.py` or `conftest.py`
+- Determine tier classification (use `/audit-markers` for that)
+
+The output feeds into `/audit-markers` when migrating legacy resource markers
+to predicates.
diff --git a/test/README.md b/test/README.md
index 00b620154..a8c9df3c3 100644
--- a/test/README.md
+++ b/test/README.md
@@ -32,7 +32,7 @@ Heavy GPU backends (HuggingFace, vLLM) hold GPU memory at the process level. Eve
 
 Process isolation is **opt-in** via the `--isolate-heavy` flag or `CICD=1` environment variable. When enabled, the collection hook in `test/conftest.py`:
 
-1. Detects modules marked with `@pytest.mark.requires_gpu_isolation`
+1. Detects modules gated with `require_gpu_isolation()` (or the deprecated `@pytest.mark.requires_gpu_isolation`)
 2. Runs each marked module in a separate subprocess
 3. Sets required environment variables (`PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`)
 4. Ensures full GPU memory release between modules
@@ -145,11 +145,31 @@ See [`MARKERS_GUIDE.md`](MARKERS_GUIDE.md) for complete marker documentation.
 Key markers for GPU testing:
 - `@pytest.mark.vllm` - Requires vLLM backend (local, GPU required)
 - `@pytest.mark.huggingface` - Requires HuggingFace backend (local, GPU-heavy)
-- `@pytest.mark.requires_gpu` - Requires GPU hardware (capability check)
-- `@pytest.mark.requires_gpu_isolation` - Requires OS-level process isolation for GPU memory (execution strategy)
-- `@pytest.mark.requires_heavy_ram` - Requires 48GB+ RAM
 - `@pytest.mark.slow` - Tests taking >5 minutes
 
+### Resource gating (predicates)
+
+Use predicate functions from `test/predicates.py` for resource gating:
+
+```python
+from test.predicates import require_gpu, require_ram, require_gpu_isolation
+
+pytestmark = [pytest.mark.e2e, pytest.mark.huggingface, require_gpu(), require_ram(min_gb=48)]
+```
+
+| Predicate | Use when test needs |
+| --------- | ------------------- |
+| `require_gpu()` | Any GPU (CUDA or MPS) |
+| `require_gpu(min_vram_gb=N)` | GPU with at least N GB VRAM |
+| `require_ram(min_gb=N)` | N GB+ system RAM |
+| `require_gpu_isolation()` | Subprocess isolation for CUDA memory |
+| `require_api_key("ENV_VAR")` | Specific API credentials |
+
+> **Deprecated:** The markers `requires_gpu`, `requires_heavy_ram`, `requires_api_key`,
+> and `requires_gpu_isolation` are deprecated. Existing tests using them still work
+> (conftest auto-skip handles them) but new tests must use predicates. Migrate legacy
+> markers to predicates when touching those files.
+
 ## Coverage
 
 Coverage reports are generated in `htmlcov/` and `coverage.json`.
diff --git a/test/conftest.py b/test/conftest.py
index b86b4a50c..987dabb7f 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -89,7 +89,13 @@ def get_system_capabilities():
                 capabilities["gpu_memory_gb"] = float(result.stdout.strip()) / 1024
             except Exception:
                 pass
-        # Note: MPS doesn't provide easy memory query, leave at 0
+        elif has_mps:
+            try:
+                capabilities["gpu_memory_gb"] = torch.mps.recommended_max_memory() / (
+                    1024**3
+                )
+            except (RuntimeError, AttributeError):
+                pass
 
     # Detect RAM
     if HAS_PSUTIL:
diff --git a/test/predicates.py b/test/predicates.py
index b7d10d085..86f33a6f7 100644
--- a/test/predicates.py
+++ b/test/predicates.py
@@ -47,13 +47,21 @@ def _gpu_available() -> bool:
 
 
 def _gpu_vram_gb() -> float:
-    """Return VRAM in GB for the first CUDA device, or 0 if unavailable."""
+    """Return usable GPU VRAM in GB, or 0 if unavailable.
+
+    On CUDA: reports device 0 total memory.
+    On macOS MPS: reports ``recommendedMaxWorkingSetSize`` — the Metal
+    driver's own estimate of how much unified memory the GPU can use
+    without degrading system performance.
+    """
     try:
         import torch
 
         if torch.cuda.is_available():
             return torch.cuda.get_device_properties(0).total_memory / (1024**3)
-    except (ImportError, RuntimeError):
+        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            return torch.mps.recommended_max_memory() / (1024**3)
+    except (ImportError, RuntimeError, AttributeError):
         pass
     return 0.0
 

From 6a3d6f31ef18f2440ec7368cc8e5cdd4fb397b32 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 12:54:38 +0000
Subject: [PATCH 10/42] refactor: fold estimate-vram into audit-markers skill

VRAM estimation is only useful during marker audits, not standalone.
Move the model-tracing and VRAM computation procedure into the
audit-markers resource gating section and delete the separate skill.
---
 .agents/skills/audit-markers/SKILL.md | 103 ++++++++++-
 .agents/skills/estimate-vram/SKILL.md | 239 --------------------------
 2 files changed, 99 insertions(+), 243 deletions(-)
 delete mode 100644 .agents/skills/estimate-vram/SKILL.md

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index f7b1ff762..f71733d1f 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -3,8 +3,9 @@ name: audit-markers
 description: >
   Audit and fix pytest markers on test files and examples. Classifies tests as
   unit/integration/e2e/qualitative using general heuristics and project-specific
-  marker rules. Use when reviewing markers, auditing test files, or checking
-  before commit. References test/MARKERS_GUIDE.md for project conventions.
+  marker rules. Estimates GPU VRAM and RAM requirements by tracing model
+  identifiers and looking up parameter counts. Use when reviewing markers,
+  auditing test files, or checking before commit.
 argument-hint: "[file-or-directory] [--dry-run | --apply]"
 compatibility: "Claude Code, IBM Bob"
 metadata:
@@ -349,8 +350,102 @@ resource markers scattered across files.
 ### Determining `min_vram_gb` and `min_gb` values
 
 When migrating legacy `requires_gpu` or `requires_heavy_ram` markers to predicates,
-use the `/estimate-vram` skill to determine the correct `min_vram_gb` and `min_gb`
-values based on the model each test loads. Do not guess or use blanket thresholds.
+do not guess or use blanket thresholds. Determine the correct values by tracing the
+model each test loads and computing VRAM requirements from parameter counts.
+
+#### Trace the model identifier
+
+For each file needing GPU/RAM gating, determine which model(s) it loads. Check in order:
+
+1. **Module-level constants** — e.g. `BASE_MODEL = "ibm-granite/..."` or
+   `MODEL_ID = model_ids.QWEN3_0_6B`.
+2. **Fixture definitions** — trace `@pytest.fixture` functions for:
+   - `LocalHFBackend(model_id=...)` — extract the `model_id` argument
+   - `LocalVLLMBackend(model_id=...)` — extract the `model_id` argument
+   - `start_session("hf", model_id=...)` — extract `model_id`
+3. **ModelIdentifier resolution** — if the model_id is a constant like
+   `model_ids.QWEN3_0_6B`, read `mellea/backends/model_ids.py` and extract
+   the `hf_model_name` field.
+4. **Conftest fixtures** — check `conftest.py` files up the directory tree for
+   fixture definitions that provide model/backend instances.
+5. **Per-function overrides** — some files have different models per test function.
+   Track per-function when this occurs.
+
+#### Look up parameter count
+
+Use these strategies in priority order. Stop at the first that succeeds.
+
+**Strategy A: HuggingFace Hub API** (preferred, requires network)
+
+```python
+from huggingface_hub.utils._safetensors import get_safetensors_metadata
+meta = get_safetensors_metadata("ibm-granite/granite-3.3-8b-instruct")
+total_params = sum(meta.parameter_count.values())
+```
+
+Run via `uv run python -c "..."` — only needs `huggingface_hub` (in the `[hf]` extra).
+
+**Strategy B: Ollama model info** (for Ollama-tagged models)
+
+```bash
+ollama show <model_name> --modelfile 2>/dev/null | grep -i 'parameter'
+```
+
+**Strategy C: Model name parsing** (offline fallback)
+
+| Pattern | Extract | Example match |
+|---------|---------|---------------|
+| `(\d+\.?\d*)b[-_.]` or `-(\d+\.?\d*)b` | N billion | `granite-3.3-8b` → 8B |
+| `(\d+\.?\d*)B` (capital B in HF names) | N billion | `Qwen3-0.6B` → 0.6B |
+| `-(\d+)m[-_.]` or `(\d+)m-` | N million ÷ 1000 | `granite-4.0-h-350m` → 0.35B |
+| `micro` without explicit size | 0.35B–3B | Check ModelIdentifier catalog |
+
+When the name is ambiguous (e.g. `granite4:micro-h`), resolve via the
+`ModelIdentifier` constant in `model_ids.py` — the HF name usually contains
+the explicit size.
+
+**Strategy D: Conservative default** (last resort)
+- Assume **8B parameters** (16 GB at fp16)
+- Flag as **"model unidentified — manual review needed"**
+
+#### Backend determines GPU gating need
+
+| Backend | GPU loaded locally? | Predicate needed |
+|---------|--------------------|--------------------|
+| `LocalHFBackend` | Yes | `require_gpu(min_vram_gb=N)` |
+| `LocalVLLMBackend` | Yes | `require_gpu(min_vram_gb=N)` |
+| `OllamaModelBackend` | Managed by Ollama | `require_ollama()` only. Exception: models >8B through Ollama may need `require_ram(min_gb=N)` for the server process. |
+| `OpenAIBackend` (real API) | No | No GPU gate |
+| `OpenAIBackend` → Ollama `/v1` | Managed by Ollama | `require_ollama()` only |
+| `WatsonxAIBackend` / `LiteLLMBackend` / Cloud | No | No GPU gate |
+
+**Key rule:** Ollama manages its own GPU memory. Tests using Ollama backends
+should use `require_ollama()`, NOT `require_gpu()`.
+
+#### Compute VRAM and RAM estimates
+
+**VRAM formula:**
+```
+vram_gb = params_B × bytes_per_param × 1.2
+```
+
+Where `bytes_per_param` depends on precision: fp32=4.0, fp16/bf16=2.0 (default),
+int8=1.0, int4=0.5. The 1.2 multiplier covers KV cache, activations, and framework
+buffers. Round `min_vram_gb` **up** to the next even integer.
+
+**RAM formula** (local GPU backends — HF, vLLM):
+```
+min_ram_gb = max(16, vram_gb + 8)
+```
+
+For Ollama backends with large models (>8B):
+```
+min_ram_gb = max(16, vram_gb + 12)
+```
+
+**GPU isolation:** If a test uses `LocalHFBackend` or `LocalVLLMBackend`, recommend
+`require_gpu_isolation()` in addition to `require_gpu()`. These backends hold GPU
+memory at the process level and need subprocess isolation for multi-module test runs.
 
 ### What to audit
 
diff --git a/.agents/skills/estimate-vram/SKILL.md b/.agents/skills/estimate-vram/SKILL.md
deleted file mode 100644
index 82acec433..000000000
--- a/.agents/skills/estimate-vram/SKILL.md
+++ /dev/null
@@ -1,239 +0,0 @@
----
-name: estimate-vram
-description: >
-  Estimate GPU VRAM and RAM requirements for test files by tracing model
-  identifiers and looking up parameter counts. Use when migrating legacy
-  requires_gpu/requires_heavy_ram markers to predicates, or adding resource
-  gating to new GPU tests. Produces a recommendations table with exact
-  require_gpu(min_vram_gb=N) and require_ram(min_gb=N) values.
-argument-hint: "[file-or-directory] [--precision fp16|fp32|int4|int8]"
-compatibility: "Claude Code, IBM Bob"
-metadata:
-  version: "2026-03-25"
-  capabilities: [read_file, bash, grep, glob]
----
-
-# Estimate VRAM Requirements
-
-Analyse test files to determine appropriate `require_gpu(min_vram_gb=N)` and
-`require_ram(min_gb=N)` values for resource gating predicates.
-
-## Inputs
-
-- `$ARGUMENTS` — file path, directory, or glob. If empty, scan all test files
-  that have GPU-related markers or backend constructors.
-- `--precision` — override default precision assumption. One of `fp16` (default),
-  `fp32`, `int4`, `int8`.
-
-## Project References
-
-Read these before estimating — they provide model constants and predicate APIs:
-
-- **Model identifiers:** `mellea/backends/model_ids.py` (`ModelIdentifier` constants)
-- **Resource predicates:** `test/predicates.py` (available predicate functions)
-- **Marker conventions:** `test/MARKERS_GUIDE.md`
-
----
-
-## Procedure
-
-### Step 1 — Find GPU-relevant test files
-
-If `$ARGUMENTS` is a specific file, use that. Otherwise grep across the target
-scope for any of these signals:
-
-- `requires_gpu`, `require_gpu` — existing or legacy resource gating
-- `requires_heavy_ram`, `require_ram` — existing or legacy RAM gating
-- `LocalHFBackend`, `LocalVLLMBackend` — local GPU backends
-- `.from_pretrained(` — direct model loading
-- `pytest.mark.huggingface`, `pytest.mark.vllm` — GPU backend markers
-
-Files with only `pytest.mark.ollama` or cloud backend markers (openai, watsonx,
-litellm, bedrock) do not need GPU gating analysis — skip them unless they also
-have a `requires_gpu` marker that may be wrong (like `test_genslot.py`).
-
-### Step 2 — Trace the model identifier
-
-For each file, determine which model(s) it loads. Check in order:
-
-1. **Module-level constants** — e.g. `BASE_MODEL = "ibm-granite/..."` or
-   `MODEL_ID = model_ids.QWEN3_0_6B`.
-2. **Fixture definitions** — trace `@pytest.fixture` functions for:
-   - `LocalHFBackend(model_id=...)` — extract the `model_id` argument
-   - `LocalVLLMBackend(model_id=...)` — extract the `model_id` argument
-   - `start_session("hf", model_id=...)` — extract `model_id`
-3. **ModelIdentifier resolution** — if the model_id is a constant like
-   `model_ids.QWEN3_0_6B`, read `mellea/backends/model_ids.py` and extract
-   the `hf_model_name` field.
-4. **Conftest fixtures** — check `conftest.py` files up the directory tree for
-   fixture definitions that provide model/backend instances.
-5. **Per-function overrides** — some files have different models per test function.
-   Track per-function when this occurs.
-
-Record the model ID as a HuggingFace repo name (e.g. `"ibm-granite/granite-4.0-micro"`)
-or an Ollama tag (e.g. `"granite4:micro-h"`) — whichever is available.
-
-### Step 3 — Look up parameter count
-
-Use three strategies in priority order. Stop at the first that succeeds.
-
-#### Strategy A: HuggingFace Hub API (preferred, requires network)
-
-If the model ID is an HF repo name and `huggingface_hub` is available:
-
-```python
-from huggingface_hub.utils._safetensors import get_safetensors_metadata
-meta = get_safetensors_metadata("ibm-granite/granite-3.3-8b-instruct")
-total_params = sum(meta.parameter_count.values())
-```
-
-This returns exact parameter counts by dtype. Use it when available.
-
-Run this via `uv run python -c "..."` — the agent does not need torch installed,
-only `huggingface_hub` (which is in the `[hf]` extra).
-
-If `huggingface_hub` is not installed or network is unavailable, fall through.
-
-#### Strategy B: Ollama model info (for Ollama-tagged models)
-
-If the model has an Ollama name and Ollama is running:
-
-```bash
-ollama show <model_name> --modelfile 2>/dev/null | grep -i 'parameter'
-```
-
-Or parse the Ollama tag for size hints (see Strategy C).
-
-#### Strategy C: Model name parsing (offline fallback)
-
-Extract parameter count from common naming patterns in the model ID string.
-Match against these regex patterns (case-insensitive):
-
-| Pattern | Extract | Example match |
-|---------|---------|---------------|
-| `(\d+\.?\d*)b[-_.]` or `-(\d+\.?\d*)b` | N billion | `granite-3.3-8b` → 8B |
-| `(\d+\.?\d*)B` (capital B in HF names) | N billion | `Qwen3-0.6B` → 0.6B |
-| `-(\d+)m[-_.]` or `(\d+)m-` | N million ÷ 1000 | `granite-4.0-h-350m` → 0.35B |
-| `micro` without explicit size | 0.35B–3B | Check ModelIdentifier catalog |
-| `tiny` without explicit size | 1B–7B | Check ModelIdentifier catalog |
-| `small` without explicit size | 3B–8B | Check ModelIdentifier catalog |
-
-When the name is ambiguous (e.g. `granite4:micro-h` has no explicit number),
-resolve via the `ModelIdentifier` constant in `model_ids.py` — the HF name
-usually contains the explicit size.
-
-#### Strategy D: Conservative default (last resort)
-
-If the model cannot be identified after A–C:
-- Assume **8B parameters** (16 GB at fp16)
-- Flag the file as **"model unidentified — manual review needed"**
-
-### Step 4 — Determine backend type
-
-The backend determines whether GPU gating is needed at all:
-
-| Backend | GPU loaded locally? | Predicate needed |
-|---------|--------------------|--------------------|
-| `LocalHFBackend` | Yes | `require_gpu(min_vram_gb=N)` |
-| `LocalVLLMBackend` | Yes | `require_gpu(min_vram_gb=N)` |
-| `OllamaModelBackend` | Managed by Ollama | `require_ollama()` only. Exception: models >8B through Ollama may need `require_ram(min_gb=N)` for the Ollama server process. |
-| `OpenAIBackend` (real API) | No | No GPU gate |
-| `OpenAIBackend` → Ollama `/v1` | Managed by Ollama | `require_ollama()` only |
-| `WatsonxAIBackend` | No | No GPU gate |
-| `LiteLLMBackend` | No | No GPU gate |
-| Cloud API (Bedrock, etc.) | No | No GPU gate |
-
-**Key rule:** Ollama manages its own GPU memory. Tests using Ollama backends
-should use `require_ollama()`, NOT `require_gpu()`. The only exception is when
-a test needs guaranteed GPU performance (rare) or uses a very large model where
-insufficient system RAM would cause Ollama to fail.
-
-### Step 5 — Compute VRAM and RAM estimates
-
-#### VRAM formula
-
-```
-vram_gb = params_B × bytes_per_param × 1.2
-```
-
-Where:
-- `params_B` = parameter count in billions
-- `bytes_per_param` depends on precision:
-  - fp32: 4.0
-  - fp16/bf16: 2.0 (default assumption)
-  - int8: 1.0
-  - int4: 0.5
-- `1.2` = 20% overhead for KV cache, activations, framework buffers
-
-Round `min_vram_gb` **up** to the next even integer.
-
-#### RAM formula
-
-For local GPU backends (HF, vLLM):
-```
-min_ram_gb = max(16, vram_gb + 8)
-```
-
-The `+ 8` accounts for OS, Python runtime, data loading, and test framework.
-Minimum 16 GB because the test environment needs a working OS + IDE.
-
-For Ollama backends with large models (>8B):
-```
-min_ram_gb = max(16, vram_gb + 12)
-```
-
-The `+ 12` accounts for the Ollama server process overhead on top of the model.
-
-#### GPU isolation
-
-If a test uses `LocalHFBackend` or `LocalVLLMBackend`, recommend
-`require_gpu_isolation()` in addition to `require_gpu()`. These backends hold
-GPU memory at the process level and need subprocess isolation for multi-module
-test runs.
-
-### Step 6 — Output recommendations
-
-#### Summary table (always print first)
-
-```
-| File | Model | Params | Backend | VRAM (GB) | Recommended predicates |
-|------|-------|--------|---------|-----------|------------------------|
-```
-
-Each row should show:
-- File path (relative to repo root)
-- Model identifier (short form)
-- Parameter count (e.g. "8B", "350M")
-- Backend type (HF, vLLM, Ollama, API)
-- Computed VRAM at the selected precision
-- The exact predicate call(s) to use
-
-#### Flag categories
-
-| Flag | Meaning |
-|------|---------|
-| `model unidentified` | Strategies A–C all failed. Default 8B applied. Manual review needed. |
-| `remove GPU gate` | Test uses Ollama/API backend — `require_gpu()` is unnecessary. |
-| `multi-GPU required` | VRAM exceeds 48 GB — cannot run on single consumer GPU. |
-| `verify precision` | Default fp16 assumed but test may use quantisation. |
-
-#### Footer
-
-```
----
-Files analysed: N | Estimates computed: N | Manual review needed: N
-Precision: fp16 (default) | Override with --precision
-```
-
----
-
-## Scope boundaries
-
-This skill does NOT:
-- Modify test files (it only produces recommendations)
-- Run the actual models
-- Modify `predicates.py` or `conftest.py`
-- Determine tier classification (use `/audit-markers` for that)
-
-The output feeds into `/audit-markers` when migrating legacy resource markers
-to predicates.

From 32885411ae69fb7134a3fcf34c07300cfd4af997 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 13:19:08 +0000
Subject: [PATCH 11/42] docs: drop isolation refs and fix RAM guidance in
 markers docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

requires_heavy_ram and requires_gpu_isolation are deprecated with no
replacement — models load into VRAM not system RAM, and GPU isolation
is now automatic. require_ram() stays available for genuinely RAM-bound
tests but has no current use case.
---
 .agents/skills/audit-markers/SKILL.md | 57 ++++++++++++---------------
 test/MARKERS_GUIDE.md                 | 23 ++++++-----
 2 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index f71733d1f..3c1029164 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -349,9 +349,10 @@ resource markers scattered across files.
 
 ### Determining `min_vram_gb` and `min_gb` values
 
-When migrating legacy `requires_gpu` or `requires_heavy_ram` markers to predicates,
-do not guess or use blanket thresholds. Determine the correct values by tracing the
-model each test loads and computing VRAM requirements from parameter counts.
+When migrating legacy `requires_gpu` markers to predicates, do not guess or use
+blanket thresholds. Determine the correct `min_vram_gb` by tracing the model each
+test loads and computing VRAM requirements from parameter counts. Legacy
+`requires_heavy_ram` markers should simply be removed (see "What to audit" below).
 
 #### Trace the model identifier
 
@@ -414,7 +415,7 @@ the explicit size.
 |---------|--------------------|--------------------|
 | `LocalHFBackend` | Yes | `require_gpu(min_vram_gb=N)` |
 | `LocalVLLMBackend` | Yes | `require_gpu(min_vram_gb=N)` |
-| `OllamaModelBackend` | Managed by Ollama | `require_ollama()` only. Exception: models >8B through Ollama may need `require_ram(min_gb=N)` for the server process. |
+| `OllamaModelBackend` | Managed by Ollama | `require_ollama()` only |
 | `OpenAIBackend` (real API) | No | No GPU gate |
 | `OpenAIBackend` → Ollama `/v1` | Managed by Ollama | `require_ollama()` only |
 | `WatsonxAIBackend` / `LiteLLMBackend` / Cloud | No | No GPU gate |
@@ -422,7 +423,7 @@ the explicit size.
 **Key rule:** Ollama manages its own GPU memory. Tests using Ollama backends
 should use `require_ollama()`, NOT `require_gpu()`.
 
-#### Compute VRAM and RAM estimates
+#### Compute VRAM estimate
 
 **VRAM formula:**
 ```
@@ -433,19 +434,9 @@ Where `bytes_per_param` depends on precision: fp32=4.0, fp16/bf16=2.0 (default),
 int8=1.0, int4=0.5. The 1.2 multiplier covers KV cache, activations, and framework
 buffers. Round `min_vram_gb` **up** to the next even integer.
 
-**RAM formula** (local GPU backends — HF, vLLM):
-```
-min_ram_gb = max(16, vram_gb + 8)
-```
-
-For Ollama backends with large models (>8B):
-```
-min_ram_gb = max(16, vram_gb + 12)
-```
-
-**GPU isolation:** If a test uses `LocalHFBackend` or `LocalVLLMBackend`, recommend
-`require_gpu_isolation()` in addition to `require_gpu()`. These backends hold GPU
-memory at the process level and need subprocess isolation for multi-module test runs.
+Models load into GPU VRAM, not system RAM — do **not** add `require_ram()` to GPU
+tests. The `require_ram()` predicate exists for tests that are genuinely RAM-bound
+(large dataset processing, etc.), not as a companion to `require_gpu()`.
 
 ### What to audit
 
@@ -453,21 +444,24 @@ Check the project's predicate module (see Project References) for available
 predicates, then apply the following checks to every e2e/qualitative file:
 
 1. **Legacy resource markers → migrate to predicates.** If a test uses
-   `@pytest.mark.requires_gpu`, `@pytest.mark.requires_heavy_ram`,
-   `@pytest.mark.requires_api_key`, or `@pytest.mark.requires_gpu_isolation`,
+   `@pytest.mark.requires_gpu` or `@pytest.mark.requires_api_key`,
    replace with the equivalent predicate from the project's predicate module.
-   Resource markers are deprecated in favour of predicates. This is a **fix**
-   (same priority as `llm` → `e2e`), not just a recommendation — apply it in
-   Step 4 like any other marker fix. The replacement requires adding an import
-   for the predicate and swapping the marker in the `pytestmark` list or
-   decorator.
+   If a test uses `@pytest.mark.requires_heavy_ram` or
+   `@pytest.mark.requires_gpu_isolation`, **remove** them — these are
+   deprecated markers with no direct replacement (`requires_heavy_ram` was a
+   blanket 48 GB threshold that conflated VRAM with RAM; GPU isolation is now
+   automatic). Resource markers are deprecated in favour of predicates. This
+   is a **fix** (same priority as `llm` → `e2e`), not just a recommendation —
+   apply it in Step 4 like any other marker fix. The replacement requires
+   adding an import for the predicate and swapping the marker in the
+   `pytestmark` list or decorator.
 2. **Ad-hoc `skipif` → migrate to predicate.** If a predicate exists for
    the same check (e.g., `require_gpu()` exists but the test has a raw
    `skipif(not torch.cuda.is_available())`), replace with the predicate.
 3. **Missing gating.** A test that uses a GPU backend but has no GPU
    predicate and no `skipif` — add the appropriate predicate.
-4. **Imprecise gating.** A predicate that's too broad (e.g., `require_ram(48)`
-   on a test that only needs 16 GB) — tighten the threshold.
+4. **Imprecise gating.** A predicate that's too broad for the actual model
+   being loaded — tighten the `min_vram_gb` threshold.
 5. **Redundant CICD `skipif`.** `skipif(CICD == 1)` is usually redundant
    when conftest auto-skip or predicates already handle the condition.
    Flag as removable.
@@ -499,8 +493,7 @@ Read `test/predicates.py` for the available predicates. Expected patterns:
 |---|---|
 | `require_gpu()` | Any GPU (CUDA or MPS) |
 | `require_gpu(min_vram_gb=N)` | GPU with at least N GB VRAM |
-| `require_ram(min_gb=N)` | N GB+ system RAM |
-| `require_gpu_isolation()` | Subprocess isolation for CUDA memory |
+| `require_ram(min_gb=N)` | N GB+ system RAM (for genuinely RAM-bound tests, not GPU model loading) |
 | `require_api_key("OPENAI_API_KEY")` | Specific API credentials |
 | `require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID")` | Multiple credentials |
 | `require_package("cpex.framework")` | Optional dependency |
@@ -509,8 +502,8 @@ Read `test/predicates.py` for the available predicates. Expected patterns:
 
 Typical combinations for backends:
 
-- `huggingface` → `require_gpu()` + `require_ram(48)` (adjust RAM per model)
-- `vllm` → `require_gpu(min_vram_gb=24)` + `require_ram(48)`
+- `huggingface` → `require_gpu(min_vram_gb=N)` (compute N from model params)
+- `vllm` → `require_gpu(min_vram_gb=N)` (compute N from model params)
 - `watsonx` → `require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID")`
 - `openai` → `require_api_key("OPENAI_API_KEY")` only for real OpenAI (not Ollama-compat)
 
@@ -724,5 +717,5 @@ flag as a blocker, don't silently re-add:
   over it. New backends are added by inserting one entry into the dict.
   `pyproject.toml` and `test/MARKERS_GUIDE.md` must stay in sync manually.
 - **Resource predicates:** `test/predicates.py` provides `require_gpu`,
-  `require_ram`, `require_gpu_isolation`, `require_api_key`, `require_package`,
+  `require_ram`, `require_api_key`, `require_package`,
   `require_ollama`, `require_python`.
diff --git a/test/MARKERS_GUIDE.md b/test/MARKERS_GUIDE.md
index e6fa3877b..11a2ac212 100644
--- a/test/MARKERS_GUIDE.md
+++ b/test/MARKERS_GUIDE.md
@@ -124,8 +124,8 @@ tests don't need real backends.
 | `ollama`       | Ollama (port 11434)           | Local, light (~2-4GB RAM)             |
 | `openai`       | OpenAI API or compatible      | API calls (may use Ollama `/v1`)      |
 | `watsonx`      | Watsonx API                   | API calls, requires credentials       |
-| `huggingface`  | HuggingFace transformers      | Local, GPU, 48GB+ RAM                 |
-| `vllm`         | vLLM                          | Local, GPU required, 48GB+ RAM        |
+| `huggingface`  | HuggingFace transformers      | Local, GPU required                   |
+| `vllm`         | vLLM                          | Local, GPU required                   |
 | `litellm`      | LiteLLM (wraps other backends)| Depends on underlying backend         |
 | `bedrock`      | AWS Bedrock                   | API calls, requires credentials       |
 
@@ -145,15 +145,14 @@ infrastructure is absent. Use **predicate decorators** from `test/predicates.py`
 — they give test authors precise control over skip conditions.
 
 ```python
-from test.predicates import require_gpu, require_ram, require_api_key
+from test.predicates import require_gpu, require_api_key
 ```
 
 | Predicate | Use when test needs |
 | --------- | ------------------- |
 | `require_gpu()` | Any GPU (CUDA or MPS) |
 | `require_gpu(min_vram_gb=N)` | GPU with at least N GB VRAM |
-| `require_ram(min_gb=N)` | N GB+ system RAM |
-| `require_gpu_isolation()` | Subprocess isolation for CUDA memory |
+| `require_ram(min_gb=N)` | N GB+ system RAM (genuinely RAM-bound tests only) |
 | `require_api_key("ENV_VAR")` | Specific API credentials |
 | `require_package("pkg")` | Optional dependency |
 | `require_ollama()` | Running Ollama server |
@@ -161,8 +160,8 @@ from test.predicates import require_gpu, require_ram, require_api_key
 
 ### Typical combinations
 
-- `huggingface` → `require_gpu()` + `require_ram(min_gb=48)` (adjust per model)
-- `vllm` → `require_gpu(min_vram_gb=24)` + `require_ram(min_gb=48)`
+- `huggingface` → `require_gpu(min_vram_gb=N)` (compute N from model params)
+- `vllm` → `require_gpu(min_vram_gb=N)` (compute N from model params)
 - `watsonx` → `require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID")`
 - `openai` → `require_api_key("OPENAI_API_KEY")` only for real OpenAI (not Ollama-compat)
 
@@ -180,7 +179,11 @@ These are not resource predicates but still control test selection:
 The markers `requires_gpu`, `requires_heavy_ram`, `requires_api_key`, and
 `requires_gpu_isolation` are deprecated. Existing tests using them still work
 (conftest auto-skip logic handles them) but new tests should use predicates.
-Migrate legacy markers to predicates when touching those files.
+When migrating: `requires_gpu` → `require_gpu(min_vram_gb=N)`;
+`requires_api_key` → `require_api_key(...)`;
+`requires_heavy_ram` and `requires_gpu_isolation` → **remove** (no replacement
+needed — `requires_heavy_ram` conflated VRAM with RAM, and GPU isolation is
+now automatic).
 
 ## Auto-Detection
 
@@ -228,10 +231,10 @@ def test_greeting_content(session):
     assert "hello" in result.value.lower()
 
 # Heavy GPU e2e (predicates for resource gating)
-from test.predicates import require_gpu, require_ram, require_gpu_isolation
+from test.predicates import require_gpu
 
 pytestmark = [pytest.mark.e2e, pytest.mark.huggingface,
-              require_gpu(), require_ram(min_gb=48), require_gpu_isolation()]
+              require_gpu(min_vram_gb=20)]
 ```
 
 ## Example Files (`docs/examples/`)

From 914502d16b4d478f6e1d7e57f0e9e2fa63e674eb Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 13:28:45 +0000
Subject: [PATCH 12/42] docs: add legacy marker guidance for example files in
 audit-markers skill

---
 .agents/skills/audit-markers/SKILL.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index 3c1029164..9427dbcba 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -518,6 +518,13 @@ Examples use a comment-based marker format (not `pytestmark`):
 Same classification rules apply. Parser: `docs/examples/conftest.py`
 (`_extract_markers_from_file`).
 
+**Legacy markers in examples:** The same deprecation rules apply to
+`# pytest:` comments. Remove `requires_heavy_ram`, `requires_gpu_isolation`,
+and `llm` when found. Replace `requires_gpu` with the appropriate predicate
+marker if the comment format supports it, or just remove it and rely on the
+backend marker (e.g., `huggingface` already triggers GPU checks in the
+examples conftest).
+
 ---
 
 # Audit Procedure

From ab8ad75dcf3c82d40f1d0f1a8aa575b0dcaff9cb Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 13:30:34 +0000
Subject: [PATCH 13/42] =?UTF-8?q?refactor:=20remove=20require=5Follama()?=
 =?UTF-8?q?=20predicate=20=E2=80=94=20redundant=20with=20backend=20marker?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ollama backend marker + conftest auto-skip already handles Ollama
availability. No other backend has a dedicated predicate — consistent
to let the marker system handle it.
---
 .agents/skills/audit-markers/SKILL.md |  9 ++++-----
 test/MARKERS_GUIDE.md                 |  1 -
 test/predicates.py                    | 23 -----------------------
 3 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index 9427dbcba..058dd4c98 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -415,13 +415,13 @@ the explicit size.
 |---------|--------------------|--------------------|
 | `LocalHFBackend` | Yes | `require_gpu(min_vram_gb=N)` |
 | `LocalVLLMBackend` | Yes | `require_gpu(min_vram_gb=N)` |
-| `OllamaModelBackend` | Managed by Ollama | `require_ollama()` only |
+| `OllamaModelBackend` | Managed by Ollama | None — `ollama` backend marker + conftest auto-skip handles availability |
 | `OpenAIBackend` (real API) | No | No GPU gate |
-| `OpenAIBackend` → Ollama `/v1` | Managed by Ollama | `require_ollama()` only |
+| `OpenAIBackend` → Ollama `/v1` | Managed by Ollama | None — `ollama` backend marker handles it |
 | `WatsonxAIBackend` / `LiteLLMBackend` / Cloud | No | No GPU gate |
 
 **Key rule:** Ollama manages its own GPU memory. Tests using Ollama backends
-should use `require_ollama()`, NOT `require_gpu()`.
+should use the `ollama` backend marker only, NOT `require_gpu()`.
 
 #### Compute VRAM estimate
 
@@ -497,7 +497,6 @@ Read `test/predicates.py` for the available predicates. Expected patterns:
 | `require_api_key("OPENAI_API_KEY")` | Specific API credentials |
 | `require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID")` | Multiple credentials |
 | `require_package("cpex.framework")` | Optional dependency |
-| `require_ollama()` | Running Ollama server |
 | `require_python((3, 11))` | Minimum Python version |
 
 Typical combinations for backends:
@@ -725,4 +724,4 @@ flag as a blocker, don't silently re-add:
   `pyproject.toml` and `test/MARKERS_GUIDE.md` must stay in sync manually.
 - **Resource predicates:** `test/predicates.py` provides `require_gpu`,
   `require_ram`, `require_api_key`, `require_package`,
-  `require_ollama`, `require_python`.
+  `require_python`.
diff --git a/test/MARKERS_GUIDE.md b/test/MARKERS_GUIDE.md
index 11a2ac212..9c81705f6 100644
--- a/test/MARKERS_GUIDE.md
+++ b/test/MARKERS_GUIDE.md
@@ -155,7 +155,6 @@ from test.predicates import require_gpu, require_api_key
 | `require_ram(min_gb=N)` | N GB+ system RAM (genuinely RAM-bound tests only) |
 | `require_api_key("ENV_VAR")` | Specific API credentials |
 | `require_package("pkg")` | Optional dependency |
-| `require_ollama()` | Running Ollama server |
 | `require_python((3, 11))` | Minimum Python version |
 
 ### Typical combinations
diff --git a/test/predicates.py b/test/predicates.py
index 86f33a6f7..246d68c72 100644
--- a/test/predicates.py
+++ b/test/predicates.py
@@ -179,29 +179,6 @@ def test_plugin_registration(): ...
     return pytest.mark.skipif(not available, reason=f"{package} not installed")
 
 
-# ---------------------------------------------------------------------------
-# Service reachability
-# ---------------------------------------------------------------------------
-
-
-def require_ollama(*, host: str = "localhost", port: int = 11434):
-    """Skip unless Ollama is reachable on the given host/port."""
-    import socket
-
-    try:
-        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        sock.settimeout(1)
-        result = sock.connect_ex((host, port))
-        sock.close()
-        reachable = result == 0
-    except Exception:
-        reachable = False
-
-    return pytest.mark.skipif(
-        not reachable, reason=f"Ollama not available at {host}:{port}"
-    )
-
-
 # ---------------------------------------------------------------------------
 # Python version
 # ---------------------------------------------------------------------------

From be39488d2dcab35f82ef5fdaba46c7621479219b Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 14:25:32 +0000
Subject: [PATCH 14/42] refactor: replace requires_heavy_ram gate with
 huggingface backend marker in examples conftest

The legacy requires_heavy_ram marker (blanket 48 GB RAM threshold) conflated
VRAM with system RAM. Replace both the collection-time and runtime skip logic
to gate on the huggingface backend marker instead, which accurately checks
GPU availability.
---
 docs/examples/conftest.py | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/docs/examples/conftest.py b/docs/examples/conftest.py
index 3afd2fca7..af2fb7112 100644
--- a/docs/examples/conftest.py
+++ b/docs/examples/conftest.py
@@ -123,17 +123,8 @@ def _should_skip_collection(markers):
     if "slow" in markers and int(os.environ.get("SKIP_SLOW", 0)) == 1:
         return True, "Skipping slow test (SKIP_SLOW=1)"
 
-    # Skip tests requiring heavy RAM if insufficient
-    if "requires_heavy_ram" in markers:
-        RAM_THRESHOLD_GB = 48
-        if capabilities["ram_gb"] > 0 and capabilities["ram_gb"] < RAM_THRESHOLD_GB:
-            return (
-                True,
-                f"Insufficient RAM ({capabilities['ram_gb']:.1f}GB < {RAM_THRESHOLD_GB}GB)",
-            )
-
     # Skip tests requiring GPU if not available
-    if "requires_gpu" in markers or "vllm" in markers:
+    if "requires_gpu" in markers or "huggingface" in markers or "vllm" in markers:
         if not capabilities["has_gpu"]:
             return True, "GPU not available"
 
@@ -588,7 +579,6 @@ def pytest_runtest_setup(item):
     config = item.config
     ignore_all = config.getoption("--ignore-all-checks", default=False)
     ignore_gpu = config.getoption("--ignore-gpu-check", default=False) or ignore_all
-    ignore_ram = config.getoption("--ignore-ram-check", default=False) or ignore_all
     ignore_ollama = (
         config.getoption("--ignore-ollama-check", default=False) or ignore_all
     )
@@ -612,18 +602,14 @@ def pytest_runtest_setup(item):
                     )
 
     # Skip tests requiring GPU if not available
-    if item.get_closest_marker("requires_gpu") and not ignore_gpu:
+    if (
+        item.get_closest_marker("requires_gpu")
+        or item.get_closest_marker("huggingface")
+        or item.get_closest_marker("vllm")
+    ) and not ignore_gpu:
         if not capabilities["has_gpu"]:
             pytest.skip("Skipping test: GPU not available")
 
-    # Skip tests requiring heavy RAM if insufficient
-    if item.get_closest_marker("requires_heavy_ram") and not ignore_ram:
-        RAM_THRESHOLD_GB = 48  # Based on real-world testing
-        if capabilities["ram_gb"] > 0 and capabilities["ram_gb"] < RAM_THRESHOLD_GB:
-            pytest.skip(
-                f"Skipping test: Insufficient RAM ({capabilities['ram_gb']:.1f}GB < {RAM_THRESHOLD_GB}GB)"
-            )
-
     # Backend-specific skipping
     if item.get_closest_marker("watsonx") and not ignore_api_key:
         if not capabilities["has_api_keys"].get("watsonx"):

From ab8a20ff2ee10eb96b97ec4c50aa98f194fca619 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 25 Mar 2026 15:44:15 +0000
Subject: [PATCH 15/42] refactor: replace ad-hoc bedrock skipif with
 require_api_key predicate

---
 test/backends/test_bedrock.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/test/backends/test_bedrock.py b/test/backends/test_bedrock.py
index 236a068ff..9b416f30c 100644
--- a/test/backends/test_bedrock.py
+++ b/test/backends/test_bedrock.py
@@ -1,5 +1,3 @@
-import os
-
 import openai
 import pytest
 
@@ -9,15 +7,12 @@
 from mellea.backends.bedrock import create_bedrock_mantle_backend
 from mellea.backends.openai import OpenAIBackend
 from mellea.stdlib.context import ChatContext
+from test.predicates import require_api_key
 
-# Skip entire module in CI since the single test is qualitative
 pytestmark = [
     pytest.mark.e2e,
     pytest.mark.bedrock,
-    pytest.mark.skipif(
-        "AWS_BEARER_TOKEN_BEDROCK" not in os.environ.keys(),
-        reason="Skipping Bedrock backend tests if $AWS_BEARER_TOKEN_BEDROCK is not set.",
-    ),
+    require_api_key("AWS_BEARER_TOKEN_BEDROCK"),
 ]
 
 

From c0c004e170e98be15cd2abaab8177414b3ef249b Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 26 Mar 2026 14:37:23 +0000
Subject: [PATCH 16/42] refactor: migrate legacy resource markers to predicates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace deprecated pytest markers with typed predicate functions from
test/predicates.py across all test files and example files:

- requires_gpu → require_gpu(min_vram_gb=N) with per-model VRAM estimates
- requires_heavy_ram → removed (conflated VRAM with RAM; no replacement needed)
- requires_gpu_isolation → removed (GPU isolation is now automatic)
- requires_api_key → require_api_key("VAR1", "VAR2", ...) with explicit env vars

Also removes spurious requires_gpu from ollama-backed tests (test_genslot,
test_think_budget_forcing, test_component_typing) and adds missing
integration marker to test_hook_call_sites.

VRAM estimates computed from model parameter counts using bf16 formula
(params_B × 2 × 1.2, rounded up to next even GB):
- granite-3.3-8b: 20 GB, Mistral-7B: 18 GB, granite-4.0-micro (3B): 8 GB
- Qwen3-0.6B: 4 GB (conservative for vLLM KV cache headroom)
- granite-4.0-h-micro (3B): 8 GB, alora training (3B): 12 GB
---
 docs/examples/aLora/101_example.py                        | 2 +-
 docs/examples/aLora/102_example.py                        | 2 +-
 docs/examples/aLora/example_readme_generator.py           | 2 +-
 docs/examples/aLora/make_training_data.py                 | 2 +-
 docs/examples/aLora/stembolts_intrinsic.py                | 2 +-
 docs/examples/image_text_models/vision_openai_examples.py | 2 +-
 docs/examples/intrinsics/answerability.py                 | 2 +-
 docs/examples/intrinsics/citations.py                     | 2 +-
 docs/examples/intrinsics/context_attribution.py           | 2 +-
 docs/examples/intrinsics/context_relevance.py             | 2 +-
 docs/examples/intrinsics/factuality_correction.py         | 2 +-
 docs/examples/intrinsics/factuality_detection.py          | 2 +-
 docs/examples/intrinsics/guardian_core.py                 | 2 +-
 docs/examples/intrinsics/hallucination_detection.py       | 2 +-
 docs/examples/intrinsics/intrinsics.py                    | 2 +-
 docs/examples/intrinsics/policy_guardrails.py             | 2 +-
 docs/examples/intrinsics/query_clarification.py           | 2 +-
 docs/examples/intrinsics/query_rewrite.py                 | 2 +-
 docs/examples/intrinsics/requirement_check.py             | 2 +-
 docs/examples/intrinsics/uncertainty.py                   | 2 +-
 docs/examples/mify/rich_document_advanced.py              | 2 +-
 docs/examples/safety/guardian_huggingface.py              | 2 +-
 docs/examples/safety/repair_with_guardian.py              | 2 +-
 docs/examples/sofai/sofai_graph_coloring.py               | 2 +-
 docs/examples/tutorial/document_mobject.py                | 2 +-
 test/backends/test_huggingface.py                         | 6 +++---
 test/backends/test_huggingface_tools.py                   | 5 ++---
 test/backends/test_litellm_watsonx.py                     | 4 +++-
 test/backends/test_openai_vllm.py                         | 5 +++--
 test/backends/test_vllm.py                                | 6 +++---
 test/backends/test_vllm_tools.py                          | 5 ++---
 test/backends/test_watsonx.py                             | 4 +++-
 test/cli/test_alora_train_integration.py                  | 5 +++--
 test/core/test_component_typing.py                        | 5 -----
 test/plugins/test_hook_call_sites.py                      | 2 ++
 test/stdlib/components/intrinsic/test_core.py             | 4 ++--
 test/stdlib/components/intrinsic/test_guardian.py         | 4 ++--
 test/stdlib/components/intrinsic/test_rag.py              | 4 ++--
 test/stdlib/components/test_genslot.py                    | 2 +-
 test/stdlib/sampling/test_think_budget_forcing.py         | 8 +-------
 test/stdlib/test_session.py                               | 3 ++-
 test/stdlib/test_spans.py                                 | 8 ++------
 test/telemetry/test_metrics_backend.py                    | 4 +++-
 43 files changed, 64 insertions(+), 70 deletions(-)

diff --git a/docs/examples/aLora/101_example.py b/docs/examples/aLora/101_example.py
index b1538f4f4..8f19d1082 100644
--- a/docs/examples/aLora/101_example.py
+++ b/docs/examples/aLora/101_example.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 import time
 
diff --git a/docs/examples/aLora/102_example.py b/docs/examples/aLora/102_example.py
index 05c9780fc..3ca44f48c 100644
--- a/docs/examples/aLora/102_example.py
+++ b/docs/examples/aLora/102_example.py
@@ -1,4 +1,4 @@
-# pytest: skip, huggingface, requires_heavy_ram, e2e
+# pytest: skip, huggingface, e2e
 # SKIP REASON: Requires user input; tests same functionality as 101_example.py.
 
 from stembolts_intrinsic import (
diff --git a/docs/examples/aLora/example_readme_generator.py b/docs/examples/aLora/example_readme_generator.py
index 3c586aa5e..94ef46883 100644
--- a/docs/examples/aLora/example_readme_generator.py
+++ b/docs/examples/aLora/example_readme_generator.py
@@ -1,4 +1,4 @@
-# pytest: skip, huggingface, requires_heavy_ram, e2e
+# pytest: skip, huggingface, e2e
 # SKIP REASON: documentation only.
 
 from cli.alora.readme_generator import generate_readme, make_readme_jinja_dict
diff --git a/docs/examples/aLora/make_training_data.py b/docs/examples/aLora/make_training_data.py
index e3b25fb36..b3e7f4ea5 100644
--- a/docs/examples/aLora/make_training_data.py
+++ b/docs/examples/aLora/make_training_data.py
@@ -1,4 +1,4 @@
-# pytest: skip, huggingface, requires_heavy_ram, e2e
+# pytest: skip, huggingface, e2e
 # SKIP REASON: documentation only.import argparse
 import argparse
 import json
diff --git a/docs/examples/aLora/stembolts_intrinsic.py b/docs/examples/aLora/stembolts_intrinsic.py
index f3af5f42b..accbebfab 100644
--- a/docs/examples/aLora/stembolts_intrinsic.py
+++ b/docs/examples/aLora/stembolts_intrinsic.py
@@ -1,5 +1,5 @@
 # type: ignore
-# pytest: skip, huggingface, requires_heavy_ram, e2e
+# pytest: skip, huggingface, e2e
 # SKIP REASON: needs to update.
 
 import mellea.stdlib.functional as mfuncs
diff --git a/docs/examples/image_text_models/vision_openai_examples.py b/docs/examples/image_text_models/vision_openai_examples.py
index f1d423cd0..c4e2be8cd 100644
--- a/docs/examples/image_text_models/vision_openai_examples.py
+++ b/docs/examples/image_text_models/vision_openai_examples.py
@@ -1,4 +1,4 @@
-# pytest: ollama, e2e, requires_heavy_ram
+# pytest: ollama, e2e
 
 """Examples using vision models with OpenAI backend."""
 
diff --git a/docs/examples/intrinsics/answerability.py b/docs/examples/intrinsics/answerability.py
index 5875286d3..3afbddbd5 100644
--- a/docs/examples/intrinsics/answerability.py
+++ b/docs/examples/intrinsics/answerability.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the answerability intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/citations.py b/docs/examples/intrinsics/citations.py
index 43b634792..9b2fc1296 100644
--- a/docs/examples/intrinsics/citations.py
+++ b/docs/examples/intrinsics/citations.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the citations intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/context_attribution.py b/docs/examples/intrinsics/context_attribution.py
index b31eba88a..e90628fe0 100644
--- a/docs/examples/intrinsics/context_attribution.py
+++ b/docs/examples/intrinsics/context_attribution.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the context attribution intrinsic.
 
diff --git a/docs/examples/intrinsics/context_relevance.py b/docs/examples/intrinsics/context_relevance.py
index a89effeb7..32562d0ac 100644
--- a/docs/examples/intrinsics/context_relevance.py
+++ b/docs/examples/intrinsics/context_relevance.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the context relevance intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/factuality_correction.py b/docs/examples/intrinsics/factuality_correction.py
index d3902024a..123ee92f4 100644
--- a/docs/examples/intrinsics/factuality_correction.py
+++ b/docs/examples/intrinsics/factuality_correction.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the factuality correction intrinsic.
 
diff --git a/docs/examples/intrinsics/factuality_detection.py b/docs/examples/intrinsics/factuality_detection.py
index 511ceaab6..9418ff9b6 100644
--- a/docs/examples/intrinsics/factuality_detection.py
+++ b/docs/examples/intrinsics/factuality_detection.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the factuality detection intrinsic.
 
diff --git a/docs/examples/intrinsics/guardian_core.py b/docs/examples/intrinsics/guardian_core.py
index d80ad9de5..40951e65c 100644
--- a/docs/examples/intrinsics/guardian_core.py
+++ b/docs/examples/intrinsics/guardian_core.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the guardian-core intrinsic for safety and hallucination detection.
 
diff --git a/docs/examples/intrinsics/hallucination_detection.py b/docs/examples/intrinsics/hallucination_detection.py
index 6904d96b5..babfe789c 100644
--- a/docs/examples/intrinsics/hallucination_detection.py
+++ b/docs/examples/intrinsics/hallucination_detection.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the hallucination detection intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/intrinsics.py b/docs/examples/intrinsics/intrinsics.py
index b798039c3..f69f3a7b8 100644
--- a/docs/examples/intrinsics/intrinsics.py
+++ b/docs/examples/intrinsics/intrinsics.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 import mellea.stdlib.functional as mfuncs
 from mellea.backends.adapters.adapter import AdapterType, IntrinsicAdapter
diff --git a/docs/examples/intrinsics/policy_guardrails.py b/docs/examples/intrinsics/policy_guardrails.py
index 782f2cab6..c9772ce70 100644
--- a/docs/examples/intrinsics/policy_guardrails.py
+++ b/docs/examples/intrinsics/policy_guardrails.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the policy_guardrails intrinsic.
 
diff --git a/docs/examples/intrinsics/query_clarification.py b/docs/examples/intrinsics/query_clarification.py
index e2a0a3e9f..8b8973e39 100644
--- a/docs/examples/intrinsics/query_clarification.py
+++ b/docs/examples/intrinsics/query_clarification.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 """
 Example usage of the query clarification intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/query_rewrite.py b/docs/examples/intrinsics/query_rewrite.py
index 8b4025033..07d42fc1a 100644
--- a/docs/examples/intrinsics/query_rewrite.py
+++ b/docs/examples/intrinsics/query_rewrite.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the query rewrite intrinsic for RAG applications.
 
diff --git a/docs/examples/intrinsics/requirement_check.py b/docs/examples/intrinsics/requirement_check.py
index 08176fc97..c0d5792e5 100644
--- a/docs/examples/intrinsics/requirement_check.py
+++ b/docs/examples/intrinsics/requirement_check.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the requirement check intrinsic.
 
diff --git a/docs/examples/intrinsics/uncertainty.py b/docs/examples/intrinsics/uncertainty.py
index 7f052abc2..d3318d151 100644
--- a/docs/examples/intrinsics/uncertainty.py
+++ b/docs/examples/intrinsics/uncertainty.py
@@ -1,4 +1,4 @@
-# pytest: huggingface, requires_heavy_ram, e2e
+# pytest: huggingface, e2e
 
 """Example usage of the uncertainty/certainty intrinsic.
 
diff --git a/docs/examples/mify/rich_document_advanced.py b/docs/examples/mify/rich_document_advanced.py
index 1fee383f3..f35754b83 100644
--- a/docs/examples/mify/rich_document_advanced.py
+++ b/docs/examples/mify/rich_document_advanced.py
@@ -1,4 +1,4 @@
-# pytest: skip, huggingface, requires_heavy_ram, e2e
+# pytest: skip, huggingface, e2e
 # SKIP REASON: CXXABI_1.3.15 not found - conda environment issue on HPC systems with old glibc
 
 # ruff: noqa E402
diff --git a/docs/examples/safety/guardian_huggingface.py b/docs/examples/safety/guardian_huggingface.py
index 532711e2c..35d493565 100644
--- a/docs/examples/safety/guardian_huggingface.py
+++ b/docs/examples/safety/guardian_huggingface.py
@@ -1,4 +1,4 @@
-# pytest: ollama, huggingface, requires_heavy_ram, e2e
+# pytest: ollama, huggingface, e2e
 
 """Example of using GuardianCheck with HuggingFace backend for direct model inference
 
diff --git a/docs/examples/safety/repair_with_guardian.py b/docs/examples/safety/repair_with_guardian.py
index cbee89548..f5dc6cfe6 100644
--- a/docs/examples/safety/repair_with_guardian.py
+++ b/docs/examples/safety/repair_with_guardian.py
@@ -1,4 +1,4 @@
-# pytest: ollama, huggingface, requires_heavy_ram, e2e
+# pytest: ollama, huggingface, e2e
 
 """RepairTemplateStrategy Example with Actual Function Call Validation
 Demonstrates how RepairTemplateStrategy repairs responses using actual function calls.
diff --git a/docs/examples/sofai/sofai_graph_coloring.py b/docs/examples/sofai/sofai_graph_coloring.py
index 35583ba76..12e8ded85 100644
--- a/docs/examples/sofai/sofai_graph_coloring.py
+++ b/docs/examples/sofai/sofai_graph_coloring.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, e2e, requires_heavy_ram
+# pytest: ollama, qualitative, e2e
 
 """SOFAI Sampling Strategy Example: Graph Coloring Problem.
 
diff --git a/docs/examples/tutorial/document_mobject.py b/docs/examples/tutorial/document_mobject.py
index f917cb8da..f2d892ec7 100644
--- a/docs/examples/tutorial/document_mobject.py
+++ b/docs/examples/tutorial/document_mobject.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, e2e, requires_heavy_ram
+# pytest: ollama, qualitative, e2e
 
 from mellea.backends import model_ids
 from mellea.backends.model_ids import IBM_GRANITE_4_HYBRID_MICRO
diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
index 8bdb5cab2..dc2807489 100644
--- a/test/backends/test_huggingface.py
+++ b/test/backends/test_huggingface.py
@@ -13,13 +13,13 @@
 import pytest
 import torch
 
+from test.predicates import require_gpu
+
 # Mark all tests in this module with backend and resource requirements
 pytestmark = [
     pytest.mark.huggingface,
     pytest.mark.e2e,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
-    pytest.mark.requires_gpu_isolation,  # Activate GPU memory isolation
+    require_gpu(min_vram_gb=20),
     # Skip entire module in CI since 17/18 tests are qualitative
     pytest.mark.skipif(
         int(os.environ.get("CICD", 0)) == 1,
diff --git a/test/backends/test_huggingface_tools.py b/test/backends/test_huggingface_tools.py
index ce15ad449..ac0347990 100644
--- a/test/backends/test_huggingface_tools.py
+++ b/test/backends/test_huggingface_tools.py
@@ -3,14 +3,13 @@
 import pytest
 
 from mellea.backends.tools import MelleaTool
+from test.predicates import require_gpu
 
 # Skip entire module in CI since the single test is qualitative
 pytestmark = [
     pytest.mark.huggingface,
     pytest.mark.e2e,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
-    pytest.mark.requires_gpu_isolation,
+    require_gpu(min_vram_gb=18),
     pytest.mark.skipif(
         int(os.environ.get("CICD", 0)) == 1,
         reason="Skipping HuggingFace tools tests in CI - qualitative test",
diff --git a/test/backends/test_litellm_watsonx.py b/test/backends/test_litellm_watsonx.py
index e224f9989..b770dce07 100644
--- a/test/backends/test_litellm_watsonx.py
+++ b/test/backends/test_litellm_watsonx.py
@@ -2,12 +2,14 @@
 
 import pytest
 
+from test.predicates import require_api_key
+
 # Mark all tests in this module as requiring Watsonx via LiteLLM
 pytestmark = [
     pytest.mark.litellm,
     pytest.mark.watsonx,
     pytest.mark.e2e,
-    pytest.mark.requires_api_key,
+    require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID"),
 ]
 
 from mellea import MelleaSession
diff --git a/test/backends/test_openai_vllm.py b/test/backends/test_openai_vllm.py
index 52fcb85c9..1ad21d585 100644
--- a/test/backends/test_openai_vllm.py
+++ b/test/backends/test_openai_vllm.py
@@ -9,13 +9,14 @@
 import pytest
 import requests
 
+from test.predicates import require_gpu
+
 # Mark all tests in this module with backend and resource requirements
 pytestmark = [
     pytest.mark.openai,
     pytest.mark.e2e,
     pytest.mark.vllm,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
+    require_gpu(min_vram_gb=8),
     # Skip entire module in CI since all 8 tests are qualitative
     pytest.mark.skipif(
         int(os.environ.get("CICD", 0)) == 1,
diff --git a/test/backends/test_vllm.py b/test/backends/test_vllm.py
index 01d002034..bc054de65 100644
--- a/test/backends/test_vllm.py
+++ b/test/backends/test_vllm.py
@@ -5,13 +5,13 @@
 import pydantic
 import pytest
 
+from test.predicates import require_gpu
+
 # Mark all tests in this module with backend and resource requirements
 pytestmark = [
     pytest.mark.vllm,
     pytest.mark.e2e,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
-    pytest.mark.requires_gpu_isolation,  # Activate GPU memory isolation
+    require_gpu(min_vram_gb=4),
     # Skip entire module in CI since all 8 tests are qualitative
     pytest.mark.skipif(
         int(os.environ.get("CICD", 0)) == 1,
diff --git a/test/backends/test_vllm_tools.py b/test/backends/test_vllm_tools.py
index 5a61d5aba..e18d47942 100644
--- a/test/backends/test_vllm_tools.py
+++ b/test/backends/test_vllm_tools.py
@@ -3,14 +3,13 @@
 import pytest
 
 from mellea.backends.tools import MelleaTool
+from test.predicates import require_gpu
 
 # Skip entire module in CI since the single test is qualitative
 pytestmark = [
     pytest.mark.vllm,
     pytest.mark.e2e,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
-    pytest.mark.requires_gpu_isolation,
+    require_gpu(min_vram_gb=18),
     pytest.mark.skipif(
         int(os.environ.get("CICD", 0)) == 1,
         reason="Skipping vLLM tools tests in CI - qualitative test",
diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py
index eda9baa69..dd7f0c049 100644
--- a/test/backends/test_watsonx.py
+++ b/test/backends/test_watsonx.py
@@ -5,11 +5,13 @@
 import pydantic
 import pytest
 
+from test.predicates import require_api_key
+
 # Mark all tests in this module with backend and auth requirements
 pytestmark = [
     pytest.mark.watsonx,
     pytest.mark.e2e,
-    pytest.mark.requires_api_key,
+    require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID"),
     # Skip entire module in CI since 8/9 tests are qualitative
     pytest.mark.skipif(
         int(os.environ.get("CICD", 0)) == 1,
diff --git a/test/cli/test_alora_train_integration.py b/test/cli/test_alora_train_integration.py
index 83c8cd0e6..5a064d014 100644
--- a/test/cli/test_alora_train_integration.py
+++ b/test/cli/test_alora_train_integration.py
@@ -14,11 +14,12 @@
 import torch
 from transformers import AutoTokenizer
 
+from test.predicates import require_gpu
+
 pytestmark = [
     pytest.mark.huggingface,
     pytest.mark.e2e,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
+    require_gpu(min_vram_gb=12),
     # Skip entire module in CI since 17/18 tests are qualitative
     pytest.mark.skipif(
         int(os.environ.get("CICD", 0)) == 1,
diff --git a/test/core/test_component_typing.py b/test/core/test_component_typing.py
index b69aa7a9c..a88ae7edf 100644
--- a/test/core/test_component_typing.py
+++ b/test/core/test_component_typing.py
@@ -120,7 +120,6 @@ def test_incorrect_type_override():
 # Uses granite4:micro-h (3B hybrid, lightweight) in local mode
 @pytest.mark.qualitative
 @pytest.mark.ollama
-@pytest.mark.requires_gpu
 @pytest.mark.e2e
 async def test_generating(session):
     m = session
@@ -163,8 +162,6 @@ async def test_generating(session):
 
 @pytest.mark.qualitative
 @pytest.mark.ollama
-@pytest.mark.requires_gpu
-@pytest.mark.requires_heavy_ram
 @pytest.mark.e2e
 def test_message_typing(session):
     m = session
@@ -180,8 +177,6 @@ def test_message_typing(session):
 
 @pytest.mark.qualitative
 @pytest.mark.ollama
-@pytest.mark.requires_gpu
-@pytest.mark.requires_heavy_ram
 @pytest.mark.e2e
 async def test_generating_with_sampling(session):
     m = session
diff --git a/test/plugins/test_hook_call_sites.py b/test/plugins/test_hook_call_sites.py
index 5c0742b3c..46bc2d6ab 100644
--- a/test/plugins/test_hook_call_sites.py
+++ b/test/plugins/test_hook_call_sites.py
@@ -16,6 +16,8 @@
 
 import pytest
 
+pytestmark = pytest.mark.integration
+
 pytest.importorskip("cpex.framework")
 
 from mellea.core.backend import Backend
diff --git a/test/stdlib/components/intrinsic/test_core.py b/test/stdlib/components/intrinsic/test_core.py
index c84718b32..c1b151124 100644
--- a/test/stdlib/components/intrinsic/test_core.py
+++ b/test/stdlib/components/intrinsic/test_core.py
@@ -13,6 +13,7 @@
 from mellea.stdlib.components.intrinsic import core
 from mellea.stdlib.context import ChatContext
 from test.conftest import cleanup_gpu_backend
+from test.predicates import require_gpu
 from test.stdlib.components.intrinsic.test_rag import (
     _read_input_json as _read_rag_input_json,
     _read_output_json as _read_rag_output_json,
@@ -25,8 +26,7 @@
         reason="Skipping core intrinsic tests in CI - all qualitative tests",
     ),
     pytest.mark.huggingface,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
+    require_gpu(min_vram_gb=8),
     pytest.mark.e2e,
 ]
 
diff --git a/test/stdlib/components/intrinsic/test_guardian.py b/test/stdlib/components/intrinsic/test_guardian.py
index 05fd205d1..c0b4b48c8 100644
--- a/test/stdlib/components/intrinsic/test_guardian.py
+++ b/test/stdlib/components/intrinsic/test_guardian.py
@@ -14,6 +14,7 @@
 from mellea.stdlib.components.intrinsic import guardian
 from mellea.stdlib.context import ChatContext
 from test.conftest import cleanup_gpu_backend
+from test.predicates import require_gpu
 
 # Skip entire module in CI since all tests are qualitative
 pytestmark = [
@@ -22,8 +23,7 @@
         reason="Skipping Guardian tests in CI - all qualitative tests",
     ),
     pytest.mark.huggingface,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
+    require_gpu(min_vram_gb=8),
     pytest.mark.e2e,
 ]
 
diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py
index 49c8dcaf3..910390ef6 100644
--- a/test/stdlib/components/intrinsic/test_rag.py
+++ b/test/stdlib/components/intrinsic/test_rag.py
@@ -13,6 +13,7 @@
 from mellea.stdlib.components import Document, Message
 from mellea.stdlib.components.intrinsic import rag
 from mellea.stdlib.context import ChatContext
+from test.predicates import require_gpu
 
 # Skip entire module in CI since all 7 tests are qualitative
 pytestmark = [
@@ -21,8 +22,7 @@
         reason="Skipping RAG tests in CI - all qualitative tests",
     ),
     pytest.mark.huggingface,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,  # 3B model + document processing needs ~30-35GB
+    require_gpu(min_vram_gb=8),
     pytest.mark.e2e,
 ]
 
diff --git a/test/stdlib/components/test_genslot.py b/test/stdlib/components/test_genslot.py
index 33a6482f8..8d65dbe62 100644
--- a/test/stdlib/components/test_genslot.py
+++ b/test/stdlib/components/test_genslot.py
@@ -18,7 +18,7 @@
 from mellea.stdlib.sampling import RejectionSamplingStrategy
 
 # Module-level markers: Uses granite4:micro-h (3B hybrid, lightweight) in local mode
-pytestmark = [pytest.mark.ollama, pytest.mark.requires_gpu, pytest.mark.e2e]
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
 
 
 @pytest.fixture(scope="module")
diff --git a/test/stdlib/sampling/test_think_budget_forcing.py b/test/stdlib/sampling/test_think_budget_forcing.py
index f5d5860b7..590e579fc 100644
--- a/test/stdlib/sampling/test_think_budget_forcing.py
+++ b/test/stdlib/sampling/test_think_budget_forcing.py
@@ -11,13 +11,7 @@
 MODEL_ID = OPENAI_GPT_OSS_20B
 
 # Module-level markers: gpt-oss:20b is a 20B model requiring heavy resources
-pytestmark = [
-    pytest.mark.ollama,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
-    pytest.mark.e2e,
-    pytest.mark.qualitative,
-]
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e, pytest.mark.qualitative]
 
 
 @pytest.fixture(scope="module")
diff --git a/test/stdlib/test_session.py b/test/stdlib/test_session.py
index 80506b591..439d27fe6 100644
--- a/test/stdlib/test_session.py
+++ b/test/stdlib/test_session.py
@@ -8,6 +8,7 @@
 from mellea.stdlib.components import Message
 from mellea.stdlib.context import ChatContext
 from mellea.stdlib.session import MelleaSession, start_session
+from test.predicates import require_api_key
 
 # Mark all tests as requiring Ollama (start_session defaults to Ollama)
 pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
@@ -22,7 +23,7 @@ def m_session(gh_run):
 
 
 @pytest.mark.watsonx
-@pytest.mark.requires_api_key
+@require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID")
 def test_start_session_watsonx(gh_run):
     if gh_run == 1:
         pytest.skip("Skipping watsonx tests.")
diff --git a/test/stdlib/test_spans.py b/test/stdlib/test_spans.py
index f39bd0e53..9f6319d65 100644
--- a/test/stdlib/test_spans.py
+++ b/test/stdlib/test_spans.py
@@ -6,14 +6,10 @@
 from mellea.core import CBlock
 from mellea.stdlib.components import SimpleComponent
 from mellea.stdlib.session import MelleaSession, start_session
+from test.predicates import require_gpu
 
 # Module-level markers for all tests using Granite 4 hybrid micro (3B model)
-pytestmark = [
-    pytest.mark.huggingface,
-    pytest.mark.requires_gpu,
-    pytest.mark.requires_heavy_ram,
-    pytest.mark.e2e,
-]
+pytestmark = [pytest.mark.huggingface, require_gpu(min_vram_gb=8), pytest.mark.e2e]
 
 
 # We edit the context type in the async tests below. Don't change the scope here.
diff --git a/test/telemetry/test_metrics_backend.py b/test/telemetry/test_metrics_backend.py
index d8d79e6c1..8e808880b 100644
--- a/test/telemetry/test_metrics_backend.py
+++ b/test/telemetry/test_metrics_backend.py
@@ -14,6 +14,7 @@
 )
 from mellea.stdlib.components import Message
 from mellea.stdlib.context import SimpleContext
+from test.predicates import require_api_key, require_gpu
 
 # Check if OpenTelemetry is available
 try:
@@ -218,7 +219,7 @@ async def test_openai_token_metrics_integration(enable_metrics, metric_reader, s
 @pytest.mark.asyncio
 @pytest.mark.e2e
 @pytest.mark.watsonx
-@pytest.mark.requires_api_key
+@require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID")
 async def test_watsonx_token_metrics_integration(enable_metrics, metric_reader):
     """Test that WatsonX backend records token metrics correctly."""
     if not os.getenv("WATSONX_API_KEY"):
@@ -331,6 +332,7 @@ async def test_litellm_token_metrics_integration(
 @pytest.mark.asyncio
 @pytest.mark.e2e
 @pytest.mark.huggingface
+@require_gpu(min_vram_gb=8)
 @pytest.mark.parametrize("stream", [False, True], ids=["non-streaming", "streaming"])
 async def test_huggingface_token_metrics_integration(
     enable_metrics, metric_reader, stream, hf_metrics_backend

From 01fdc1ea6890c7d7d465cbaf77b2c6fbd1a58930 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 26 Mar 2026 14:51:47 +0000
Subject: [PATCH 17/42] test: skip collection gracefully when optional backend
 deps are missing

Add pytest.importorskip() / pytest.importorskip() guards to 14 test
files that previously aborted the entire test run with a ModuleNotFoundError
when optional extras were not installed:

- torch / llguidance (mellea[hf]): test_huggingface, test_huggingface_tools,
  test_alora_train_integration, test_intrinsics_formatters, test_core,
  test_guardian, test_rag, test_spans
- litellm (mellea[litellm]): test_litellm_ollama, test_litellm_watsonx
- ibm_watsonx_ai (mellea[watsonx]): test_watsonx
- docling / docling_core (mellea[mify]): test_tool_calls, test_richdocument,
  test_transform

With these guards, `uv run pytest` runs all collectable tests and reports
skipped files with a clear reason instead of aborting at first ImportError.
---
 test/backends/test_huggingface.py                     | 2 +-
 test/backends/test_huggingface_tools.py               | 1 +
 test/backends/test_litellm_ollama.py                  | 1 +
 test/backends/test_litellm_watsonx.py                 | 1 +
 test/backends/test_tool_calls.py                      | 2 ++
 test/backends/test_watsonx.py                         | 1 +
 test/cli/test_alora_train_integration.py              | 3 ++-
 test/formatters/granite/test_intrinsics_formatters.py | 3 ++-
 test/stdlib/components/docs/test_richdocument.py      | 2 ++
 test/stdlib/components/intrinsic/test_core.py         | 3 ++-
 test/stdlib/components/intrinsic/test_guardian.py     | 3 ++-
 test/stdlib/components/intrinsic/test_rag.py          | 3 ++-
 test/stdlib/components/test_transform.py              | 2 ++
 test/stdlib/test_spans.py                             | 2 ++
 14 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
index dc2807489..281406ca0 100644
--- a/test/backends/test_huggingface.py
+++ b/test/backends/test_huggingface.py
@@ -11,7 +11,7 @@
 
 import pydantic
 import pytest
-import torch
+torch = pytest.importorskip("torch", reason="torch not installed — install mellea[hf]")
 
 from test.predicates import require_gpu
 
diff --git a/test/backends/test_huggingface_tools.py b/test/backends/test_huggingface_tools.py
index ac0347990..feaf87980 100644
--- a/test/backends/test_huggingface_tools.py
+++ b/test/backends/test_huggingface_tools.py
@@ -16,6 +16,7 @@
     ),
 ]
 
+pytest.importorskip("llguidance", reason="llguidance not installed — install mellea[hf]")
 import mellea.backends.model_ids as model_ids
 from mellea import MelleaSession
 from mellea.backends import ModelOption
diff --git a/test/backends/test_litellm_ollama.py b/test/backends/test_litellm_ollama.py
index 3ca4bcbfb..710f6eb0d 100644
--- a/test/backends/test_litellm_ollama.py
+++ b/test/backends/test_litellm_ollama.py
@@ -6,6 +6,7 @@
 # Mark all tests in this module as requiring Ollama via LiteLLM
 pytestmark = [pytest.mark.litellm, pytest.mark.ollama, pytest.mark.e2e]
 
+pytest.importorskip("litellm", reason="litellm not installed — install mellea[litellm]")
 from mellea import MelleaSession, generative
 from mellea.backends import ModelOption, model_ids
 from mellea.backends.litellm import LiteLLMBackend
diff --git a/test/backends/test_litellm_watsonx.py b/test/backends/test_litellm_watsonx.py
index b770dce07..2c413e070 100644
--- a/test/backends/test_litellm_watsonx.py
+++ b/test/backends/test_litellm_watsonx.py
@@ -12,6 +12,7 @@
     require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID"),
 ]
 
+pytest.importorskip("litellm", reason="litellm not installed — install mellea[litellm]")
 from mellea import MelleaSession
 from mellea.backends.litellm import LiteLLMBackend
 from mellea.core import CBlock
diff --git a/test/backends/test_tool_calls.py b/test/backends/test_tool_calls.py
index abf27e51d..ece501b16 100644
--- a/test/backends/test_tool_calls.py
+++ b/test/backends/test_tool_calls.py
@@ -9,6 +9,8 @@
     add_tools_from_model_options,
 )
 from mellea.core import ModelOutputThunk
+
+pytest.importorskip("docling", reason="docling not installed — install mellea[mify]")
 from mellea.stdlib.components.docs.richdocument import Table
 from mellea.stdlib.context import ChatContext
 from mellea.stdlib.session import MelleaSession
diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py
index dd7f0c049..45cc162b6 100644
--- a/test/backends/test_watsonx.py
+++ b/test/backends/test_watsonx.py
@@ -19,6 +19,7 @@
     ),
 ]
 
+pytest.importorskip("ibm_watsonx_ai", reason="ibm_watsonx_ai not installed — install mellea[watsonx]")
 from mellea import MelleaSession
 from mellea.backends import ModelOption, model_ids
 from mellea.backends.watsonx import WatsonxAIBackend
diff --git a/test/cli/test_alora_train_integration.py b/test/cli/test_alora_train_integration.py
index 5a064d014..221e6dde9 100644
--- a/test/cli/test_alora_train_integration.py
+++ b/test/cli/test_alora_train_integration.py
@@ -11,7 +11,8 @@
 from pathlib import Path
 
 import pytest
-import torch
+
+torch = pytest.importorskip("torch", reason="torch not installed — install mellea[hf]")
 from transformers import AutoTokenizer
 
 from test.predicates import require_gpu
diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py
index 70ec4d0c1..d44bf86f1 100644
--- a/test/formatters/granite/test_intrinsics_formatters.py
+++ b/test/formatters/granite/test_intrinsics_formatters.py
@@ -16,7 +16,8 @@
 import pydantic
 import pytest
 import requests
-import torch
+
+torch = pytest.importorskip("torch", reason="torch not installed — install mellea[hf]")
 import yaml
 
 # First Party
diff --git a/test/stdlib/components/docs/test_richdocument.py b/test/stdlib/components/docs/test_richdocument.py
index 3538beac1..45f94f4c3 100644
--- a/test/stdlib/components/docs/test_richdocument.py
+++ b/test/stdlib/components/docs/test_richdocument.py
@@ -2,6 +2,8 @@
 import tempfile
 
 import pytest
+
+pytest.importorskip("docling_core", reason="docling_core not installed — install mellea[mify]")
 from docling_core.types.doc.document import DoclingDocument
 
 import mellea
diff --git a/test/stdlib/components/intrinsic/test_core.py b/test/stdlib/components/intrinsic/test_core.py
index c1b151124..b980afddb 100644
--- a/test/stdlib/components/intrinsic/test_core.py
+++ b/test/stdlib/components/intrinsic/test_core.py
@@ -6,7 +6,8 @@
 import pathlib
 
 import pytest
-import torch
+
+torch = pytest.importorskip("torch", reason="torch not installed — install mellea[hf]")
 
 from mellea.backends.huggingface import LocalHFBackend
 from mellea.stdlib.components import Document, Message
diff --git a/test/stdlib/components/intrinsic/test_guardian.py b/test/stdlib/components/intrinsic/test_guardian.py
index c0b4b48c8..749b60c09 100644
--- a/test/stdlib/components/intrinsic/test_guardian.py
+++ b/test/stdlib/components/intrinsic/test_guardian.py
@@ -6,7 +6,8 @@
 import pathlib
 
 import pytest
-import torch
+
+torch = pytest.importorskip("torch", reason="torch not installed — install mellea[hf]")
 
 from mellea.backends.huggingface import LocalHFBackend
 from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B
diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py
index 910390ef6..a00694b52 100644
--- a/test/stdlib/components/intrinsic/test_rag.py
+++ b/test/stdlib/components/intrinsic/test_rag.py
@@ -6,7 +6,8 @@
 import pathlib
 
 import pytest
-import torch
+
+torch = pytest.importorskip("torch", reason="torch not installed — install mellea[hf]")
 
 from mellea.backends.huggingface import LocalHFBackend
 from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B
diff --git a/test/stdlib/components/test_transform.py b/test/stdlib/components/test_transform.py
index ecbd95d9a..b0ed491ee 100644
--- a/test/stdlib/components/test_transform.py
+++ b/test/stdlib/components/test_transform.py
@@ -2,6 +2,8 @@
 
 from mellea.core import TemplateRepresentation
 from mellea.stdlib.components import MObject, Query, Transform
+
+pytest.importorskip("docling", reason="docling not installed — install mellea[mify]")
 from mellea.stdlib.components.docs.richdocument import TableTransform
 
 custom_mobject_description = "custom mobject description"
diff --git a/test/stdlib/test_spans.py b/test/stdlib/test_spans.py
index 9f6319d65..af3f7681a 100644
--- a/test/stdlib/test_spans.py
+++ b/test/stdlib/test_spans.py
@@ -1,6 +1,8 @@
 import pytest
 
 from mellea.backends import ModelOption
+
+pytest.importorskip("llguidance", reason="llguidance not installed — install mellea[hf]")
 from mellea.backends.huggingface import LocalHFBackend
 from mellea.backends.model_ids import IBM_GRANITE_4_HYBRID_MICRO
 from mellea.core import CBlock

From c6d565eebc79fe2eb9ed68cd2b56675121986b46 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 26 Mar 2026 19:05:18 +0000
Subject: [PATCH 18/42] test: refine integration marker definition and apply
 audit fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expand integration to cover SDK-boundary tests (OTel InMemoryMetricReader,
InMemorySpanExporter, LoggingHandler) — tests that assert against a real
third-party SDK contract, not just multi-component wiring. Updates SKILL.md
and MARKERS_GUIDE.md with new definition, indicators, tie-breaker, and
SDK-boundary signal tables.

Applied fixes:
- test/telemetry/test_{metrics,metrics_token,logging}.py: add integration marker
- test/telemetry/test_metrics_backend.py: add openai marker to OTel+OpenAI test,
  remove redundant inline skip already covered by require_api_key predicate
- test/cli/test_alora_train.py: add integration to test_imports_work (real LoraConfig)
- test/formatters/granite/test_intrinsics_formatters.py: remove unregistered block_network marker
- test/stdlib/components/docs/test_richdocument.py: add integration pytestmark + e2e/huggingface/qualitative on skipped generation test
- test/backends/test_openai_ollama.py: note inherited module marker limitation
- docs/examples/plugins/testing_plugins.py: add # pytest: unit
---
 .agents/skills/audit-markers/SKILL.md         | 177 ++++++++++++------
 docs/examples/plugins/testing_plugins.py      |   1 +
 test/MARKERS_GUIDE.md                         |  50 ++++-
 test/backends/test_openai_ollama.py           |   3 +
 test/cli/test_alora_train.py                  |   1 +
 .../granite/test_intrinsics_formatters.py     |   1 -
 .../components/docs/test_richdocument.py      |  11 +-
 test/telemetry/test_logging.py                |   7 +-
 test/telemetry/test_metrics.py                |   7 +-
 test/telemetry/test_metrics_backend.py        |   4 +-
 test/telemetry/test_metrics_token.py          |   7 +-
 11 files changed, 196 insertions(+), 73 deletions(-)

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index 058dd4c98..d4dd1400e 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -4,7 +4,7 @@ description: >
   Audit and fix pytest markers on test files and examples. Classifies tests as
   unit/integration/e2e/qualitative using general heuristics and project-specific
   marker rules. Estimates GPU VRAM and RAM requirements by tracing model
-  identifiers and looking up parameter counts. Use when reviewing markers,
+  identifiers and looking up parameter counts. Use when reviewing markers (classification),
   auditing test files, or checking before commit.
 argument-hint: "[file-or-directory] [--dry-run | --apply]"
 compatibility: "Claude Code, IBM Bob"
@@ -54,10 +54,11 @@ to anything external. Pure logic testing.
 **Recognise by:**
 - Imports only from the project and stdlib — no external service clients
 - Creates objects directly, calls methods, checks return values
-- If it uses test doubles, they replace external boundaries (network, DB, services)
+- If it uses test doubles, they replace all external boundaries (network, DB, services, third-party SDKs)
+- Third-party library is imported only as a type or helper, not as a real collaborator being asserted against
 - No fixture that starts/connects to a real or fixture-managed service
 - Runs in milliseconds to low seconds
-- Would pass on any machine with just the language runtime and project deps
+- Would pass identically if you replaced a real SDK import with a stub of the same interface
 
 **Examples of unit assertions:**
 ```python
@@ -69,41 +70,76 @@ mock_backend.generate.assert_called_once()
 
 ### Integration
 
-Tests **multiple components working together**, potentially needing additional
-services or fixture-managed dependencies. Backends may be mocked, stubbed, or
-stood up by test fixtures.
+**Verifies that your code correctly communicates across a real boundary.**
+The boundary may be a third-party SDK/library whose API contract you are
+asserting against, multiple internal components wired together, or a
+fixture-managed local service. What distinguishes integration from unit is
+that at least one real external component — not a mock or stub — is on the
+other side of the boundary being tested.
+
+**Key distinction from unit:** The boundary is not limited to network or
+hardware. A test that wires project code against a real third-party SDK
+object to assert on output format or values is integration — even when
+entirely in-memory with no network I/O. The question is whether a real
+external component's API contract is being verified, not whether there is
+network activity.
+
+**Key distinction from e2e:** Integration controls or provides its
+dependencies (mocks, in-memory SDK components, fixture-managed local
+services). E2E depends on real backends that exist independently (Ollama,
+cloud APIs, GPU-loaded models).
+
+**Positive indicators:**
+
+- Uses a real third-party SDK object to *capture and assert* on output —
+  e.g. `InMemoryMetricReader`, `InMemorySpanExporter`, `LoggingHandler` —
+  rather than patching the SDK away
+- Asserts on format or content of data as received by the external component
+  (semantic conventions, attribute names, accumulated values)
+- Wires multiple real project components together and mocks only at the
+  outermost boundary (LLM call, network, hardware)
+- Breaking the interface between your code and the external component (e.g.
+  a changed attribute name, a missing SDK method call) would cause the test
+  to fail
+- Fixture-managed dependencies that stand up or configure real local services
+
+**Negative indicators (likely unit instead):**
+
+- All external boundaries replaced with `MagicMock`, `patch`, or `AsyncMock`
+- Third-party library imported only as a type or helper, not as a real
+  collaborator being asserted against
+- Toggles env vars and checks booleans or config state with no real SDK
+  objects instantiated
+- Only one real component under test; everything else is faked
+
+**Tie-breaker:** If you changed the contract between your code and the
+external component (e.g. renamed an attribute, stopped calling the right
+SDK method), would this test catch it? If yes → integration. If no → unit.
+
+**Examples:**
 
-**Recognise by:**
-- Creates real instances of multiple project components and wires them together
-- External service boundaries may be mocked, stubbed, or managed by fixtures
-- Tests that the components interact correctly — data flows, callbacks fire, errors propagate
-- May need additional services, but the test controls or provides its dependencies
-- Slower than unit (fixture setup, service lifecycle) and may consume more memory
-
-**Key distinction from unit:** Count the real (non-mock) project components
-being exercised. Unit isolates **one** class or function — all collaborators
-are faked. Integration wires up **multiple** real components and mocks only
-at the external perimeter (network, backend, database).
-
-**Key distinction from e2e:** Integration controls its dependencies (mocks, stubs,
-fixture-managed services). E2E uses real backends that exist independently.
-
-**Borderline: unit vs integration (the "scope of mocks" rule)**
-
-When a test uses mocks, look at *what* is mocked to decide:
-
-- **Mock replaces external I/O only, multiple real internal components wired
-  together** → **integration**. Example: a test that registers a real `Plugin`,
-  calls real `invoke_hook()` and `register()`, but passes `MagicMock()` for
-  the backend. The plugin-manager wiring executes for real; only the LLM call
-  is faked.
-- **Mock replaces internal collaborators too, only one real component under
-  test** → **unit**. Example: a test that instantiates one `Plugin` but
-  passes `MagicMock()` for the session, the backend, and the hook dispatcher.
-  Only the plugin's own logic executes.
-
-When in doubt, ask: "if I broke the *wiring* between two components, would
-this test catch it?" If yes → integration. If no → unit.
+```python
+# Integration — real OTel InMemoryMetricReader, asserting SDK contract
+@pytest.mark.integration
+def test_token_metrics_attributes(clean_metrics_env):
+    reader = InMemoryMetricReader()
+    provider = MeterProvider(metric_readers=[reader])
+    record_token_usage_metrics(input_tokens=10, output_tokens=5, ...)
+    # Asserts against real OTel output — breaking the attribute name would fail this
+    assert attrs["gen_ai.provider.name"] == "ollama"
+
+# Integration — multiple real project components, only LLM call mocked
+@pytest.mark.integration
+def test_session_chains_components(mock_backend):
+    session = start_session(backend=mock_backend)
+    result = session.instruct("hello")
+    assert mock_backend.generate.called
+
+# Unit — real OTel SDK imported but only for isinstance check on a no-op
+def test_instruments_are_noop_when_disabled(clean_metrics_env):
+    counter = create_counter("test.counter")
+    assert counter.__class__.__name__ == "_NoOpCounter"
+```
 
 ### E2E (End-to-End)
 
@@ -166,6 +202,22 @@ or non-unit. Use these to triage at scale (see Audit Procedure, Step 0).
 | GPU / model loading | `import torch`, `.to("cuda")`, `.from_pretrained(` | Hardware dependency |
 | External downloads | URL literals (`http://`, `https://`), `urlopen`, `requests.get` with URLs | Network fetch |
 
+### SDK-boundary signals (test is likely integration, not unit)
+
+These patterns indicate real third-party SDK objects are being used as
+collaborators to assert on output — not mocked away:
+
+| Category | Grep patterns | Notes |
+|---|---|---|
+| OTel metrics | `InMemoryMetricReader`, `MeterProvider(metric_readers=` | Asserting against real OTel metrics output |
+| OTel tracing | `InMemorySpanExporter`, `SimpleSpanProcessor`, `TracerProvider(` | Asserting against real OTel span output |
+| OTel logging | `LoggingHandler`, `LoggerProvider`, `set_logger_provider` | Asserting against real OTel log output |
+| Real SDK setup | `provider.force_flush()`, `reader.get_metrics_data()` | Consuming real SDK output |
+
+Cross-reference: if these appear alongside `@patch(` that patches the SDK
+itself away → unit. If the SDK objects are instantiated and used directly →
+integration.
+
 ### Mock signals (test is likely unit)
 
 | Category | Grep patterns |
@@ -185,15 +237,17 @@ or non-unit. Use these to triage at scale (see Audit Procedure, Step 0).
 
 ### Cross-referencing signals
 
-A single file may contain both live and mock signals. Cross-reference to
-determine the correct bucket:
+A single file may contain multiple signal types. Cross-reference to determine
+the correct bucket:
 
-| Live signals? | Mock signals? | Classification |
-|---|---|---|
-| Yes | No | Almost certainly e2e — deep-read to confirm |
-| Yes | Yes | Needs inspection — partial mock = integration, or mixed file |
-| No | Yes | Likely unit — skip deep read |
-| No | No | Likely unit — skip deep read |
+| Live-backend signals? | SDK-boundary signals? | Mock signals? | Classification |
+|---|---|---|---|
+| Yes | Any | No | Almost certainly e2e — deep-read to confirm |
+| Yes | Any | Yes | Needs inspection — partial mock = integration, or mixed file |
+| No | Yes | No | Likely integration — deep-read to confirm SDK objects are asserted against, not just imported |
+| No | Yes | Yes | Needs inspection — if SDK is patched away → unit; if used directly → integration |
+| No | No | Yes | Likely unit — skip deep read |
+| No | No | No | Likely unit — skip deep read |
 
 ## When to Ask the User
 
@@ -311,6 +365,14 @@ test_func(session) → session uses MagicMock/MockBackend/patch
 If the test only checks the mock was called → **unit**.
 If the test wires real components around the mock → **integration**.
 
+**Pattern 5b — Real SDK collaborator (integration):**
+```
+test_func(clean_metrics_env) → creates InMemoryMetricReader() + MeterProvider()
+                              → calls project code → asserts on reader output
+```
+No network, no backend — but a real OTel SDK object is on the other side of
+the boundary being asserted against → **integration**.
+
 **Pattern 6 — No backend at all (unit):**
 ```
 test_func() — or test_func(tmp_path, capsys, ...)
@@ -549,10 +611,15 @@ Run grep across all target files for:
    network literals (`localhost`, `127.0.0.1`, port numbers), HTTP client
    usage, subprocess calls, `_API_KEY`/`_TOKEN`/`_SECRET` in env var checks,
    GPU/model loading (`torch`, `.from_pretrained(`), URL literals.
-2. **Mock signals** — `MagicMock`, `Mock(`, `AsyncMock`, `@patch(`,
+2. **SDK-boundary signals** — real third-party SDK objects used as collaborators:
+   `InMemoryMetricReader`, `InMemorySpanExporter`, `MeterProvider(metric_readers=`,
+   `TracerProvider(`, `LoggingHandler`, `provider.force_flush()`,
+   `reader.get_metrics_data()`. See the SDK-boundary signal table in the
+   Behavioural Signal Detection section.
+3. **Mock signals** — `MagicMock`, `Mock(`, `AsyncMock`, `@patch(`,
    `monkeypatch`, `mocker`, HTTP mock libraries.
-3. **Existing markers** — `pytestmark`, `@pytest.mark.`, `# pytest:`.
-4. **Live/mock fixture names** from Phase 0.
+4. **Existing markers** — `pytestmark`, `@pytest.mark.`, `# pytest:`.
+5. **Live/mock fixture names** from Phase 0.
 
 ### Phase 2: Bucket and prioritise
 
@@ -560,10 +627,10 @@ Cross-reference the signal hits into four priority buckets:
 
 | Priority | Condition | Action |
 |---|---|---|
-| **P1 — Missing markers** | Live signals present, NO existing markers | Deep-read and classify. These are the most likely gaps. |
-| **P2 — Mixed signals** | Both live AND mock signals present | Deep-read to determine if integration, partial mock, or mixed file. |
-| **P3 — Validate existing** | Live signals present, markers already exist | Spot-check that markers match the actual backend. Replace deprecated `llm`. |
-| **P4 — Skip** | No live signals (mock-only or no signals at all) | Likely unit. Report as clean without deep-reading. Spot-check a sample if the count is large. |
+| **P1 — Missing markers** | Live-backend or SDK-boundary signals present, NO existing markers | Deep-read and classify. These are the most likely gaps. |
+| **P2 — Mixed signals** | Both live/SDK signals AND mock signals present | Deep-read to determine if integration, partial mock, or mixed file. |
+| **P3 — Validate existing** | Live-backend or SDK-boundary signals present, markers already exist | Spot-check that markers match. Replace deprecated `llm`. |
+| **P4 — Skip** | No live-backend or SDK-boundary signals (mock-only or no signals) | Likely unit. Report as clean without deep-reading. Spot-check a sample if the count is large. |
 
 ### Phase 3: Deep-read
 
@@ -588,7 +655,10 @@ The `# pytest:` comment is the only marker mechanism (no `pytestmark`).
 For each `def test_*` or `async def test_*`, apply the general classification
 from Part 1 using the project-specific heuristics from Part 2:
 
-1. **Real backend or mocked?** → determines unit/integration vs e2e
+1. **Real backend, SDK collaborator, or fully mocked?**
+   - Real LLM backend (Ollama, HF, cloud API) → **e2e**
+   - Real third-party SDK object asserted against (OTel reader, logging handler) → **integration**
+   - All external boundaries mocked/patched → **unit** (single component) or **integration** (multiple real components wired)
 2. **Which backend(s)?** → backend markers (e2e only)
 3. **Deterministic or content-dependent assertions?** → e2e vs qualitative
 4. **What resources?** → resource markers
@@ -692,6 +762,7 @@ and MARKERS_GUIDE.md (same apply/confirm rules as other fixes in Step 4).
 Report issues outside marker-edit scope as **notes**. Do NOT fix these:
 - Missing conftest skip logic for a backend
 - Tests with no assertions
+- Files mixing unit and integration tests that could be split
 - Files mixing unit and e2e tests that could be split
 
 ## Output Summary
diff --git a/docs/examples/plugins/testing_plugins.py b/docs/examples/plugins/testing_plugins.py
index 2102a904d..bcd3c5f6f 100644
--- a/docs/examples/plugins/testing_plugins.py
+++ b/docs/examples/plugins/testing_plugins.py
@@ -1,3 +1,4 @@
+# pytest: unit
 # Testing plugins — how to unit-test hook functions without a live session.
 #
 # This example shows how to:
diff --git a/test/MARKERS_GUIDE.md b/test/MARKERS_GUIDE.md
index 9c81705f6..8d4949ca7 100644
--- a/test/MARKERS_GUIDE.md
+++ b/test/MARKERS_GUIDE.md
@@ -5,7 +5,7 @@
 ```bash
 # By granularity tier
 pytest -m unit                          # Self-contained, no services (fast)
-pytest -m integration                   # Multi-component, fixture-managed deps
+pytest -m integration                   # Real SDK/library boundary or multi-component wiring
 pytest -m e2e                           # Real backends (ollama, APIs, GPU models)
 pytest -m "e2e and not qualitative"     # Deterministic real-backend tests only
 
@@ -46,17 +46,55 @@ def test_cblock_repr():
 
 ### Integration (explicit)
 
-**Multiple components wired together**, potentially needing additional services
-or fixture-managed dependencies. Backends may be mocked, stubbed, or stood up
-by test fixtures. The test controls or provides its own dependencies.
+**Verifies that your code correctly communicates across a real boundary.**
+The boundary may be a third-party SDK/library whose API contract you are
+asserting against, multiple internal components wired together, or a
+fixture-managed local service. What distinguishes integration from unit is
+that at least one real external component — not a mock or stub — is on the
+other side of the boundary being tested.
 
 - Add `@pytest.mark.integration` explicitly
-- Slower than unit (fixture setup, service lifecycle), may consume more memory
-- No backend markers needed — integration tests don't use real backends
+- No backend markers needed — integration tests do not use real LLM backends
+- Slower than unit (fixture setup, real SDK objects), but faster than e2e
+
+**Positive indicators:**
+
+- Uses a real third-party SDK object to *capture and assert* on output —
+  e.g. `InMemoryMetricReader`, `InMemorySpanExporter`, `LoggingHandler` —
+  rather than patching the SDK away
+- Asserts on the format or content of data as received by an external
+  component (semantic conventions, attribute names, accumulated values)
+- Wires multiple real project components together and mocks only at the
+  outermost boundary
+- Breaking the interface between your code and the external component
+  (e.g. a changed attribute name, a missing SDK call) would cause the test
+  to fail
+
+**Negative indicators (likely unit instead):**
+
+- All external boundaries replaced with `MagicMock`, `patch`, or `AsyncMock`
+- Third-party library imported only as a type or helper, not as a real
+  collaborator being asserted against
+- Toggles env vars and checks booleans or config state with no real SDK
+  objects instantiated
+
+**Tie-breaker:** If you changed the contract between your code and the
+external component, would this test catch it? If yes → integration. If no
+→ unit.
 
 ```python
+@pytest.mark.integration
+def test_token_metrics_format(clean_metrics_env):
+    # Real InMemoryMetricReader — asserting against the OTel SDK contract
+    reader = InMemoryMetricReader()
+    provider = MeterProvider(metric_readers=[reader])
+    record_token_usage_metrics(input_tokens=10, output_tokens=5, ...)
+    metrics_data = reader.get_metrics_data()
+    assert metrics_data.resource_metrics[0]...name == "mellea.llm.tokens.input"
+
 @pytest.mark.integration
 def test_session_chains_components(mock_backend):
+    # Multiple real project components wired together; only LLM call mocked
     session = start_session(backend=mock_backend)
     result = session.instruct("hello")
     assert mock_backend.generate.called
diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py
index 0372ae603..133708a21 100644
--- a/test/backends/test_openai_ollama.py
+++ b/test/backends/test_openai_ollama.py
@@ -9,6 +9,9 @@
 
 # Mark all tests in this module as requiring Ollama via OpenAI-compatible API
 pytestmark = [pytest.mark.openai, pytest.mark.ollama, pytest.mark.e2e]
+# NOTE: test_api_key_and_base_url_from_parameters, test_parameter_overrides_env_variable,
+# and test_missing_api_key_raises_error are constructor-only unit tests that don't need
+# Ollama, but pytest has no mechanism to remove inherited module markers per-function.
 
 from mellea import MelleaSession
 from mellea.backends import ModelOption
diff --git a/test/cli/test_alora_train.py b/test/cli/test_alora_train.py
index 813accc5f..43862cdd8 100644
--- a/test/cli/test_alora_train.py
+++ b/test/cli/test_alora_train.py
@@ -215,6 +215,7 @@ def test_invocation_prompt_tokenization():
         )
 
 
+@pytest.mark.integration
 def test_imports_work():
     """Test that PEFT imports work correctly (no IBM alora dependency)."""
     # This test verifies the migration was successful
diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py
index d44bf86f1..8f555f552 100644
--- a/test/formatters/granite/test_intrinsics_formatters.py
+++ b/test/formatters/granite/test_intrinsics_formatters.py
@@ -433,7 +433,6 @@ def test_canned_input(yaml_json_combo_no_alora):
     assert after_json == expected_json
 
 
-@pytest.mark.block_network
 def test_openai_compat(yaml_json_combo_no_alora):
     """
     Verify that the dataclasses for intrinsics chat completions can be directly passed
diff --git a/test/stdlib/components/docs/test_richdocument.py b/test/stdlib/components/docs/test_richdocument.py
index 45f94f4c3..9ae153517 100644
--- a/test/stdlib/components/docs/test_richdocument.py
+++ b/test/stdlib/components/docs/test_richdocument.py
@@ -3,12 +3,17 @@
 
 import pytest
 
-pytest.importorskip("docling_core", reason="docling_core not installed — install mellea[mify]")
+pytest.importorskip(
+    "docling_core", reason="docling_core not installed — install mellea[mify]"
+)
 from docling_core.types.doc.document import DoclingDocument
 
 import mellea
 from mellea.core import TemplateRepresentation
 from mellea.stdlib.components.docs.richdocument import RichDocument, Table
+from test.predicates import require_gpu
+
+pytestmark = pytest.mark.integration
 
 
 @pytest.fixture(scope="module")
@@ -100,6 +105,10 @@ def test_empty_table():
 
 
 @pytest.mark.skip  # Test requires too much memory for smaller machines.
+@pytest.mark.e2e
+@pytest.mark.huggingface
+@pytest.mark.qualitative
+@require_gpu(min_vram_gb=16)
 def test_richdocument_generation(rd: RichDocument):
     m = mellea.start_session(backend_name="hf")
     response = m.chat(rd.to_markdown()[:500] + "\nSummarize the provided document.")
diff --git a/test/telemetry/test_logging.py b/test/telemetry/test_logging.py
index 595def8c0..fb876c24e 100644
--- a/test/telemetry/test_logging.py
+++ b/test/telemetry/test_logging.py
@@ -15,9 +15,10 @@
 except ImportError:
     OTEL_AVAILABLE = False
 
-pytestmark = pytest.mark.skipif(
-    not OTEL_AVAILABLE, reason="OpenTelemetry not installed"
-)
+pytestmark = [
+    pytest.mark.skipif(not OTEL_AVAILABLE, reason="OpenTelemetry not installed"),
+    pytest.mark.integration,
+]
 
 
 def _reset_logging_modules():
diff --git a/test/telemetry/test_metrics.py b/test/telemetry/test_metrics.py
index f07ea61e9..b9a1a4c01 100644
--- a/test/telemetry/test_metrics.py
+++ b/test/telemetry/test_metrics.py
@@ -14,9 +14,10 @@
 except ImportError:
     OTEL_AVAILABLE = False
 
-pytestmark = pytest.mark.skipif(
-    not OTEL_AVAILABLE, reason="OpenTelemetry not installed"
-)
+pytestmark = [
+    pytest.mark.skipif(not OTEL_AVAILABLE, reason="OpenTelemetry not installed"),
+    pytest.mark.integration,
+]
 
 
 @pytest.fixture
diff --git a/test/telemetry/test_metrics_backend.py b/test/telemetry/test_metrics_backend.py
index 8e808880b..8623eed69 100644
--- a/test/telemetry/test_metrics_backend.py
+++ b/test/telemetry/test_metrics_backend.py
@@ -162,6 +162,7 @@ async def test_ollama_token_metrics_integration(enable_metrics, metric_reader, s
 
 @pytest.mark.asyncio
 @pytest.mark.e2e
+@pytest.mark.openai
 @pytest.mark.ollama
 @pytest.mark.parametrize("stream", [False, True], ids=["non-streaming", "streaming"])
 async def test_openai_token_metrics_integration(enable_metrics, metric_reader, stream):
@@ -222,9 +223,6 @@ async def test_openai_token_metrics_integration(enable_metrics, metric_reader, s
 @require_api_key("WATSONX_API_KEY", "WATSONX_URL", "WATSONX_PROJECT_ID")
 async def test_watsonx_token_metrics_integration(enable_metrics, metric_reader):
     """Test that WatsonX backend records token metrics correctly."""
-    if not os.getenv("WATSONX_API_KEY"):
-        pytest.skip("WATSONX_API_KEY not set")
-
     from mellea.backends.watsonx import WatsonxAIBackend
     from mellea.telemetry import metrics as metrics_module
 
diff --git a/test/telemetry/test_metrics_token.py b/test/telemetry/test_metrics_token.py
index 2a61acce9..631e313a9 100644
--- a/test/telemetry/test_metrics_token.py
+++ b/test/telemetry/test_metrics_token.py
@@ -15,9 +15,10 @@
 except ImportError:
     OTEL_AVAILABLE = False
 
-pytestmark = pytest.mark.skipif(
-    not OTEL_AVAILABLE, reason="OpenTelemetry not installed"
-)
+pytestmark = [
+    pytest.mark.skipif(not OTEL_AVAILABLE, reason="OpenTelemetry not installed"),
+    pytest.mark.integration,
+]
 
 
 @pytest.fixture

From 7ccf18222546bef6e46fc5f1bfd6af8e1bbdd6ce Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 26 Mar 2026 20:17:14 +0000
Subject: [PATCH 19/42] test: add importorskip guards and optional-dep skip
 logic for examples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- test/plugins/test_payloads.py: importorskip("cpex") — skip module when
  mellea[hooks] not installed instead of failing mid-test with ImportError
- test/telemetry/test_metrics_plugins.py: same cpex guard
- docs/examples/conftest.py: extend _check_optional_imports to cover docling,
  pandas, cpex (mellea.plugins imports), and litellm; also call the check from
  pytest_pycollect_makemodule so directly-specified files are guarded too
- docs/examples/image_text_models/README.md: add Prerequisites section listing
  models to pull (granite3.2-vision, qwen2.5vl:7b)
---
 docs/examples/conftest.py                 | 47 +++++++++++++++++++++++
 docs/examples/image_text_models/README.md | 11 +++++-
 test/plugins/test_payloads.py             |  2 +
 test/telemetry/test_metrics_plugins.py    |  2 +
 4 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/docs/examples/conftest.py b/docs/examples/conftest.py
index af2fb7112..fbc8ecfd0 100644
--- a/docs/examples/conftest.py
+++ b/docs/examples/conftest.py
@@ -172,6 +172,47 @@ def _check_optional_imports(file_path):
                         "langchain_community not installed (install with: uv pip install mellea[tools])",
                     )
 
+        # Check for docling imports (direct or via mellea.stdlib.components.docs)
+        if (
+            "import docling" in content
+            or "from docling" in content
+            or "from mellea.stdlib.components.docs.richdocument" in content
+        ):
+            try:
+                import docling
+            except ImportError:
+                return (
+                    True,
+                    "docling not installed (install with: uv pip install mellea[docling])",
+                )
+
+        # Check for pandas imports
+        if "import pandas" in content or "from pandas" in content:
+            try:
+                import pandas
+            except ImportError:
+                return True, "pandas not installed"
+
+        # Check for cpex (mellea[hooks]) — plugins that call register()/plugin_scope()
+        if "from mellea.plugins" in content or "import mellea.plugins" in content:
+            try:
+                import cpex
+            except ImportError:
+                return (
+                    True,
+                    "cpex not installed (install with: uv pip install mellea[hooks])",
+                )
+
+        # Check for litellm
+        if "import litellm" in content or "from litellm" in content:
+            try:
+                import litellm
+            except ImportError:
+                return (
+                    True,
+                    "litellm not installed (install with: uv pip install mellea[backends])",
+                )
+
     except Exception:
         pass
 
@@ -371,6 +412,12 @@ def pytest_pycollect_makemodule(module_path, parent):
         # Prevent import by returning custom collector
         return SkippedFile.from_parent(parent, path=file_path)
 
+    # Also check optional imports here — this hook fires for directly-specified
+    # files too, whereas pytest_ignore_collect only fires during directory traversal.
+    should_skip, _reason = _check_optional_imports(file_path)
+    if should_skip:
+        return SkippedFile.from_parent(parent, path=file_path)
+
     return None
 
 
diff --git a/docs/examples/image_text_models/README.md b/docs/examples/image_text_models/README.md
index e84b4ad89..faa08d1cd 100644
--- a/docs/examples/image_text_models/README.md
+++ b/docs/examples/image_text_models/README.md
@@ -55,10 +55,19 @@ response = m.chat(
 
 ## Supported Models
 
-- **Ollama**: llava, bakllava, llava-phi3, moondream
+- **Ollama**: granite3.2-vision, llava, bakllava, llava-phi3, moondream, qwen2.5vl:7b
 - **OpenAI**: gpt-4-vision-preview, gpt-4o
 - **LiteLLM**: Various vision models through unified interface
 
+## Prerequisites
+
+Pull a vision-capable model before running these examples:
+
+```bash
+ollama pull granite3.2-vision    # ~2.4 GB — primary recommended model
+ollama pull qwen2.5vl:7b         # ~4.7 GB — used in vision_openai_examples.py
+```
+
 ## Related Documentation
 
 - See `test/backends/test_vision_*.py` for more examples
diff --git a/test/plugins/test_payloads.py b/test/plugins/test_payloads.py
index a6c06beef..81fa7f6bf 100644
--- a/test/plugins/test_payloads.py
+++ b/test/plugins/test_payloads.py
@@ -3,6 +3,8 @@
 import pytest
 from pydantic import ValidationError
 
+pytest.importorskip("cpex", reason="cpex not installed — install mellea[hooks]")
+
 from mellea.plugins.base import MelleaBasePayload
 from mellea.plugins.hooks.component import ComponentPreExecutePayload
 from mellea.plugins.hooks.generation import GenerationPreCallPayload
diff --git a/test/telemetry/test_metrics_plugins.py b/test/telemetry/test_metrics_plugins.py
index f89c2be57..9f577714d 100644
--- a/test/telemetry/test_metrics_plugins.py
+++ b/test/telemetry/test_metrics_plugins.py
@@ -4,6 +4,8 @@
 
 import pytest
 
+pytest.importorskip("cpex", reason="cpex not installed — install mellea[hooks]")
+
 from mellea.core.base import ModelOutputThunk
 from mellea.plugins.hooks.generation import GenerationPostCallPayload
 from mellea.telemetry.metrics_plugins import TokenMetricsPlugin

From 32f1f9b44c2816f8e90eb413017009889331f6e9 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 26 Mar 2026 20:21:21 +0000
Subject: [PATCH 20/42] fix: convert example import errors to skips; add cpex
 importorskip guards
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace per-dep import checks in examples conftest with a runtime approach:
ExampleModule (a pytest.Module subclass) is now returned by
pytest_pycollect_makemodule for all runnable example files, preventing
pytest's default collector from importing them directly. Import errors in
the subprocess are caught in ExampleItem.runtest() and converted to skips,
so no optional dependency needs to be encoded in conftest.

Remove _check_optional_imports entirely — it was hand-maintained and would
need updating for every new optional dep.

Also:
- test/plugins/test_payloads.py: importorskip("cpex")
- test/telemetry/test_metrics_plugins.py: importorskip("cpex")
- docs/examples/image_text_models/README.md: add Prerequisites section
  listing models to pull (granite3.2-vision, qwen2.5vl:7b)
---
 docs/examples/conftest.py | 117 ++++++++++----------------------------
 1 file changed, 29 insertions(+), 88 deletions(-)

diff --git a/docs/examples/conftest.py b/docs/examples/conftest.py
index fbc8ecfd0..9740306ff 100644
--- a/docs/examples/conftest.py
+++ b/docs/examples/conftest.py
@@ -143,80 +143,6 @@ def _should_skip_collection(markers):
     return False, None
 
 
-def _check_optional_imports(file_path):
-    """Check if file has optional imports that aren't installed.
-
-    Returns (should_skip, reason) tuple.
-    """
-    try:
-        with open(file_path) as f:
-            content = f.read()
-
-        # Check for langchain imports
-        if "from langchain" in content or "import langchain" in content:
-            try:
-                import langchain_core
-            except ImportError:
-                return True, "langchain_core not installed"
-
-            # Check for langchain_community specifically
-            if (
-                "from langchain_community" in content
-                or "import langchain_community" in content
-            ):
-                try:
-                    import langchain_community
-                except ImportError:
-                    return (
-                        True,
-                        "langchain_community not installed (install with: uv pip install mellea[tools])",
-                    )
-
-        # Check for docling imports (direct or via mellea.stdlib.components.docs)
-        if (
-            "import docling" in content
-            or "from docling" in content
-            or "from mellea.stdlib.components.docs.richdocument" in content
-        ):
-            try:
-                import docling
-            except ImportError:
-                return (
-                    True,
-                    "docling not installed (install with: uv pip install mellea[docling])",
-                )
-
-        # Check for pandas imports
-        if "import pandas" in content or "from pandas" in content:
-            try:
-                import pandas
-            except ImportError:
-                return True, "pandas not installed"
-
-        # Check for cpex (mellea[hooks]) — plugins that call register()/plugin_scope()
-        if "from mellea.plugins" in content or "import mellea.plugins" in content:
-            try:
-                import cpex
-            except ImportError:
-                return (
-                    True,
-                    "cpex not installed (install with: uv pip install mellea[hooks])",
-                )
-
-        # Check for litellm
-        if "import litellm" in content or "from litellm" in content:
-            try:
-                import litellm
-            except ImportError:
-                return (
-                    True,
-                    "litellm not installed (install with: uv pip install mellea[backends])",
-                )
-
-    except Exception:
-        pass
-
-    return False, None
 
 
 def pytest_addoption(parser):
@@ -412,13 +338,10 @@ def pytest_pycollect_makemodule(module_path, parent):
         # Prevent import by returning custom collector
         return SkippedFile.from_parent(parent, path=file_path)
 
-    # Also check optional imports here — this hook fires for directly-specified
-    # files too, whereas pytest_ignore_collect only fires during directory traversal.
-    should_skip, _reason = _check_optional_imports(file_path)
-    if should_skip:
-        return SkippedFile.from_parent(parent, path=file_path)
-
-    return None
+    # Return ExampleModule so pytest never falls through to its default Module
+    # collector (which would import the file directly). Import errors are
+    # instead caught at runtime in ExampleItem.runtest() and converted to skips.
+    return ExampleModule.from_parent(parent, path=file_path)
 
 
 def pytest_ignore_collect(collection_path, config):
@@ -496,13 +419,10 @@ def pytest_collect_file(parent: pytest.Dir, file_path: pathlib.PosixPath):
             # If we can't read markers, continue with other checks
             pass
 
-        # Check for optional imports before creating ExampleFile
-        should_skip, _reason = _check_optional_imports(file_path)
-        if should_skip:
-            # FIX: Return SkippedFile instead of None for optional import skips too
-            return SkippedFile.from_parent(parent, path=file_path)
-
-        return ExampleFile.from_parent(parent, path=file_path)
+        # ExampleModule (returned by pytest_pycollect_makemodule) handles
+        # collection for files that should run — return None here to avoid
+        # creating a duplicate collector from this hook.
+        return None
 
 
 class SkippedFile(pytest.File):
@@ -531,6 +451,18 @@ def collect(self):
         return [ExampleItem.from_parent(self, name=self.name)]
 
 
+class ExampleModule(pytest.Module):
+    """Module stand-in that routes to ExampleItem without importing the file.
+
+    Returned by pytest_pycollect_makemodule to prevent pytest's default
+    Module collector from importing the file directly (which would crash on
+    missing optional deps before ExampleItem.runtest() can catch them).
+    """
+
+    def collect(self):
+        return [ExampleItem.from_parent(self, name=self.path.name)]
+
+
 class ExampleItem(pytest.Item):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -583,6 +515,15 @@ def runtest(self):
                         skip_reason = line.replace("Skipped:", "").strip()
                         break
                 pytest.skip(skip_reason)
+            elif "ModuleNotFoundError" in stderr or "ImportError" in stderr:
+                # Missing optional dependency — skip rather than fail so the
+                # suite stays green without every optional package installed.
+                reason = "optional dependency not installed"
+                for line in stderr.split("\n"):
+                    if "ModuleNotFoundError" in line or "ImportError" in line:
+                        reason = line.strip()
+                        break
+                pytest.skip(reason)
             else:
                 raise ExampleTestException(
                     f"Example failed with exit code {retcode}.\nStderr: {stderr}\n"

From 3e6ec88a96c9a5067a986447a358a75175fd10c6 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 26 Mar 2026 21:06:45 +0000
Subject: [PATCH 21/42] test: skip OTel-dependent tests when opentelemetry not
 installed

Locally running without mellea[telemetry] caused three tests to fail
with assertion errors rather than skip cleanly. Add importorskip at
module level for test_tracing.py and a skipif decorator for the single
OTel-gated test in test_astream_exception_propagation.py.
---
 test/core/test_astream_exception_propagation.py | 7 +++++++
 test/telemetry/test_tracing.py                  | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/test/core/test_astream_exception_propagation.py b/test/core/test_astream_exception_propagation.py
index cd441250a..a8ebeca10 100644
--- a/test/core/test_astream_exception_propagation.py
+++ b/test/core/test_astream_exception_propagation.py
@@ -6,11 +6,14 @@
 """
 
 import datetime
+import importlib.util
 
 import pytest
 
 from mellea.core.base import CBlock, GenerateType, ModelOutputThunk
 
+_otel_available = importlib.util.find_spec("opentelemetry") is not None
+
 
 async def _noop_process(mot, chunk):
     if mot._underlying_value is None:
@@ -76,6 +79,10 @@ async def _tracking_post_process(mot):
     assert post_process_called, "post_process should be called on successful completion"
 
 
+@pytest.mark.skipif(
+    not _otel_available,
+    reason="opentelemetry not installed — install mellea[telemetry]",
+)
 async def test_astream_closes_telemetry_span_on_error():
     """Telemetry span must be ended and error recorded when generation fails."""
     from unittest.mock import MagicMock
diff --git a/test/telemetry/test_tracing.py b/test/telemetry/test_tracing.py
index 5ea1a9e00..6566e8e70 100644
--- a/test/telemetry/test_tracing.py
+++ b/test/telemetry/test_tracing.py
@@ -4,6 +4,10 @@
 
 import pytest
 
+pytest.importorskip(
+    "opentelemetry", reason="opentelemetry not installed — install mellea[telemetry]"
+)
+
 
 @pytest.fixture
 def enable_app_tracing(monkeypatch):

From c6fbfb658f4c5f9490601868c70279a4978d8830 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 26 Mar 2026 21:16:28 +0000
Subject: [PATCH 22/42] fix: use conservative heuristic for Apple Silicon GPU
 memory detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Metal's recommendedMaxWorkingSetSize is a static device property (~75%
of total RAM) that ignores current system load. Replace it with
min(total * 0.75, total - 16) so that desktop/IDE memory usage is
accounted for. Also removes the torch dependency for GPU detection on
Apple Silicon — sysctl hw.memsize is used directly.

CUDA path on Linux is unchanged.
---
 test/conftest.py   | 30 ++++++++++++++++++------------
 test/predicates.py | 41 ++++++++++++++++++++++++++++++++---------
 2 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/test/conftest.py b/test/conftest.py
index 987dabb7f..6e38fad27 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -65,12 +65,25 @@ def get_system_capabilities():
     }
 
     # Detect GPU (CUDA for NVIDIA, MPS for Apple Silicon)
-    if HAS_TORCH:
-        has_cuda = torch.cuda.is_available()
-        has_mps = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
-        capabilities["has_gpu"] = has_cuda or has_mps
+    import platform as _platform
+    import subprocess as _subprocess
 
-        if has_cuda:
+    _is_apple_silicon = sys.platform == "darwin" and _platform.machine() == "arm64"
+
+    if _is_apple_silicon:
+        capabilities["has_gpu"] = True
+        try:
+            out = _subprocess.run(
+                ["sysctl", "-n", "hw.memsize"],
+                capture_output=True, text=True, timeout=2,
+            )
+            total_gb = int(out.stdout.strip()) / (1024**3)
+            capabilities["gpu_memory_gb"] = min(total_gb * 0.75, total_gb - 16)
+        except Exception:
+            pass
+    elif HAS_TORCH:
+        if torch.cuda.is_available():
+            capabilities["has_gpu"] = True
             try:
                 # Use nvidia-smi to avoid initializing CUDA in parent process.
                 # torch.cuda.get_device_properties(0) creates a CUDA context,
@@ -89,13 +102,6 @@ def get_system_capabilities():
                 capabilities["gpu_memory_gb"] = float(result.stdout.strip()) / 1024
             except Exception:
                 pass
-        elif has_mps:
-            try:
-                capabilities["gpu_memory_gb"] = torch.mps.recommended_max_memory() / (
-                    1024**3
-                )
-            except (RuntimeError, AttributeError):
-                pass
 
     # Detect RAM
     if HAS_PSUTIL:
diff --git a/test/predicates.py b/test/predicates.py
index 246d68c72..4d6005154 100644
--- a/test/predicates.py
+++ b/test/predicates.py
@@ -26,6 +26,8 @@ def test_large_model():
 from __future__ import annotations
 
 import os
+import platform
+import subprocess
 import sys
 
 import pytest
@@ -34,14 +36,36 @@ def test_large_model():
 # GPU
 # ---------------------------------------------------------------------------
 
+_IS_APPLE_SILICON = sys.platform == "darwin" and platform.machine() == "arm64"
+
+
+def _apple_silicon_vram_gb() -> float:
+    """Conservative usable GPU memory estimate for Apple Silicon.
+
+    Metal's ``recommendedMaxWorkingSetSize`` is a static device property
+    (~75% of total RAM) that does not account for current system load.
+    We use ``min(total * 0.75, total - 16)`` to leave headroom for the OS
+    and desktop applications, which typically consume 8–16 GB on a loaded
+    developer machine.
+    """
+    try:
+        out = subprocess.run(
+            ["sysctl", "-n", "hw.memsize"],
+            capture_output=True, text=True, timeout=2,
+        )
+        total_gb = int(out.stdout.strip()) / (1024**3)
+        return min(total_gb * 0.75, total_gb - 16)
+    except Exception:
+        return 0.0
+
 
 def _gpu_available() -> bool:
+    if _IS_APPLE_SILICON:
+        return True
     try:
         import torch
 
-        return torch.cuda.is_available() or (
-            hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
-        )
+        return torch.cuda.is_available()
     except ImportError:
         return False
 
@@ -49,18 +73,17 @@ def _gpu_available() -> bool:
 def _gpu_vram_gb() -> float:
     """Return usable GPU VRAM in GB, or 0 if unavailable.
 
-    On CUDA: reports device 0 total memory.
-    On macOS MPS: reports ``recommendedMaxWorkingSetSize`` — the Metal
-    driver's own estimate of how much unified memory the GPU can use
-    without degrading system performance.
+    On Apple Silicon: uses a conservative heuristic based on total unified
+    memory rather than Metal's static ``recommendedMaxWorkingSetSize``.
+    On CUDA: reports device 0 total memory via torch.
     """
+    if _IS_APPLE_SILICON:
+        return _apple_silicon_vram_gb()
     try:
         import torch
 
         if torch.cuda.is_available():
             return torch.cuda.get_device_properties(0).total_memory / (1024**3)
-        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            return torch.mps.recommended_max_memory() / (1024**3)
     except (ImportError, RuntimeError, AttributeError):
         pass
     return 0.0

From 8cee78166c1b2ac5bbc59dfa35edaf7c3e41b854 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 26 Mar 2026 21:43:02 +0000
Subject: [PATCH 23/42] test: add training memory signals to audit-markers
 skill; bump alora VRAM gate

Training tests need ~2x the base model inference memory (activations,
optimizer states, gradient temporaries). The skill now detects training
signals (train_model, Trainer, epochs=) and checks that require_gpu
min_vram_gb uses the 2x rule.

Bump test_alora_train_integration from min_vram_gb=12 to 20 (3B bfloat16:
~6 GB inference, ~12 GB training peak + headroom) so it skips correctly
on 32 GB Apple Silicon under typical load.
---
 .agents/skills/audit-markers/SKILL.md    | 37 ++++++++++++++++++++++++
 test/cli/test_alora_train_integration.py |  4 ++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index d4dd1400e..3741f11b8 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -202,6 +202,37 @@ or non-unit. Use these to triage at scale (see Audit Procedure, Step 0).
 | GPU / model loading | `import torch`, `.to("cuda")`, `.from_pretrained(` | Hardware dependency |
 | External downloads | URL literals (`http://`, `https://`), `urlopen`, `requests.get` with URLs | Network fetch |
 
+### Training memory signals (check `require_gpu` threshold)
+
+Training tests consume significantly more memory than inference. When these
+patterns appear, verify that `require_gpu(min_vram_gb=N)` uses the **training
+peak**, not just the model parameter size:
+
+| Category | Grep patterns | Notes |
+|---|---|---|
+| Model load | `from_pretrained(`, `AutoModelForCausalLM`, `AutoModelForSeq2SeqLM`, `AutoTokenizer` | Downloading/loading a real model |
+| Training | `train_model(`, `Trainer(`, `trainer.train(`, `epochs=`, `num_train_epochs=` | Any training loop |
+| Inference on real model | `.generate(` in a test body without a mock | Full model forward pass |
+| HF dataset download | `load_dataset(` | Dataset fetch + tokenisation |
+
+**Training memory rule:** Training requires ~2× the base model weight memory
+(activations, optimizer states, gradient temporaries). A test that trains then
+reloads the model for inference has two separate peaks — use the training peak
+plus headroom:
+
+```
+min_vram_gb = (model_param_bytes_in_bfloat16 * 2) + headroom
+            ≈ (params_B * 2 GB) * 2  + ~4 GB headroom
+```
+
+Example: 3B bfloat16 model → 6 GB weights → training peak ~12 GB → set
+`require_gpu(min_vram_gb=20)` so the gate fires on machines where available
+GPU memory is below that, rather than letting the test OOM mid-run.
+
+The VRAM heuristic (`predicates.py`) reports *estimated available* memory, not
+total RAM. A 32 GB Apple Silicon machine reports ~16 GB available. Setting
+`min_vram_gb=20` correctly skips on that machine while running on 48 GB+.
+
 ### SDK-boundary signals (test is likely integration, not unit)
 
 These patterns indicate real third-party SDK objects are being used as
@@ -611,6 +642,9 @@ Run grep across all target files for:
    network literals (`localhost`, `127.0.0.1`, port numbers), HTTP client
    usage, subprocess calls, `_API_KEY`/`_TOKEN`/`_SECRET` in env var checks,
    GPU/model loading (`torch`, `.from_pretrained(`), URL literals.
+1a. **Training signals** — `from_pretrained(`, `train_model(`, `Trainer(`, `epochs=`,
+    `.generate(` (non-mock), `load_dataset(`. Cross-reference against
+    `require_gpu(min_vram_gb=N)` — check N uses the 2× training memory rule.
 2. **SDK-boundary signals** — real third-party SDK objects used as collaborators:
    `InMemoryMetricReader`, `InMemorySpanExporter`, `MeterProvider(metric_readers=`,
    `TracerProvider(`, `LoggingHandler`, `provider.force_flush()`,
@@ -662,6 +696,9 @@ from Part 1 using the project-specific heuristics from Part 2:
 2. **Which backend(s)?** → backend markers (e2e only)
 3. **Deterministic or content-dependent assertions?** → e2e vs qualitative
 4. **What resources?** → resource markers
+5. **Training memory?** → if training signals present (`train_model(`, `Trainer(`,
+   `epochs=`), verify `require_gpu(min_vram_gb=N)` uses 2× the model inference
+   memory + headroom (see Training memory signals table).
 
 If uncertain about a classification (especially qualitative vs e2e), note it
 and ask the user to confirm.
diff --git a/test/cli/test_alora_train_integration.py b/test/cli/test_alora_train_integration.py
index 221e6dde9..0aa29af2f 100644
--- a/test/cli/test_alora_train_integration.py
+++ b/test/cli/test_alora_train_integration.py
@@ -20,7 +20,9 @@
 pytestmark = [
     pytest.mark.huggingface,
     pytest.mark.e2e,
-    require_gpu(min_vram_gb=12),
+    require_gpu(
+        min_vram_gb=20
+    ),  # 3B bfloat16: ~6 GB inference, ~12 GB training peak + headroom
     # Skip entire module in CI since 17/18 tests are qualitative
     pytest.mark.skipif(
         int(os.environ.get("CICD", 0)) == 1,

From 28808ff91bae534fca1bbffc75c11a816ba74626 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 26 Mar 2026 23:05:50 +0000
Subject: [PATCH 24/42] fix: cache system capabilities result in examples
 conftest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

get_system_capabilities() was caching the function reference, not the
result — causing the Ollama socket check (1s timeout) and full capability
detection to re-run for every example file during collection (~102 times).
Cache the result dict instead so detection runs exactly once.
---
 docs/examples/conftest.py | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/docs/examples/conftest.py b/docs/examples/conftest.py
index 9740306ff..c117c907e 100644
--- a/docs/examples/conftest.py
+++ b/docs/examples/conftest.py
@@ -11,16 +11,16 @@
 
 import pytest
 
-# Lazy import of system capability detection to avoid circular imports
-_get_system_capabilities = None
+# Cached result of system capability detection (None = not yet computed)
+_capabilities_cache: dict | None = None
 
 
 def get_system_capabilities():
-    """Lazy load system capabilities from test/conftest.py."""
-    global _get_system_capabilities
+    """Lazy load system capabilities from test/conftest.py, cached after first call."""
+    global _capabilities_cache
 
-    if _get_system_capabilities is not None:
-        return _get_system_capabilities()
+    if _capabilities_cache is not None:
+        return _capabilities_cache
 
     # Add test directory to path to enable import
     _test_dir = pathlib.Path(__file__).parent.parent.parent / "test"
@@ -38,8 +38,8 @@ def get_system_capabilities():
         if spec and spec.loader:
             test_conftest = importlib.util.module_from_spec(spec)
             spec.loader.exec_module(test_conftest)
-            _get_system_capabilities = test_conftest.get_system_capabilities
-            return _get_system_capabilities()
+            _capabilities_cache = test_conftest.get_system_capabilities()
+            return _capabilities_cache
         else:
             raise ImportError("Could not load test/conftest.py")
     except (ImportError, AttributeError) as e:
@@ -50,17 +50,14 @@ def get_system_capabilities():
             f"Could not import get_system_capabilities from test/conftest.py: {e}. Heavy RAM tests will NOT be skipped!"
         )
 
-        def fallback():
-            return {
-                "has_gpu": False,
-                "gpu_memory_gb": 0,
-                "ram_gb": 0,
-                "has_api_keys": {},
-                "has_ollama": False,
-            }
-
-        _get_system_capabilities = fallback
-        return fallback()
+        _capabilities_cache = {
+            "has_gpu": False,
+            "gpu_memory_gb": 0,
+            "ram_gb": 0,
+            "has_api_keys": {},
+            "has_ollama": False,
+        }
+        return _capabilities_cache
 
 
 examples_to_skip: dict[str, str] = {}

From 7f05eb8c1ae05aa95cda50255843200e322144d3 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Thu, 26 Mar 2026 23:07:03 +0000
Subject: [PATCH 25/42] fix: cache get_system_capabilities() result in
 test/conftest.py

The function was called once per test in pytest_runtest_setup (325+
calls) and once at collection in pytest_collection_modifyitems, each
time re-running the Ollama socket check (1s timeout when down), sysctl
subprocess, and psutil query. Cache the result after the first call.
---
 test/conftest.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/conftest.py b/test/conftest.py
index 6e38fad27..136fd3467 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -54,8 +54,15 @@ def _check_ollama_available():
         return False
 
 
+_capabilities_cache: dict | None = None
+
+
 def get_system_capabilities():
     """Detect system capabilities for test requirements."""
+    global _capabilities_cache
+    if _capabilities_cache is not None:
+        return _capabilities_cache
+
     capabilities = {
         "has_gpu": False,
         "gpu_memory_gb": 0,
@@ -123,6 +130,7 @@ def get_system_capabilities():
     # Detect Ollama availability
     capabilities["has_ollama"] = _check_ollama_available()
 
+    _capabilities_cache = capabilities
     return capabilities
 
 

From 66d35f067a6e6bba191a5b95133f91bdfacbe086 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 06:38:36 +0000
Subject: [PATCH 26/42] fix: flush MPS memory pool in intrinsic test fixture
 teardown

torch.cuda.empty_cache() is a no-op on Apple Silicon MPS, leaving the
MPS allocator pool occupied after each module fixture tears down. The
next module then loads a fresh model into an already-pressured pool,
causing the process RSS to grow unboundedly across modules. Both calls
are now guarded so CUDA and MPS runs each get the correct flush.
---
 test/conftest.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/conftest.py b/test/conftest.py
index 136fd3467..c6251504d 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -588,7 +588,7 @@ def cleanup_gpu_backend(backend, backend_name="unknown"):
                 pass
             del backend._underlying_model
 
-        # 8. Force garbage collection and flush CUDA cache
+        # 8. Force garbage collection and flush device caches
         gc.collect()
         gc.collect()
 
@@ -602,6 +602,8 @@ def cleanup_gpu_backend(backend, backend_name="unknown"):
                 f"/ {total / 1024**3:.1f}GB total "
                 f"(reclaimed {(free_after - free_before) / 1024**3:.1f}GB)"
             )
+        if torch.backends.mps.is_available():
+            torch.mps.empty_cache()
 
     except ImportError:
         pass

From 355154f5a19c35b563e1f08c1d6d1434ec8d8251 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 06:38:45 +0000
Subject: [PATCH 27/42] fix: load LocalHFBackend model in config dtype to
 prevent float32 upcasting

AutoModelForCausalLM.from_pretrained without torch_dtype may load
weights in float32 on CPU before moving to MPS/CUDA, doubling peak
memory briefly and leaving float32 remnants in the allocator pool.
torch_dtype="auto" respects the model config (bfloat16 for Granite)
for both the CPU load and the device transfer.
---
 mellea/backends/huggingface.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
index e6236e5c6..d4c9fb65a 100644
--- a/mellea/backends/huggingface.py
+++ b/mellea/backends/huggingface.py
@@ -300,7 +300,9 @@ def __init__(
                 )
                 # Get the model and tokenizer.
                 self._model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
-                    self._hf_model_id, device_map=str(self._device)
+                    self._hf_model_id,
+                    device_map=str(self._device),
+                    torch_dtype="auto",
                 )
                 self._tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
                     self._hf_model_id

From 601162c0e65a269c05b68e6def92cddddd285f9f Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 07:12:30 +0000
Subject: [PATCH 28/42] test: remove --isolate-heavy process isolation and bump
 intrinsic VRAM gates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove --isolate-heavy flag, _run_heavy_modules_isolated(), pytest_collection_finish(),
  and require_gpu_isolation() predicate — superseded by cleanup_gpu_backend() from PR #721
- Remove dead requires_gpu/requires_api_key branches from docs/examples/conftest.py
- Bump min_vram_gb from 8 → 12 on test_guardian, test_core, test_rag, test_spans —
  correct gate for 3B base model (6 GB) + adapters + inference overhead; 8 GB was
  wrong and masked by the now-fixed MPS pool leak
- Add adapter accumulation signals to audit-markers skill
- Update AGENTS.md, test/README.md, MARKERS_GUIDE.md to remove --isolate-heavy references
---
 .agents/skills/audit-markers/SKILL.md         |  28 +++
 AGENTS.md                                     |   1 -
 docs/examples/conftest.py                     |  24 +--
 test/MARKERS_GUIDE.md                         |   1 -
 test/README.md                                |  61 +-----
 test/backends/test_vllm.py                    |  13 +-
 test/backends/test_vllm_tools.py              |  13 +-
 test/conftest.py                              | 186 +-----------------
 test/predicates.py                            |  26 +--
 test/stdlib/components/intrinsic/test_core.py |   2 +-
 .../components/intrinsic/test_guardian.py     |   2 +-
 test/stdlib/components/intrinsic/test_rag.py  |   2 +-
 test/stdlib/test_spans.py                     |   6 +-
 13 files changed, 58 insertions(+), 307 deletions(-)

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index 3741f11b8..5d7172ba3 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -233,6 +233,34 @@ The VRAM heuristic (`predicates.py`) reports *estimated available* memory, not
 total RAM. A 32 GB Apple Silicon machine reports ~16 GB available. Setting
 `min_vram_gb=20` correctly skips on that machine while running on 48 GB+.
 
+### Adapter accumulation signals (module-scoped backend with multiple intrinsics)
+
+When a test module uses a **module-scoped backend fixture** and calls multiple different intrinsic
+functions (`call_intrinsic`, `guardian_check`, `policy_guardrails`, `guardian_check_harm`, etc.)
+against the same backend, each intrinsic loads a separate LoRA adapter that stays resident in
+`_loaded_adapters` for the lifetime of the fixture. Adapters do NOT auto-unload between tests.
+
+| Signal | Pattern | Notes |
+|---|---|---|
+| Module-scoped HF backend | `scope="module"` on a `LocalHFBackend` fixture | Adapter accumulation possible |
+| Multiple intrinsic calls | `call_intrinsic(`, `guardian_check(`, `policy_guardrails(`, `factuality_` | Each loads a distinct adapter |
+| No `unload_adapter` | Absence of `backend.unload_adapter(` in fixture teardown | Adapters pile up |
+
+**Memory estimate for modules with adapter accumulation:**
+
+```
+min_vram_gb = base_model_gb + (N_distinct_intrinsics × ~0.2 GB) + inference_overhead_gb
+```
+
+For `test_guardian.py` (6 tests, 3B base model, ~4 distinct adapters):
+- Base model: ~6 GB
+- 4 adapters: ~0.8 GB
+- Inference overhead: ~2 GB
+- Total: ~9 GB minimum → use `require_gpu(min_vram_gb=12)` for headroom
+
+Flag any module where `scope="module"` backend + multiple distinct intrinsic calls has
+`require_gpu(min_vram_gb=N)` set only to cover the base model size.
+
 ### SDK-boundary signals (test is likely integration, not unit)
 
 These patterns indicate real third-party SDK objects are being used as
diff --git a/AGENTS.md b/AGENTS.md
index c329c3bf4..3bb1fe75c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -25,7 +25,6 @@ uv run pytest                         # Default: qualitative tests, skip slow te
 uv run pytest -m "not qualitative"    # Fast tests only (~2 min)
 uv run pytest -m slow                 # Run only slow tests (>5 min)
 uv run pytest --co -q                 # Run ALL tests including slow (bypass config)
-uv run pytest --isolate-heavy         # Enable GPU process isolation (opt-in)
 uv run ruff format .                  # Format code
 uv run ruff check .                   # Lint code
 uv run mypy .                         # Type check
diff --git a/docs/examples/conftest.py b/docs/examples/conftest.py
index c117c907e..1ff04019d 100644
--- a/docs/examples/conftest.py
+++ b/docs/examples/conftest.py
@@ -121,7 +121,7 @@ def _should_skip_collection(markers):
         return True, "Skipping slow test (SKIP_SLOW=1)"
 
     # Skip tests requiring GPU if not available
-    if "requires_gpu" in markers or "huggingface" in markers or "vllm" in markers:
+    if "huggingface" in markers or "vllm" in markers:
         if not capabilities["has_gpu"]:
             return True, "GPU not available"
 
@@ -131,17 +131,16 @@ def _should_skip_collection(markers):
             return True, "Ollama not available (port 11434 not listening)"
 
     # Skip tests requiring API keys
-    if "requires_api_key" in markers or "watsonx" in markers:
-        if "watsonx" in markers and not capabilities["has_api_keys"].get("watsonx"):
+    if "watsonx" in markers:
+        if not capabilities["has_api_keys"].get("watsonx"):
             return True, "Watsonx API credentials not found"
-        if "openai" in markers and not capabilities["has_api_keys"].get("openai"):
+    if "openai" in markers:
+        if not capabilities["has_api_keys"].get("openai"):
             return True, "OpenAI API key not found"
 
     return False, None
 
 
-
-
 def pytest_addoption(parser):
     """Add command-line options for skipping capability checks.
 
@@ -577,20 +576,9 @@ def pytest_runtest_setup(item):
             reason="Skipping qualitative test: got env variable CICD == 1. Used only in gh workflows."
         )
 
-    # Skip tests requiring API keys if not available
-    if item.get_closest_marker("requires_api_key") and not ignore_api_key:
-        for backend in ["openai", "watsonx"]:
-            if item.get_closest_marker(backend):
-                if not capabilities["has_api_keys"].get(backend):
-                    pytest.skip(
-                        f"Skipping test: {backend} API key not found in environment"
-                    )
-
     # Skip tests requiring GPU if not available
     if (
-        item.get_closest_marker("requires_gpu")
-        or item.get_closest_marker("huggingface")
-        or item.get_closest_marker("vllm")
+        item.get_closest_marker("huggingface") or item.get_closest_marker("vllm")
     ) and not ignore_gpu:
         if not capabilities["has_gpu"]:
             pytest.skip("Skipping test: GPU not available")
diff --git a/test/MARKERS_GUIDE.md b/test/MARKERS_GUIDE.md
index 8d4949ca7..74cbdf050 100644
--- a/test/MARKERS_GUIDE.md
+++ b/test/MARKERS_GUIDE.md
@@ -313,7 +313,6 @@ jobs:
 ```
 
 - `CICD=1` skips qualitative tests
-- `CICD=1` enables GPU process isolation (`--isolate-heavy` behaviour)
 - `slow` tests excluded by default (add `-m slow` to include)
 
 ## Related Files
diff --git a/test/README.md b/test/README.md
index a8c9df3c3..c9824366c 100644
--- a/test/README.md
+++ b/test/README.md
@@ -17,62 +17,9 @@ uv run pytest -m slow
 
 ## Environment Variables
 
-- `CICD=1` - Enable CI mode (skips qualitative tests, enables GPU process isolation)
+- `CICD=1` - Enable CI mode (skips qualitative tests)
 - `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` - Helps with GPU memory fragmentation
 
-## Heavy GPU Tests - Process Isolation
-
-**Heavy GPU tests (HuggingFace, vLLM) can use process isolation to guarantee GPU memory release between test modules.**
-
-### Why Process Isolation?
-
-Heavy GPU backends (HuggingFace, vLLM) hold GPU memory at the process level. Even with aggressive cleanup (garbage collection, CUDA cache clearing, etc.), GPU memory remains locked by the CUDA driver until the process exits. When running multiple heavy GPU test modules in sequence, this can cause OOM errors.
-
-### How It Works
-
-Process isolation is **opt-in** via the `--isolate-heavy` flag or `CICD=1` environment variable. When enabled, the collection hook in `test/conftest.py`:
-
-1. Detects modules gated with `require_gpu_isolation()` (or the deprecated `@pytest.mark.requires_gpu_isolation`)
-2. Runs each marked module in a separate subprocess
-3. Sets required environment variables (`PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`)
-4. Ensures full GPU memory release between modules
-5. Reports results from all modules
-
-### Usage
-
-```bash
-# Normal execution (no isolation) - fast, but may hit GPU OOM with multiple heavy modules
-uv run pytest test/backends/test_vllm.py test/backends/test_huggingface.py
-
-# With isolation (opt-in) - slower, but guarantees GPU memory release
-uv run pytest test/backends/test_vllm.py test/backends/test_huggingface.py --isolate-heavy
-
-# Run all heavy GPU tests with isolation
-uv run pytest -m requires_gpu_isolation --isolate-heavy
-
-# CI automatically enables isolation (via CICD=1)
-CICD=1 uv run pytest test/backends/
-
-# Single module runs normally (no isolation needed even with flag)
-uv run pytest test/backends/test_vllm.py --isolate-heavy
-```
-
-### Affected Tests
-
-Tests marked with `@pytest.mark.requires_gpu_isolation`:
-- `test/backends/test_huggingface.py` - HuggingFace backend tests
-- `test/backends/test_huggingface_tools.py` - HuggingFace tool calling tests
-- `test/backends/test_vllm.py` - vLLM backend tests
-- `test/backends/test_vllm_tools.py` - vLLM tool calling tests
-
-### Technical Details
-
-- **Opt-in by default**: Use `--isolate-heavy` flag or set `CICD=1`
-- **Single module**: Runs normally even with isolation flag (no subprocess overhead)
-- **Multiple modules**: Each runs in its own subprocess with full GPU memory isolation
-- **Test discovery**: Works normally (`pytest --collect-only`) - isolation only happens during execution
-- **Marker-based**: Only modules with `@pytest.mark.requires_gpu_isolation` are isolated
-
 ## GPU Testing on CUDA Systems
 
 ### The Problem: CUDA EXCLUSIVE_PROCESS Mode
@@ -152,7 +99,7 @@ Key markers for GPU testing:
 Use predicate functions from `test/predicates.py` for resource gating:
 
 ```python
-from test.predicates import require_gpu, require_ram, require_gpu_isolation
+from test.predicates import require_gpu, require_ram
 
 pytestmark = [pytest.mark.e2e, pytest.mark.huggingface, require_gpu(), require_ram(min_gb=48)]
 ```
@@ -162,13 +109,13 @@ pytestmark = [pytest.mark.e2e, pytest.mark.huggingface, require_gpu(), require_r
 | `require_gpu()` | Any GPU (CUDA or MPS) |
 | `require_gpu(min_vram_gb=N)` | GPU with at least N GB VRAM |
 | `require_ram(min_gb=N)` | N GB+ system RAM |
-| `require_gpu_isolation()` | Subprocess isolation for CUDA memory |
 | `require_api_key("ENV_VAR")` | Specific API credentials |
 
 > **Deprecated:** The markers `requires_gpu`, `requires_heavy_ram`, `requires_api_key`,
 > and `requires_gpu_isolation` are deprecated. Existing tests using them still work
 > (conftest auto-skip handles them) but new tests must use predicates. Migrate legacy
-> markers to predicates when touching those files.
+> markers to predicates when touching those files. `require_gpu_isolation()` has been
+> removed — use `--group-by-backend` for backend grouping instead.
 
 ## Coverage
 
diff --git a/test/backends/test_vllm.py b/test/backends/test_vllm.py
index bc054de65..d9bc8a092 100644
--- a/test/backends/test_vllm.py
+++ b/test/backends/test_vllm.py
@@ -34,21 +34,16 @@
     )
 
 
-# vLLM tests use hybrid backend strategy (see conftest.py):
-# - Default: Shared session-scoped backend (fast, no fragmentation)
-# - --isolate-heavy: Module-scoped backends in separate processes
+# vLLM tests use a shared session-scoped backend (see conftest.py shared_vllm_backend).
+# Falls back to a module-scoped backend when --group-by-backend delays shared creation.
 @pytest.fixture(scope="module")
 def backend(shared_vllm_backend):
-    """Use shared session-scoped backend, or create module-scoped if isolated.
-
-    Without --isolate-heavy: Uses shared backend (fast, no fragmentation)
-    With --isolate-heavy: Creates module-scoped backend (process isolation)
-    """
+    """Use shared session-scoped backend, or create module-scoped as fallback."""
     if shared_vllm_backend is not None:
         yield shared_vllm_backend
         return  # skip cleanup — shared backend cleaned up by conftest
 
-    # Isolation mode - create module-scoped backend
+    # Fallback: shared backend not available, create module-scoped backend
     backend = LocalVLLMBackend(
         model_id=model_ids.IBM_GRANITE_4_MICRO_3B,
         model_options={
diff --git a/test/backends/test_vllm_tools.py b/test/backends/test_vllm_tools.py
index e18d47942..861f2a51e 100644
--- a/test/backends/test_vllm_tools.py
+++ b/test/backends/test_vllm_tools.py
@@ -30,23 +30,18 @@
     )
 
 
-# vLLM tests use hybrid backend strategy (see conftest.py):
-# - Default: Shared session-scoped backend (fast, no fragmentation)
-# - --isolate-heavy: Module-scoped backends in separate processes
+# vLLM tests use a shared session-scoped backend (see conftest.py shared_vllm_backend).
+# Falls back to a module-scoped backend when --group-by-backend delays shared creation.
 # Note: Originally used Mistral-7B, now uses Granite 4 Micro for consistency.
 # Granite 4 Micro supports tool calling and is sufficient for testing.
 @pytest.fixture(scope="module")
 def backend(shared_vllm_backend):
-    """Use shared session-scoped backend, or create module-scoped if isolated.
-
-    Without --isolate-heavy: Uses shared backend (fast, no fragmentation)
-    With --isolate-heavy: Creates module-scoped backend (process isolation)
-    """
+    """Use shared session-scoped backend, or create module-scoped as fallback."""
     if shared_vllm_backend is not None:
         yield shared_vllm_backend
         return
 
-    # Isolation mode - create module-scoped backend
+    # Fallback: shared backend not available, create module-scoped backend
     backend = LocalVLLMBackend(
         model_id=model_ids.IBM_GRANITE_4_MICRO_3B,
         model_options={
diff --git a/test/conftest.py b/test/conftest.py
index c6251504d..8412ec57f 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -82,7 +82,9 @@ def get_system_capabilities():
         try:
             out = _subprocess.run(
                 ["sysctl", "-n", "hw.memsize"],
-                capture_output=True, text=True, timeout=2,
+                capture_output=True,
+                text=True,
+                timeout=2,
             )
             total_gb = int(out.stdout.strip()) / (1024**3)
             capabilities["gpu_memory_gb"] = min(total_gb * 0.75, total_gb - 16)
@@ -149,25 +151,9 @@ def gh_run() -> int:
 def shared_vllm_backend(request):
     """Shared vLLM backend for ALL vLLM tests across all modules.
 
-    When --isolate-heavy is used, returns None to allow module-scoped backends.
     When --group-by-backend is used, delays creation until after openai_vllm group.
     Uses IBM Granite 4 Micro as a small, fast model suitable for all vLLM tests.
     """
-    # Check if process isolation is enabled
-    use_isolation = (
-        request.config.getoption("--isolate-heavy", default=False)
-        or os.environ.get("CICD", "0") == "1"
-    )
-
-    if use_isolation:
-        logger = FancyLogger.get_logger()
-        logger.info(
-            "Process isolation enabled (--isolate-heavy). "
-            "Skipping shared vLLM backend - each module will create its own."
-        )
-        yield None
-        return
-
     # When using --group-by-backend, delay backend creation until after openai_vllm group
     if request.config.getoption("--group-by-backend", default=False):
         # Check if we're currently in the openai_vllm group
@@ -310,12 +296,6 @@ def add_option_safe(option_name, **kwargs):
         default=False,
         help="Register all acceptance plugin sets for every test",
     )
-    add_option_safe(
-        "--isolate-heavy",
-        action="store_true",
-        default=False,
-        help="Run heavy GPU tests in isolated subprocesses (slower, but guarantees CUDA memory release)",
-    )
     add_option_safe(
         "--group-by-backend",
         action="store_true",
@@ -386,104 +366,6 @@ def pytest_configure(config):
 # ============================================================================
 
 
-def _run_heavy_modules_isolated(session, heavy_modules: list[str]) -> int:
-    """Run heavy RAM test modules in separate processes for GPU memory isolation.
-
-    Streams output in real-time and parses for test failures to provide
-    a clear summary at the end.
-
-    Returns exit code (0 = all passed, 1 = any failed).
-    """
-    print("\n" + "=" * 70)
-    print("Heavy GPU Test Process Isolation Active")
-    print("=" * 70)
-    print(
-        f"Running {len(heavy_modules)} heavy GPU test module(s) in separate processes"
-    )
-    print("to ensure GPU memory is fully released between modules.\n")
-
-    # Set environment variables for vLLM
-    env = os.environ.copy()
-    env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-
-    all_passed = True
-    failed_modules = {}  # module_path -> list of failed test names
-
-    for i, module_path in enumerate(heavy_modules, 1):
-        print(f"\n[{i}/{len(heavy_modules)}] Running: {module_path}")
-        print("-" * 70)
-
-        # Build pytest command with same options as parent session
-        cmd = [sys.executable, "-m", "pytest", module_path, "-v", "--no-cov"]
-
-        # Add markers from original command if present
-        config = session.config
-        markexpr = config.getoption("-m", default=None)
-        if markexpr:
-            cmd.extend(["-m", markexpr])
-
-        import pathlib
-
-        repo_root = str(pathlib.Path(__file__).parent.parent.resolve())
-        env["PYTHONPATH"] = f"{repo_root}{os.pathsep}{env.get('PYTHONPATH', '')}"
-
-        # Stream output in real-time while capturing for parsing
-        process = subprocess.Popen(
-            cmd,
-            env=env,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,  # Merge stderr into stdout
-            text=True,
-            bufsize=1,  # Line buffered for immediate output
-        )
-
-        failed_tests = []
-
-        # Stream output line by line
-        if process.stdout:
-            for line in process.stdout:
-                print(line, end="")  # Print immediately (streaming)
-
-                # Parse for failures (pytest format: "test_file.py::test_name FAILED")
-                if " FAILED " in line:
-                    # Extract test name from pytest output
-                    try:
-                        parts = line.split(" FAILED ")
-                        if len(parts) >= 2:
-                            # Get the test identifier (the part before " FAILED ")
-                            # Strip whitespace and take last token (handles indentation)
-                            test_name = parts[0].strip().split()[-1]
-                            failed_tests.append(test_name)
-                    except Exception:
-                        # If parsing fails, continue - we'll still show module failed
-                        pass
-
-        process.wait()
-
-        if process.returncode != 0:
-            all_passed = False
-            failed_modules[module_path] = failed_tests
-            print(f"✗ Module failed: {module_path}")
-        else:
-            print(f"✓ Module passed: {module_path}")
-
-    print("\n" + "=" * 70)
-    if all_passed:
-        print("All heavy GPU modules passed!")
-    else:
-        print(f"Failed modules ({len(failed_modules)}):")
-        for module, tests in failed_modules.items():
-            print(f"  {module}:")
-            if tests:
-                for test in tests:
-                    print(f"    - {test}")
-            else:
-                print("    (module failed but couldn't parse specific test names)")
-    print("=" * 70 + "\n")
-
-    return 0 if all_passed else 1
-
-
 # ============================================================================
 # vLLM Backend Cleanup Helper
 # ============================================================================
@@ -609,68 +491,6 @@ def cleanup_gpu_backend(backend, backend_name="unknown"):
         pass
 
 
-def pytest_collection_finish(session):
-    """
-    Opt-in process isolation for heavy GPU tests.
-    Prevents CUDA OOMs by forcing OS-level memory release between heavy modules.
-    """
-    # 1. Test Discovery Guard: Never isolate during discovery
-    if session.config.getoption("collectonly", default=False):
-        return
-
-    # 2. Opt-in Guard: Only isolate if explicitly requested or in CI
-    use_isolation = (
-        session.config.getoption("--isolate-heavy", default=False)
-        or os.environ.get("CICD", "0") == "1"
-    )
-    if not use_isolation:
-        return
-
-    # 3. Hardware Guard: Only applies to CUDA environments
-    try:
-        import torch
-
-        if not torch.cuda.is_available():
-            return
-    except ImportError:
-        return
-
-    # Collect modules explicitly marked for GPU isolation
-    heavy_items = [
-        item
-        for item in session.items
-        if item.get_closest_marker("requires_gpu_isolation")
-    ]
-
-    # Extract unique module paths
-    heavy_modules = list({str(item.path) for item in heavy_items})
-
-    if len(heavy_modules) <= 1:
-        return  # No isolation needed for a single module
-
-    # Confirmation logging: Show which modules will be isolated
-    print(f"\n[INFO] GPU Isolation enabled for {len(heavy_modules)} modules:")
-    for module in heavy_modules:
-        print(f"  - {module}")
-
-    # Execute heavy modules in subprocesses
-    exit_code = _run_heavy_modules_isolated(session, heavy_modules)
-
-    # 4. Non-Destructive Execution: Remove heavy items, DO NOT exit.
-    session.items = [
-        item for item in session.items if str(item.path) not in heavy_modules
-    ]
-
-    # Propagate subprocess failures to the main pytest session
-    if exit_code != 0:
-        # Count actual test failures from the isolated modules
-        # Note: We increment testsfailed by the number of modules that failed,
-        # not the total number of modules. The _run_heavy_modules_isolated
-        # function already tracks which modules failed.
-        session.testsfailed += 1  # Mark that failures occurred
-        session.exitstatus = exit_code
-
-
 # ============================================================================
 # Test Collection Filtering
 # ============================================================================
diff --git a/test/predicates.py b/test/predicates.py
index 4d6005154..2c00f1865 100644
--- a/test/predicates.py
+++ b/test/predicates.py
@@ -45,13 +45,12 @@ def _apple_silicon_vram_gb() -> float:
     Metal's ``recommendedMaxWorkingSetSize`` is a static device property
     (~75% of total RAM) that does not account for current system load.
     We use ``min(total * 0.75, total - 16)`` to leave headroom for the OS
-    and desktop applications, which typically consume 8–16 GB on a loaded
+    and desktop applications, which typically consume 8-16 GB on a loaded
     developer machine.
     """
     try:
         out = subprocess.run(
-            ["sysctl", "-n", "hw.memsize"],
-            capture_output=True, text=True, timeout=2,
+            ["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=2
         )
         total_gb = int(out.stdout.strip()) / (1024**3)
         return min(total_gb * 0.75, total_gb - 16)
@@ -133,27 +132,6 @@ def require_ram(min_gb: int):
     return pytest.mark.skipif(False, reason="")
 
 
-# ---------------------------------------------------------------------------
-# GPU process isolation
-# ---------------------------------------------------------------------------
-
-
-def require_gpu_isolation():
-    """Skip unless GPU process isolation is enabled.
-
-    Isolation is active when ``--isolate-heavy`` is passed or ``CICD=1``.
-    Tests marked with this predicate will be run in separate subprocesses
-    to prevent CUDA OOM from cross-test memory leaks.
-    """
-    isolate = os.environ.get("CICD", "0") == "1"
-    # Note: --isolate-heavy is a pytest CLI flag checked at collection time
-    # by conftest.py.  At import time we can only check the env var.
-    return pytest.mark.skipif(
-        not (isolate or _gpu_available()),
-        reason="GPU isolation requires CICD=1 or --isolate-heavy with a GPU",
-    )
-
-
 # ---------------------------------------------------------------------------
 # API keys / credentials
 # ---------------------------------------------------------------------------
diff --git a/test/stdlib/components/intrinsic/test_core.py b/test/stdlib/components/intrinsic/test_core.py
index b980afddb..9443fdc5f 100644
--- a/test/stdlib/components/intrinsic/test_core.py
+++ b/test/stdlib/components/intrinsic/test_core.py
@@ -27,7 +27,7 @@
         reason="Skipping core intrinsic tests in CI - all qualitative tests",
     ),
     pytest.mark.huggingface,
-    require_gpu(min_vram_gb=8),
+    require_gpu(min_vram_gb=12),
     pytest.mark.e2e,
 ]
 
diff --git a/test/stdlib/components/intrinsic/test_guardian.py b/test/stdlib/components/intrinsic/test_guardian.py
index 749b60c09..ec8a58a84 100644
--- a/test/stdlib/components/intrinsic/test_guardian.py
+++ b/test/stdlib/components/intrinsic/test_guardian.py
@@ -24,7 +24,7 @@
         reason="Skipping Guardian tests in CI - all qualitative tests",
     ),
     pytest.mark.huggingface,
-    require_gpu(min_vram_gb=8),
+    require_gpu(min_vram_gb=12),
     pytest.mark.e2e,
 ]
 
diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py
index a00694b52..ca98ccb76 100644
--- a/test/stdlib/components/intrinsic/test_rag.py
+++ b/test/stdlib/components/intrinsic/test_rag.py
@@ -23,7 +23,7 @@
         reason="Skipping RAG tests in CI - all qualitative tests",
     ),
     pytest.mark.huggingface,
-    require_gpu(min_vram_gb=8),
+    require_gpu(min_vram_gb=12),
     pytest.mark.e2e,
 ]
 
diff --git a/test/stdlib/test_spans.py b/test/stdlib/test_spans.py
index af3f7681a..969e5a29e 100644
--- a/test/stdlib/test_spans.py
+++ b/test/stdlib/test_spans.py
@@ -2,7 +2,9 @@
 
 from mellea.backends import ModelOption
 
-pytest.importorskip("llguidance", reason="llguidance not installed — install mellea[hf]")
+pytest.importorskip(
+    "llguidance", reason="llguidance not installed — install mellea[hf]"
+)
 from mellea.backends.huggingface import LocalHFBackend
 from mellea.backends.model_ids import IBM_GRANITE_4_HYBRID_MICRO
 from mellea.core import CBlock
@@ -11,7 +13,7 @@
 from test.predicates import require_gpu
 
 # Module-level markers for all tests using Granite 4 hybrid micro (3B model)
-pytestmark = [pytest.mark.huggingface, require_gpu(min_vram_gb=8), pytest.mark.e2e]
+pytestmark = [pytest.mark.huggingface, require_gpu(min_vram_gb=12), pytest.mark.e2e]
 
 
 # We edit the context type in the async tests below. Don't change the scope here.

From 58d26925cc4c07efc3782bd4f3c2fc55d6e2c0be Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 08:53:23 +0000
Subject: [PATCH 29/42] test: migrate legacy markers in
 test_intrinsics_formatters.py

Replace deprecated @pytest.mark.llm, @pytest.mark.requires_gpu,
@pytest.mark.requires_heavy_ram, @pytest.mark.requires_gpu_isolation
with @pytest.mark.e2e and @require_gpu(min_vram_gb=12) to align with
the new marker taxonomy (#727/#728). VRAM gate set to 12 GB matching
the 3B-parameter model loaded across the parametrized test cases.
---
 test/formatters/granite/test_intrinsics_formatters.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py
index 8f555f552..5604f7642 100644
--- a/test/formatters/granite/test_intrinsics_formatters.py
+++ b/test/formatters/granite/test_intrinsics_formatters.py
@@ -29,6 +29,7 @@
 )
 from mellea.formatters.granite.base import util as base_util
 from mellea.formatters.granite.intrinsics import json_util, util as intrinsics_util
+from test.predicates import require_gpu
 
 
 def _read_file(name):
@@ -573,10 +574,8 @@ def _round_floats(json_data, num_digits: int = 2):
 
 
 @pytest.mark.huggingface
-@pytest.mark.llm
-@pytest.mark.requires_gpu
-@pytest.mark.requires_heavy_ram
-@pytest.mark.requires_gpu_isolation  # Activate GPU memory isolation
+@pytest.mark.e2e
+@require_gpu(min_vram_gb=12)
 @pytest.mark.skipif(
     int(os.environ.get("CICD", 0)) == 1, reason="Skipping HuggingFace tests in CI"
 )

From 8ec37565577e11615d4ebe17fc3dd417ac77799e Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 08:59:41 +0000
Subject: [PATCH 30/42] test: add integration marker to
 test_dependency_isolation.py

---
 test/package/test_dependency_isolation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/package/test_dependency_isolation.py b/test/package/test_dependency_isolation.py
index 06a2902c3..9a6f60cd6 100644
--- a/test/package/test_dependency_isolation.py
+++ b/test/package/test_dependency_isolation.py
@@ -17,6 +17,7 @@
 import pytest
 
 pytestmark = [
+    pytest.mark.integration,
     pytest.mark.slow,  # Very slow on the first run.
     pytest.mark.timeout(600),
 ]

From f6f49fc9154e36a77e11e999c3047d99dd4c406c Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 09:06:35 +0000
Subject: [PATCH 31/42] docs: document OLLAMA_KEEP_ALIVE=1m as memory
 optimisation for unordered test runs

---
 test/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/README.md b/test/README.md
index c9824366c..0815d4abb 100644
--- a/test/README.md
+++ b/test/README.md
@@ -19,6 +19,10 @@ uv run pytest -m slow
 
 - `CICD=1` - Enable CI mode (skips qualitative tests)
 - `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` - Helps with GPU memory fragmentation
+- `OLLAMA_KEEP_ALIVE=1m` - Reduce Ollama model idle window from the default 5 minutes to 1 minute.
+  Useful when running without `--group-by-backend`: limits how long a loaded Ollama model occupies
+  unified memory while HF/torch tests are running. Has no effect mid-run (timer resets per request),
+  but reduces the overlap window when switching between backend groups.
 
 ## GPU Testing on CUDA Systems
 

From 7119f783a96e7b8c51adf8b5a7b7c6c78de42330 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 09:30:45 +0000
Subject: [PATCH 32/42] fix: suppress mypy name-defined for torch.Tensor after
 importorskip change

---
 test/backends/test_huggingface.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
index 281406ca0..7c8e8ff98 100644
--- a/test/backends/test_huggingface.py
+++ b/test/backends/test_huggingface.py
@@ -11,6 +11,7 @@
 
 import pydantic
 import pytest
+
 torch = pytest.importorskip("torch", reason="torch not installed — install mellea[hf]")
 
 from test.predicates import require_gpu
@@ -360,7 +361,7 @@ async def test_generate_with_lock(backend) -> None:
     )
     b.add_adapter(IntrinsicAdapter("answerability", base_model_name=b.base_model_name))
 
-    memoized: dict[torch.Tensor, str] = dict()
+    memoized: dict[torch.Tensor, str] = dict()  # type: ignore[name-defined]
     gen_func = model.generate
 
     def mock_func(input_ids, *args, **kwargs):

From dbb5f11cb646f8ebc46f00c52854cfd4cc346446 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 09:38:12 +0000
Subject: [PATCH 33/42] fix: ruff format huggingface.py from_pretrained args

---
 mellea/backends/huggingface.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
index d4c9fb65a..55f85be7b 100644
--- a/mellea/backends/huggingface.py
+++ b/mellea/backends/huggingface.py
@@ -300,9 +300,7 @@ def __init__(
                 )
                 # Get the model and tokenizer.
                 self._model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
-                    self._hf_model_id,
-                    device_map=str(self._device),
-                    torch_dtype="auto",
+                    self._hf_model_id, device_map=str(self._device), torch_dtype="auto"
                 )
                 self._tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
                     self._hf_model_id

From 9dfea0dc1a357c58496b60dcf7acb42e8967baa4 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 09:47:28 +0000
Subject: [PATCH 34/42] fix: ruff format test_watsonx.py and
 test_huggingface_tools.py

---
 test/backends/test_huggingface_tools.py | 4 +++-
 test/backends/test_watsonx.py           | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/backends/test_huggingface_tools.py b/test/backends/test_huggingface_tools.py
index feaf87980..b2a72176f 100644
--- a/test/backends/test_huggingface_tools.py
+++ b/test/backends/test_huggingface_tools.py
@@ -16,7 +16,9 @@
     ),
 ]
 
-pytest.importorskip("llguidance", reason="llguidance not installed — install mellea[hf]")
+pytest.importorskip(
+    "llguidance", reason="llguidance not installed — install mellea[hf]"
+)
 import mellea.backends.model_ids as model_ids
 from mellea import MelleaSession
 from mellea.backends import ModelOption
diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py
index 45cc162b6..d6ee471aa 100644
--- a/test/backends/test_watsonx.py
+++ b/test/backends/test_watsonx.py
@@ -19,7 +19,9 @@
     ),
 ]
 
-pytest.importorskip("ibm_watsonx_ai", reason="ibm_watsonx_ai not installed — install mellea[watsonx]")
+pytest.importorskip(
+    "ibm_watsonx_ai", reason="ibm_watsonx_ai not installed — install mellea[watsonx]"
+)
 from mellea import MelleaSession
 from mellea.backends import ModelOption, model_ids
 from mellea.backends.watsonx import WatsonxAIBackend

From d445f0c692f998c410302078b2eb510c587d609d Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 10:10:11 +0000
Subject: [PATCH 35/42] refactor: remove requires_gpu, requires_heavy_ram,
 requires_gpu_isolation markers and handlers

---
 pyproject.toml        |  2 --
 test/MARKERS_GUIDE.md | 14 +++++---------
 test/conftest.py      | 33 ---------------------------------
 3 files changed, 5 insertions(+), 44 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 31a531e91..1a3604b2e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -343,8 +343,6 @@ markers = [
 
     # Capability markers
     "requires_api_key: Tests requiring external API keys",
-    "requires_gpu: Tests requiring GPU",
-    "requires_heavy_ram: Tests requiring 48GB+ RAM",
     "qualitative: Non-deterministic quality tests",
     "slow: Tests taking >1 minute (e.g., multi-step pipelines like researcher)",
 
diff --git a/test/MARKERS_GUIDE.md b/test/MARKERS_GUIDE.md
index 74cbdf050..a01331cad 100644
--- a/test/MARKERS_GUIDE.md
+++ b/test/MARKERS_GUIDE.md
@@ -211,16 +211,11 @@ These are not resource predicates but still control test selection:
 | `slow`         | Tests taking >1 minute           | Excluded by default via `pyproject.toml` addopts |
 | `qualitative`  | Non-deterministic output         | Skipped when `CICD=1`                            |
 
-### Legacy resource markers (deprecated)
+### Removed markers
 
-The markers `requires_gpu`, `requires_heavy_ram`, `requires_api_key`, and
-`requires_gpu_isolation` are deprecated. Existing tests using them still work
-(conftest auto-skip logic handles them) but new tests should use predicates.
-When migrating: `requires_gpu` → `require_gpu(min_vram_gb=N)`;
-`requires_api_key` → `require_api_key(...)`;
-`requires_heavy_ram` and `requires_gpu_isolation` → **remove** (no replacement
-needed — `requires_heavy_ram` conflated VRAM with RAM, and GPU isolation is
-now automatic).
+`requires_gpu`, `requires_heavy_ram`, and `requires_gpu_isolation` have been
+removed. Use `require_gpu(min_vram_gb=N)` from `test.predicates` instead.
+`requires_api_key` is still active — see below.
 
 ## Auto-Detection
 
@@ -236,6 +231,7 @@ whose requirements are not met. No configuration needed.
 | All        | —                             | `--ignore-all-checks`    |
 
 Use `-rs` with pytest to see skip reasons:
+
 ```bash
 pytest -rs
 ```
diff --git a/test/conftest.py b/test/conftest.py
index 8412ec57f..27f88f34a 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -266,12 +266,6 @@ def add_option_safe(option_name, **kwargs):
         default=False,
         help="Ignore GPU requirement checks (tests may fail without GPU)",
     )
-    add_option_safe(
-        "--ignore-ram-check",
-        action="store_true",
-        default=False,
-        help="Ignore RAM requirement checks (tests may fail with insufficient RAM)",
-    )
     add_option_safe(
         "--ignore-ollama-check",
         action="store_true",
@@ -330,12 +324,6 @@ def pytest_configure(config):
     config.addinivalue_line(
         "markers", "requires_api_key: Tests requiring external API keys"
     )
-    config.addinivalue_line("markers", "requires_gpu: Tests requiring GPU")
-    config.addinivalue_line("markers", "requires_heavy_ram: Tests requiring 16GB+ RAM")
-    config.addinivalue_line(
-        "markers",
-        "requires_gpu_isolation: Explicitly tag tests/modules that require OS-level process isolation to clear CUDA memory.",
-    )
     config.addinivalue_line("markers", "qualitative: Non-deterministic quality tests")
 
     # Granularity markers
@@ -574,7 +562,6 @@ def pytest_runtest_setup(item):
 
     Can be overridden with command-line options:
     - pytest --ignore-gpu-check
-    - pytest --ignore-ram-check
     - pytest --ignore-ollama-check
     - pytest --ignore-api-key-check
     """
@@ -633,7 +620,6 @@ def pytest_runtest_setup(item):
     # Check for override flags from CLI
     ignore_all = config.getoption("--ignore-all-checks", default=False)
     ignore_gpu = config.getoption("--ignore-gpu-check", default=False) or ignore_all
-    ignore_ram = config.getoption("--ignore-ram-check", default=False) or ignore_all
     ignore_api_key = (
         config.getoption("--ignore-api-key-check", default=False) or ignore_all
     )
@@ -654,25 +640,6 @@ def pytest_runtest_setup(item):
                         f"Skipping test: {backend} API key not found in environment"
                     )
 
-    # Skip tests requiring GPU if not available (unless override)
-    if item.get_closest_marker("requires_gpu") and not ignore_gpu:
-        if not capabilities["has_gpu"]:
-            pytest.skip("Skipping test: GPU not available")
-
-    # Skip tests requiring heavy RAM if insufficient (unless override)
-    # NOTE: The 48GB threshold is based on empirical testing:
-    #   - HuggingFace tests with granite-3.3-8b-instruct failed on 32GB M1 MacBook
-    #   - Also failed on 36GB system
-    #   - Set to 48GB as safe threshold for 8B model + overhead
-    # TODO: Consider per-model thresholds or make configurable
-    #       Can be overridden with: pytest --ignore-ram-check
-    if item.get_closest_marker("requires_heavy_ram") and not ignore_ram:
-        RAM_THRESHOLD_GB = 48  # Based on real-world testing
-        if capabilities["ram_gb"] > 0 and capabilities["ram_gb"] < RAM_THRESHOLD_GB:
-            pytest.skip(
-                f"Skipping test: Insufficient RAM ({capabilities['ram_gb']:.1f}GB < {RAM_THRESHOLD_GB}GB)"
-            )
-
     # Backend-specific skipping
     # Leaving OpenAI commented since our current OpenAI tests don't require OpenAI apikeys.
     # if item.get_closest_marker("openai") and not ignore_api_key:

From 6148d8d9d57ab1ee2e8b15f887ee3cb700b5527f Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 10:13:44 +0000
Subject: [PATCH 36/42] refactor: remove --ignore-*-check override flags from
 conftest

---
 test/MARKERS_GUIDE.md | 12 ++++-----
 test/conftest.py      | 57 ++++---------------------------------------
 2 files changed, 10 insertions(+), 59 deletions(-)

diff --git a/test/MARKERS_GUIDE.md b/test/MARKERS_GUIDE.md
index a01331cad..ace388b78 100644
--- a/test/MARKERS_GUIDE.md
+++ b/test/MARKERS_GUIDE.md
@@ -222,13 +222,11 @@ removed. Use `require_gpu(min_vram_gb=N)` from `test.predicates` instead.
 The test suite automatically detects system capabilities and skips tests
 whose requirements are not met. No configuration needed.
 
-| Capability | How detected                  | Override flag            |
-| ---------- | ----------------------------- | ------------------------ |
-| Ollama     | Port 11434 check              | `--ignore-ollama-check`  |
-| GPU        | `torch.cuda.is_available()`   | `--ignore-gpu-check`     |
-| RAM        | `psutil.virtual_memory()`     | `--ignore-ram-check`     |
-| API keys   | Environment variable check    | `--ignore-api-key-check` |
-| All        | —                             | `--ignore-all-checks`    |
+| Capability | How detected                  |
+| ---------- | ----------------------------- |
+| Ollama     | Port 11434 check              |
+| GPU/VRAM   | `torch` + `sysctl hw.memsize` |
+| API keys   | Environment variable check    |
 
 Use `-rs` with pytest to see skip reasons:
 
diff --git a/test/conftest.py b/test/conftest.py
index 27f88f34a..2c21b2004 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -260,30 +260,6 @@ def add_option_safe(option_name, **kwargs):
             # Option already exists (likely from docs/examples/conftest.py)
             pass
 
-    add_option_safe(
-        "--ignore-gpu-check",
-        action="store_true",
-        default=False,
-        help="Ignore GPU requirement checks (tests may fail without GPU)",
-    )
-    add_option_safe(
-        "--ignore-ollama-check",
-        action="store_true",
-        default=False,
-        help="Ignore Ollama availability checks (tests will fail if Ollama not running)",
-    )
-    add_option_safe(
-        "--ignore-api-key-check",
-        action="store_true",
-        default=False,
-        help="Ignore API key checks (tests will fail without valid API keys)",
-    )
-    add_option_safe(
-        "--ignore-all-checks",
-        action="store_true",
-        default=False,
-        help="Ignore all requirement checks (GPU, RAM, Ollama, API keys)",
-    )
     add_option_safe(
         "--disable-default-mellea-plugins",
         action="store_true",
@@ -492,12 +468,6 @@ def pytest_collection_modifyitems(config, items):
     """
     capabilities = get_system_capabilities()
 
-    # Check for override flags
-    ignore_all = config.getoption("--ignore-all-checks", default=False)
-    ignore_ollama = (
-        config.getoption("--ignore-ollama-check", default=False) or ignore_all
-    )
-
     skip_ollama = pytest.mark.skip(
         reason="Ollama not available (port 11434 not listening)"
     )
@@ -508,7 +478,7 @@ def pytest_collection_modifyitems(config, items):
 
     for item in items:
         # Skip ollama tests if ollama not available
-        if item.get_closest_marker("ollama") and not ignore_ollama:
+        if item.get_closest_marker("ollama"):
             if not capabilities["has_ollama"]:
                 item.add_marker(skip_ollama)
 
@@ -561,9 +531,6 @@ def pytest_runtest_setup(item):
     """Skip tests based on markers and system capabilities.
 
     Can be overridden with command-line options:
-    - pytest --ignore-gpu-check
-    - pytest --ignore-ollama-check
-    - pytest --ignore-api-key-check
     """
     capabilities = get_system_capabilities()
     gh_run = int(os.environ.get("CICD", 0))
@@ -617,22 +584,14 @@ def pytest_runtest_setup(item):
 
         pytest_runtest_setup._last_backend_group = current_group
 
-    # Check for override flags from CLI
-    ignore_all = config.getoption("--ignore-all-checks", default=False)
-    ignore_gpu = config.getoption("--ignore-gpu-check", default=False) or ignore_all
-    ignore_api_key = (
-        config.getoption("--ignore-api-key-check", default=False) or ignore_all
-    )
-
     # Skip qualitative tests in CI
     if item.get_closest_marker("qualitative") and gh_run == 1:
         pytest.skip(
             reason="Skipping qualitative test: got env variable CICD == 1. Used only in gh workflows."
         )
 
-    # Skip tests requiring API keys if not available (unless override)
-    if item.get_closest_marker("requires_api_key") and not ignore_api_key:
-        # Check specific backend markers
+    # Skip tests requiring API keys if not available
+    if item.get_closest_marker("requires_api_key"):
         for backend in ["openai", "watsonx"]:
             if item.get_closest_marker(backend):
                 if not capabilities["has_api_keys"].get(backend):
@@ -640,19 +599,13 @@ def pytest_runtest_setup(item):
                         f"Skipping test: {backend} API key not found in environment"
                     )
 
-    # Backend-specific skipping
-    # Leaving OpenAI commented since our current OpenAI tests don't require OpenAI apikeys.
-    # if item.get_closest_marker("openai") and not ignore_api_key:
-    #     if not capabilities["has_api_keys"].get("openai"):
-    #         pytest.skip("Skipping test: OPENAI_API_KEY not found in environment")
-
-    if item.get_closest_marker("watsonx") and not ignore_api_key:
+    if item.get_closest_marker("watsonx"):
         if not capabilities["has_api_keys"].get("watsonx"):
             pytest.skip(
                 "Skipping test: Watsonx API credentials not found in environment"
             )
 
-    if item.get_closest_marker("vllm") and not ignore_gpu:
+    if item.get_closest_marker("vllm"):
         if not capabilities["has_gpu"]:
             pytest.skip("Skipping test: vLLM requires GPU")
 

From 22b29bbb2245818e95c449bc1cb329931851a091 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 27 Mar 2026 10:18:46 +0000
Subject: [PATCH 37/42] refactor: remove requires_api_key marker; fix api
 backend group to match watsonx+bedrock markers

---
 pyproject.toml   |  1 -
 test/conftest.py | 25 ++++++++-----------------
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1a3604b2e..e16e3f571 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -342,7 +342,6 @@ markers = [
     "bedrock: Tests requiring AWS Bedrock backend (requires credentials)",
 
     # Capability markers
-    "requires_api_key: Tests requiring external API keys",
     "qualitative: Non-deterministic quality tests",
     "slow: Tests taking >1 minute (e.g., multi-step pipelines like researcher)",
 
diff --git a/test/conftest.py b/test/conftest.py
index 2c21b2004..e6311778d 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -231,8 +231,8 @@ def shared_vllm_backend(request):
         "description": "Ollama backend tests (local server)",
     },
     "api": {
-        "marker": "requires_api_key",
-        "description": "API-based backends (OpenAI, Watsonx, Bedrock)",
+        "markers": ["watsonx", "bedrock"],
+        "description": "API-based backends (Watsonx, Bedrock — require cloud credentials)",
     },
 }
 
@@ -297,9 +297,6 @@ def pytest_configure(config):
         config.addinivalue_line("markers", f"{name}: {desc}")
 
     # Capability markers
-    config.addinivalue_line(
-        "markers", "requires_api_key: Tests requiring external API keys"
-    )
     config.addinivalue_line("markers", "qualitative: Non-deterministic quality tests")
 
     # Granularity markers
@@ -496,11 +493,13 @@ def pytest_collection_modifyitems(config, items):
         seen = set()
 
         for group_name in BACKEND_GROUP_ORDER:
-            marker = BACKEND_GROUPS[group_name]["marker"]
+            group_info = BACKEND_GROUPS[group_name]
+            markers = group_info.get("markers") or [group_info["marker"]]
             group_tests = [
                 item
                 for item in items
-                if item.get_closest_marker(marker) and id(item) not in seen
+                if any(item.get_closest_marker(m) for m in markers)
+                and id(item) not in seen
             ]
 
             if group_tests:
@@ -540,7 +539,8 @@ def pytest_runtest_setup(item):
     if config.getoption("--group-by-backend", default=False):
         current_group = None
         for group_name, group_info in BACKEND_GROUPS.items():
-            if item.get_closest_marker(group_info["marker"]):
+            markers = group_info.get("markers") or [group_info["marker"]]
+            if any(item.get_closest_marker(m) for m in markers):
                 current_group = group_name
                 break
 
@@ -590,15 +590,6 @@ def pytest_runtest_setup(item):
             reason="Skipping qualitative test: got env variable CICD == 1. Used only in gh workflows."
         )
 
-    # Skip tests requiring API keys if not available
-    if item.get_closest_marker("requires_api_key"):
-        for backend in ["openai", "watsonx"]:
-            if item.get_closest_marker(backend):
-                if not capabilities["has_api_keys"].get(backend):
-                    pytest.skip(
-                        f"Skipping test: {backend} API key not found in environment"
-                    )
-
     if item.get_closest_marker("watsonx"):
         if not capabilities["has_api_keys"].get("watsonx"):
             pytest.skip(

From e1d79fba575ca953193066815c1619684d2e8141 Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Fri, 27 Mar 2026 17:15:21 -0500
Subject: [PATCH 38/42] fix: address review

Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com>
---
 test/backends/test_tool_calls.py                 | 2 +-
 test/stdlib/components/docs/test_richdocument.py | 2 +-
 test/stdlib/components/test_transform.py         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/backends/test_tool_calls.py b/test/backends/test_tool_calls.py
index ece501b16..f67f2cdde 100644
--- a/test/backends/test_tool_calls.py
+++ b/test/backends/test_tool_calls.py
@@ -10,7 +10,7 @@
 )
 from mellea.core import ModelOutputThunk
 
-pytest.importorskip("docling", reason="docling not installed — install mellea[mify]")
+pytest.importorskip("docling", reason="docling not installed — install mellea[docling]")
 from mellea.stdlib.components.docs.richdocument import Table
 from mellea.stdlib.context import ChatContext
 from mellea.stdlib.session import MelleaSession
diff --git a/test/stdlib/components/docs/test_richdocument.py b/test/stdlib/components/docs/test_richdocument.py
index 9ae153517..8623e3d74 100644
--- a/test/stdlib/components/docs/test_richdocument.py
+++ b/test/stdlib/components/docs/test_richdocument.py
@@ -4,7 +4,7 @@
 import pytest
 
 pytest.importorskip(
-    "docling_core", reason="docling_core not installed — install mellea[mify]"
+    "docling_core", reason="docling_core not installed — install mellea[docling]"
 )
 from docling_core.types.doc.document import DoclingDocument
 
diff --git a/test/stdlib/components/test_transform.py b/test/stdlib/components/test_transform.py
index b0ed491ee..2686ba6da 100644
--- a/test/stdlib/components/test_transform.py
+++ b/test/stdlib/components/test_transform.py
@@ -3,7 +3,7 @@
 from mellea.core import TemplateRepresentation
 from mellea.stdlib.components import MObject, Query, Transform
 
-pytest.importorskip("docling", reason="docling not installed — install mellea[mify]")
+pytest.importorskip("docling", reason="docling not installed — install mellea[docling]")
 from mellea.stdlib.components.docs.richdocument import TableTransform
 
 custom_mobject_description = "custom mobject description"

From b772cc44f5396b881e9b1edb374addf39e305a44 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Sat, 28 Mar 2026 09:44:11 +0000
Subject: [PATCH 39/42] test: mark test_image_block_in_instruction as
 qualitative

---
 test/backends/test_vision_openai.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/backends/test_vision_openai.py b/test/backends/test_vision_openai.py
index c8348f2c8..01942cacf 100644
--- a/test/backends/test_vision_openai.py
+++ b/test/backends/test_vision_openai.py
@@ -68,6 +68,7 @@ def test_image_block_construction_from_pil(pil_image: Image.Image):
     assert ImageBlock.is_valid_base64_png(str(image_block))
 
 
+@pytest.mark.qualitative
 def test_image_block_in_instruction(
     m_session: MelleaSession, pil_image: Image.Image, gh_run: int
 ):

From c9b996dacc36cb0f8011c49bf5d2c97eeb92ab90 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Sat, 28 Mar 2026 09:51:31 +0000
Subject: [PATCH 40/42] chore: commit .claude/settings.json with skillLocations
 for skill discovery

---
 .claude/settings.json | 3 +++
 .gitignore            | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 .claude/settings.json

diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 000000000..a4254f6d8
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,3 @@
+{
+  "skillLocations": [".agents/skills"]
+}
diff --git a/.gitignore b/.gitignore
index 6bc21d762..9ee2fcc6d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -451,7 +451,8 @@ pyrightconfig.json
 
 # AI agent configs
 .bob/
-.claude/
+.claude/*
+!.claude/settings.json
 
 # Generated API documentation (built by tooling/docs-autogen/)
 docs/docs/api/

From 3d80a8166a633297da19a4484a76262f15589893 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Sat, 28 Mar 2026 09:53:41 +0000
Subject: [PATCH 41/42] docs: broaden audit-markers skill description to cover
 diagnostic use cases

---
 .agents/skills/audit-markers/SKILL.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index 5d7172ba3..6a3ada6c9 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -4,8 +4,12 @@ description: >
   Audit and fix pytest markers on test files and examples. Classifies tests as
   unit/integration/e2e/qualitative using general heuristics and project-specific
   marker rules. Estimates GPU VRAM and RAM requirements by tracing model
-  identifiers and looking up parameter counts. Use when reviewing markers (classification),
-  auditing test files, or checking before commit.
+  identifiers and looking up parameter counts.
+  Use when: writing a new test and unsure which markers to apply; reviewing or
+  auditing existing test markers; a test is unexpectedly skipped or not collected;
+  a test is consuming too much GPU/RAM and you want to check its resource gates;
+  checking marker correctness before committing; or any question about why a test
+  does or doesn't run in a given configuration.
 argument-hint: "[file-or-directory] [--dry-run | --apply]"
 compatibility: "Claude Code, IBM Bob"
 metadata:

From ec0254d5e040aa921c2bc6b04a867870d55029de Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Sat, 28 Mar 2026 09:55:27 +0000
Subject: [PATCH 42/42] docs: add diagnostic mode to audit-markers skill for
 troubleshooting skip/resource issues

---
 .agents/skills/audit-markers/SKILL.md | 35 +++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/.agents/skills/audit-markers/SKILL.md b/.agents/skills/audit-markers/SKILL.md
index 6a3ada6c9..558e8c9a9 100644
--- a/.agents/skills/audit-markers/SKILL.md
+++ b/.agents/skills/audit-markers/SKILL.md
@@ -30,6 +30,41 @@ marker rules for **mellea**.
 - `--apply` — produce report and apply fixes without asking.
 - `--dry-run` — report only, do not offer to apply.
 
+Single-file use: skip the triage phase (Step 0) entirely — deep-read the file directly and proceed from Step 1.
+
+## Modes of use
+
+This skill has two modes. Choose based on what the user asked:
+
+### Audit mode (default)
+User wants markers classified or fixed — for a new test, an existing file, or a
+pre-commit check. Follow the full Audit Procedure (Steps 0–4).
+
+### Diagnostic mode
+User wants to know **why** a specific test is not running, is being skipped, or
+is consuming unexpected resources. Do NOT produce an audit report. Instead:
+
+1. **Read the test** — identify its markers and any predicate decorators.
+2. **Check the default filter** — read `pyproject.toml` `[tool.pytest.ini_options]`
+   `addopts`. The project default is `-m "not slow"`. If the test has `slow`, it
+   is excluded from a plain `uv run pytest` run.
+3. **Check backend auto-skip** — read `test/conftest.py` `pytest_configure` and
+   the `pytest_collection_modifyitems` hook. Backend markers (`ollama`, `huggingface`,
+   etc.) trigger auto-skip when the backend is unavailable. Check whether the
+   relevant service or credentials are present on the user's machine.
+4. **Evaluate predicates** — if the test has a predicate decorator (`require_gpu`,
+   `require_api_key`, `require_ram`, etc.), read `test/predicates.py` and explain
+   what condition would cause the skip. For `require_gpu(min_vram_gb=N)`, compare N
+   against the system's detected VRAM (run `get_system_capabilities()` logic or
+   check `sysctl hw.memsize` on Apple Silicon / `nvidia-smi` on CUDA).
+5. **Report directly** — answer "this test is skipped because X" with the specific
+   condition, the value it evaluated to, and how to override if appropriate (e.g.
+   `uv run pytest test/path/test_foo.py` bypasses the `-m "not slow"` default filter).
+
+For resource overload (test consuming too much GPU/RAM): classify the test's
+resource gates using the VRAM heuristics in Part 2, compare against what the
+test actually loads, and report whether the gate is correctly set or too loose.
+
 ## Project References
 
 Read these before auditing — they are the authoritative source for marker conventions: