braintrustdata · Abhijeet Prasad (AbhiPrasad) · Apr 10, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -23,6 +23,7 @@ Important code areas in `py/src/braintrust/`:
 - temporal: `contrib/temporal/`
 - CLI/devserver: `cli/`, `devserver/`
 - tests: colocated `test_*.py`
+- type tests: `type_tests/`
 
 ## Setup
 
@@ -83,13 +84,27 @@ Testing preferences:
 Key facts:
 
 - `test_core` runs without optional vendor packages.
+- `test_types` runs pyright, mypy, and pytest on `py/src/braintrust/type_tests/`. Use this session when changing generic type signatures in the framework.
 - wrapper coverage is split across dedicated nox sessions by provider/version.
 - `pylint` installs the broad dependency surface before checking files.
 - `cd py && make pylint` runs only `pylint`; `cd py && make lint` runs pre-commit hooks first and then `pylint`.
 - `test-wheel` is a wheel sanity check and requires a built wheel first.
 
 When changing behavior, run the narrowest affected session first, then expand only if needed.
 
+## Type Tests
+
+`py/src/braintrust/type_tests/` contains tests that are validated by both static type checkers (pyright, mypy) and pytest at runtime. The `test_types` nox session runs all three checks and is auto-discovered by CI.
+
+When changing generic type signatures (e.g., `Eval`, `EvalCase`, `EvalScorer`, `EvalHooks`), add or update a test in `type_tests/` to verify the type checker accepts the intended usage patterns.
+
+New test files should be named `test_*.py` and use absolute imports (`from braintrust.framework import ...`). They are regular pytest files that also happen to be valid pyright/mypy targets.
+
+```bash
+cd py
+nox -s test_types
+```
+
 ## VCR
 
 VCR/cassette coverage is the default and preferred testing strategy for provider and integration behavior in this repo. Reach for cassette-backed tests before introducing mocks or fakes, and keep new coverage aligned with the existing VCR patterns unless there is a strong reason not to.

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -134,6 +134,17 @@ BRAINTRUST_CLAUDE_AGENT_SDK_RECORD_MODE=all \
   nox -s "test_claude_agent_sdk(latest)" -- -k "test_calculator_with_multiple_operations"
 ```
 
+### Type Tests
+
+`py/src/braintrust/type_tests/` contains tests that are checked by pyright, mypy, and pytest. The `test_types` nox session runs all three and is included in CI automatically.
+
+When changing generic type signatures (e.g., `Eval`, `EvalCase`, `EvalScorer`, `EvalHooks`), add or update a test file in `type_tests/` to verify the type checker accepts the intended usage patterns. Test files are named `test_*.py`, use absolute imports (`from braintrust.framework import ...`), and double as regular pytest files.
+
+```bash
+cd py
+nox -s test_types
+```
+
 ### Fixtures
 
 Shared test fixtures live in `py/src/braintrust/conftest.py`.

diff --git a/py/noxfile.py b/py/noxfile.py
@@ -44,6 +44,7 @@ def _pinned_python_version():
 INTEGRATION_DIR = "braintrust/integrations"
 CONTRIB_DIR = "braintrust/contrib"
 DEVSERVER_DIR = "braintrust/devserver"
+TYPE_TESTS_DIR = "braintrust/type_tests"
 
 
 SILENT_INSTALLS = True
@@ -390,6 +391,27 @@ def test_otel_not_installed(session):
     _run_tests(session, "braintrust/test_otel.py")
 
 
+@nox.session()
+def test_types(session):
+    """Run type-check tests with pyright, mypy, and pytest."""
+    _install_test_deps(session)
+    session.install("pyright==1.1.408", "mypy==1.20.0")
+
+    type_tests_dir = f"src/{TYPE_TESTS_DIR}"
+    test_files = glob.glob(os.path.join(type_tests_dir, "test_*.py"))
+    if not test_files:
+        session.skip("No type test files found")
+
+    # Run pyright on each file
+    session.run("pyright", *test_files)
+
+    # Run mypy on each file (only check the test files themselves, not transitive deps)
+    session.run("mypy", "--follow-imports=silent", *test_files)
+
+    # Run pytest for the runtime assertions
+    _run_tests(session, TYPE_TESTS_DIR)
+
+
 @nox.session()
 def pylint(session):
     # pylint needs everything so we don't trigger missing import errors
@@ -502,6 +524,7 @@ def _run_core_tests(session):
             *_integration_subdirs_to_ignore(),
             CONTRIB_DIR,
             DEVSERVER_DIR,
+            TYPE_TESTS_DIR,
         ],
     )
 

diff --git a/py/src/braintrust/devserver/server.py b/py/src/braintrust/devserver/server.py
@@ -50,7 +50,7 @@
 from .schemas import ValidationError, parse_eval_body
 
 
-_all_evaluators: dict[str, Evaluator[Any, Any]] = {}
+_all_evaluators: dict[str, Evaluator[Any, Any, Any]] = {}
 
 
 class _ParameterOverrideHooks:
@@ -289,7 +289,7 @@ async def run_and_complete():
         return JSONResponse({"error": f"Failed to run evaluation: {str(e)}"}, status_code=500)
 
 
-def create_app(evaluators: list[Evaluator[Any, Any]], org_name: str | None = None):
+def create_app(evaluators: list[Evaluator[Any, Any, Any]], org_name: str | None = None):
     """Create and configure the Starlette app for the dev server.
 
     Args:
@@ -318,7 +318,7 @@ def create_app(evaluators: list[Evaluator[Any, Any]], org_name: str | None = Non
 
 
 def run_dev_server(
-    evaluators: list[Evaluator[Any, Any]],
+    evaluators: list[Evaluator[Any, Any, Any]],
     host: str = "localhost",
     port: int = 8300,
     org_name: str | None = None,
@@ -346,7 +346,7 @@ def snake_to_camel(snake_str: str) -> str:
 
 def make_scorer(
     state: BraintrustState, name: str, score: FunctionId, project_id: str | None = None
-) -> EvalScorer[Any, Any]:
+) -> EvalScorer[Any, Any, Any]:
     def scorer_fn(input, output, expected, metadata):
         request = {
             **score,