From 2b4c8ae960abe0754b8238e7bc9f4b76dc6e3a06 Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Tue, 17 Mar 2026 12:00:51 -0400
Subject: [PATCH 1/3] feat: Add spec doc for classifiers in the SDK

---
 docs/telemetry/classifier.md | 238 +++++++++++++++++++++++++++++++++++
 1 file changed, 238 insertions(+)
 create mode 100644 docs/telemetry/classifier.md
diff --git a/docs/telemetry/classifier.md b/docs/telemetry/classifier.md
new file mode 100644
index 0000000..a444d6a
--- /dev/null
+++ b/docs/telemetry/classifier.md
@@ -0,0 +1,238 @@
+# Classifiers
+
+## Overview
+
+Classifiers categorize and label eval outputs. Unlike scorers (numeric 0-1), classifiers produce structured classification items with optional confidence and metadata. Both receive the same arguments (`output`, `expected`, `input`, `metadata`) and run in parallel during evaluations.
+
+Classifications are stored as `Record<string, ClassificationItem[]>` -- a dictionary keyed by classifier name, where each value is an array of items. This supports multiple classifiers producing independent label sets, a single classifier producing multiple labels, and multiple classifiers contributing to the same key.
+
+---
+
+## Public API
+
+### Evaluator Interface
+
+An evaluator **MUST** include at least one of `scores` or `classifiers` (or both). SDKs **MUST** validate this at runtime and raise a clear error if neither is provided, even if the constraint is also enforced at the type level.
+
+```typescript
+interface EvaluatorBase<Input, Output, Expected, Metadata> {
+  data: () => Dataset<Input, Expected, Metadata>;
+  task: (input: Input, hooks: Hooks) => Output | Promise<Output>;
+}
+
+type Evaluator<Input, Output, Expected, Metadata> =
+  | EvaluatorBase<Input, Output, Expected, Metadata> & {
+      scores: EvalScorer<Input, Output, Expected, Metadata>[];
+      classifiers?: EvalClassifier<Input, Output, Expected, Metadata>[];
+    }
+  | EvaluatorBase<Input, Output, Expected, Metadata> & {
+      scores?: EvalScorer<Input, Output, Expected, Metadata>[];
+      classifiers: EvalClassifier<Input, Output, Expected, Metadata>[];
+    };
+```
+
+### EvalClassifier
+
+A classifier function accepts the same arguments as a scorer. It **MAY** return a single `Classification`, an array (multi-label), or `null`. It **MAY** be synchronous or asynchronous.
+
+```typescript
+type OneOrMoreClassifications = Classification | Classification[] | null;
+
+type EvalClassifier<Input, Output, Expected, Metadata> = (
+  args: EvalScorerArgs<Input, Output, Expected, Metadata>,
+) => OneOrMoreClassifications | Promise<OneOrMoreClassifications>;
+```
+
+### Classification
+
+Returned by classifier functions. The `name` field is used as the grouping key in the results dictionary and is omitted when converting to the storage format.
+
+```typescript
+interface Classification {
+  name: string;
+  id: string;
+  label?: string;
+  confidence?: number | null;
+  metadata?: Record<string, unknown>;
+}
+```
+
+### EvalResult
+
+The `classifications` field **MUST** be omitted (not an empty object) when no classifiers are defined or all return `null`.
+
+```typescript
+interface EvalResult {
+  input: unknown;
+  output: unknown;
+  expected?: unknown;
+  scores?: Record<string, number | null>;
+  classifications?: Record<string, ClassificationItem[]>;
+  metadata?: Record<string, unknown>;
+}
+```
+
+---
+
+## Behavior
+
+### Execution
+
+SDKs **MUST** run classifiers in parallel with scorers (e.g., `Promise.all`).
+
+Each classifier **MUST** run inside a traced span with `type: "classifier"` and `name` set to the resolved classifier name:
+
+```typescript
+rootSpan.traced(
+  (classifierSpan) => {
+    const result = await classifierFn({ input, output, expected, metadata });
+    classifierSpan.log({ output: result });
+    return result;
+  },
+  {
+    name: resolvedClassifierName,
+    span_attributes: {
+      name: resolvedClassifierName,
+      type: "classifier",
+    },
+  },
+);
+```
+
+### Name Resolution
+
+SDKs **MUST** resolve classifier name with this precedence:
+
+1. `name` field on the returned `Classification` object(s)
+2. `.name` property of the classifier function
+3. Fallback: `classifier_${index}`
+
+Items with the same resolved name **MUST** be appended to the same array.
+
+### Validation
+
+Each classification result **MUST** have:
+- A `name` that is a non-empty string
+- An `id` that is a non-empty string
+
+If validation fails, treat the classifier as failed.
+
+Additional field rules:
+- `confidence` is an unconstrained number (no 0-1 range enforced). SDKs **MUST NOT** reject values outside 0-1.
+- `metadata` is an unconstrained `Record<string, unknown>`. SDKs **MUST NOT** impose size limits.
+- Duplicate `{name, id}` pairs are allowed. Multiple items with the same `id` under the same name key **MUST** all be stored. Deduplication, if needed, is handled at the display layer.
+- Order is stable. Items **MUST** be stored in the order they are returned by the classifier.
+
+### Conversion to ClassificationItem
+
+When storing results, SDKs **MUST** convert `Classification` to `ClassificationItem`:
+
+1. Copy `id` as-is
+2. Default `label` to `id` if not provided
+3. Include `confidence` and `metadata` only if present
+4. Omit `name` (it becomes the dictionary key)
+
+### Error Handling
+
+Classifier failures **MUST NOT** abort the evaluation or affect other classifiers/scorers.
+
+On failure:
+1. Record the error under `classifier_errors` in eval metadata (maps classifier name to error message)
+2. Log the error to the root span's metadata
+3. **SHOULD** emit a debug warning
+
+This mirrors the `scorer_errors` pattern.
+
+---
+
+## Wire Format
+
+### ClassificationItem
+
+The storage format for a single classification. Derived from `Classification` by dropping `name` and adding an optional `source`.
+
+| Field | Type | Required | Description |
+|---|---|---|---|
+| `id` | String | **REQUIRED** | Stable identifier for filtering and grouping. |
+| `label` | String | **OPTIONAL** | Display label. Defaults to `id`. |
+| `confidence` | Number \| null | **OPTIONAL** | Confidence score, typically 0-1. |
+| `metadata` | Record\<string, unknown\> | **OPTIONAL** | Arbitrary metadata. |
+| `source` | SavedFunctionId \| null | **OPTIONAL** | Function that produced this classification. Set by the platform for online scoring; SDKs MAY omit. |
+
+### Classifications on Events
+
+Stored as a top-level `classifications` field on experiment and log events. **MUST** be `Record<string, ClassificationItem[]>`. **MUST** be omitted when empty.
+
+```json
+{
+  "classifications": {
+    "category": [
+      { "id": "greeting", "label": "Greeting", "confidence": 0.91 }
+    ],
+    "sentiment": [
+      { "id": "positive", "label": "Positive" },
+      { "id": "enthusiastic", "label": "Enthusiastic" }
+    ]
+  }
+}
+```
+
+---
+
+## Examples
+
+### Basic
+
+```javascript
+Eval("my-project", {
+  data: () => [{ input: "Hello!", expected: "Hi there!" }],
+  task: async (input) => callMyModel(input),
+  scores: [
+    ({ output, expected }) => ({
+      name: "exact_match",
+      score: output === expected ? 1 : 0,
+    }),
+  ],
+  classifiers: [
+    ({ output }) => ({
+      name: "category",
+      id: "greeting",
+      label: "Greeting",
+      confidence: 0.95,
+    }),
+  ],
+});
+```
+
+### Classifiers Only (No Scores)
+
+```javascript
+Eval("my-project", {
+  data: () => [{ input: "Hello!", expected: "Hi there!" }],
+  task: async (input) => callMyModel(input),
+  classifiers: [categoryClassifier, sentimentClassifier],
+});
+```
+
+### Multi-Label
+
+```javascript
+const sentimentClassifier = ({ output }) => [
+  { name: "sentiment", id: "positive", label: "Positive", confidence: 0.8 },
+  { name: "sentiment", id: "enthusiastic", label: "Enthusiastic", confidence: 0.6 },
+];
+```
+
+### Error Output
+
+When a classifier fails, the result includes:
+
+```json
+{
+  "metadata": {
+    "classifier_errors": {
+      "broken_classifier": "must return classifications with a non-empty string name"
+    }
+  }
+}
+```

From 62d7376ea9e9d9a2208a718d929a532a8dd40503 Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Tue, 17 Mar 2026 15:49:38 -0400
Subject: [PATCH 2/3] clean up spec

---
 docs/telemetry/classifier.md | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/docs/telemetry/classifier.md b/docs/telemetry/classifier.md
index a444d6a..9331af2 100644
--- a/docs/telemetry/classifier.md
+++ b/docs/telemetry/classifier.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-Classifiers categorize and label eval outputs. Unlike scorers (numeric 0-1), classifiers produce structured classification items with optional confidence and metadata. Both receive the same arguments (`output`, `expected`, `input`, `metadata`) and run in parallel during evaluations.
+Classifiers categorize and label eval outputs. Unlike scorers (numeric 0-1), classifiers produce structured classification items with optional metadata. Both receive the same arguments (`output`, `expected`, `input`, `metadata`) and run in parallel during evaluations.
 
 Classifications are stored as `Record<string, ClassificationItem[]>` -- a dictionary keyed by classifier name, where each value is an array of items. This supports multiple classifiers producing independent label sets, a single classifier producing multiple labels, and multiple classifiers contributing to the same key.
 
@@ -52,7 +52,6 @@ interface Classification {
   name: string;
   id: string;
   label?: string;
-  confidence?: number | null;
   metadata?: Record<string, unknown>;
 }
 ```
@@ -80,7 +79,7 @@ interface EvalResult {
 
 SDKs **MUST** run classifiers in parallel with scorers (e.g., `Promise.all`).
 
-Each classifier **MUST** run inside a traced span with `type: "classifier"` and `name` set to the resolved classifier name:
+Each classifier **MUST** run inside a traced span with `type: "classifier"` and `purpose: "scorer"`. The traced span name is resolved from the classifier function name (or fallback) rather than from returned classification items:
 
 ```typescript
 rootSpan.traced(
@@ -90,10 +89,10 @@ rootSpan.traced(
     return result;
   },
   {
-    name: resolvedClassifierName,
+    name: resolvedClassifierSpanName,
     span_attributes: {
-      name: resolvedClassifierName,
       type: "classifier",
+      purpose: "scorer",
     },
   },
 );
@@ -101,13 +100,12 @@ rootSpan.traced(
 
 ### Name Resolution
 
-SDKs **MUST** resolve classifier name with this precedence:
+SDKs **MUST** resolve names in two places:
 
-1. `name` field on the returned `Classification` object(s)
-2. `.name` property of the classifier function
-3. Fallback: `classifier_${index}`
+1. Classification span name: `.name` property of the classifier function, then fallback `classifier_${index}`
+2. Classification result grouping key: `name` field on each returned `Classification` object
 
-Items with the same resolved name **MUST** be appended to the same array.
+Each returned classification item **MUST** include a non-empty string `name`; items with the same returned `name` **MUST** be appended to the same array.
 
 ### Validation
 
@@ -118,7 +116,6 @@ Each classification result **MUST** have:
 If validation fails, treat the classifier as failed.
 
 Additional field rules:
-- `confidence` is an unconstrained number (no 0-1 range enforced). SDKs **MUST NOT** reject values outside 0-1.
 - `metadata` is an unconstrained `Record<string, unknown>`. SDKs **MUST NOT** impose size limits.
 - Duplicate `{name, id}` pairs are allowed. Multiple items with the same `id` under the same name key **MUST** all be stored. Deduplication, if needed, is handled at the display layer.
 - Order is stable. Items **MUST** be stored in the order they are returned by the classifier.
@@ -129,7 +126,7 @@ When storing results, SDKs **MUST** convert `Classification` to `ClassificationI
 
 1. Copy `id` as-is
 2. Default `label` to `id` if not provided
-3. Include `confidence` and `metadata` only if present
+3. Include `metadata` only if present
 4. Omit `name` (it becomes the dictionary key)
 
 ### Error Handling
@@ -149,15 +146,13 @@ This mirrors the `scorer_errors` pattern.
 
 ### ClassificationItem
 
-The storage format for a single classification. Derived from `Classification` by dropping `name` and adding an optional `source`.
+The storage format for a single classification. Derived from `Classification` by dropping `name` and defaulting `label` to `id` when omitted.
 
 | Field | Type | Required | Description |
 |---|---|---|---|
 | `id` | String | **REQUIRED** | Stable identifier for filtering and grouping. |
-| `label` | String | **OPTIONAL** | Display label. Defaults to `id`. |
-| `confidence` | Number \| null | **OPTIONAL** | Confidence score, typically 0-1. |
+| `label` | String | **REQUIRED** | Display label. Defaults to `id` during conversion. |
 | `metadata` | Record\<string, unknown\> | **OPTIONAL** | Arbitrary metadata. |
-| `source` | SavedFunctionId \| null | **OPTIONAL** | Function that produced this classification. Set by the platform for online scoring; SDKs MAY omit. |
 
 ### Classifications on Events
 
@@ -167,7 +162,7 @@ Stored as a top-level `classifications` field on experiment and log events. **MU
 {
   "classifications": {
     "category": [
-      { "id": "greeting", "label": "Greeting", "confidence": 0.91 }
+      { "id": "greeting", "label": "Greeting" }
     ],
     "sentiment": [
       { "id": "positive", "label": "Positive" },
@@ -198,7 +193,6 @@ Eval("my-project", {
       name: "category",
       id: "greeting",
       label: "Greeting",
-      confidence: 0.95,
     }),
   ],
 });
@@ -218,8 +212,8 @@ Eval("my-project", {
 
 ```javascript
 const sentimentClassifier = ({ output }) => [
-  { name: "sentiment", id: "positive", label: "Positive", confidence: 0.8 },
-  { name: "sentiment", id: "enthusiastic", label: "Enthusiastic", confidence: 0.6 },
+  { name: "sentiment", id: "positive", label: "Positive" },
+  { name: "sentiment", id: "enthusiastic", label: "Enthusiastic" },
 ];
 ```
 

From d3d6e330498b75daef7612eda2a07a4455f02457 Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Tue, 14 Apr 2026 10:37:47 -0400
Subject: [PATCH 3/3] update with latest changes

---
 docs/telemetry/classifier.md | 93 +++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 29 deletions(-)

diff --git a/docs/telemetry/classifier.md b/docs/telemetry/classifier.md
index 9331af2..dc9142f 100644
--- a/docs/telemetry/classifier.md
+++ b/docs/telemetry/classifier.md
@@ -1,5 +1,7 @@
 # Classifiers
 
+> **Reference implementation:** [braintrust-sdk-javascript PR #1553](https://github.com/braintrustdata/braintrust-sdk-javascript/pull/1553)
+
 ## Overview
 
 Classifiers categorize and label eval outputs. Unlike scorers (numeric 0-1), classifiers produce structured classification items with optional metadata. Both receive the same arguments (`output`, `expected`, `input`, `metadata`) and run in parallel during evaluations.
@@ -12,23 +14,23 @@ Classifications are stored as `Record<string, ClassificationItem[]>` -- a dictio
 
 ### Evaluator Interface
 
-An evaluator **MUST** include at least one of `scores` or `classifiers` (or both). SDKs **MUST** validate this at runtime and raise a clear error if neither is provided, even if the constraint is also enforced at the type level.
+An evaluator **MUST** include at least one of `scores` or `classifiers` (or both). Both fields are typed as optional; SDKs **MUST** validate this at runtime and raise a clear error if neither is provided.
 
 ```typescript
-interface EvaluatorBase<Input, Output, Expected, Metadata> {
+interface Evaluator<Input, Output, Expected, Metadata> {
   data: () => Dataset<Input, Expected, Metadata>;
   task: (input: Input, hooks: Hooks) => Output | Promise<Output>;
-}
 
-type Evaluator<Input, Output, Expected, Metadata> =
-  | EvaluatorBase<Input, Output, Expected, Metadata> & {
-      scores: EvalScorer<Input, Output, Expected, Metadata>[];
-      classifiers?: EvalClassifier<Input, Output, Expected, Metadata>[];
-    }
-  | EvaluatorBase<Input, Output, Expected, Metadata> & {
-      scores?: EvalScorer<Input, Output, Expected, Metadata>[];
-      classifiers: EvalClassifier<Input, Output, Expected, Metadata>[];
-    };
+  /**
+   * A set of scorer functions. At least one of `scores` or `classifiers` must be provided.
+   */
+  scores?: EvalScorer<Input, Output, Expected, Metadata>[];
+
+  /**
+   * A set of classifier functions. At least one of `scores` or `classifiers` must be provided.
+   */
+  classifiers?: EvalClassifier<Input, Output, Expected, Metadata>[];
+}
 ```
 
 ### EvalClassifier
@@ -58,14 +60,16 @@ interface Classification {
 
 ### EvalResult
 
-The `classifications` field **MUST** be omitted (not an empty object) when no classifiers are defined or all return `null`.
+The `scores` field is always present (may be an empty `{}`). The `classifications` field **MUST** be omitted (not an empty object) when no classifiers are defined, all return `null`, or all fail.
 
 ```typescript
 interface EvalResult {
   input: unknown;
   output: unknown;
   expected?: unknown;
-  scores?: Record<string, number | null>;
+  error: unknown;
+  origin?: ObjectReference;
+  scores: Record<string, number | null>;
   classifications?: Record<string, ClassificationItem[]>;
   metadata?: Record<string, unknown>;
 }
@@ -79,21 +83,26 @@ interface EvalResult {
 
 SDKs **MUST** run classifiers in parallel with scorers (e.g., `Promise.all`).
 
-Each classifier **MUST** run inside a traced span with `type: "classifier"` and `purpose: "scorer"`. The traced span name is resolved from the classifier function name (or fallback) rather than from returned classification items:
+Each classifier **MUST** run inside a traced span with `type: "classifier"` and `purpose: "scorer"`. The traced span also receives the propagated event from the root span and the scoring arguments (excluding `trace`) as the span's input. The span name is resolved from the classifier function name (or fallback) rather than from returned classification items:
 
 ```typescript
 rootSpan.traced(
-  (classifierSpan) => {
-    const result = await classifierFn({ input, output, expected, metadata });
-    classifierSpan.log({ output: result });
+  async (classifierSpan) => {
+    const result = await classifierFn(scoringArgs);
+    classifierSpan.log({
+      output: resultOutput,
+      metadata: resultMetadata,
+    });
     return result;
   },
   {
     name: resolvedClassifierSpanName,
-    span_attributes: {
+    spanAttributes: {
       type: "classifier",
       purpose: "scorer",
     },
+    propagatedEvent: makeScorerPropagatedEvent(await rootSpan.export()),
+    event: { input: scoringArgsForLogging },
   },
 );
 ```
@@ -102,20 +111,21 @@ rootSpan.traced(
 
 SDKs **MUST** resolve names in two places:
 
-1. Classification span name: `.name` property of the classifier function, then fallback `classifier_${index}`
-2. Classification result grouping key: `name` field on each returned `Classification` object
+1. **Classifier span name**: `.name` property of the classifier function, falling back to `classifier_${index}`.
+2. **Classification result grouping key**: `name` field on each returned `Classification` object. If `name` is missing, empty, or not a string, it **MUST** default to the classifier function's resolved span name (from step 1). This is **not** a validation failure.
 
-Each returned classification item **MUST** include a non-empty string `name`; items with the same returned `name` **MUST** be appended to the same array.
+Items with the same resolved `name` **MUST** be appended to the same array.
 
 ### Validation
 
-Each classification result **MUST** have:
-- A `name` that is a non-empty string
-- An `id` that is a non-empty string
+Each classification result **MUST** be a non-empty object. If the returned value is not a non-empty object, the classifier **MUST** be treated as failed with an error like:
 
-If validation fails, treat the classifier as failed.
+```
+When returning structured classifier results, each classification must be a non-empty object.
+```
 
 Additional field rules:
+- `name` defaults to the classifier function's resolved span name when missing/empty (see Name Resolution above).
 - `metadata` is an unconstrained `Record<string, unknown>`. SDKs **MUST NOT** impose size limits.
 - Duplicate `{name, id}` pairs are allowed. Multiple items with the same `id` under the same name key **MUST** all be stored. Deduplication, if needed, is handled at the display layer.
 - Order is stable. Items **MUST** be stored in the order they are returned by the classifier.
@@ -126,15 +136,23 @@ When storing results, SDKs **MUST** convert `Classification` to `ClassificationI
 
 1. Copy `id` as-is
 2. Default `label` to `id` if not provided
-3. Include `metadata` only if present
+3. Include `metadata` only if present (omit when `undefined`)
 4. Omit `name` (it becomes the dictionary key)
 
+### Logging Classifications
+
+When the `classifications` dictionary is non-empty, SDKs **MUST** log it to the root span:
+
+```typescript
+rootSpan.log({ classifications });
+```
+
 ### Error Handling
 
 Classifier failures **MUST NOT** abort the evaluation or affect other classifiers/scorers.
 
 On failure:
-1. Record the error under `classifier_errors` in eval metadata (maps classifier name to error message)
+1. Record the error under `classifier_errors` in eval metadata (maps classifier name to error message/stack)
 2. Log the error to the root span's metadata
 3. **SHOULD** emit a debug warning
 
@@ -217,6 +235,23 @@ const sentimentClassifier = ({ output }) => [
 ];
 ```
 
+### Classifier with Metadata
+
+```javascript
+Eval("my-project", {
+  data: [{ input: "hello", expected: "greeting" }],
+  task: (input) => input,
+  classifiers: [
+    () => ({
+      name: "category",
+      id: "greeting",
+      label: "Greeting",
+      metadata: { source: "unit-test" },
+    }),
+  ],
+});
+```
+
 ### Error Output
 
 When a classifier fails, the result includes:
@@ -225,7 +260,7 @@ When a classifier fails, the result includes:
 {
   "metadata": {
     "classifier_errors": {
-      "broken_classifier": "must return classifications with a non-empty string name"
+      "broken_classifier": "When returning structured classifier results, each classification must be a non-empty object. Got: null"
     }
   }
 }