Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ export const evalRunCommand = command({
threshold: option({
type: optional(number),
long: 'threshold',
description: 'Suite-level quality gate: exit 1 if mean score falls below this value (0-1)',
description:
'Per-test score threshold (0-1, default 0.8). Exit 1 if any test scores below this value',
}),
},
handler: async (args) => {
Expand Down
19 changes: 9 additions & 10 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ import {
calculateEvaluationSummary,
formatEvaluationSummary,
formatMatrixSummary,
formatThresholdSummary,
} from './statistics.js';
import { type TargetSelection, selectMultipleTargets, selectTarget } from './targets.js';

Expand Down Expand Up @@ -568,6 +567,7 @@ async function runSingleEvalFile(params: {
readonly matrixMode?: boolean;
readonly totalBudgetUsd?: number;
readonly failOnError?: FailOnError;
readonly threshold?: number;
}): Promise<{ results: EvaluationResult[] }> {
const {
testFilePath,
Expand Down Expand Up @@ -685,6 +685,7 @@ async function runSingleEvalFile(params: {
failOnError,
graderTarget: options.graderTarget,
model: options.model,
threshold: options.threshold,
streamCallbacks: streamingObserver?.getStreamCallbacks(),
onResult: async (result: EvaluationResult) => {
(
Expand Down Expand Up @@ -1162,6 +1163,7 @@ export async function runEvalCommand(
matrixMode: targetPrep.selections.length > 1,
totalBudgetUsd: targetPrep.totalBudgetUsd,
failOnError: targetPrep.failOnError,
threshold: resolvedThreshold,
});

return result.results;
Expand All @@ -1185,16 +1187,13 @@ export async function runEvalCommand(
);
}

const summary = calculateEvaluationSummary(allResults);
console.log(formatEvaluationSummary(summary));
const thresholdOpts =
resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined;
const summary = calculateEvaluationSummary(allResults, thresholdOpts);
console.log(formatEvaluationSummary(summary, thresholdOpts));

// Threshold quality gate check
let thresholdFailed = false;
if (resolvedThreshold !== undefined) {
const thresholdResult = formatThresholdSummary(summary.mean, resolvedThreshold);
console.log(`\n${thresholdResult.message}`);
thresholdFailed = !thresholdResult.passed;
}
// Exit code matches RESULT verdict: fail if any test scored below threshold.
const thresholdFailed = resolvedThreshold !== undefined && summary.qualityFailureCount > 0;

// Print matrix summary when multiple targets were evaluated
if (isMatrixMode && allResults.length > 0) {
Expand Down
43 changes: 22 additions & 21 deletions apps/cli/src/commands/eval/statistics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ function buildHistogram(values: readonly number[]): readonly HistogramBin[] {

export function calculateEvaluationSummary(
results: readonly EvaluationResult[],
options?: { threshold?: number },
): EvaluationSummary {
const total = results.length;

Expand Down Expand Up @@ -132,10 +133,19 @@ export function calculateEvaluationSummary(
const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));

// Count by execution status
// Count by execution status. When a custom threshold is provided,
// recompute passed/failed from raw scores instead of executionStatus
// (which uses the hardcoded PASS_THRESHOLD of 0.8).
const executionErrorCount = executionErrors.length;
const qualityFailureCount = results.filter((r) => r.executionStatus === 'quality_failure').length;
const passedCount = results.filter((r) => r.executionStatus === 'ok').length;
const scoreThreshold = options?.threshold;
const passedCount =
scoreThreshold !== undefined
? qualityResults.filter((r) => r.score >= scoreThreshold).length
: results.filter((r) => r.executionStatus === 'ok').length;
const qualityFailureCount =
scoreThreshold !== undefined
? qualityResults.filter((r) => r.score < scoreThreshold).length
: results.filter((r) => r.executionStatus === 'quality_failure').length;

// Aggregate by failure stage and reason (execution errors only)
const byFailureStage: Record<string, number> = {};
Expand Down Expand Up @@ -174,7 +184,10 @@ function formatScore(value: number): string {
return value.toFixed(3);
}

export function formatEvaluationSummary(summary: EvaluationSummary): string {
export function formatEvaluationSummary(
summary: EvaluationSummary,
options?: { threshold?: number },
): string {
if (summary.total === 0) {
return '\nNo results to summarize';
}
Expand All @@ -193,14 +206,16 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string {
lines.push('');
}

// Overall verdict line
// Overall verdict: all non-error cases must score >= per-test threshold.
const gradedCount = summary.total - summary.executionErrorCount;
const threshold = options?.threshold ?? 0.8;
const overallPassed =
summary.passedCount === summary.total - summary.executionErrorCount ||
summary.passedCount === gradedCount ||
(summary.qualityFailureCount === 0 && summary.executionErrorCount === 0);
const overallVerdict = overallPassed ? 'PASS' : 'FAIL';
const useColor = !(process.env.NO_COLOR !== undefined) && (process.stdout.isTTY ?? false);
const verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m';
const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${summary.total} passed, mean score: ${formatScore(summary.mean)})`;
const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;

lines.push('\n==================================================');
if (useColor) {
Expand Down Expand Up @@ -334,17 +349,3 @@ export function formatMatrixSummary(results: readonly EvaluationResult[]): strin

return lines.join('\n');
}

/**
* Format a threshold check summary line.
* Returns whether the threshold was met and the formatted message.
*/
export function formatThresholdSummary(
meanScore: number,
threshold: number,
): { passed: boolean; message: string } {
const passed = meanScore >= threshold;
const verdict = passed ? 'PASS' : 'FAIL';
const message = `Suite score: ${meanScore.toFixed(2)} (threshold: ${threshold.toFixed(2)}) — ${verdict}`;
return { passed, message };
}
60 changes: 39 additions & 21 deletions apps/cli/test/commands/eval/threshold.test.ts
Original file line number Diff line number Diff line change
@@ -1,31 +1,49 @@
import { describe, expect, it } from 'bun:test';

import { formatThresholdSummary } from '../../../src/commands/eval/statistics.js';

describe('formatThresholdSummary', () => {
it('returns PASS when mean score meets threshold', () => {
const result = formatThresholdSummary(0.85, 0.6);
expect(result.passed).toBe(true);
expect(result.message).toContain('0.85');
expect(result.message).toContain('0.60');
expect(result.message).toContain('PASS');
import type { EvaluationResult } from '@agentv/core';

import { calculateEvaluationSummary } from '../../../src/commands/eval/statistics.js';

function makeResult(testId: string, score: number): EvaluationResult {
return {
testId,
score,
executionStatus: score >= 0.8 ? 'ok' : 'quality_failure',
} as EvaluationResult;
}

describe('calculateEvaluationSummary with threshold', () => {
const results: EvaluationResult[] = [
makeResult('test-1', 1.0),
makeResult('test-2', 0.6),
makeResult('test-3', 0.9),
makeResult('test-4', 0.4),
];

it('uses default 0.8 threshold when no threshold provided', () => {
const summary = calculateEvaluationSummary(results);
// test-1 (1.0) and test-3 (0.9) pass at 0.8
expect(summary.passedCount).toBe(2);
expect(summary.qualityFailureCount).toBe(2);
});

it('returns FAIL when mean score is below threshold', () => {
const result = formatThresholdSummary(0.53, 0.6);
expect(result.passed).toBe(false);
expect(result.message).toContain('0.53');
expect(result.message).toContain('0.60');
expect(result.message).toContain('FAIL');
it('recomputes passed/failed with custom threshold', () => {
const summary = calculateEvaluationSummary(results, { threshold: 0.5 });
// test-1 (1.0), test-2 (0.6), test-3 (0.9) pass at 0.5
expect(summary.passedCount).toBe(3);
expect(summary.qualityFailureCount).toBe(1);
});

it('returns PASS when mean score exactly equals threshold', () => {
const result = formatThresholdSummary(0.6, 0.6);
expect(result.passed).toBe(true);
it('stricter threshold reduces pass count', () => {
const summary = calculateEvaluationSummary(results, { threshold: 0.95 });
// only test-1 (1.0) passes at 0.95
expect(summary.passedCount).toBe(1);
expect(summary.qualityFailureCount).toBe(3);
});

it('returns PASS for threshold 0 with any score', () => {
const result = formatThresholdSummary(0, 0);
expect(result.passed).toBe(true);
it('threshold 0 passes everything', () => {
const summary = calculateEvaluationSummary(results, { threshold: 0 });
expect(summary.passedCount).toBe(4);
expect(summary.qualityFailureCount).toBe(0);
});
});
8 changes: 4 additions & 4 deletions apps/web/src/content/docs/docs/evaluation/running-evals.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ When halted, remaining tests are recorded with `failureReasonCode: 'error_thresh

### Suite-Level Quality Threshold

Set a minimum mean score for the eval suite. If the mean quality score falls below the threshold, the CLI exits with code 1 — useful for CI/CD quality gates.
Set a per-test score threshold for the eval suite. Each test case must score at or above this value to pass. If any test scores below the threshold, the CLI exits with code 1 — useful for CI/CD quality gates.

**CLI flag:**

Expand All @@ -257,12 +257,12 @@ execution:
threshold: 0.8
```

The CLI `--threshold` flag overrides the YAML value. The threshold is a number between 0 and 1. Mean score is computed from quality results only (execution errors are excluded).
The CLI `--threshold` flag overrides the YAML value. The threshold is a number between 0 and 1 (default: 0.8). Execution errors are excluded from the count.

When active, a summary line is printed after the eval results:
When active, the summary line shows how many tests met the threshold:

```
Suite score: 0.85 (threshold: 0.80) — PASS
RESULT: PASS (28/31 scored >= 0.8, mean: 0.927)
```

The threshold also controls JUnit XML pass/fail: tests with scores below the threshold are marked as `<failure>` in JUnit output. When no threshold is set, JUnit defaults to 0.5.
Expand Down
22 changes: 18 additions & 4 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ import { type PromptInputs, buildPromptInputs, loadTests } from './yaml-parser.j

type MaybePromise<T> = T | Promise<T>;

function classifyQualityStatus(score: number): ExecutionStatus {
return score >= PASS_THRESHOLD ? 'ok' : 'quality_failure';
function classifyQualityStatus(score: number, threshold = PASS_THRESHOLD): ExecutionStatus {
return score >= threshold ? 'ok' : 'quality_failure';
}

function buildSkippedEvaluatorError(
Expand Down Expand Up @@ -194,6 +194,8 @@ export interface RunEvalCaseOptions {
readonly evalDir?: string;
/** Include verbose request details in results (e.g. agent input text) */
readonly verbose?: boolean;
/** Per-test score threshold for pass/fail (default: 0.8) */
readonly threshold?: number;
}

export interface ProgressEvent {
Expand Down Expand Up @@ -261,6 +263,8 @@ export interface RunEvaluationOptions {
readonly graderTarget?: string;
/** CLI override: model for grader target (e.g., "openai:gpt-5-mini") */
readonly model?: string;
/** Per-test score threshold for pass/fail (default: 0.8) */
readonly threshold?: number;
}

export async function runEvaluation(
Expand Down Expand Up @@ -299,6 +303,7 @@ export async function runEvaluation(
retainOnFailure,
graderTarget: cliGraderTarget,
model: cliModel,
threshold: scoreThreshold,
} = options;

// Disable cache when trials > 1 (cache makes trials deterministic = pointless)
Expand Down Expand Up @@ -475,6 +480,7 @@ export async function runEvaluation(
agentTimeoutMs,
targetResolver,
availableTargets,
threshold: scoreThreshold,
});
} catch (error) {
if (verbose) {
Expand Down Expand Up @@ -933,6 +939,7 @@ export async function runEvaluation(
repoManager,
evalDir,
verbose,
threshold: scoreThreshold,
};
let result =
trials && trials.count > 1
Expand Down Expand Up @@ -1123,6 +1130,7 @@ async function runBatchEvaluation(options: {
readonly agentTimeoutMs?: number;
readonly targetResolver?: (name: string) => Provider | undefined;
readonly availableTargets?: readonly string[];
readonly threshold?: number;
}): Promise<readonly EvaluationResult[]> {
const {
evalCases,
Expand All @@ -1138,6 +1146,7 @@ async function runBatchEvaluation(options: {
agentTimeoutMs,
targetResolver,
availableTargets,
threshold: batchThreshold,
} = options;

// Prepare prompt inputs up front so we can reuse them for grading.
Expand Down Expand Up @@ -1246,6 +1255,7 @@ async function runBatchEvaluation(options: {
targetResolver,
availableTargets,
verbose,
threshold: batchThreshold,
});

if (providerError) {
Expand Down Expand Up @@ -1337,6 +1347,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
repoManager,
evalDir,
verbose,
threshold: caseThreshold,
} = options;
const setupDebug = process.env.AGENTV_SETUP_DEBUG === '1';

Expand Down Expand Up @@ -1767,6 +1778,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
fileChanges,
workspacePath,
verbose,
threshold: caseThreshold,
});

const totalDurationMs = Date.now() - caseStartMs;
Expand Down Expand Up @@ -1796,7 +1808,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
const executionStatus: ExecutionStatus =
providerError || skippedEvaluatorError
? 'execution_error'
: classifyQualityStatus(result.score);
: classifyQualityStatus(result.score, caseThreshold);

const finalResult = providerError
? {
Expand Down Expand Up @@ -2017,6 +2029,7 @@ async function evaluateCandidate(options: {
readonly fileChanges?: string;
readonly workspacePath?: string;
readonly verbose?: boolean;
readonly threshold?: number;
}): Promise<EvaluationResult> {
const {
evalCase,
Expand All @@ -2041,6 +2054,7 @@ async function evaluateCandidate(options: {
availableTargets,
fileChanges,
workspacePath,
threshold: evalThreshold,
} = options;

const gradeTimestamp = nowFn();
Expand Down Expand Up @@ -2124,7 +2138,7 @@ async function evaluateCandidate(options: {
scores: scores,
trace: trace,
fileChanges,
executionStatus: classifyQualityStatus(score.score),
executionStatus: classifyQualityStatus(score.score, evalThreshold),
};
}

Expand Down
Loading