diff --git a/apps/cli/src/commands/eval/commands/prompt/accessors.ts b/apps/cli/src/commands/eval/commands/prompt/accessors.ts deleted file mode 100644 index aeb45e44c..000000000 --- a/apps/cli/src/commands/eval/commands/prompt/accessors.ts +++ /dev/null @@ -1,236 +0,0 @@ -import type { EvaluatorConfig, JsonObject, TestMessage } from '@agentv/core'; -import { loadTestById, loadTests } from '@agentv/core'; - -import { findRepoRoot } from '../../shared.js'; - -interface PromptEvalInputResult { - readonly test_id: string; - readonly input: readonly JsonObject[]; - readonly criteria: string; -} - -interface PromptEvalExpectedOutputResult { - readonly test_id: string; - readonly criteria: string; - readonly expected_output: readonly JsonObject[]; - readonly reference_answer?: string; - readonly assertions: readonly EvaluatorConfig[]; -} - -interface PromptEvalListResult { - readonly eval_path: string; - readonly test_ids: readonly string[]; -} - -export async function listPromptEvalTestIds(evalPath: string): Promise { - const repoRoot = await findRepoRoot(process.cwd()); - const tests = await loadTests(evalPath, repoRoot); - - return { - eval_path: evalPath, - test_ids: tests.map((test) => test.id).sort(), - }; -} - -export async function getPromptEvalInput( - evalPath: string, - testId: string, -): Promise { - const repoRoot = await findRepoRoot(process.cwd()); - const evalCase = await loadTestById(evalPath, repoRoot, testId); - const fileMap = buildFileMap(evalCase.input, evalCase.file_paths); - - return { - test_id: evalCase.id, - input: resolveMessages(evalCase.input, fileMap), - criteria: evalCase.criteria, - }; -} - -export async function getPromptEvalExpectedOutput( - evalPath: string, - testId: string, -): Promise { - const repoRoot = await findRepoRoot(process.cwd()); - const evalCase = await loadTestById(evalPath, repoRoot, testId); - - return { - test_id: evalCase.id, - criteria: evalCase.criteria, - expected_output: evalCase.expected_output, - reference_answer: evalCase.reference_answer, - assertions: evalCase.assertions ?? [], - }; -} - -export async function getPromptEvalGradingBrief(evalPath: string, testId: string): Promise { - const repoRoot = await findRepoRoot(process.cwd()); - const evalCase = await loadTestById(evalPath, repoRoot, testId); - const fileMap = buildFileMap(evalCase.input, evalCase.file_paths); - const resolvedInput = resolveMessages(evalCase.input, fileMap); - - const lines: string[] = []; - - // Input - const inputText = extractTextFromMessages(resolvedInput); - if (inputText) { - lines.push(`Input: "${inputText}"`); - } - - // Files - if (evalCase.file_paths.length > 0) { - lines.push(`Files: ${evalCase.file_paths.join(', ')}`); - } - - // Expected output - if (evalCase.reference_answer) { - lines.push(`Expected: "${evalCase.reference_answer}"`); - } - - // Criteria - const criteria: string[] = []; - if (evalCase.criteria) { - criteria.push(evalCase.criteria); - } - for (const assertion of evalCase.assertions ?? []) { - const entry = assertion as Record; - const type = entry.type as string | undefined; - const bag = (entry.config as Record) ?? {}; - if (type === 'contains') { - criteria.push(`Output contains '${entry.value}'`); - } else if (type === 'rubrics') { - const items = (entry.criteria ?? bag.criteria) as Array<{ outcome?: string }> | undefined; - if (Array.isArray(items)) { - for (const item of items) { - if (item.outcome) criteria.push(item.outcome); - } - } - } else if (type === 'llm-grader' || type === 'llm_grader') { - const prompt = entry.prompt ?? bag.prompt ?? bag.criteria; - criteria.push(`[llm-grader] ${typeof prompt === 'string' ? prompt : ''}`); - } else if (type === 'code-grader' || type === 'code_grader') { - const name = entry.name ?? type; - const desc = bag.description ?? entry.description; - criteria.push(`[code-grader] ${name}${desc ? `: ${desc}` : ''}`); - } else if (type === 'skill-trigger') { - const trigger = entry.should_trigger !== false; - criteria.push(`[skill-trigger] should_trigger: ${trigger} for ${entry.skill}`); - } else if (type) { - criteria.push(`[${type}] ${entry.value ?? bag.criteria ?? bag.prompt ?? ''}`); - } - } - - if (criteria.length > 0) { - lines.push('Criteria:'); - for (const c of criteria) { - lines.push(` - ${c}`); - } - } - - return lines.join('\n'); -} - -function extractTextFromMessages(messages: JsonObject[]): string { - for (const msg of messages) { - if (msg.role !== 'user') continue; - if (typeof msg.content === 'string') return msg.content; - if (Array.isArray(msg.content)) { - const textBlocks = (msg.content as JsonObject[]) - .filter((b) => b.type === 'text') - .map((b) => b.value as string); - if (textBlocks.length > 0) return textBlocks.join(' '); - } - } - return ''; -} - -/** - * Build a mapping from relative file names to resolved absolute paths. - * Uses enriched input file segments as the primary source, then falls back - * to suffix-matching against all file_paths. - */ -function buildFileMap( - inputMessages: readonly TestMessage[], - allFilePaths: readonly string[], -): Map { - const map = new Map(); - - for (const message of inputMessages) { - if (!Array.isArray(message.content)) { - continue; - } - - for (const segment of message.content) { - registerResolvedFileSegment(map, segment); - } - } - - // Fall back to suffix-matching against file_paths - return { - get(key: string): string | undefined { - const direct = map.get(key); - if (direct) return direct; - return allFilePaths.find((filePath) => filePath.endsWith(`/${key}`) || filePath === key); - }, - has(key: string): boolean { - return this.get(key) !== undefined; - }, - } as Map; -} - -function registerResolvedFileSegment(map: Map, segment: JsonObject): void { - if (segment.type !== 'file' || typeof segment.resolvedPath !== 'string') { - return; - } - - // `value` is the authored file reference from the eval. `path` is the - // normalized display/reference path attached during parsing. Usually they are - // the same, but both are valid lookup aliases for downstream prompt tooling. - const aliases = [segment.value, segment.path].filter( - (alias): alias is string => typeof alias === 'string', - ); - - for (const alias of aliases) { - map.set(alias, segment.resolvedPath); - } -} - -/** - * Resolve file references in messages, replacing relative values with absolute paths. - * The agent can then read these files directly from the filesystem. - */ -function resolveMessages( - messages: readonly TestMessage[], - fileMap: Map, -): JsonObject[] { - return messages.map((message) => { - if (typeof message.content === 'string') { - return { role: message.role, content: message.content } as JsonObject; - } - - if (!Array.isArray(message.content)) { - return { role: message.role, content: message.content } as JsonObject; - } - - const resolvedContent: JsonObject[] = []; - for (const segment of message.content) { - if (typeof segment === 'string') { - resolvedContent.push({ type: 'text', value: segment } as JsonObject); - continue; - } - - const obj = segment as JsonObject; - if (obj.type === 'file' && typeof obj.value === 'string') { - const resolved = fileMap.get(obj.value); - resolvedContent.push({ - type: 'file', - path: resolved ?? obj.value, - } as JsonObject); - } else { - resolvedContent.push(obj); - } - } - - return { role: message.role, content: resolvedContent } as JsonObject; - }); -} diff --git a/apps/cli/src/commands/eval/commands/prompt/index.ts b/apps/cli/src/commands/eval/commands/prompt/index.ts deleted file mode 100644 index 570b2aa07..000000000 --- a/apps/cli/src/commands/eval/commands/prompt/index.ts +++ /dev/null @@ -1,81 +0,0 @@ -import { command, flag, option, optional, positional, string, subcommands } from 'cmd-ts'; - -import { - getPromptEvalExpectedOutput, - getPromptEvalGradingBrief, - getPromptEvalInput, - listPromptEvalTestIds, -} from './accessors.js'; - -export const evalPromptEvalSubcommand = command({ - name: 'eval', - description: 'Extract eval prompt data for agents', - args: { - list: flag({ - long: 'list', - description: 'List available test IDs', - }), - input: flag({ - long: 'input', - description: 'Extract the test input payload for a single test', - }), - expectedOutput: flag({ - long: 'expected-output', - description: 'Extract expected output and grading context for a single test', - }), - gradingBrief: flag({ - long: 'grading-brief', - description: 'Output human-readable grading brief with typed criteria', - }), - testId: option({ - type: optional(string), - long: 'test-id', - description: 'Test ID (required for --input and --expected-output)', - }), - evalPath: positional({ - type: string, - displayName: 'eval-path', - description: 'Path to evaluation .yaml, .json, or .jsonl file', - }), - }, - handler: async ({ evalPath, expectedOutput, gradingBrief, input, list, testId }) => { - const selectedModes = [list, input, expectedOutput, gradingBrief].filter(Boolean).length; - if (selectedModes !== 1) { - throw new Error( - 'Specify exactly one of --list, --input, --expected-output, or --grading-brief.', - ); - } - - if (gradingBrief) { - if (!testId) { - throw new Error('--test-id is required with --grading-brief.'); - } - const brief = await getPromptEvalGradingBrief(evalPath, testId); - process.stdout.write(brief); - process.stdout.write('\n'); - return; - } - - if ((input || expectedOutput) && !testId) { - throw new Error('--test-id is required with --input and --expected-output.'); - } - - const requiredTestId = testId ?? ''; - const output = list - ? await listPromptEvalTestIds(evalPath) - : input - ? await getPromptEvalInput(evalPath, requiredTestId) - : await getPromptEvalExpectedOutput(evalPath, requiredTestId); - - process.stdout.write(JSON.stringify(output, null, 2)); - process.stdout.write('\n'); - }, -}); - -export const evalPromptCommand = subcommands({ - name: 'prompt', - description: 'Prompt commands', - cmds: { - eval: evalPromptEvalSubcommand, - }, -}); diff --git a/apps/cli/src/commands/eval/index.ts b/apps/cli/src/commands/eval/index.ts index e9a3a991a..305590d1f 100644 --- a/apps/cli/src/commands/eval/index.ts +++ b/apps/cli/src/commands/eval/index.ts @@ -1,7 +1,6 @@ import { subcommands } from 'cmd-ts'; import { evalAssertCommand } from './commands/assert.js'; -import { evalPromptCommand } from './commands/prompt/index.js'; import { evalRunCommand } from './commands/run.js'; export const evalCommand = subcommands({ @@ -9,7 +8,6 @@ export const evalCommand = subcommands({ description: 'Evaluation commands', cmds: { run: evalRunCommand, - prompt: evalPromptCommand, assert: evalAssertCommand, }, }); diff --git a/apps/cli/src/commands/import/claude.ts b/apps/cli/src/commands/import/claude.ts new file mode 100644 index 000000000..5664d1afe --- /dev/null +++ b/apps/cli/src/commands/import/claude.ts @@ -0,0 +1,149 @@ +import { mkdir, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { discoverClaudeSessions, parseClaudeSession, readTranscriptFile } from '@agentv/core'; +import { command, flag, option, optional, string } from 'cmd-ts'; + +export const importClaudeCommand = command({ + name: 'claude', + description: 'Import a Claude Code session transcript for offline grading', + args: { + sessionId: option({ + type: optional(string), + long: 'session-id', + description: 'UUID of the Claude Code session to import', + }), + discover: option({ + type: optional(string), + long: 'discover', + description: 'Discovery mode: "latest" to import the most recent session', + }), + projectPath: option({ + type: optional(string), + long: 'project-path', + description: 'Filter sessions by project path', + }), + output: option({ + type: optional(string), + long: 'output', + short: 'o', + description: + 'Output file path (default: .agentv/transcripts/claude-.jsonl)', + }), + projectsDir: option({ + type: optional(string), + long: 'projects-dir', + description: 'Override the default ~/.claude/projects directory', + }), + list: flag({ + long: 'list', + description: 'List available sessions instead of importing', + }), + }, + handler: async ({ sessionId, discover, projectPath, output, projectsDir, list }) => { + if (list) { + const sessions = await discoverClaudeSessions({ + projectPath, + projectsDir, + limit: 20, + }); + + if (sessions.length === 0) { + console.log('No Claude Code sessions found.'); + return; + } + + console.log(`Found ${sessions.length} session(s):\n`); + for (const session of sessions) { + const age = formatAge(session.updatedAt); + console.log(` ${session.sessionId} ${age} ${session.projectDir}`); + } + return; + } + + // Determine which session to import + let sessionFilePath: string; + + if (sessionId) { + const sessions = await discoverClaudeSessions({ + sessionId, + projectPath, + projectsDir, + limit: 1, + }); + + if (sessions.length === 0) { + console.error(`Error: session ${sessionId} not found.`); + process.exit(1); + } + sessionFilePath = sessions[0].filePath; + } else if (discover === 'latest') { + const sessions = await discoverClaudeSessions({ + projectPath, + projectsDir, + latest: true, + }); + + if (sessions.length === 0) { + console.error('Error: no Claude Code sessions found.'); + process.exit(1); + } + sessionFilePath = sessions[0].filePath; + sessionId = sessions[0].sessionId; + console.log(`Discovered latest session: ${sessionId}`); + } else { + console.error('Error: specify --session-id or --discover latest to select a session.'); + process.exit(1); + } + + // Parse the session + const rawJsonl = await readTranscriptFile(sessionFilePath); + const transcript = parseClaudeSession(rawJsonl); + + // Determine output path + const shortId = (sessionId ?? transcript.source.sessionId).slice(0, 8); + const outputPath = output ?? path.join('.agentv', 'transcripts', `claude-${shortId}.jsonl`); + + // Ensure output directory exists + await mkdir(path.dirname(outputPath), { recursive: true }); + + // Write transcript as JSONL (one message per line) + const outputLines = transcript.messages.map((msg) => JSON.stringify(msg)); + await writeFile(outputPath, `${outputLines.join('\n')}\n`, 'utf8'); + + const msgCount = transcript.messages.length; + const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0); + + console.log(`Imported ${msgCount} messages (${toolCount} tool calls) → ${outputPath}`); + + if (transcript.source.model) { + console.log(` Model: ${transcript.source.model}`); + } + if (transcript.durationMs !== undefined) { + console.log(` Duration: ${formatDurationMs(transcript.durationMs)}`); + } + if (transcript.tokenUsage) { + console.log( + ` Tokens: ${transcript.tokenUsage.input} in / ${transcript.tokenUsage.output} out`, + ); + } + }, +}); + +function formatAge(date: Date): string { + const diffMs = Date.now() - date.getTime(); + const diffMin = Math.floor(diffMs / 60_000); + if (diffMin < 60) return `${diffMin}m ago`; + const diffHours = Math.floor(diffMin / 60); + if (diffHours < 24) return `${diffHours}h ago`; + const diffDays = Math.floor(diffHours / 24); + return `${diffDays}d ago`; +} + +function formatDurationMs(ms: number): string { + if (ms < 1000) return `${ms}ms`; + const seconds = Math.floor(ms / 1000); + if (seconds < 60) return `${seconds}s`; + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return `${minutes}m ${remainingSeconds}s`; +} diff --git a/apps/cli/src/commands/import/index.ts b/apps/cli/src/commands/import/index.ts new file mode 100644 index 000000000..d76ddcaf0 --- /dev/null +++ b/apps/cli/src/commands/import/index.ts @@ -0,0 +1,11 @@ +import { subcommands } from 'cmd-ts'; + +import { importClaudeCommand } from './claude.js'; + +export const importCommand = subcommands({ + name: 'import', + description: 'Import agent session transcripts for offline grading', + cmds: { + claude: importClaudeCommand, + }, +}); diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index 9a2990307..6ae8fbecb 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -4,8 +4,8 @@ import packageJson from '../package.json' with { type: 'json' }; import { compareCommand } from './commands/compare/index.js'; import { convertCommand } from './commands/convert/index.js'; import { createCommand } from './commands/create/index.js'; -import { evalPromptCommand } from './commands/eval/commands/prompt/index.js'; import { evalCommand } from './commands/eval/index.js'; +import { importCommand } from './commands/import/index.js'; import { initCmdTsCommand } from './commands/init/index.js'; import { pipelineCommand } from './commands/pipeline/index.js'; import { resultsCommand } from './commands/results/index.js'; @@ -24,7 +24,7 @@ export const app = subcommands({ version: packageJson.version, cmds: { eval: evalCommand, - prompt: evalPromptCommand, + import: importCommand, compare: compareCommand, convert: convertCommand, create: createCommand, @@ -45,14 +45,14 @@ export const app = subcommands({ * Known eval subcommand names — used to decide whether to inject the * implicit `run` subcommand for backward-compatible `agentv eval `. */ -const EVAL_SUBCOMMANDS = new Set(['run', 'prompt', 'assert']); +const EVAL_SUBCOMMANDS = new Set(['run', 'assert']); /** * Top-level CLI command names (excluding `eval` itself). - * Used to distinguish `agentv eval …` from `agentv prompt eval …`. + * Used to ensure `eval` is the top-level subcommand, not nested. */ const TOP_LEVEL_COMMANDS = new Set([ - 'prompt', + 'import', 'compare', 'convert', 'create', @@ -89,13 +89,12 @@ export function preprocessArgv(argv: string[]): string[] { // Implicit `run` subcommand: `agentv eval ` → `agentv eval run ` // when the first arg after `eval` is not a known eval subcommand. // This preserves backward compatibility now that `eval` is a subcommands group. - // Only applies when `eval` is the top-level subcommand, NOT when it appears - // inside another command (e.g. `agentv prompt eval …`). + // Only applies when `eval` is the top-level subcommand. // Exception: `--help` / `-h` should show the eval group help, not run's help. const evalIdx = result.indexOf('eval'); if (evalIdx !== -1) { // Ensure no top-level command appears before `eval` in the argv — - // if one does, `eval` is a nested subcommand (e.g. `prompt eval`). + // if one does, `eval` is a nested subcommand. const isTopLevel = !result.slice(0, evalIdx).some((arg) => TOP_LEVEL_COMMANDS.has(arg)); if (isTopLevel) { const nextArg = result[evalIdx + 1]; diff --git a/apps/cli/test/prompt-eval.integration.test.ts b/apps/cli/test/prompt-eval.integration.test.ts deleted file mode 100644 index 77495c51b..000000000 --- a/apps/cli/test/prompt-eval.integration.test.ts +++ /dev/null @@ -1,168 +0,0 @@ -import { describe, expect, it } from 'bun:test'; -import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; -import { tmpdir } from 'node:os'; -import path from 'node:path'; -import { fileURLToPath } from 'node:url'; -import { execa } from 'execa'; - -import { assertCoreBuild } from './setup-core-build.js'; - -assertCoreBuild(); - -interface PromptEvalFixture { - readonly baseDir: string; - readonly suiteDir: string; - readonly evalPath: string; -} - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); -const projectRoot = path.resolve(__dirname, '../../..'); -const CLI_ENTRY = path.join(projectRoot, 'apps/cli/src/cli.ts'); - -async function createFixture(): Promise { - const baseDir = await mkdtemp(path.join(tmpdir(), 'agentv-prompt-eval-')); - const suiteDir = path.join(baseDir, 'suite'); - await mkdir(suiteDir, { recursive: true }); - - const evalPath = path.join(suiteDir, 'sample.eval.yaml'); - await writeFile( - evalPath, - `description: Prompt eval CLI fixture - -tests: - - id: greeting-test - criteria: Assistant greets the user by name - assertions: - - name: mentions-name - type: contains - value: Taylor - input: - - role: user - content: Say hello to Taylor. - expected_output: - - role: assistant - content: Hello, Taylor! - - id: farewell-test - criteria: Assistant says goodbye politely - input: - - role: user - content: Say goodbye to Taylor. - expected_output: - - role: assistant - content: Goodbye, Taylor. -`, - 'utf8', - ); - - return { baseDir, suiteDir, evalPath } satisfies PromptEvalFixture; -} - -async function runPromptCli( - fixture: PromptEvalFixture, - args: readonly string[], -): Promise<{ stdout: string; stderr: string; exitCode: number }> { - const result = await execa('bun', ['--no-env-file', CLI_ENTRY, ...args], { - cwd: fixture.suiteDir, - env: { - ...process.env, - CI: 'true', - }, - reject: false, - }); - - return { - stdout: result.stdout, - stderr: result.stderr, - exitCode: result.exitCode ?? 0, - }; -} - -describe('agentv prompt eval CLI', () => { - it('lists available test IDs', async () => { - const fixture = await createFixture(); - try { - const result = await runPromptCli(fixture, ['prompt', 'eval', '--list', fixture.evalPath]); - - expect(result.exitCode).toBe(0); - expect(JSON.parse(result.stdout)).toEqual({ - eval_path: fixture.evalPath, - test_ids: ['farewell-test', 'greeting-test'], - }); - } finally { - await rm(fixture.baseDir, { recursive: true, force: true }); - } - }); - - it('returns prompt input for a specific test via --input', async () => { - const fixture = await createFixture(); - try { - const result = await runPromptCli(fixture, [ - 'prompt', - 'eval', - '--input', - fixture.evalPath, - '--test-id', - 'greeting-test', - ]); - - expect(result.exitCode).toBe(0); - expect(JSON.parse(result.stdout)).toEqual({ - test_id: 'greeting-test', - input: [{ role: 'user', content: 'Say hello to Taylor.' }], - criteria: 'Assistant greets the user by name', - }); - } finally { - await rm(fixture.baseDir, { recursive: true, force: true }); - } - }); - - it('returns human-readable grading brief via --grading-brief', async () => { - const fixture = await createFixture(); - try { - const result = await runPromptCli(fixture, [ - 'prompt', - 'eval', - '--grading-brief', - fixture.evalPath, - '--test-id', - 'greeting-test', - ]); - - expect(result.exitCode).toBe(0); - expect(result.stdout).toContain('Input:'); - expect(result.stdout).toContain('Say hello to Taylor.'); - expect(result.stdout).toContain('Expected:'); - expect(result.stdout).toContain('Hello, Taylor!'); - expect(result.stdout).toContain('Criteria:'); - expect(result.stdout).toContain('Taylor'); - } finally { - await rm(fixture.baseDir, { recursive: true, force: true }); - } - }); - - it('returns expected output and evaluator context for a specific test', async () => { - const fixture = await createFixture(); - try { - const result = await runPromptCli(fixture, [ - 'prompt', - 'eval', - '--expected-output', - fixture.evalPath, - '--test-id', - 'greeting-test', - ]); - - expect(result.exitCode).toBe(0); - expect(JSON.parse(result.stdout)).toEqual({ - test_id: 'greeting-test', - criteria: 'Assistant greets the user by name', - expected_output: [{ role: 'assistant', content: 'Hello, Taylor!' }], - reference_answer: 'Hello, Taylor!', - assertions: [{ name: 'mentions-name', type: 'contains', value: 'Taylor' }], - }); - } finally { - await rm(fixture.baseDir, { recursive: true, force: true }); - } - }); -}); diff --git a/apps/cli/test/unit/preprocess-argv.test.ts b/apps/cli/test/unit/preprocess-argv.test.ts index 116667d9d..d91c31bc3 100644 --- a/apps/cli/test/unit/preprocess-argv.test.ts +++ b/apps/cli/test/unit/preprocess-argv.test.ts @@ -3,79 +3,6 @@ import { describe, expect, it } from 'bun:test'; import { preprocessArgv } from '../../src/index.js'; describe('preprocessArgv', () => { - describe('prompt default subcommand insertion', () => { - it('does not rewrite `prompt` commands without explicit subcommands', () => { - const argv = ['node', 'agentv', 'prompt', 'file.yaml']; - expect(preprocessArgv(argv)).toEqual(argv); - }); - - it('does not rewrite bare `prompt` commands', () => { - const argv = ['node', 'agentv', 'prompt']; - expect(preprocessArgv(argv)).toEqual(argv); - }); - - it('does not insert a default accessor after `prompt eval` when followed by a file', () => { - const result = preprocessArgv(['node', 'agentv', 'prompt', 'eval', 'file.yaml']); - expect(result).toEqual(['node', 'agentv', 'prompt', 'eval', 'file.yaml']); - }); - - it('does not insert a default accessor when `prompt eval` has no further arguments', () => { - const argv = ['node', 'agentv', 'prompt', 'eval']; - expect(preprocessArgv(argv)).toEqual(argv); - }); - - it('passes through `prompt eval --input` with flags', () => { - const result = preprocessArgv([ - 'node', - 'agentv', - 'prompt', - 'eval', - '--input', - 'file.yaml', - '--test-id', - 'case-1', - ]); - expect(result).toEqual([ - 'node', - 'agentv', - 'prompt', - 'eval', - '--input', - 'file.yaml', - '--test-id', - 'case-1', - ]); - }); - - it('passes through `prompt eval --expected-output` with flags', () => { - const result = preprocessArgv([ - 'node', - 'agentv', - 'prompt', - 'eval', - '--expected-output', - 'file.yaml', - '--test-id', - 'case-1', - ]); - expect(result).toEqual([ - 'node', - 'agentv', - 'prompt', - 'eval', - '--expected-output', - 'file.yaml', - '--test-id', - 'case-1', - ]); - }); - - it('passes through `prompt eval --list`', () => { - const argv = ['node', 'agentv', 'prompt', 'eval', '--list', 'file.yaml']; - expect(preprocessArgv(argv)).toEqual(argv); - }); - }); - describe('--eval-id convenience alias', () => { it('rewrites `--eval-id` → `--test-id`', () => { const result = preprocessArgv(['node', 'agentv', 'eval', 'file.yaml', '--eval-id', 'case-1']); @@ -86,29 +13,6 @@ describe('preprocessArgv', () => { const result = preprocessArgv(['node', 'agentv', 'eval', 'file.yaml', '--eval-id=case-1']); expect(result).toEqual(['node', 'agentv', 'eval', 'run', 'file.yaml', '--test-id=case-1']); }); - - it('rewrites `--eval-id` in prompt commands', () => { - const result = preprocessArgv([ - 'node', - 'agentv', - 'prompt', - 'eval', - 'input', - 'file.yaml', - '--eval-id', - 'case-1', - ]); - expect(result).toEqual([ - 'node', - 'agentv', - 'prompt', - 'eval', - 'input', - 'file.yaml', - '--test-id', - 'case-1', - ]); - }); }); describe('eval implicit run subcommand', () => { diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 08f93c66c..0388223d9 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -301,70 +301,19 @@ The `--file` option reads a JSON file with `{ "output": "...", "input": "..." }` This is the same interface that agent-orchestrated evals use — the EVAL.yaml transpiler emits `assertions` instructions for code graders so external grading agents can execute them directly. -## Agent-Orchestrated Evals +## Offline Grading -Run evaluations without API keys by letting an external agent (e.g., Claude Code, Copilot CLI) orchestrate the eval pipeline. - -### Overview - -```bash -agentv eval prompt eval --list evals/my-eval.yaml -``` - -Returns JSON listing the available `test_ids` for the eval file. - -### Get Task Input +Grade existing agent sessions without re-running them. Import a transcript, then run deterministic evaluators: ```bash -agentv eval prompt eval --input evals/my-eval.yaml --test-id case-123 -``` - -Returns JSON with: -- `input` — `[{role, content}]` array. File references use absolute paths (`{type: "file", path: "/abs/path"}`) that the agent can read directly from the filesystem. -- `criteria` — grading criteria for the orchestrator's reference (do not pass to the candidate). - -### Get Grading Context +# Import a Claude Code session +agentv import claude --discover latest -```bash -agentv eval prompt eval --expected-output evals/my-eval.yaml --test-id case-123 +# Run evaluators against the imported transcript +agentv eval evals/my-eval.yaml --transcript .agentv/transcripts/claude-.jsonl ``` -Returns JSON with the data an external grader needs: - -- `expected_output` — reference assistant messages -- `reference_answer` — flattened reference text when available -- `criteria` — high-level success criteria -- `assertions` — evaluator configs for the test - -### Get Grading Brief - -Output a human-readable summary of the grading criteria for a specific test, with type-prefixed assertion tags: - -```bash -agentv eval prompt eval --grading-brief evals/my-eval.yaml --test-id case-123 -``` - -Example output: - -``` -Input: "Summarise the following article in one sentence." -Expected: "The quick brown fox jumps over the lazy dog near the river bank." -Criteria: - - [code-grader] rouge-score: Measures n-gram recall and F1 - - [llm-grader] Summary captures key points - - [skill-trigger] should_trigger: true for summariser -``` - -This is useful for agents orchestrating evals to understand what criteria a test is evaluated against before running it. - -### When to Use - -| Scenario | Command | -|----------|---------| -| Have API keys, want end-to-end automation | `agentv eval` | -| Run a single assertion in isolation | `agentv eval assert ` | -| No API keys, external agent can orchestrate the run | `agentv eval prompt eval --list/--input/--expected-output` | -| Inspect grading criteria before running | `agentv eval prompt eval --grading-brief` | +See the [Import tool docs](/docs/tools/import/) for all providers and options. ## Version Requirements diff --git a/apps/web/src/content/docs/docs/evaluators/code-graders.mdx b/apps/web/src/content/docs/docs/evaluators/code-graders.mdx index b62200f65..3befe3581 100644 --- a/apps/web/src/content/docs/docs/evaluators/code-graders.mdx +++ b/apps/web/src/content/docs/docs/evaluators/code-graders.mdx @@ -14,10 +14,10 @@ Code graders communicate via stdin/stdout JSON: **Input (stdin):** ```json { - "input_text": "What is 15 + 27?", + "input": "What is 15 + 27?", "criteria": "Correctly calculates 15 + 27 = 42", - "output_text": "The answer is 42.", - "expected_output_text": "42" + "output": "The answer is 42.", + "expected_output": "42" } **Output (stdout):** @@ -41,7 +41,7 @@ Code graders communicate via stdin/stdout JSON: # validators/check_answer.py import json, sys data = json.load(sys.stdin) -output_text = data.get("output_text", "") +output_text = data.get("output", "") assertions = [] @@ -66,7 +66,7 @@ print(json.dumps({ import { readFileSync } from "fs"; const data = JSON.parse(readFileSync("/dev/stdin", "utf-8")); -const outputText: string = data.output_text ?? ""; +const outputText: string = data.output ?? ""; const assertions: Array<{ text: string; passed: boolean }> = []; @@ -102,7 +102,8 @@ The `@agentv/eval` package provides a declarative API with automatic stdin/stdou #!/usr/bin/env bun import { defineCodeGrader } from '@agentv/eval'; -export default defineCodeGrader(({ outputText, criteria }) => { +export default defineCodeGrader(({ output, criteria }) => { + const outputText = output?.[0]?.content ?? ''; const assertions: Array<{ text: string; passed: boolean }> = []; if (outputText.includes(criteria)) { @@ -146,7 +147,9 @@ Use `createTargetClient` from the SDK: #!/usr/bin/env bun import { createTargetClient, defineCodeGrader } from '@agentv/eval'; -export default defineCodeGrader(async ({ inputText, outputText }) => { +export default defineCodeGrader(async ({ input, output }) => { + const inputText = input?.[0]?.content ?? ''; + const outputText = output?.[0]?.content ?? ''; const target = createTargetClient(); if (!target) return { score: 0, assertions: [{ text: 'Target not configured', passed: false }] }; @@ -171,14 +174,14 @@ Use `target.invokeBatch(requests)` for multiple calls in parallel. ## Advanced Input Fields -Beyond the basic text fields (`input_text`, `output_text`, `expected_output_text`, `criteria`), code graders receive additional structured context: +Beyond the basic text fields (`input`, `output`, `expected_output`, `criteria`), code graders receive additional structured context: | Field | Type | Description | |-------|------|-------------| +| `input` | `string \| Message[]` | Input text or full resolved input message array | +| `output` | `string \| Message[]` | Agent output text or full execution trace with tool calls | +| `expected_output` | `string \| Message[]` | Expected output text or expected agent behavior including tool calls | | `input_files` | `string[]` | Paths to input files referenced in the eval | -| `input` | `Message[]` | Full resolved input message array | -| `expected_output` | `Message[]` | Expected agent behavior including tool calls | -| `output` | `Message[]` | Actual agent execution trace with tool calls | | `trace` | `TraceSummary` | Lightweight execution metrics (tool calls, errors) | | `token_usage` | `{input, output}` | Token consumption | | `cost_usd` | `number` | Estimated cost in USD | @@ -293,7 +296,7 @@ agentv eval assert rouge-score --file result.json The command: 1. Discovers the grader script by walking up directories looking for `.agentv/graders/.{ts,js,mts,mjs}` -2. Passes `{ output_text, output, input, input_text }` to the script via stdin +2. Passes `{ output, input, criteria }` to the script via stdin 3. Prints the grader's JSON result to stdout 4. Exits 0 if score >= 0.5, exit 1 otherwise @@ -304,5 +307,5 @@ This is the same interface that agent-orchestrated evals use — the EVAL.yaml t Pipe JSON directly to the grader script for full control: ```bash -echo '{"input_text":"What is 2+2?","criteria":"4","output_text":"4","expected_output_text":"4"}' | python validators/check_answer.py +echo '{"input":"What is 2+2?","criteria":"4","output":"4","expected_output":"4"}' | python validators/check_answer.py ``` diff --git a/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx b/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx index e279fc4cd..f5a1d1ac1 100644 --- a/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx +++ b/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx @@ -79,23 +79,18 @@ The `files[]` field lists files that the agent needs during evaluation. Paths ar AgentV resolves these paths and copies the files into the workspace before the agent runs. If a file is missing, the test case fails with a `file_copy_error`. -## Agent mode (no API keys) +## Offline grading (no API keys) -AgentV's prompt subcommands work with evals.json, enabling agent-mode evaluation without API keys: +Grade existing agent sessions offline using `agentv import` to convert transcripts, then run deterministic evaluators: ```bash -# List test IDs -agentv prompt eval --list evals.json +# Import a Claude Code session transcript +agentv import claude --discover latest -# Get input for a specific test -agentv prompt eval --input evals.json --test-id 1 - -# Get expected output and evaluator criteria for a test -agentv prompt eval --expected-output evals.json --test-id 1 +# Run deterministic evaluators against the imported transcript +agentv eval evals.json --target copilot-log ``` -In agent mode, the host agent uses these accessors to enumerate tests, act as the candidate, and then grade the saved answer against the eval spec. - If you're using the `agentv-bench` skill bundle, validate your evals before running: ```bash diff --git a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx index 53f937e32..40db69514 100644 --- a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx +++ b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx @@ -119,31 +119,17 @@ Run the same evaluation **with** the skill loaded: agentv eval evals.json --target candidate ``` -Or if using agent mode (no API keys required): +Or grade existing sessions offline (no API keys required): ```bash -# List available test cases -agentv prompt eval --list evals.json +# Import a Claude Code session transcript +agentv import claude --discover latest -# Get the input prompt for a test case -agentv prompt eval --input evals.json --test-id 1 - -# After running the agent, fetch expected output and evaluator criteria -agentv prompt eval --expected-output evals.json --test-id 1 +# Run deterministic evaluators against the imported transcript +agentv eval evals.json --target copilot-log ``` -Agent mode is useful when you want to evaluate skills with agents that don't have a direct API integration — you orchestrate the run yourself and use AgentV's accessor commands to read the eval spec. - -If you're using the `agentv-bench` skill bundle, the equivalent wrappers are: - -```bash -cd plugins/agentv-dev/skills/agentv-bench -bun install -bun scripts/quick-validate.ts --scope wrappers -bun scripts/prompt-eval.ts --list evals.json -bun scripts/prompt-eval.ts --input evals.json --test-id 1 -bun scripts/prompt-eval.ts --expected-output evals.json --test-id 1 -``` +Offline grading is useful when you want to evaluate skills with agents that don't have a direct API integration — import the session transcript and run deterministic evaluators. ## Step 4: Compare Results diff --git a/apps/web/src/content/docs/docs/tools/import.mdx b/apps/web/src/content/docs/docs/tools/import.mdx new file mode 100644 index 000000000..bc5029fca --- /dev/null +++ b/apps/web/src/content/docs/docs/tools/import.mdx @@ -0,0 +1,111 @@ +--- +title: Import +description: Import agent session transcripts for offline grading +sidebar: + order: 3 +--- + +The `import` command converts agent session transcripts into AgentV's `Message[]` format for offline grading — no re-running the agent, no API keys needed. + +## Supported Providers + +| Provider | Command | Source | +|----------|---------|--------| +| Claude Code | `agentv import claude` | `~/.claude/projects//.jsonl` | + +Codex and Copilot importers are planned for future releases. + +## `import claude` + +Import a Claude Code session transcript. + +### Discover available sessions + +```bash +agentv import claude --list +``` + +Output: + +``` +Found 5 session(s): + + 4c4f9e4e-e6f1-490b-a1b1-9aef543ebf22 2m ago -home-user-myproject + 087b801a-7a63-48ff-b348-62563a290b23 1h ago -home-user-myproject + ed8b8c62-4414-49fb-8739-006d809c8588 3h ago -home-user-other-project +``` + +### Import latest session + +```bash +agentv import claude --discover latest +``` + +### Import a specific session + +```bash +agentv import claude --session-id 4c4f9e4e-e6f1-490b-a1b1-9aef543ebf22 +``` + +### Filter by project path + +```bash +agentv import claude --discover latest --project-path /home/user/myproject +``` + +### Custom output path + +```bash +agentv import claude --discover latest -o transcripts/my-session.jsonl +``` + +Default output: `.agentv/transcripts/claude-.jsonl` + +## Options + +| Flag | Description | +|------|-------------| +| `--session-id ` | Import a specific session by UUID | +| `--discover latest` | Import the most recent session | +| `--project-path ` | Filter sessions by project path | +| `--output, -o ` | Custom output file path | +| `--projects-dir ` | Override `~/.claude/projects` directory | +| `--list` | List available sessions instead of importing | + +## Output Format + +The imported transcript is written as JSONL — one `Message` object per line: + +```json +{"role":"user","content":"Fix the bug in auth.ts"} +{"role":"assistant","content":"I'll fix the authentication bug.","toolCalls":[{"tool":"Read","input":{"file_path":"src/auth.ts"},"id":"toolu_01...","output":"...file contents..."}]} +``` + +Each message follows AgentV's standard `Message` interface with `role`, `content`, and optional `toolCalls` (including tool outputs paired from subsequent events). + +## What Gets Parsed + +| Claude Event | AgentV Message | +|-------------|----------------| +| `user` | `{ role: 'user', content }` | +| `assistant` | `{ role: 'assistant', content, toolCalls }` | +| `tool_use` blocks | `ToolCall { tool, input, id }` | +| `tool_result` blocks | Paired with matching `tool_use` by ID | +| `progress`, `system` | Skipped | +| Subagent events | Filtered out (v1) | + +Token usage is aggregated from the final cumulative value per LLM request. Duration is computed from first-to-last event timestamp. + +## Workflow + +Import a session, then run evaluators against it: + +```bash +# 1. Import the latest Claude Code session +agentv import claude --discover latest + +# 2. Run evaluators against the imported transcript +agentv eval evals/my-eval.yaml --transcript .agentv/transcripts/claude-4c4f9e4e.jsonl +``` + +See `examples/features/import-claude/` for a complete working example. diff --git a/examples/features/import-claude/README.md b/examples/features/import-claude/README.md new file mode 100644 index 000000000..ed4dba683 --- /dev/null +++ b/examples/features/import-claude/README.md @@ -0,0 +1,65 @@ +# Import Claude — Offline Transcript Grading + +Demonstrates importing a Claude Code session transcript and grading it +offline with deterministic evaluators. **No LLM API key needed.** + +Evaluators used: +- `code-grader` — custom TypeScript grader inspecting the full `Message[]` with tool calls + +## Setup + +### 1. Run a Claude Code session + +Start a Claude Code session on any project: + +```bash +claude -p "List all TypeScript files in this project" +``` + +### 2. Import the session transcript + +```bash +agentv import claude --discover latest -o transcripts/session.jsonl +``` + +Or import a specific session: + +```bash +# List available sessions +agentv import claude --list + +# Import by session ID +agentv import claude --session-id -o transcripts/session.jsonl +``` + +### 3. Run the eval + +```bash +agentv eval evals/transcript-check.EVAL.yaml +``` + +## How it works + +``` +~/.claude/projects//.jsonl + ↓ agentv import claude (reads from disk, converts to Message[]) +.agentv/transcripts/claude-.jsonl + ↓ code-grader (deterministic) +pass/fail +``` + +The import pipeline: +1. Discovers Claude Code sessions in `~/.claude/projects/` +2. Parses the JSONL transcript (user messages, assistant responses, tool calls) +3. Pairs `tool_use` blocks with `tool_result` responses +4. Aggregates token usage (last cumulative value per LLM request) +5. Writes a clean `Message[]` JSONL for evaluation + +## Evaluators + +### transcript-quality (code-grader) + +Custom grader using `defineCodeGrader` from `@agentv/eval`. Validates: +1. Transcript contains at least one assistant message +2. Tool calls were recorded with outputs +3. No empty assistant messages diff --git a/examples/features/import-claude/evals/transcript-check.EVAL.yaml b/examples/features/import-claude/evals/transcript-check.EVAL.yaml new file mode 100644 index 000000000..ecd18a84c --- /dev/null +++ b/examples/features/import-claude/evals/transcript-check.EVAL.yaml @@ -0,0 +1,8 @@ +tests: + - id: transcript-quality + input: "Analyze the imported Claude Code transcript" + criteria: "Transcript contains meaningful assistant messages with tool calls" + assertions: + - name: transcript-quality + type: code-grader + command: [bun, graders/transcript-quality.ts] diff --git a/examples/features/import-claude/graders/transcript-quality.ts b/examples/features/import-claude/graders/transcript-quality.ts new file mode 100644 index 000000000..91e61ad04 --- /dev/null +++ b/examples/features/import-claude/graders/transcript-quality.ts @@ -0,0 +1,36 @@ +#!/usr/bin/env bun +import { defineCodeGrader } from '@agentv/eval'; + +export default defineCodeGrader(({ output }) => { + const assertions: Array<{ text: string; passed: boolean }> = []; + + // Check 1: Has assistant messages + const assistantMessages = output?.filter((m) => m.role === 'assistant') ?? []; + assertions.push({ + text: `Contains assistant messages (found ${assistantMessages.length})`, + passed: assistantMessages.length > 0, + }); + + // Check 2: Has tool calls with outputs + const toolCalls = assistantMessages.flatMap((m) => m.toolCalls ?? []); + const toolCallsWithOutput = toolCalls.filter((tc) => tc.output != null); + assertions.push({ + text: `Tool calls have outputs (${toolCallsWithOutput.length}/${toolCalls.length})`, + passed: toolCalls.length === 0 || toolCallsWithOutput.length > 0, + }); + + // Check 3: No empty assistant messages + const emptyAssistant = assistantMessages.filter( + (m) => !m.content && (!m.toolCalls || m.toolCalls.length === 0), + ); + assertions.push({ + text: `No empty assistant messages (found ${emptyAssistant.length})`, + passed: emptyAssistant.length === 0, + }); + + const passed = assertions.filter((a) => a.passed).length; + return { + score: assertions.length > 0 ? passed / assertions.length : 0, + assertions, + }; +}); diff --git a/packages/core/src/import/claude-parser.ts b/packages/core/src/import/claude-parser.ts new file mode 100644 index 000000000..03d3d2d19 --- /dev/null +++ b/packages/core/src/import/claude-parser.ts @@ -0,0 +1,323 @@ +/** + * Claude Code session JSONL parser. + * + * Reads a Claude Code session transcript (~/.claude/projects//.jsonl) + * and converts it to AgentV's Message[] format. + * + * Each line is a JSON object with: + * { type, message: { role, content }, sessionId, timestamp, uuid, requestId, ... } + * + * Supported event types: + * user → Message { role: 'user' } (also contains tool_result blocks) + * assistant → Message { role: 'assistant', toolCalls from tool_use content blocks } + * + * Skipped event types: progress, system, file-history-snapshot + * + * Key behaviors: + * - tool_use blocks in assistant events → ToolCall (pending output) + * - tool_result blocks in user events → matched to pending tool_use by tool_use_id + * - Usage is cumulative per requestId; only the last value per requestId is used + * - Streaming assistant events with the same requestId are deduplicated (keep latest) + * - Subagent events (isSidechain: true) are filtered out in v1 + * - Duration is from first↔last event timestamp (including skipped types) + * - cost_usd is null (Claude Code does not report per-session cost) + */ + +import type { Message, ToolCall } from '../evaluation/providers/types.js'; +import type { TranscriptEntry, TranscriptSource } from './types.js'; + +interface ClaudeEvent { + readonly type: string; + readonly requestId?: string; + readonly isSidechain?: boolean; + readonly message?: { + readonly role?: string; + readonly content?: string | readonly ClaudeContentBlock[]; + readonly usage?: ClaudeUsage; + readonly model?: string; + }; + readonly sessionId?: string; + readonly timestamp?: string; + readonly uuid?: string; + readonly cwd?: string; +} + +interface ClaudeContentBlock { + readonly type: string; + readonly text?: string; + readonly thinking?: string; + readonly name?: string; + readonly input?: unknown; + readonly id?: string; + readonly tool_use_id?: string; + readonly content?: string | readonly { readonly type: string; readonly text?: string }[]; +} + +interface ClaudeUsage { + readonly input_tokens?: number; + readonly output_tokens?: number; + readonly cache_read_input_tokens?: number; +} + +const SKIPPED_TYPES = new Set(['progress', 'system', 'file-history-snapshot']); + +export function parseClaudeSession(jsonl: string): TranscriptEntry { + const messages: Message[] = []; + let sessionId = ''; + let projectPath: string | undefined; + let model: string | undefined; + let startTimestamp: string | undefined; + let endTimestamp: string | undefined; + + // Track usage per requestId — values are cumulative, so we only keep the last + const usageByRequestId = new Map(); + + // Track the last assistant message per requestId to deduplicate streaming updates + let lastAssistantRequestId: string | undefined; + let lastAssistantIdx = -1; + + // Track pending tool_use IDs for pairing with tool_result in user events + const pendingToolCalls = new Map(); + + const lines = jsonl.split('\n').filter((l) => l.trim().length > 0); + + for (const line of lines) { + let event: ClaudeEvent; + try { + event = JSON.parse(line) as ClaudeEvent; + } catch { + continue; + } + + if (!event.type) continue; + + // Track timestamps from ALL events (including skipped types) for accurate duration + if (event.timestamp) { + if (!startTimestamp) startTimestamp = event.timestamp; + endTimestamp = event.timestamp; + } + + // Skip non-message event types + if (SKIPPED_TYPES.has(event.type)) continue; + + // Skip subagent events (v1: only process main conversation) + if (event.isSidechain) continue; + + // Capture session metadata from first event + if (!sessionId && event.sessionId) { + sessionId = event.sessionId; + } + if (!projectPath && event.cwd) { + projectPath = event.cwd; + } + + switch (event.type) { + case 'user': { + const msg = event.message; + if (!msg) break; + + const contentArr = msg.content; + + // User events can contain both tool_result blocks (responses to tool_use) + // and text blocks. Process tool_results first, then extract text. + if (Array.isArray(contentArr)) { + for (const block of contentArr as readonly ClaudeContentBlock[]) { + if (block.type === 'tool_result' && block.tool_use_id) { + const pending = pendingToolCalls.get(block.tool_use_id); + if (pending) { + const existingMsg = messages[pending.msgIdx]; + const existingCalls = [...(existingMsg.toolCalls ?? [])]; + existingCalls[pending.toolIdx] = { + ...existingCalls[pending.toolIdx], + output: extractToolResultContent(block.content), + }; + messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls }; + pendingToolCalls.delete(block.tool_use_id); + } + } + } + } + + // Extract text content for the user message + const text = extractTextContent(contentArr); + if (text !== undefined) { + messages.push({ role: 'user', content: text }); + } + break; + } + + case 'assistant': { + const msg = event.message; + if (!msg) break; + + // Capture model from first assistant message + if (!model && msg.model) { + model = msg.model; + } + + // Track usage (cumulative per requestId — last value wins) + if (msg.usage && event.requestId) { + usageByRequestId.set(event.requestId, msg.usage); + } + + // Parse content array for text and tool_use blocks + const { text, toolCalls } = extractAssistantContent(msg.content); + + // Deduplicate streaming assistant events with the same requestId + if ( + event.requestId && + event.requestId === lastAssistantRequestId && + lastAssistantIdx >= 0 + ) { + // Replace the previous partial message + messages[lastAssistantIdx] = { + role: 'assistant', + content: text || undefined, + toolCalls: toolCalls.length > 0 ? toolCalls : undefined, + }; + // Re-register tool calls for pairing + registerPendingToolCalls(toolCalls, lastAssistantIdx, pendingToolCalls); + } else { + // Only push if there's actual content or tool calls + if (text || toolCalls.length > 0) { + lastAssistantIdx = messages.length; + messages.push({ + role: 'assistant', + content: text || undefined, + toolCalls: toolCalls.length > 0 ? toolCalls : undefined, + }); + registerPendingToolCalls(toolCalls, lastAssistantIdx, pendingToolCalls); + } + } + lastAssistantRequestId = event.requestId; + break; + } + } + } + + // Compute final usage from last-seen value per requestId + let totalInputTokens = 0; + let totalOutputTokens = 0; + for (const usage of usageByRequestId.values()) { + totalInputTokens += Number(usage.input_tokens ?? 0); + totalOutputTokens += Number(usage.output_tokens ?? 0); + } + const hasUsage = usageByRequestId.size > 0; + + let durationMs: number | undefined; + if (startTimestamp && endTimestamp) { + durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime(); + } + + const source: TranscriptSource = { + provider: 'claude', + sessionId, + projectPath, + startedAt: startTimestamp, + model, + }; + + return { + messages, + source, + tokenUsage: hasUsage ? { input: totalInputTokens, output: totalOutputTokens } : undefined, + durationMs, + costUsd: null, + }; +} + +/** + * Register tool_use IDs from an assistant message for later pairing with tool_result. + */ +function registerPendingToolCalls( + toolCalls: ToolCall[], + msgIdx: number, + pending: Map, +): void { + for (let i = 0; i < toolCalls.length; i++) { + const id = toolCalls[i].id; + if (id) { + pending.set(id, { msgIdx, toolIdx: i }); + } + } +} + +/** + * Extract text content from a message's content field. + */ +function extractTextContent( + content: string | readonly ClaudeContentBlock[] | undefined, +): string | undefined { + if (content === undefined || content === null) return undefined; + if (typeof content === 'string') return content; + + const textParts: string[] = []; + for (const block of content) { + if (block.type === 'text' && block.text) { + textParts.push(block.text); + } + } + return textParts.length > 0 ? textParts.join('') : undefined; +} + +/** + * Extract text and tool_use calls from an assistant message's content array. + * Note: tool_result blocks appear in user events, not here. + */ +function extractAssistantContent(content: string | readonly ClaudeContentBlock[] | undefined): { + text: string | undefined; + toolCalls: ToolCall[]; +} { + if (content === undefined || content === null) { + return { text: undefined, toolCalls: [] }; + } + if (typeof content === 'string') { + return { text: content, toolCalls: [] }; + } + + const textParts: string[] = []; + const toolCalls: ToolCall[] = []; + + for (const block of content) { + switch (block.type) { + case 'text': + if (block.text) textParts.push(block.text); + break; + + case 'tool_use': + if (block.name) { + toolCalls.push({ + tool: block.name, + input: block.input, + id: block.id, + }); + } + break; + + // Skip thinking blocks and other types + } + } + + return { + text: textParts.length > 0 ? textParts.join('') : undefined, + toolCalls, + }; +} + +/** + * Extract text from a tool_result content field. + */ +function extractToolResultContent( + content: string | readonly { readonly type: string; readonly text?: string }[] | undefined, +): string | undefined { + if (content === undefined || content === null) return undefined; + if (typeof content === 'string') return content; + + const parts: string[] = []; + for (const block of content) { + if (block.type === 'text' && block.text) { + parts.push(block.text); + } + } + return parts.length > 0 ? parts.join('') : undefined; +} diff --git a/packages/core/src/import/index.ts b/packages/core/src/import/index.ts new file mode 100644 index 000000000..7e695fd3b --- /dev/null +++ b/packages/core/src/import/index.ts @@ -0,0 +1,7 @@ +export { parseClaudeSession } from './claude-parser.js'; +export { + discoverClaudeSessions, + type ClaudeDiscoverOptions, + type ClaudeSession, +} from './session-discovery.js'; +export { readTranscriptFile, type TranscriptEntry, type TranscriptSource } from './types.js'; diff --git a/packages/core/src/import/session-discovery.ts b/packages/core/src/import/session-discovery.ts new file mode 100644 index 000000000..e03fadaf4 --- /dev/null +++ b/packages/core/src/import/session-discovery.ts @@ -0,0 +1,114 @@ +/** + * Claude Code session discovery. + * + * Scans ~/.claude/projects/ for session JSONL files. Claude Code stores + * sessions at: + * ~/.claude/projects//.jsonl + * + * Where is the absolute project path with `/` replaced + * by `-` and prefixed with `-` (e.g., `-home-user-myproject`). + * + * Sessions are returned sorted by modification time (most recent first). + */ + +import { readdir, stat } from 'node:fs/promises'; +import { homedir } from 'node:os'; +import path from 'node:path'; + +export interface ClaudeSession { + /** UUID of the session */ + readonly sessionId: string; + /** Full path to the JSONL file */ + readonly filePath: string; + /** Encoded project directory name */ + readonly projectDir: string; + /** Last modification time */ + readonly updatedAt: Date; +} + +export interface ClaudeDiscoverOptions { + /** Filter by session UUID (exact match). */ + readonly sessionId?: string; + /** Filter by project path (substring match against encoded dir name). */ + readonly projectPath?: string; + /** Maximum number of sessions to return (default: 10). */ + readonly limit?: number; + /** Override the default ~/.claude/projects directory. */ + readonly projectsDir?: string; + /** Return only the most recent session. */ + readonly latest?: boolean; +} + +const DEFAULT_PROJECTS_DIR = () => path.join(homedir(), '.claude', 'projects'); + +/** + * Encode a filesystem path to Claude Code's project directory format. + * `/home/user/myproject` → `-home-user-myproject` + */ +function encodeProjectPath(projectPath: string): string { + return projectPath.replace(/\//g, '-'); +} + +export async function discoverClaudeSessions( + opts?: ClaudeDiscoverOptions, +): Promise { + const projectsDir = opts?.projectsDir ?? DEFAULT_PROJECTS_DIR(); + const limit = opts?.latest ? 1 : (opts?.limit ?? 10); + + let projectDirs: string[]; + try { + projectDirs = await readdir(projectsDir); + } catch { + return []; + } + + // Filter project directories if projectPath is specified + if (opts?.projectPath) { + const encoded = encodeProjectPath(opts.projectPath); + projectDirs = projectDirs.filter((dir) => dir === encoded || dir.includes(encoded)); + } + + const sessions: ClaudeSession[] = []; + + for (const projectDir of projectDirs) { + const dirPath = path.join(projectsDir, projectDir); + + let entries: string[]; + try { + entries = await readdir(dirPath); + } catch { + continue; + } + + for (const entry of entries) { + if (!entry.endsWith('.jsonl')) continue; + + const sessionId = entry.replace(/\.jsonl$/, ''); + + // Filter by session ID if specified + if (opts?.sessionId && sessionId !== opts.sessionId) continue; + + const filePath = path.join(dirPath, entry); + + let updatedAt: Date; + try { + const fileStat = await stat(filePath); + updatedAt = fileStat.mtime; + } catch { + updatedAt = new Date(0); + } + + sessions.push({ + sessionId, + filePath, + projectDir, + updatedAt, + }); + } + } + + // Sort by modification time, most recent first + sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime()); + + return sessions.slice(0, limit); +} diff --git a/packages/core/src/import/types.ts b/packages/core/src/import/types.ts new file mode 100644 index 000000000..5595dfd82 --- /dev/null +++ b/packages/core/src/import/types.ts @@ -0,0 +1,43 @@ +/** + * Core types for the transcript import pipeline. + * + * A TranscriptEntry represents a single event in a parsed agent session + * transcript (user message, assistant response, tool call, etc.). + * + * A TranscriptSource describes where a transcript came from (provider, + * session ID, file path, etc.). + */ + +import { readFile } from 'node:fs/promises'; + +import type { Message, ProviderTokenUsage } from '../evaluation/providers/types.js'; + +/** + * A parsed transcript: ordered messages plus session metadata. + */ +export interface TranscriptEntry { + readonly messages: Message[]; + readonly source: TranscriptSource; + readonly tokenUsage?: ProviderTokenUsage; + readonly durationMs?: number; + readonly costUsd?: number | null; +} + +/** + * Metadata describing the origin of a transcript. + */ +export interface TranscriptSource { + readonly provider: string; + readonly sessionId: string; + readonly projectPath?: string; + readonly startedAt?: string; + readonly model?: string; +} + +/** + * Read a JSONL transcript file and return its raw text. + * Throws if the file does not exist or cannot be read. + */ +export async function readTranscriptFile(filePath: string): Promise { + return readFile(filePath, 'utf8'); +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 852f73f66..37bb2e8a2 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -87,6 +87,9 @@ export { discoverGraders as discoverJudges, } from './evaluation/registry/grader-discovery.js'; +// Import pipeline +export * from './import/index.js'; + export type AgentKernel = { status: string; }; diff --git a/plugins/agentv-dev/skills/agentv-bench/references/migrating-from-skill-creator.md b/plugins/agentv-dev/skills/agentv-bench/references/migrating-from-skill-creator.md index 7c02b7fb7..2231efbf6 100644 --- a/plugins/agentv-dev/skills/agentv-bench/references/migrating-from-skill-creator.md +++ b/plugins/agentv-dev/skills/agentv-bench/references/migrating-from-skill-creator.md @@ -10,10 +10,8 @@ AgentV runs skill-creator's evals.json directly — no conversion required: # Run evals.json with AgentV agentv eval evals.json -# Or in agent mode (no API keys) -agentv prompt eval --list evals.json -agentv prompt eval --input evals.json --test-id 1 -agentv prompt eval --expected-output evals.json --test-id 1 +# Or run a single assertion offline (no API keys) +agentv eval assert --agent-output "..." --agent-input "..." ``` AgentV automatically: @@ -22,18 +20,7 @@ AgentV automatically: - Converts `assertions` → LLM-grader evaluators - Resolves `files[]` paths relative to the evals.json directory -If you're using the `agentv-bench` skill, the bundled Bun scripts wrap these same commands and artifacts instead of inventing a second format: - -```bash -cd plugins/agentv-dev/skills/agentv-bench -bun install -bun scripts/run-eval.ts --eval-path ../../../../examples/features/agent-skills-evals/evals.json --dry-run -bun scripts/prompt-eval.ts --list ../../../../examples/features/agent-skills-evals/evals.json -bun scripts/convert-evals.ts --eval-path ../../../../examples/features/agent-skills-evals/evals.json --out /tmp/eval.yaml -bun scripts/generate-report.ts --artifacts .agentv/artifacts --out /tmp/agentv-review.html -``` - -These scripts still call `agentv` wherever possible. Code graders, grading, and artifact generation remain in AgentV core; the scripts just orchestrate and summarize the existing outputs. +If you're using the `agentv-bench` skill, it orchestrates these same AgentV commands. Code graders, grading, and artifact generation remain in AgentV core; the skill just orchestrates and summarizes the existing outputs. ## What You Gain diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md index 86c1438f0..5d9cc8331 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md @@ -30,15 +30,10 @@ agentv convert evals.json # Run directly without converting (all commands accept evals.json) agentv eval evals.json -agentv prompt eval --list evals.json -agentv prompt eval --input evals.json --test-id 1 -agentv prompt eval --expected-output evals.json --test-id 1 ``` The converter maps `prompt` → `input`, `expected_output` → `expected_output`, `assertions` → `assertions` (`llm-grader`), and resolves `files[]` paths. The generated YAML includes TODO comments for AgentV features to add (workspace setup, code graders, rubrics, required gates). -If you're running the lifecycle through `agentv-bench`, use `agentv convert` and `agentv prompt eval` directly — the Python scripts in `agentv-bench/scripts/` orchestrate these same commands. - After converting, enhance the YAML with AgentV-specific capabilities shown below. ## From Chat Transcript @@ -540,10 +535,11 @@ agentv eval [--test-id ] [--target ] [--dry-run] [--thresh # Run with OTLP JSON file (importable by OTel backends) agentv eval --otel-file traces/eval.otlp.json -# Agent-orchestrated evals (no API keys needed) -agentv prompt eval --list # enumerate test IDs -agentv prompt eval --input --test-id # task input JSON (file paths, not embedded content) -agentv prompt eval --expected-output --test-id # expected output + grader criteria +# Run a single assertion in isolation (no API keys needed) +agentv eval assert --agent-output "..." --agent-input "..." + +# Import agent transcripts for offline grading +agentv import claude --discover latest # Re-run only execution errors from a previous output agentv eval --retry-errors