Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,61 @@ export function createApp(
}
});

/** Aggregate runs from all registered projects, sorted by timestamp descending. */
app.get('/api/projects/all-runs', (c) => {
const registry = loadProjectRegistry();
const allRuns: Array<{
filename: string;
path: string;
timestamp: string;
test_count: number;
pass_rate: number;
avg_score: number;
size_bytes: number;
target?: string;
experiment?: string;
project_id: string;
project_name: string;
}> = [];

for (const p of registry.projects) {
try {
const metas = listResultFiles(p.path);
for (const m of metas) {
let target: string | undefined;
let experiment: string | undefined;
try {
const records = loadLightweightResults(m.path);
if (records.length > 0) {
target = records[0].target;
experiment = records[0].experiment;
}
} catch {
// ignore enrichment errors
}
allRuns.push({
filename: m.filename,
path: m.path,
timestamp: m.timestamp,
test_count: m.testCount,
pass_rate: m.passRate,
avg_score: m.avgScore,
size_bytes: m.sizeBytes,
...(target && { target }),
...(experiment && { experiment }),
project_id: p.id,
project_name: p.name,
});
}
} catch {
// skip inaccessible projects
}
}

allRuns.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
return c.json({ runs: allRuns });
});

// ── Data routes (unscoped) ────────────────────────────────────────────

app.get('/api/config', (c) => handleConfig(c, defaultCtx));
Expand Down
27 changes: 26 additions & 1 deletion apps/studio/src/components/Sidebar.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ import { Link, useMatchRoute } from '@tanstack/react-router';

import {
isPassing,
useAllProjectRuns,
useCategoryDatasets,
useExperiments,
useProjectList,
useProjectRunDetail,
useProjectRunList,
useRunDetail,
Expand Down Expand Up @@ -104,11 +106,19 @@ export function Sidebar() {

function RunSidebar() {
const matchRoute = useMatchRoute();
const { data } = useRunList();
const { data: projectData } = useProjectList();
const hasProjects = (projectData?.projects.length ?? 0) > 0;

const isHome = matchRoute({ to: '/' });
const runMatch = matchRoute({ to: '/runs/$runId', fuzzy: true });

// On the projects landing page, show aggregated runs from all projects
const useAggregated = hasProjects && isHome !== false;

const { data: localData } = useRunList();
const { data: aggregatedData } = useAllProjectRuns();
const data = useAggregated ? aggregatedData : localData;

return (
<aside className="flex w-64 flex-col border-r border-gray-800 bg-gray-900/50">
<div className="flex items-center gap-2 border-b border-gray-800 px-4 py-4">
Expand All @@ -130,6 +140,21 @@ function RunSidebar() {
'runId' in runMatch &&
(runMatch as { runId: string }).runId === run.filename;

// Aggregated runs link to their project's run detail
if (run.project_id) {
return (
<Link
key={`${run.project_id}/${run.filename}`}
to="/projects/$projectId/runs/$runId"
params={{ projectId: run.project_id, runId: run.filename }}
className="mb-0.5 block truncate rounded-md px-2 py-1.5 text-sm text-gray-400 transition-colors hover:bg-gray-800/50 hover:text-gray-200"
title={run.project_name}
>
{run.filename}
</Link>
);
}

return (
<Link
key={run.filename}
Expand Down
10 changes: 10 additions & 0 deletions apps/studio/src/lib/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,16 @@ export function useProjectList() {
return useQuery(projectListOptions);
}

export const allProjectRunsOptions = queryOptions({
queryKey: ['projects', 'all-runs'],
queryFn: () => fetchJson<RunListResponse>('/api/projects/all-runs'),
refetchInterval: 5_000,
});

export function useAllProjectRuns() {
return useQuery(allProjectRunsOptions);
}

export async function addProjectApi(projectPath: string): Promise<ProjectEntry> {
const res = await fetch('/api/projects', {
method: 'POST',
Expand Down
2 changes: 2 additions & 0 deletions apps/studio/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ export interface RunMeta {
size_bytes: number;
target?: string;
experiment?: string;
project_id?: string;
project_name?: string;
}

export interface RunListResponse {
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added apps/web/src/assets/screenshots/studio-runs.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
94 changes: 94 additions & 0 deletions apps/web/src/content/docs/docs/tools/studio.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
---
title: Studio
description: Visual dashboard for reviewing evaluation results
sidebar:
order: 6
---

import { Image } from 'astro:assets';
import studioRuns from '../../../../assets/screenshots/studio-runs.png';
import studioProjects from '../../../../assets/screenshots/studio-projects.png';

The `studio` command launches a web-based dashboard for browsing evaluation runs, inspecting individual test results, and reviewing scores.

<Image src={studioRuns} alt="AgentV Studio showing evaluation runs with pass rates and scores" />

## Usage

```bash
agentv studio
```

Studio auto-discovers results from `.agentv/results/` in the current directory and opens at `http://localhost:3117`.

You can also point it at a specific results file:

```bash
agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z/index.jsonl
```

## Options

| Option | Description |
|--------|-------------|
| `--port`, `-p` | Port to listen on (flag > `PORT` env var > 3117) |
| `--dir`, `-d` | Working directory (default: current directory) |
| `--multi` | Launch in multi-project dashboard mode |
| `--add <path>` | Register a project by path |
| `--remove <id>` | Unregister a project by ID |
| `--discover <path>` | Scan a directory tree for repos with `.agentv/` |

## Features

- **Recent Runs** -- table of all evaluation runs with target, experiment, timestamp, test count, pass rate, and mean score
- **Experiments** -- group and compare runs by experiment name
- **Targets** -- group runs by target (model/agent)
- **Run Detail** -- drill into a run to see per-test results, scores, and evaluator output
- **Human Review** -- add feedback annotations to individual test results

## Multi-Project Dashboard

By default, Studio shows results for the current directory. The multi-project mode lets you view results across multiple repositories from a single dashboard.

### Registering Projects

Register projects one at a time:

```bash
agentv studio --add /path/to/project-a
agentv studio --add /path/to/project-b
```

Each path must contain a `.agentv/` directory. Projects are stored in `~/.agentv/projects.yaml`.

### Auto-Discovery

Scan a parent directory to find and register all projects:

```bash
agentv studio --discover /path/to/repos
```

This recursively searches (up to 2 levels deep) for directories containing `.agentv/` and registers them.

### Launching the Dashboard

Once projects are registered, launch the multi-project dashboard:

```bash
agentv studio --multi
```

If you have any registered projects, `--multi` is automatically enabled. The landing page shows a card for each project with run count, pass rate, and last run time. Click a project to view its runs.

<Image src={studioProjects} alt="AgentV Studio multi-project dashboard showing project cards with pass rates" />

### Removing Projects

Unregister a project by its ID:

```bash
agentv studio --remove my-project
```

Project IDs are derived from the directory name (e.g., `/home/user/repos/my-project` becomes `my-project`).
Loading