diff --git a/.specify/specs/validate-machine-readable-output.md b/.specify/specs/validate-machine-readable-output.md new file mode 100644 index 000000000..b0c658ee9 --- /dev/null +++ b/.specify/specs/validate-machine-readable-output.md @@ -0,0 +1,1108 @@ +# Design: Machine-Readable Validation Output with Store/Reload + +## Summary + +Add machine-readable output formats to `dandi validate`, support saving validation +results to disk, and reloading/re-rendering them later with different grouping, +filtering, and display options. This enables workflows like: + +- Sweeping all BIDS example datasets or all DANDI datasets, storing results +- Storing validation errors during `upload` for later inspection +- Re-rendering stored results with alternative groupings/filters without re-running validation + +## Related Issues & PRs + +| # | Title | Status | Relevance | +|---|-------|--------|-----------| +| [#1515](https://github.com/dandi/dandi-cli/issues/1515) | `validate: Add -f\|--format option` | Open | Direct: requests JSON/YAML output formats | +| [#1753](https://github.com/dandi/dandi-cli/issues/1753) | Provide easy means for introspecting upload validation failures | Open | Direct: decouple execution from rendering via JSON dumps | +| [#1743](https://github.com/dandi/dandi-cli/pull/1743) | Add filtering of issues by type/ID or by file location | Draft | Related: post-validation filtering, blocked pending store/cache | +| [#1737](https://github.com/dandi/dandi-cli/issues/1737) | `upload,validate: Add --validators option` | Open | Related: selective validator control | +| [#1748](https://github.com/dandi/dandi-cli/issues/1748) | Tidy up the validate command function | Open | Prerequisite: refactor before adding options | +| [#1448](https://github.com/dandi/dandi-cli/issues/1448) | Align validation with server | Open | Consistency between client/server validation | +| [#1624](https://github.com/dandi/dandi-cli/pull/1624) | Serialize Severity by name | Merged | Prerequisite: meaningful JSON Severity output | +| [#1514](https://github.com/dandi/dandi-cli/pull/1514) | Overhaul validation results (Pydantic Origins) | Merged | Foundation: Pydantic-based models, JSON-ready | +| [#1619](https://github.com/dandi/dandi-cli/pull/1619) | Deduplicate validation results | Merged | Foundation: no duplicate records | +| [#1597](https://github.com/dandi/dandi-cli/issues/1597) | Replace bidsschematools with bids-validator-deno | Closed | Mentions filtering + result summaries as follow-ups | + +## Current State + +### What exists + +- **`ValidationResult`** (Pydantic `BaseModel` in `validate_types.py`): fully JSON-serializable + with `model_dump(mode="json")` and `model_validate_json()` round-trip support. + Fields: `id`, `origin` (validator+standard+versions), `scope`, `severity`, `message`, + `path`, `dandiset_path`, `dataset_path`, `asset_paths`, `within_asset_paths`, + `path_regex`, `metadata`. The `origin_result` field is excluded from serialization. + **No schema/format version field exists on the model itself.** +- **`Severity_`** annotated type serializes as `"ERROR"`, `"WARNING"`, etc. (not numeric). +- **`Origin`** is a Pydantic `BaseModel` with `type`, `validator`, `validator_version`, + `standard`, `standard_version`, `standard_schema_version`. +- **CLI output** is human-only: colored text via `click.secho()` with grouping by + `none`/`path`, filtering by `--min-severity` and `--ignore REGEX`. +- **`dandi ls`** already supports `-f json|json_pp|json_lines|yaml` via `formatter.py` + with `JSONFormatter`, `JSONLinesFormatter`, `YAMLFormatter` context-manager classes. +- **Upload** runs `get_validation_errors()` per-file but only logs/raises on errors; + no structured output is saved. +- **Log files** are written to `platformdirs.user_log_dir("dandi-cli", "dandi")` + (typically `~/.local/share/logs/dandi-cli/`) with pattern + `YYYY.MM.DD-HH.MM.SSZ-PID.log`. The path is stored in `ctx.obj.logfile` in the + Click context (set in `command.py:main()`). +- **Current module layout**: `dandi/validate.py` + `dandi/validate_types.py` as + top-level modules. As more validation modules are anticipated (I/O, reporting, + additional validator backends), this needs to become a subpackage. + +### What's missing + +1. No `--format` option on `validate` (only human-readable output) +2. No `--output` to save results to a file +3. No `--load` to reload previously-saved results +4. No summary statistics +5. No grouping by `validator`, `severity`, `id`, `standard`, etc. +6. Upload validation results are not persisted for later review +7. No shared utility for writing/reading validation JSONL files +8. No schema version on `ValidationResult` for forward-compatible serialization +9. Validation modules are flat files, not a subpackage + +### Importers of current validation modules + +The following files import from `validate.py` or `validate_types.py` and will +need import path updates after the subpackage refactoring: + +- `dandi/cli/cmd_validate.py` — `from ..validate import validate`; `from ..validate_types import ...` +- `dandi/cli/tests/test_cmd_validate.py` — `from ...validate_types import ...` +- `dandi/cli/tests/test_command.py` — `from ..cmd_validate import validate` +- `dandi/upload.py` — `from .validate_types import Severity` +- `dandi/organize.py` — `from .validate_types import ...` +- `dandi/pynwb_utils.py` — `from .validate_types import ...` +- `dandi/files/bases.py` — `from dandi.validate_types import ...` +- `dandi/files/bids.py` — `from ..validate_types import ...`; `from dandi.validate import validate_bids` +- `dandi/files/zarr.py` — `from ..validate_types import ...` +- `dandi/bids_validator_deno/_validator.py` — `from dandi.validate_types import ...` +- `dandi/tests/test_validate.py` — `from ..validate import validate`; `from ..validate_types import ...` +- `dandi/tests/test_validate_types.py` — `from dandi.validate_types import ...` + +## Design Principles + +### Separation of concerns (per #1753) + +Three decoupled stages: + +1. **Execution**: Run validators, produce `ValidationResult` objects +2. **Serialization**: Dump results to JSONL (the interchange format) +3. **Rendering**: Display results with grouping, filtering, formatting + +Currently stages 1+3 are coupled in `cmd_validate.py`. This design introduces +stage 2 and makes stage 3 work from either stage 1 (live) or stage 2 (loaded). + +``` +Live validation: validate() → [ValidationResult] → filter/group → render + ↓ + save(companion) + +Stored results: load(files) → [ValidationResult] → filter/group → render + +Upload results: upload() → [ValidationResult] → save(companion) + ↓ + load(companion) → filter/group → render +``` + +### Uniform output across all formats — no envelope/non-envelope split + +All structured formats (`json`, `json_pp`, `json_lines`, `yaml`) emit the +**same data** — a flat list of `ValidationResult` records. No format gets a +special envelope wrapper that others lack. This avoids having two schemas to +maintain and document. + +**JSONL**: one `ValidationResult` per line (pure results, `cat`-composable): + +```bash +cat results/*.jsonl | jq 'select(.severity == "ERROR")' # just works +wc -l results/*.jsonl # = issue count +grep BIDS.NON_BIDS results/*.jsonl # fast text search +vd results/*.jsonl # instant tabular view +``` + +**VisiData integration**: VisiData natively loads `.jsonl` as tabular data — +each `ValidationResult` becomes a row with columns for `id`, `severity`, +`path`, `origin.validator`, `message`, etc. This gives immediate interactive +sorting, filtering, frequency tables, and pivoting with no extra code. Since +VisiData is Python-based, future integration is possible (e.g., a `dandi` +VisiData plugin that adds custom commands for grouping by dandiset, linking +to BIDS spec rules, or opening the offending file). + +**JSON / JSON pretty-printed**: a JSON array of `ValidationResult` objects: + +```json +[ + {"id": "BIDS.NON_BIDS_PATH_PLACEHOLDER", "origin": {...}, "severity": "ERROR", ...}, + {"id": "NWBI.check_data_orientation", "origin": {...}, "severity": "WARNING", ...} +] +``` + +**YAML**: same array structure in YAML syntax. + +Summary statistics are handled separately via `--summary` (human output) or +post-processing (`jq`, `--load` + `--summary`). They are not baked into the +serialized data. + +### Prior art: bids-validator `output.json` + +The bids-validator ([example](https://github.com/bids-standard/bids-examples/blob/3aa560cc/2d_mb_pcasl/derivatives/bids-validator/output.json)) +uses an envelope format: + +```json +{ + "issues": { + "issues": [{"code": "...", "severity": "warning", "location": "...", ...}], + "codeMessages": {"JSON_KEY_RECOMMENDED": "A JSON file is missing..."} + }, + "summary": {"subjects": [...], "totalFiles": 11, "schemaVersion": "1.2.1", ...} +} +``` + +Their format is a **final report** — one monolithic JSON per dataset. Our design +differs intentionally: we produce **composable JSONL records** that can be +concatenated across datasets, piped through standard Unix tools, and loaded +into tabular tools like VisiData. The `_record_version` field on each record +provides the self-describing property that their envelope provides at the +file level. + +### Schema version on `ValidationResult` + +Add a `_record_version` field to `ValidationResult` to enable forward-compatible +deserialization. This lets loaders detect and handle older record formats +gracefully: + +```python +class ValidationResult(BaseModel): + _record_version: str = "1" # schema version for this record format + id: str + origin: Origin + # ... rest of fields +``` + +Serialized as `"_record_version": "1"` in every JSON line. The loader can +check this and warn/adapt if it encounters a newer version. The underscore +prefix signals it is metadata about the record format, not validation content. + +### Grouping is human-rendering only + +Structured output always emits a flat results list. `--grouping` only affects +human-readable display. Downstream tools can trivially group with +`jq group_by(.id)` etc., and a stable flat schema is more useful than a format +that varies by grouping option. + +### Orthogonal options + +`--format` and `--output` are independent: +- `--format` controls serialization format (default: `human`) +- `--output` controls destination (default: stdout) +- When `--output` is given without `--format`, the format is auto-detected + from the file extension: `.json` → `json_pp`, `.jsonl` → `json_lines`, + `.yaml`/`.yml` → `yaml`. If the extension is unrecognized, an error is + raised (since writing colored human text to a file is not useful) + +## Design + +### Step 0a: Refactor into `dandi/validate/` subpackage + +**Goal**: Convert flat validation modules into a proper subpackage to +accommodate growing validation functionality (I/O, reporting, future validators). + +**Critical**: The `git mv` must be committed **separately** from any content +changes to preserve git rename tracking. + +#### Commit 1: Pure rename (git mv only) + +```bash +mkdir -p dandi/validate +git mv dandi/validate.py dandi/validate/core.py +git mv dandi/validate_types.py dandi/validate/types.py +# Create __init__.py that re-exports everything for backwards compatibility +``` + +`dandi/validate/__init__.py` (created, not moved): + +```python +"""Validation of DANDI datasets against schemas and standards. + +This package provides validation functionality for dandisets, including: +- DANDI schema validation +- BIDS standard validation +- File layout and organization validation +- Metadata completeness checking +""" +# Re-export public API for backwards compatibility +from .core import validate, validate_bids # noqa: F401 +from .types import ( # noqa: F401 + ORIGIN_INTERNAL_DANDI, + ORIGIN_VALIDATION_DANDI, + ORIGIN_VALIDATION_DANDI_LAYOUT, + ORIGIN_VALIDATION_DANDI_ZARR, + Origin, + OriginType, + Scope, + Severity, + Severity_, + Standard, + ValidationResult, + Validator, +) +``` + +This `__init__.py` means **all existing imports continue to work unchanged**: +- `from dandi.validate import validate` — works (via `__init__.py`) +- `from dandi.validate_types import ValidationResult` — **breaks**, needs update + +#### Commit 2: Update all import paths + +Update all importers listed above to use the new subpackage paths: +- `from dandi.validate_types import X` → `from dandi.validate.types import X` + (or `from dandi.validate import X` where re-exported) +- `from dandi.validate import validate` — already works via `__init__.py` +- `from ..validate_types import X` → `from ..validate.types import X` + +Also move test files: +- `dandi/tests/test_validate.py` → `dandi/validate/tests/test_core.py` +- `dandi/tests/test_validate_types.py` → `dandi/validate/tests/test_types.py` + +#### Resulting subpackage layout + +``` +dandi/validate/ +├── __init__.py # Re-exports for backwards compat +├── core.py # validate(), validate_bids() — was validate.py +├── types.py # ValidationResult, Origin, etc. — was validate_types.py +├── io.py # NEW: JSONL read/write utilities +└── tests/ + ├── __init__.py + ├── test_core.py # was dandi/tests/test_validate.py + ├── test_types.py # was dandi/tests/test_validate_types.py + └── test_io.py # NEW: tests for I/O utilities +``` + +### Step 0b: Refactor `cmd_validate.py` (addresses #1748) + +Before adding new options, decompose the `validate()` CLI function. It currently +handles argument parsing, validation execution, filtering, and rendering in one +function. Extract: + +- `_collect_results()` — run validation, return `list[ValidationResult]` +- `_filter_results()` — apply `--min-severity`, `--ignore` +- `_render_results()` — dispatch to human or structured output + +This keeps complexity per function under the LAD threshold (max-complexity 10) +and makes it natural to slot in `--load` as an alternative to `_collect_results()`. + +Adds `@click.pass_context` to `validate()` to access `ctx.obj.logfile`. + +### Step 0c: Add `_record_version` to `ValidationResult` + +Add the schema version field to `ValidationResult` in `dandi/validate/types.py`: + +```python +class ValidationResult(BaseModel): + _record_version: str = "1" + id: str + origin: Origin + # ... existing fields unchanged +``` + +This is a backwards-compatible addition — existing serialized records without +`_record_version` will deserialize fine (Pydantic fills the default). The loader +in `io.py` can log a debug message if it encounters an unknown version. + +### Shared I/O utility: `dandi/validate/io.py` + +New module shared by `cmd_validate.py` and `upload.py`: + +```python +"""Read and write validation results in JSONL format. + +JSONL files contain one ValidationResult per line — pure results, no envelope. +This makes them fully cat-composable, grep-searchable, and jq-processable. +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +from .types import ValidationResult + +lgr = logging.getLogger(__name__) + +CURRENT_RECORD_VERSION = "1" + + +def write_validation_jsonl( + results: list[ValidationResult], + path: str | Path, +) -> Path: + """Write validation results to a JSONL file (one result per line). + + Parameters + ---------- + results : list[ValidationResult] + Validation results to write. + path : str | Path + Output file path. + + Returns + ------- + Path + The path written to. + """ + path = Path(path) + with open(path, "w") as f: + for r in results: + f.write(r.model_dump_json() + "\n") + return path + + +def append_validation_jsonl( + results: list[ValidationResult], + path: str | Path, +) -> None: + """Append validation results to an existing JSONL file.""" + path = Path(path) + with open(path, "a") as f: + for r in results: + f.write(r.model_dump_json() + "\n") + + +def load_validation_jsonl(*paths: str | Path) -> list[ValidationResult]: + """Load and concatenate validation results from JSONL files. + + Each line must be a JSON-serialized ValidationResult. Blank lines and + lines that fail to parse as ValidationResult are skipped with a warning. + + Parameters + ---------- + *paths + One or more JSONL file paths. + + Returns + ------- + list[ValidationResult] + Concatenated results from all files. + """ + results: list[ValidationResult] = [] + for p in paths: + p = Path(p) + with open(p) as f: + for lineno, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + try: + vr = ValidationResult.model_validate_json(line) + except Exception: + lgr.debug( + "Skipping unrecognized line %d in %s", lineno, p + ) + continue + rv = getattr(vr, "_record_version", CURRENT_RECORD_VERSION) + if rv != CURRENT_RECORD_VERSION: + lgr.debug( + "Record version %s (expected %s) at %s:%d", + rv, CURRENT_RECORD_VERSION, p, lineno, + ) + results.append(vr) + return results + + +def validation_companion_path(logfile: str | Path) -> Path: + """Derive the validation companion path from a log file path. + + Given ``2026.03.19-14.30.00Z-12345.log``, returns + ``2026.03.19-14.30.00Z-12345_validation.jsonl`` in the same directory. + """ + logfile = Path(logfile) + return logfile.with_name(logfile.stem + "_validation.jsonl") +``` + +### Phase 1a: Format output (`-f/--format`) — addresses #1515 + +**Goal**: Add structured output formats to `dandi validate`. + +#### CLI changes + +```python +@click.option( + "--format", "-f", + "output_format", # avoid shadowing builtin + help="Output format.", + type=click.Choice(["human", "json", "json_pp", "json_lines", "yaml"]), + default="human", +) +``` + +#### Structured output schema (uniform across formats) + +All structured formats emit a flat list of `ValidationResult` records. + +**json / json_pp** — JSON array: +```json +[ + { + "_record_version": "1", + "id": "BIDS.NON_BIDS_PATH_PLACEHOLDER", + "origin": { + "type": "VALIDATION", + "validator": "bids-validator-deno", + "validator_version": "2.0.6", + "standard": "BIDS", + "standard_version": "1.9.0", + "standard_schema_version": "0.11.3" + }, + "scope": "file", + "severity": "ERROR", + "message": "File does not match any pattern known to BIDS.", + "path": "sub-01/anat/junk.txt", + "dandiset_path": "/data/dandiset001", + "dataset_path": "/data/dandiset001", + "asset_paths": null, + "within_asset_paths": null, + "path_regex": null, + "metadata": null + } +] +``` + +**json_lines** — one record per line (same fields, `cat`-composable): +``` +{"_record_version":"1","id":"BIDS.NON_BIDS_PATH_PLACEHOLDER","origin":{...},...} +{"_record_version":"1","id":"NWBI.check_data_orientation","origin":{...},...} +``` + +**yaml** — YAML list of the same records. + +No envelope in any format. Summary is a separate concern (see Phase 1c). + +#### Implementation approach + +Reuse `formatter.py` infrastructure. `ValidationResult.model_dump(mode="json")` +produces a dict compatible with existing `JSONFormatter`/`YAMLFormatter`. +The `validate()` CLI function collects all results into a list (already does +for filtering), then dispatches to `display_errors()` (human) or structured +formatter. + +### Phase 1b: File output (`-o/--output`) + auto-save companion + +**Goal**: Write results to file and auto-save a `_validation.jsonl` companion +alongside the `.log` file. + +#### CLI changes + +```python +@click.option( + "--output", "-o", + help="Write output to file instead of stdout. " + "Requires --format to be set to a structured format.", + type=click.Path(dir_okay=False, writable=True), + default=None, +) +``` + +Validation: +- If `--output` is given and `--format` is `human` (default), auto-detect + format from file extension (`.json` → `json_pp`, `.jsonl` → `json_lines`, + `.yaml`/`.yml` → `yaml`). If extension is unrecognized, raise + `click.UsageError` + +#### Auto-save companion + +`dandi validate` writes a `_validation.jsonl` companion next to its log file +**by default** whenever there are results (any severity), regardless of +`--format`. The companion is **skipped** only when: +- `--output` is specified (user already has structured results in a file) +- `--load` is used (re-rendering existing results, not a fresh run) +- No results are produced (clean validation) + +```python +# After collecting and filtering results, regardless of --format: +if not load and not output_file and filtered: + if hasattr(ctx, "obj") and ctx.obj is not None: + _auto_save_companion(filtered, ctx.obj.logfile) +``` + +**Important**: The companion must be written in ALL output format branches +(human, structured-to-stdout, structured-to-file), not just in the +structured-to-stdout branch. A bug existed where the companion was only saved +in the `else` branch (structured output to stdout), missing the most common +case (human output, the default). + +The companion accumulation rate matches the already-existing `.log` files. + +The `validate` command's help text should mention this behavior so users +know where to find saved results: + +> Validation results are automatically saved as a JSONL companion next to the +> dandi-cli log file (unless --output is specified). Use ``dandi validate +> --load `` to re-render saved results with different options. + +### Phase 1c: Summary flag (`--summary`) + +**Goal**: Add summary statistics as a display option, decoupled from the +serialized data format. + +```python +@click.option( + "--summary / --no-summary", + help="Show summary statistics after validation output.", + default=False, +) +``` + +Human output appends: + +``` +Summary: + Total issues: 42 + By severity: ERROR: 5, WARNING: 12, HINT: 25 + By validator: bids-validator-deno: 30, nwbinspector: 10, dandi: 2 + Files with errors: 8/150 +``` + +For structured formats, `--summary` appends a summary object **to stderr** +(keeping stdout as pure machine-parseable results), or is simply computed by +the consumer from the flat results list. The summary is never mixed into the +structured output stream itself. + +### Phase 2: Load and Re-Render (`--load`) — addresses #1753 + +**Goal**: Reload previously-saved validation results and apply all display options. + +#### CLI option + +```python +@click.option( + "--load", + help="Load validation results from JSONL file(s) instead of running " + "validation. Accepts multiple --load flags. Shell glob expansion " + "is supported (e.g. results/*.jsonl).", + type=click.Path(exists=True, dir_okay=False), + multiple=True, + default=(), +) +``` + +#### Mutual exclusivity with paths + +`--load` and positional `paths` are mutually exclusive. Enforced explicitly: + +```python +if load and paths: + raise click.UsageError( + "--load and PATH arguments are mutually exclusive. " + "Use --load to re-render stored results, or provide paths to validate." + ) +``` + +#### Behavior + +- When `--load` is specified, skip all validation execution +- Use `load_validation_jsonl()` from `dandi/validate/io.py` to read and concatenate +- Apply `--min-severity`, `--ignore`, `--grouping`, `--format`, `--output` identically +- **Exit code**: reflects the loaded results — non-zero if any ERROR-severity + issues are present after filtering. Rationale: the user is asking "are there + errors in these results?" and the answer should be in the exit code regardless + of whether validation was live or loaded. +- Records with unknown `_record_version` are loaded with a debug-level warning + but not rejected (forward-compatible reading) + +#### Example workflows + +```bash +# Run validation (auto-saves _validation.jsonl companion) +dandi validate /data/dandiset001 + +# Re-render the auto-saved results with different filters +dandi validate --load ~/.local/share/logs/dandi-cli/*_validation.jsonl \ + --min-severity ERROR +dandi validate --load results.jsonl --grouping path +dandi validate --load results.jsonl -f json_pp --ignore "NWBI\." + +# Cross-dataset sweep: validate many, then analyze together +for ds in bids-examples/*/; do + dandi validate "$ds" -f json_lines -o "results/$(basename $ds).jsonl" || true +done +dandi validate --load results/*.jsonl --grouping id --min-severity ERROR +``` + +Note: shell glob expansion is done by the shell, not by Click. `--load +results/*.jsonl` works because the shell expands the glob into multiple +`--load` arguments before the CLI sees them. + +### Phase 3: Upload validation companion — addresses #1753 + +**Goal**: Persist all validation results from `dandi upload` for later inspection. + +#### The problem with `ctx.obj.logfile` in upload + +The `upload()` function in `upload.py` is both a Python API and a CLI backend. +The Click context (`ctx.obj.logfile`) is only available via CLI. The upload +function's validation happens inside a deeply nested generator (`_upload_item`). + +**Solution**: Two layers: + +1. **Python API** (`upload.py`): accepts `validation_log_path: str | Path | None` + parameter. Already implemented. +2. **CLI** (`cmd_upload.py`): add an explicit `--validation-log` option that + makes the feature discoverable and testable. + +#### CLI option: `--validation-log` + +```python +@click.option( + "--validation-log", + help="Path for writing validation results in JSONL format. " + "Defaults to a companion file next to the dandi-cli log file. " + "Pass empty string to disable.", + type=click.Path(dir_okay=False), + default=None, +) +``` + +Behavior: +- **Not specified** (default): derive companion from `ctx.obj.logfile` via + `validation_companion_path()` — same as current implicit behavior +- **Explicit path**: use that path directly as `validation_log_path` +- **Empty string `""`**: pass `None` to `upload()`, disabling validation logging + +This makes the validation companion feature visible in `dandi upload --help` +and allows users to control where results are saved or opt out entirely. + +#### Python API parameter + +```python +def upload( + paths: Sequence[str | Path] | None = None, + existing: UploadExisting = UploadExisting.REFRESH, + validation: UploadValidation = UploadValidation.REQUIRE, + # ... existing params ... + validation_log_path: str | Path | None = None, +) -> None: +``` + +The CLI wrapper resolves the `--validation-log` option and passes the +resolved path (or `None`) to `upload()`. + +#### Implementation + +```python +# In cmd_upload.py: +if validation_log is not None: + # Explicit --validation-log: use as-is (empty string → disable) + companion = Path(validation_log) if validation_log else None +else: + # Default: derive from logfile + ctx = click.get_current_context() + companion = None + if ctx.obj is not None: + companion = validation_companion_path(ctx.obj.logfile) + +upload_(..., validation_log_path=companion) + +# In upload.py, inside the upload loop (already implemented): +if validation_log_path is not None and validation_statuses: + append_validation_jsonl(validation_statuses, validation_log_path) +``` + +After the upload completes (or if it fails due to validation errors): + +```python +if validation_log_path and Path(validation_log_path).exists(): + lgr.info( + "Validation results saved to %s\n" + " Use `dandi validate --load %s` to review.", + validation_log_path, validation_log_path, + ) +``` + +Uses `append_validation_jsonl()` from `dandi/validate/io.py` — the file is +opened in append mode for each batch, allowing incremental writes as files are +validated during upload without holding all results in memory. + +#### Testing the upload validation companion + +See **Step 3** in the Implementation Order section for detailed test descriptions. + +Key insight: to test the default companion-next-to-logfile behavior, CLI tests +must invoke `main` (the Click group), not `upload` directly, because `main()` +sets up `ctx.obj.logfile`. Example: + +```python +runner = CliRunner() +r = runner.invoke(main, ["upload", "--dandi-instance", instance_id, str(dspath)]) +``` + +Tests for the explicit `--validation-log` option can invoke either `main` or +`upload` directly since the option bypasses the logfile derivation. + +### Phase 4: Extended grouping options — enhances #1743 work + +**Goal**: Support additional grouping strategies for human-readable output. + +Extend `--grouping` from `{none, path}` to: + +| Value | Groups by | Use case | +|-------|-----------|----------| +| `none` | No grouping (flat list) | Default, simple review | +| `path` | File/dataset path | Per-file review | +| `severity` | Severity level | Triage by priority | +| `id` | Error ID | Find most common issues | +| `validator` | Validator name | Per-tool review | +| `standard` | Standard (BIDS/NWB/etc) | Per-standard review | +| `dandiset` | Dandiset path | Multi-dandiset sweeps | + +Human output with grouping adds section headers and counts: + +``` +=== ERROR (5 issues) === +[BIDS.NON_BIDS_PATH_PLACEHOLDER] sub-01/junk.txt — File does not match... +... + +=== WARNING (12 issues) === +[NWBI.check_data_orientation] sub-01/sub-01.nwb — Data may not be... +... +``` + +**Structured output is unaffected** — always a flat results list regardless +of `--grouping`. Downstream tools group trivially: +`jq -s 'group_by(.origin.validator)'`. + +### Phase 5: Cross-dataset sweep support and tooling integration (optional) + +- Helper script or command for batch validation across directories +- Consider a `dandi validate-report` subcommand for aggregate analysis + across multiple loaded JSONL files (cross-tabulation, top-N errors, etc.) +- **VisiData plugin**: Since VisiData is Python-based and already opens JSONL + natively, a `dandi` VisiData plugin could add: + - Custom column extractors for nested `origin.*` fields (flattened for tabular view) + - Frequency sheet by error ID / validator / standard + - Keybinding to open the offending file or jump to BIDS spec rule + - Integration with `dandi validate --load` for round-trip editing + (e.g., mark false positives, export filtered subset) + +## Path Serialization + +Paths in `ValidationResult` are serialized as-is (whatever `Path.__str__` +produces). For portability across machines: + +- **Recommendation**: When the path is within a dandiset, make it relative + to `dandiset_path`. When within a BIDS dataset, make it relative to + `dataset_path`. This is not a requirement for Phase 1 but should be + considered for Phase 2+ to make loaded results meaningful across machines. + +## Use Case: Sweeping BIDS Example Datasets / All DANDI Datasets + +```bash +# Validate all BIDS examples, save results per-dataset +for ds in bids-examples/*/; do + dandi validate "$ds" -f json_lines -o "results/$(basename $ds).jsonl" || true +done + +# Combine and analyze with jq +cat results/*.jsonl | jq 'select(.severity == "ERROR")' | jq -s 'group_by(.id)' + +# Or reload individual results with different views +dandi validate --load results/ds001.jsonl --grouping id --min-severity ERROR +dandi validate --load results/ds001.jsonl -f json_pp + +# Reload ALL results across datasets +dandi validate --load results/*.jsonl --grouping id --min-severity ERROR --summary + +# Interactive exploration with VisiData — sort, filter, pivot, frequency tables +vd results/*.jsonl +``` + +For all DANDI datasets: + +```bash +for dandiset_id in $(dandi ls -f json_lines https://dandiarchive.org/ | jq -r '.identifier'); do + dandi download "DANDI:${dandiset_id}/draft" -o "/data/${dandiset_id}" --download dandiset.yaml + dandi validate "/data/${dandiset_id}" -f json_lines -o "results/${dandiset_id}.jsonl" || true +done + +# Aggregate cross-dandiset analysis +cat results/*.jsonl | jq -s ' + group_by(.id) + | map({id: .[0].id, count: length, severity: .[0].severity}) + | sort_by(-.count) +' +``` + +## Implementation Order + +### Step 0a: Refactor into `dandi/validate/` subpackage + +- `git mv dandi/validate.py dandi/validate/core.py` — committed alone +- `git mv dandi/validate_types.py dandi/validate/types.py` — committed alone +- Create `dandi/validate/__init__.py` with re-exports for backwards compat +- Separate commit: update all import paths across the codebase +- Separate commit: move test files to `dandi/validate/tests/` +- All existing tests must pass after each commit + +### Step 0b: Refactor `cmd_validate.py` — addresses #1748 + +- Extract `_collect_results()`, `_filter_results()`, `_render_results()` +- Keep `validate()` CLI function as thin orchestrator +- Existing behavior unchanged, existing tests must pass +- Add `@click.pass_context` to `validate()` to access `ctx.obj.logfile` + +### Step 0c: Add `_record_version` to `ValidationResult` + +- Add `_record_version: str = "1"` field +- Verify serialization round-trip includes it +- No behavioral change to existing code + +### Step 1a: `--format` (structured output to stdout) — addresses #1515 + +- Add `--format` option with choices `human|json|json_pp|json_lines|yaml` +- All structured formats emit flat list of ValidationResult records +- Reuse `formatter.py` infrastructure for JSON/YAML +- Tests: CliRunner for each format, round-trip serialization + +### Step 1b: `--output` + auto-save companion + +- Add `--output` option, auto-detect format from extension (`.json` → + `json_pp`, `.jsonl` → `json_lines`, `.yaml`/`.yml` → `yaml`); error + if extension unrecognized and `--format` not given +- Create `dandi/validate/io.py` with shared `write_validation_jsonl()`, + `append_validation_jsonl()`, `load_validation_jsonl()`, + `validation_companion_path()` +- Auto-save `_validation.jsonl` companion when results are non-empty and + `--output` is not specified (user already has their output file) +- Tests: file creation, companion naming, empty-results no-op, companion + suppressed when `--output` is used + +### Step 1c: `--summary` + +- Add `--summary` flag for human output +- For structured formats, summary to stderr (not mixed into data stream) +- Tests: summary output format, counts accuracy + +### Step 2: `--load` (reload and re-render) — addresses #1753 + +- Add `--load` option (multiple, mutually exclusive with paths) +- Use `load_validation_jsonl()` from shared utility +- All filtering/grouping/format options work on loaded results +- Exit code reflects loaded results +- `_record_version` checked with debug-level warning for unknown versions +- Tests: load + filter, multi-file concatenation, mutual exclusivity error, + forward-compatible loading of unknown versions + +### Step 3: Upload validation companion — addresses #1753 + +#### 3a: Add `--validation-log` CLI option to `dandi upload` + +Currently the upload command writes a validation companion derived from +`ctx.obj.logfile` (the dandi-cli log file), but this is invisible to the +user — there is no `--help` text about it, no way to control the path, and +no way to disable it. + +Add an explicit `--validation-log` option: + +```python +@click.option( + "--validation-log", + help="Path for writing validation results in JSONL format. " + "Defaults to a companion file next to the dandi-cli log file. " + "Pass empty string to disable.", + type=click.Path(dir_okay=False), + default=None, +) +``` + +Behavior: +- **Not specified** (default): derive companion from `ctx.obj.logfile` as today +- **Explicit path**: use that path directly +- **Empty string `""`**: disable validation logging entirely + +This makes the feature discoverable via `--help` and testable without +needing to go through `main()` to get a logfile. + +#### 3b: Python API parameter + +- `upload()` already has `validation_log_path: str | Path | None = None` +- CLI wrapper resolves the `--validation-log` option → path or `None` and + passes it through + +#### 3c: Testing strategy for upload validation companion + +**Problem**: Existing upload tests call `SampleDandiset.upload()` which +invokes the Python API directly. The Python API's `validation_log_path` +parameter works, but the real end-to-end flow — where `dandi upload` +automatically derives the companion from its log file — is untested. Also, +invoking `CliRunner().invoke(upload, ...)` on the subcommand alone does not +set `ctx.obj.logfile` (that is done by `main()`). + +**Approach — two layers of tests**: + +1. **Python API tests** (`dandi/tests/test_upload.py`): Extend existing + tests that already exercise `SampleDandiset.upload()` to pass + `validation_log_path` and verify companion contents. These test the core + write logic without CLI overhead. + + - `test_upload_bids_invalid`: pass `validation_log_path`, verify companion + exists with ERROR-level results after `UploadError` + - `test_upload_invalid_metadata`: same pattern for NWB validation errors + - New `test_upload_validation_companion_clean`: upload valid NWB with + `validation_log_path`, verify companion absent or contains only + sub-ERROR results + +2. **CLI integration tests** (`dandi/cli/tests/test_cmd_upload.py` — new file): + Use `CliRunner().invoke(main, ["upload", ...])` to go through `main()` + so `ctx.obj.logfile` is set. These require the docker-compose archive + fixture. Tests verify: + + - Default behavior: `_validation.jsonl` companion appears next to the log + file (derived via `validation_companion_path(ctx.obj.logfile)`) + - `--validation-log /path/to/custom.jsonl`: companion at specified path + - `--validation-log ""`: no companion written + - Companion content is loadable via `load_validation_jsonl()` and contains + expected severity levels + + The log file path can be discovered from the CliRunner output (dandi + logs "Logs saved in ..." at INFO level) or by inspecting the logdir + after invocation. + + ```python + @pytest.mark.ai_generated + def test_upload_cli_validation_companion( + new_dandiset: SampleDandiset, tmp_path: Path + ) -> None: + """CLI upload creates validation companion next to log file.""" + # ... set up dandiset with a file that has validation warnings/errors ... + runner = CliRunner() + r = runner.invoke(main, [ + "upload", + "--dandi-instance", new_dandiset.api.instance_id, + str(new_dandiset.dspath), + ]) + # Find the log file from the logdir + logdir = Path(platformdirs.user_log_dir("dandi-cli", "dandi")) + log_files = sorted(logdir.glob("*.log")) + assert log_files # at least one log was created + latest_log = log_files[-1] + companion = validation_companion_path(latest_log) + if companion.exists(): + results = load_validation_jsonl(companion) + assert len(results) > 0 + ``` + + ```python + @pytest.mark.ai_generated + def test_upload_cli_validation_log_option( + new_dandiset: SampleDandiset, tmp_path: Path + ) -> None: + """--validation-log sends companion to explicit path.""" + custom_log = tmp_path / "my-validation.jsonl" + runner = CliRunner() + r = runner.invoke(main, [ + "upload", + "--validation-log", str(custom_log), + "--dandi-instance", new_dandiset.api.instance_id, + str(new_dandiset.dspath), + ]) + # Verify custom path was used + if custom_log.exists(): + results = load_validation_jsonl(custom_log) + assert all(isinstance(r, ValidationResult) for r in results) + ``` + + ```python + @pytest.mark.ai_generated + def test_upload_cli_validation_log_disabled( + new_dandiset: SampleDandiset, tmp_path: Path + ) -> None: + """--validation-log '' disables companion.""" + runner = CliRunner() + r = runner.invoke(main, [ + "upload", + "--validation-log", "", + "--dandi-instance", new_dandiset.api.instance_id, + str(new_dandiset.dspath), + ]) + # No validation JSONL should exist in logdir + logdir = Path(platformdirs.user_log_dir("dandi-cli", "dandi")) + assert not list(logdir.glob("*_validation.jsonl")) + ``` + +### Step 4: Extended grouping (human-only) — enhances #1743 + +- Add `severity`, `id`, `validator`, `standard`, `dandiset` grouping options +- Implement section headers + counts for human output +- Structured output unchanged (always flat) +- This subsumes some of the filtering work in draft PR #1743 + +### Step 5: Cross-dataset sweep support (optional) + +- Helper script or `dandi validate-report` subcommand +- Aggregate analysis across multiple JSONL files + +## Backwards Compatibility + +- `dandi validate` with no new options behaves identically to today +- Exit code semantics preserved: non-zero if any ERROR-severity issues +- When using `--format` other than `human`, colored output is suppressed +- When using `--load`, exit code reflects loaded results +- The auto-save companion is the only new side effect; it writes when there + are results and `--output` is not specified (for `validate`), or always + (for `upload`). Follows the same lifecycle as the existing `.log` files +- The `dandi/validate/` subpackage `__init__.py` re-exports all public API, + so `from dandi.validate import validate` continues to work. Only direct + `from dandi.validate_types import ...` imports need updating. + +## Testing Strategy + +| Component | Test type | Location | Approach | +|-----------|-----------|----------|----------| +| Subpackage refactor | Smoke | existing | All existing tests pass after `git mv` + import updates | +| `--format` output | CLI unit | `cli/tests/test_cmd_validate.py` | `CliRunner`, assert JSON structure, round-trip | +| `dandi/validate/io.py` | Unit | `validate/tests/test_io.py` | Write → load round-trip, append, empty files, corrupt lines | +| `_record_version` | Unit | `validate/tests/test_types.py` | Serialization includes field, loader handles missing/unknown | +| `--load` | CLI unit | `cli/tests/test_cmd_validate.py` | Load from fixture files, multi-file concat, mutual exclusivity | +| `--output` | CLI unit | `cli/tests/test_cmd_validate.py` | Verify file creation, content matches stdout format | +| Companion auto-save (`validate`) | CLI unit | `cli/tests/test_cmd_validate.py` | Verify `_validation.jsonl` created next to mock logfile | +| Upload companion (Python API) | Integration | `tests/test_upload.py` | Pass `validation_log_path` to `SampleDandiset.upload()`, verify file + contents | +| Upload companion (CLI, default) | CLI integration | `cli/tests/test_cmd_upload.py` | `CliRunner().invoke(main, ["upload", ...])` — verify `_validation.jsonl` next to log file | +| Upload `--validation-log` option | CLI integration | `cli/tests/test_cmd_upload.py` | Custom path, empty-string disable | +| Extended grouping | CLI unit | `cli/tests/test_cmd_validate.py` | Each grouping value, section headers, counts | +| Summary | CLI unit | `cli/tests/test_cmd_validate.py` | Verify counts match actual results | +| Edge cases | Unit | various | Empty results, None severity, very long paths, Unicode messages | + +**Note on CLI integration tests**: Tests in `cli/tests/test_cmd_upload.py` must invoke +`main` (not `upload` directly) via `CliRunner().invoke(main, ["upload", ...])` so that +`ctx.obj.logfile` is set by the `main()` group callback. Invoking the `upload` +subcommand directly skips the log file setup. + +All new tests marked `@pytest.mark.ai_generated`. + +## File Inventory + +| File | Change | +|------|--------| +| `dandi/validate/__init__.py` | **New** — subpackage with re-exports | +| `dandi/validate/core.py` | **Renamed** from `dandi/validate.py` | +| `dandi/validate/types.py` | **Renamed** from `dandi/validate_types.py` + add `_record_version` | +| `dandi/validate/io.py` | **New** — shared JSONL read/write utilities | +| `dandi/validate/tests/__init__.py` | **New** | +| `dandi/validate/tests/test_core.py` | **Renamed** from `dandi/tests/test_validate.py` | +| `dandi/validate/tests/test_types.py` | **Renamed** from `dandi/tests/test_validate_types.py` | +| `dandi/validate/tests/test_io.py` | **New** — tests for I/O utilities | +| `dandi/cli/cmd_validate.py` | Refactor + add `--format`, `--output`, `--load`, `--summary`, grouping extensions | +| `dandi/upload.py` | Add `validation_log_path` parameter, write companion | +| `dandi/cli/cmd_upload.py` | Add `--validation-log` option, pass companion path to `upload()` | +| `dandi/cli/tests/test_cmd_upload.py` | **New** — CLI integration tests for upload validation companion | +| `dandi/cli/tests/test_cmd_validate.py` | Extend with format/load/output/summary tests | +| `dandi/tests/test_upload.py` | Extend existing tests to pass `validation_log_path` and verify companion | +| `dandi/pynwb_utils.py` | Update imports | +| `dandi/organize.py` | Update imports | +| `dandi/files/bases.py` | Update imports | +| `dandi/files/bids.py` | Update imports | +| `dandi/files/zarr.py` | Update imports | +| `dandi/bids_validator_deno/_validator.py` | Update imports | diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 32c0167db..26ab88b79 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -92,6 +92,31 @@ instance as `dandi-api-local-docker-tests`. See the note below on the `DANDI_DEVEL` environment variable, which is needed in order to expose the development command line options. +## Code style conventions + +### Dataclass and attrs field documentation + +Document dataclass/attrs fields using `#:` comments above the field, not +docstrings below. This is the format Sphinx autodoc recognizes for attribute +documentation: + +```python +@dataclass +class Movement: + """A movement/renaming of an asset""" + + #: The asset's original path + src: AssetPath + #: The asset's destination path + dest: AssetPath + #: Whether to skip this operation because an asset already exists at the + #: destination + skip: bool = False +``` + +See [dandi.move.Movement on RTD](https://dandi.readthedocs.io/en/latest/modref/generated/dandi.move.html#dandi.move.Movement) +for a rendered example. + ## Environment variables - `DANDI_DEVEL` -- enables otherwise hidden command line options, such as diff --git a/dandi/bids_validator_deno/_validator.py b/dandi/bids_validator_deno/_validator.py index e811f2c5a..8c55cb488 100644 --- a/dandi/bids_validator_deno/_validator.py +++ b/dandi/bids_validator_deno/_validator.py @@ -13,7 +13,7 @@ from pydantic import DirectoryPath, validate_call from dandi.utils import find_parent_directory_containing -from dandi.validate.types import ( +from dandi.validate._types import ( Origin, OriginType, Scope, diff --git a/dandi/cli/cmd_upload.py b/dandi/cli/cmd_upload.py index 1f057dc36..55cdb794b 100644 --- a/dandi/cli/cmd_upload.py +++ b/dandi/cli/cmd_upload.py @@ -96,7 +96,8 @@ def upload( can point to specific files you would like to validate and have uploaded. """ # Avoid heavy imports by importing with function: - from ..upload import upload + from ..upload import upload as upload_ + from ..validate._io import validation_companion_path if jobs_pair is None: jobs = None @@ -104,7 +105,12 @@ def upload( else: jobs, jobs_per_file = jobs_pair - upload( + ctx = click.get_current_context() + companion = ( + validation_companion_path(ctx.obj.logfile) if ctx.obj is not None else None + ) + + upload_( paths, existing=existing, validation=validation, @@ -115,4 +121,5 @@ def upload( jobs=jobs, jobs_per_file=jobs_per_file, sync=sync, + validation_log_path=companion, ) diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index bc86b4982..5ef7501ee 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -1,17 +1,54 @@ from __future__ import annotations +from collections import OrderedDict +import dataclasses +import json as json_mod import logging import os import re -from typing import cast +import sys +from typing import IO, Union, cast import warnings import click from .base import devel_debug_option, devel_option, map_to_click_exceptions +from .formatter import JSONFormatter, JSONLinesFormatter, TextFormatter, YAMLFormatter from ..utils import pluralize -from ..validate.core import validate as validate_ -from ..validate.types import Severity, ValidationResult +from ..validate._core import validate as validate_ +from ..validate._io import ( + load_validation_jsonl, + validation_companion_path, + write_validation_jsonl, +) +from ..validate._types import Severity, ValidationResult + +lgr = logging.getLogger(__name__) + + +@dataclasses.dataclass +class TruncationNotice: + """Placeholder indicating omitted results in truncated output.""" + + #: Number of validation results omitted from this group + omitted_count: int + """Number of validation results omitted from this group.""" + + +STRUCTURED_FORMATS = ("json", "json_pp", "json_lines", "yaml") + +_EXT_TO_FORMAT = { + ".json": "json_pp", + ".jsonl": "json_lines", + ".yaml": "yaml", + ".yml": "yaml", +} + + +def _format_from_ext(path: str) -> str | None: + """Infer output format from file extension, or None if unrecognized.""" + ext = os.path.splitext(path)[1].lower() + return _EXT_TO_FORMAT.get(ext) def _collect_results( @@ -111,7 +148,9 @@ def validate_bids( "`dandi validate` instead. Proceeding to parse the call to `dandi validate` now.", DeprecationWarning, ) - ctx.invoke(validate, paths=paths, grouping=grouping) + ctx.invoke( + validate, paths=paths, grouping=(grouping,) if grouping != "none" else () + ) @click.command() @@ -125,9 +164,13 @@ def validate_bids( @click.option( "--grouping", "-g", - help="How to group error/warning reporting.", - type=click.Choice(["none", "path"], case_sensitive=False), - default="none", + help="How to group output. Repeat for hierarchical nesting, e.g. -g severity -g id.", + type=click.Choice( + ["none", "path", "severity", "id", "validator", "standard", "dandiset"], + case_sensitive=False, + ), + multiple=True, + default=(), ) @click.option("--ignore", metavar="REGEX", help="Regex matching error IDs to ignore") @click.option( @@ -136,14 +179,57 @@ def validate_bids( type=click.Choice([i.name for i in Severity], case_sensitive=True), default="HINT", ) +@click.option( + "--format", + "-f", + "output_format", + help="Output format.", + type=click.Choice(["text", "json", "json_pp", "json_lines", "yaml"]), + default="text", +) +@click.option( + "--output", + "-o", + "output_file", + help="Write output to file instead of stdout. " + "Requires --format to be set to a structured format.", + type=click.Path(dir_okay=False, writable=True), + default=None, +) +@click.option( + "--summary/--no-summary", + help="Show summary statistics.", + default=False, +) +@click.option( + "--max-per-group", + type=int, + default=None, + help="Limit results per group (or total if ungrouped). " + "Excess results are replaced by a count of omitted items.", +) +@click.option( + "--load", + help="Load validation results from JSONL file(s) instead of running validation.", + type=click.Path(exists=True, dir_okay=False), + multiple=True, + default=(), +) @click.argument("paths", nargs=-1, type=click.Path(exists=True, dir_okay=True)) +@click.pass_context @devel_debug_option() @map_to_click_exceptions def validate( + ctx: click.Context, paths: tuple[str, ...], ignore: str | None, - grouping: str, + grouping: tuple[str, ...], min_severity: str, + output_format: str = "text", + output_file: str | None = None, + summary: bool = False, + max_per_group: int | None = None, + load: tuple[str, ...] = (), schema: str | None = None, devel_debug: bool = False, allow_any_path: bool = False, @@ -151,52 +237,375 @@ def validate( """Validate files for data standards compliance. Exits with non-0 exit code if any file is not compliant. + + Validation results are automatically saved as a JSONL companion next to the + dandi-cli log file (unless --output is used or --load is active). Use + ``dandi validate --load `` to re-render saved results later with + different grouping, filtering, or format options. """ - results = _collect_results(paths, schema, devel_debug, allow_any_path) + # Normalize grouping: strip "none" values + grouping = tuple(g for g in grouping if g != "none") + + # Auto-detect format from output file extension when --format not given + if output_file is not None and output_format == "text": + detected = _format_from_ext(output_file) + if detected is None: + raise click.UsageError( + "--output requires --format to be set to a structured format " + "(json, json_pp, json_lines, yaml), or use a recognized " + "extension (.json, .jsonl, .yaml, .yml)." + ) + output_format = detected + + # JSONL is incompatible with grouping (flat format, no nesting) + if grouping and output_format == "json_lines": + raise click.UsageError( + "--grouping is incompatible with json_lines format " + "(JSONL is a flat format that cannot represent nested groups)." + ) + + if load and paths: + raise click.UsageError("--load and positional paths are mutually exclusive.") + + if load: + results = load_validation_jsonl(load) + else: + results = _collect_results(paths, schema, devel_debug, allow_any_path) + # Auto-save companion right after collection, before filtering — so + # all results are preserved regardless of display filters. + # Skip when writing to --output (user already gets structured output). + if ( + not output_file + and results + and (obj := getattr(ctx, "obj", None)) is not None + ): + _auto_save_companion(results, obj.logfile) + filtered = _filter_results(results, min_severity, ignore) - _process_issues(filtered, grouping) + if output_file is not None: + with open(output_file, "w") as fh: + _render(filtered, output_format, fh, grouping, max_per_group=max_per_group) + lgr.info("Validation output written to %s", output_file) + if summary: + _print_summary(filtered, sys.stderr) + else: + _render( + filtered, output_format, sys.stdout, grouping, max_per_group=max_per_group + ) + if summary: + summary_out = sys.stdout if output_format == "text" else sys.stderr + _print_summary(filtered, summary_out) -def _process_issues( + _exit_if_errors(filtered) + + +def _auto_save_companion(results: list[ValidationResult], logfile: str) -> None: + """Write validation companion JSONL next to the logfile.""" + companion = validation_companion_path(logfile) + write_validation_jsonl(results, companion) + lgr.info("Validation companion saved to %s", companion) + + +def _print_summary(results: list[ValidationResult], out: IO[str]) -> None: + """Print summary statistics about validation results.""" + from collections import Counter + + total = len(results) + print("\n--- Validation Summary ---", file=out) + print(f"Total issues: {total}", file=out) + if not total: + return + + severity_counts = Counter( + r.severity.name if r.severity is not None else "NONE" for r in results + ) + print("By severity:", file=out) + for sev in ("CRITICAL", "ERROR", "WARNING", "HINT", "INFO"): + if sev in severity_counts: + print(f" {sev}: {severity_counts[sev]}", file=out) + + validator_counts = Counter(r.origin.validator.value for r in results) + if validator_counts: + print("By validator:", file=out) + for validator, count in validator_counts.most_common(): + print(f" {validator}: {count}", file=out) + + standard_counts = Counter( + r.origin.standard.value if r.origin.standard is not None else "N/A" + for r in results + ) + if standard_counts: + print("By standard:", file=out) + for standard, count in standard_counts.most_common(): + print(f" {standard}: {count}", file=out) + + +def _get_formatter( + output_format: str, out: IO[str] | None = None +) -> JSONFormatter | JSONLinesFormatter | TextFormatter | YAMLFormatter: + """Create a formatter for the given output format.""" + match output_format: + case "text": + return TextFormatter(out=out) + case "json": + return JSONFormatter(out=out) + case "json_pp": + return JSONFormatter(indent=2, out=out) + case "json_lines": + return JSONLinesFormatter(out=out) + case "yaml": + return YAMLFormatter(out=out) + case _: + raise ValueError(f"Unknown format: {output_format}") + + +def _render( + results: list[ValidationResult], + output_format: str, + out: IO[str], + grouping: tuple[str, ...] = (), + max_per_group: int | None = None, +) -> None: + """Render validation results in the given format. + + Handles both text and structured (JSON/JSONL/YAML) formats, with + optional grouping and truncation. + """ + is_text = output_format == "text" + + if grouping: + grouped: GroupedResults | TruncatedResults = _group_results(results, grouping) + if max_per_group is not None: + grouped = _truncate_leaves(grouped, max_per_group) + if is_text: + # Text grouped output uses colored section headers + if grouping == ("path",): + # Legacy path grouping: per-path display_errors + purviews = list(set(i.purview for i in results)) + for purview in purviews: + applies_to = [i for i in results if purview == i.purview] + display_errors( + [purview], + [i.id for i in applies_to], + cast("list[Severity]", [i.severity for i in applies_to]), + [i.message for i in applies_to], + ) + else: + _render_text_grouped(grouped, depth=0) + if not any( + r.severity is not None and r.severity >= Severity.ERROR for r in results + ): + click.secho("No errors found.", fg="green") + else: + # Structured grouped output: nested dict + data = _serialize_grouped(grouped) + if output_format in ("json", "json_pp"): + indent = 2 if output_format == "json_pp" else None + json_mod.dump(data, out, indent=indent, sort_keys=True, default=str) + out.write("\n") + elif output_format == "yaml": + import ruamel.yaml + + yaml = ruamel.yaml.YAML(typ="safe") + yaml.default_flow_style = False + yaml.dump(data, out) + else: + raise ValueError( + f"Unsupported format for grouped output: {output_format}" + ) + else: + # Ungrouped: use formatter per-record + if is_text: + shown = results + omitted = 0 + if max_per_group is not None and len(results) > max_per_group: + shown = results[:max_per_group] + omitted = len(results) - max_per_group + formatter = _get_formatter(output_format, out=out) + with formatter: + for r in shown: + formatter(r) + if omitted: + click.secho(f"... and {pluralize(omitted, 'more issue')}", fg="cyan") + else: + items: list[dict] = [r.model_dump(mode="json") for r in results] + if max_per_group is not None and len(items) > max_per_group: + items = items[:max_per_group] + items.append( + {"_truncated": True, "omitted_count": len(results) - max_per_group} + ) + formatter = _get_formatter(output_format, out=out) + with formatter: + for item in items: + formatter(item) + + +def _exit_if_errors(results: list[ValidationResult]) -> None: + """Raise SystemExit(1) if any result has severity >= ERROR.""" + if any(r.severity is not None and r.severity >= Severity.ERROR for r in results): + raise SystemExit(1) + + +def _group_key(issue: ValidationResult, grouping: str) -> str: + """Extract the grouping key from a ValidationResult.""" + match grouping: + case "path": + return issue.purview or "(no path)" + case "severity": + return issue.severity.name if issue.severity is not None else "NONE" + case "id": + return issue.id + case "validator": + return issue.origin.validator.value + case "standard": + return issue.origin.standard.value if issue.origin.standard else "N/A" + case "dandiset": + return str(issue.dandiset_path) if issue.dandiset_path else "(no dandiset)" + case _: + raise NotImplementedError(f"Unsupported grouping: {grouping}") + + +# Recursive grouped type: either a nested OrderedDict or leaf list +GroupedResults = Union["OrderedDict[str, GroupedResults]", list[ValidationResult]] + +# Leaf items after possible truncation +LeafItem = Union[ValidationResult, TruncationNotice] +TruncatedResults = Union["OrderedDict[str, TruncatedResults]", list[LeafItem]] + + +def _group_results( + results: list[ValidationResult], + levels: tuple[str, ...], +) -> GroupedResults: + """Group results recursively by the given hierarchy of grouping levels. + + Returns a nested OrderedDict with leaf values as lists of ValidationResult. + With zero levels, returns the flat list unchanged. + """ + if not levels: + return results + key_fn = levels[0] + remaining = levels[1:] + groups: OrderedDict[str, list[ValidationResult]] = OrderedDict() + for r in results: + k = _group_key(r, key_fn) + groups.setdefault(k, []).append(r) + if remaining: + return OrderedDict((k, _group_results(v, remaining)) for k, v in groups.items()) + # mypy can't resolve the recursive type alias, but this is correct: + # OrderedDict[str, list[VR]] is a valid GroupedResults + return cast("GroupedResults", groups) + + +def _truncate_leaves( + grouped: GroupedResults | TruncatedResults, max_per_group: int +) -> TruncatedResults: + """Truncate leaf lists to *max_per_group* items, appending a TruncationNotice.""" + if isinstance(grouped, list): + if len(grouped) > max_per_group: + kept: list[LeafItem] = list(grouped[:max_per_group]) + kept.append(TruncationNotice(len(grouped) - max_per_group)) + return kept + return cast("TruncatedResults", grouped) + return OrderedDict( + (k, _truncate_leaves(v, max_per_group)) for k, v in grouped.items() + ) + + +def _serialize_grouped(grouped: GroupedResults | TruncatedResults) -> dict | list: + """Convert grouped results to a JSON-serializable nested dict/list.""" + if isinstance(grouped, list): + result: list[dict] = [] + for item in grouped: + if isinstance(item, TruncationNotice): + result.append({"_truncated": True, "omitted_count": item.omitted_count}) + else: + result.append(item.model_dump(mode="json")) + return result + return {k: _serialize_grouped(v) for k, v in grouped.items()} + + +def _render_text( issues: list[ValidationResult], - grouping: str, + grouping: tuple[str, ...], + max_per_group: int | None = None, ) -> None: - purviews = [i.purview for i in issues] - if grouping == "none": - display_errors( - purviews, - [i.id for i in issues], - cast("list[Severity]", [i.severity for i in issues]), - [i.message for i in issues], + """Render validation results in colored text format. + + Thin wrapper around ``_render`` for backwards compatibility with tests + and ``_process_issues``. + """ + _render(issues, "text", sys.stdout, grouping, max_per_group=max_per_group) + + +def _count_leaves(grouped: GroupedResults | TruncatedResults) -> int: + """Count total items in a grouped structure (including omitted counts).""" + if isinstance(grouped, list): + return sum( + item.omitted_count if isinstance(item, TruncationNotice) else 1 + for item in grouped ) - elif grouping == "path": - # The purviews are the paths, if we group by path, we need to de-duplicate. - # typing complains if we just take the set, though the code works otherwise. - purviews = list(set(purviews)) - for purview in purviews: - applies_to = [i for i in issues if purview == i.purview] - display_errors( - [purview], - [i.id for i in applies_to], - cast("list[Severity]", [i.severity for i in applies_to]), - [i.message for i in applies_to], + return sum(_count_leaves(v) for v in grouped.values()) + + +def _render_text_grouped( + grouped: GroupedResults | TruncatedResults, + depth: int, +) -> None: + """Recursively render grouped results with nested indented section headers.""" + indent = " " * depth + if isinstance(grouped, list): + # Leaf level: render individual issues + for issue in grouped: + if isinstance(issue, TruncationNotice): + click.secho( + f"{indent}... and {pluralize(issue.omitted_count, 'more issue')}", + fg="cyan", + ) + continue + msg = f"{indent}[{issue.id}] {issue.purview} — {issue.message}" + fg = _get_severity_color( + [issue.severity] if issue.severity is not None else [] ) + click.secho(msg, fg=fg) else: - raise NotImplementedError( - "The `grouping` parameter values currently supported are 'path' and" - " 'none'." - ) + for key, value in grouped.items(): + count = _count_leaves(value) + header = f"{indent}=== {key} ({pluralize(count, 'issue')}) ===" + # Determine color from all issues in this group + all_issues = _collect_all_issues(value) + fg = _get_severity_color( + cast( + "list[Severity]", + [i.severity for i in all_issues if i.severity is not None], + ) + ) + click.secho(header, fg=fg, bold=True) + _render_text_grouped(value, depth + 1) - if ( - max( - (i.severity for i in issues if i.severity is not None), - default=Severity.INFO, - ) - >= Severity.ERROR - ): - raise SystemExit(1) - else: - click.secho("No errors found.", fg="green") + +def _collect_all_issues( + grouped: GroupedResults | TruncatedResults, +) -> list[ValidationResult]: + """Flatten a grouped structure into a list of all ValidationResults.""" + if isinstance(grouped, list): + return [item for item in grouped if isinstance(item, ValidationResult)] + result: list[ValidationResult] = [] + for v in grouped.values(): + result.extend(_collect_all_issues(v)) + return result + + +def _process_issues( + issues: list[ValidationResult], + grouping: str | tuple[str, ...], +) -> None: + """Legacy wrapper: render text output and exit if errors.""" + if isinstance(grouping, str): + grouping = (grouping,) if grouping != "none" else () + _render_text(issues, grouping) + _exit_if_errors(issues) def _get_severity_color(severities: list[Severity]) -> str: diff --git a/dandi/cli/formatter.py b/dandi/cli/formatter.py index 9bf71db13..060ffb213 100644 --- a/dandi/cli/formatter.py +++ b/dandi/cli/formatter.py @@ -3,10 +3,12 @@ import sys from textwrap import indent +import click import ruamel.yaml from .. import get_logger from ..support import pyout as pyouts +from ..validate._types import Severity lgr = get_logger() @@ -89,6 +91,40 @@ def __call__(self, rec): self.records.append(rec) +class TextFormatter(Formatter): + """Render validation results as colored text lines. + + Unlike other formatters which receive dicts, this receives + ``ValidationResult`` objects directly (needs ``.purview``, ``.severity``). + """ + + def __init__(self, out=None): + self.out = out or sys.stdout + self._has_errors = False + + def __exit__(self, exc_type, exc_value, traceback): + if not self._has_errors: + click.secho("No errors found.", fg="green", file=self.out) + + def __call__(self, rec): + + severity = rec.severity + purview = rec.purview + msg = f"[{rec.id}] {purview} — {rec.message}" + if severity is not None and severity >= Severity.ERROR: + self._has_errors = True + if severity is not None: + if severity >= Severity.ERROR: + fg = "red" + elif severity >= Severity.WARNING: + fg = "yellow" + else: + fg = "blue" + else: + fg = "blue" + click.secho(msg, fg=fg, file=self.out) + + class PYOUTFormatter(pyouts.LogSafeTabular): def __init__(self, fields, **kwargs): PYOUT_STYLE = pyouts.get_style(hide_if_missing=not fields) diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py index 978bd7e91..06b587d93 100644 --- a/dandi/cli/tests/test_cmd_validate.py +++ b/dandi/cli/tests/test_cmd_validate.py @@ -1,11 +1,24 @@ +import json from pathlib import Path +from typing import cast from click.testing import CliRunner import pytest +import ruamel.yaml -from ..cmd_validate import _process_issues, validate +from ..cmd_validate import ( + GroupedResults, + TruncationNotice, + _group_results, + _process_issues, + _render_text, + _truncate_leaves, + validate, +) +from ..command import main from ...tests.xfail import mark_xfail_windows_python313_posixsubprocess -from ...validate.types import ( +from ...validate._io import load_validation_jsonl +from ...validate._types import ( Origin, OriginType, Scope, @@ -15,6 +28,15 @@ ) +@pytest.fixture +def redirected_logdir(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Path: + """Redirect dandi-cli log directory to tmp_path/logs and return the path.""" + logdir = tmp_path / "logs" + logdir.mkdir() + monkeypatch.setattr("platformdirs.user_log_dir", lambda *a, **kw: str(logdir)) + return logdir + + @pytest.mark.parametrize( "ds_name, expected_err_location", [ @@ -149,3 +171,728 @@ def test_validate_bids_error_grouping_notification( # Does it notify the user correctly? notification_substring = "Invalid value for '--grouping'" assert notification_substring in r.output + + +@pytest.mark.ai_generated +def test_validate_format_json(simple2_nwb: Path) -> None: + """Test --format json outputs a valid JSON array.""" + r = CliRunner().invoke(validate, ["-f", "json", str(simple2_nwb)]) + assert r.exit_code == 1 # NO_DANDISET_FOUND is an error + data = json.loads(r.output) + assert isinstance(data, list) + assert len(data) >= 1 + # Check structure of first result + rec = data[0] + assert "id" in rec + assert "origin" in rec + assert "severity" in rec + assert "record_version" in rec + + +@pytest.mark.ai_generated +def test_validate_format_json_lines(simple2_nwb: Path) -> None: + """Test --format json_lines outputs one JSON object per line.""" + r = CliRunner().invoke(validate, ["-f", "json_lines", str(simple2_nwb)]) + assert r.exit_code == 1 + lines = [line for line in r.output.strip().split("\n") if line.strip()] + assert len(lines) >= 1 + for line in lines: + rec = json.loads(line) + assert "id" in rec + assert "severity" in rec + assert "record_version" in rec + + +@pytest.mark.ai_generated +def test_validate_format_yaml(simple2_nwb: Path) -> None: + """Test --format yaml outputs valid YAML.""" + r = CliRunner().invoke(validate, ["-f", "yaml", str(simple2_nwb)]) + assert r.exit_code == 1 + yaml = ruamel.yaml.YAML(typ="safe") + data = yaml.load(r.output) + assert isinstance(data, list) + assert len(data) >= 1 + rec = data[0] + assert "id" in rec + assert "severity" in rec + assert "record_version" in rec + + +@pytest.mark.ai_generated +def test_validate_format_no_errors_no_message(tmp_path: Path) -> None: + """Structured formats should not emit 'No errors found.' text.""" + (tmp_path / "dandiset.yaml").write_text( + "identifier: 12346\nname: Foo\ndescription: Dandiset Foo\n" + ) + r = CliRunner().invoke(validate, ["-f", "json", str(tmp_path)]) + assert r.exit_code == 0 + assert "No errors found" not in r.output + data = json.loads(r.output) + assert data == [] + + +@pytest.mark.ai_generated +def test_validate_output_file(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --output writes to file instead of stdout.""" + outfile = tmp_path / "results.jsonl" + r = CliRunner().invoke( + validate, + ["-f", "json_lines", "-o", str(outfile), str(simple2_nwb)], + ) + assert r.exit_code == 1 + assert outfile.exists() + lines = outfile.read_text().strip().split("\n") + assert len(lines) >= 1 + rec = json.loads(lines[0]) + assert "id" in rec + # stdout should be empty (output went to file) + assert r.output.strip() == "" + + +@pytest.mark.ai_generated +def test_validate_output_requires_format(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --output with unrecognized extension and no --format gives error.""" + outfile = tmp_path / "results.txt" + r = CliRunner().invoke( + validate, + ["-o", str(outfile), str(simple2_nwb)], + ) + assert r.exit_code != 0 + assert "--output requires --format" in r.output + + +@pytest.mark.ai_generated +def test_validate_output_auto_format(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --output auto-detects format from file extension.""" + outfile = tmp_path / "results.jsonl" + r = CliRunner().invoke( + validate, + ["-o", str(outfile), str(simple2_nwb)], + ) + assert r.exit_code == 1 # NO_DANDISET_FOUND + assert outfile.exists() + lines = outfile.read_text().strip().split("\n") + # Should be json_lines format (one JSON per line) + rec = json.loads(lines[0]) + assert "id" in rec + + # .json → json_pp (indented) + outjson = tmp_path / "results.json" + r = CliRunner().invoke( + validate, + ["-o", str(outjson), str(simple2_nwb)], + ) + assert r.exit_code == 1 + content = outjson.read_text() + data = json.loads(content) + assert isinstance(data, list) + # json_pp produces indented output + assert "\n " in content + + +@pytest.mark.ai_generated +def test_validate_summary_text(simple2_nwb: Path) -> None: + """Test --summary in text format shows statistics.""" + r = CliRunner().invoke(validate, ["--summary", str(simple2_nwb)]) + assert r.exit_code != 0 + assert "Validation Summary" in r.output + assert "Total issues:" in r.output + assert "By severity:" in r.output + + +@pytest.mark.ai_generated +def test_validate_load(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --load reads results from a JSONL file.""" + # First, produce a JSONL file + outfile = tmp_path / "results.jsonl" + r = CliRunner().invoke( + validate, + ["-f", "json_lines", "-o", str(outfile), str(simple2_nwb)], + ) + assert r.exit_code == 1 + assert outfile.exists() + + # Now load it + r = CliRunner().invoke(validate, ["--load", str(outfile)]) + assert r.exit_code == 1 # loaded errors still produce exit 1 + assert "DANDI.NO_DANDISET_FOUND" in r.output + + +@pytest.mark.ai_generated +def test_validate_load_with_format(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --load combined with --format.""" + outfile = tmp_path / "results.jsonl" + CliRunner().invoke( + validate, + ["-f", "json_lines", "-o", str(outfile), str(simple2_nwb)], + ) + assert outfile.exists() + + r = CliRunner().invoke(validate, ["--load", str(outfile), "-f", "json"]) + assert r.exit_code == 1 + data = json.loads(r.output) + assert isinstance(data, list) + assert len(data) >= 1 + + +@pytest.mark.ai_generated +def test_validate_load_mutual_exclusivity(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --load and paths are mutually exclusive.""" + outfile = tmp_path / "dummy.jsonl" + outfile.write_text("") + r = CliRunner().invoke(validate, ["--load", str(outfile), str(simple2_nwb)]) + assert r.exit_code != 0 + assert "mutually exclusive" in r.output + + +@pytest.mark.ai_generated +@pytest.mark.parametrize( + "grouping", + ["severity", "id", "validator", "standard", "dandiset"], +) +def test_render_text_grouping(grouping: str, capsys: pytest.CaptureFixture) -> None: + """Test extended grouping renders section headers with counts.""" + origin = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + issues = [ + ValidationResult( + id="NWBI.check_data_orientation", + origin=origin, + scope=Scope.FILE, + message="Data may be in the wrong orientation.", + path=Path("sub-01/sub-01.nwb"), + severity=Severity.WARNING, + dandiset_path=Path("/data/ds001"), + ), + ValidationResult( + id="NWBI.check_missing_unit", + origin=origin, + scope=Scope.FILE, + message="Missing text for attribute 'unit'.", + path=Path("sub-02/sub-02.nwb"), + severity=Severity.WARNING, + dandiset_path=Path("/data/ds001"), + ), + ] + _render_text(issues, grouping=(grouping,)) + captured = capsys.readouterr().out + + # Section headers with "===" must appear + assert "===" in captured + # Both issues should be rendered + assert "NWBI.check_data_orientation" in captured + assert "NWBI.check_missing_unit" in captured + # Issue counts in headers + assert "issue" in captured + + +def _grouping_opts(*groupings: str) -> list[str]: + """Build CLI args for one or more -g options.""" + opts: list[str] = [] + for g in groupings: + opts.extend(["-g", g]) + return opts + + +# simple2_nwb always produces at least this ERROR (not inside a dandiset) +_SIMPLE2_EXPECTED_ID = "DANDI.NO_DANDISET_FOUND" + +# Single and composite grouping specs for parametrized tests. +# Each entry is a tuple of grouping levels. +_GROUPING_SPECS: list[tuple[str, ...]] = [ + ("severity",), + ("id",), + ("validator",), + ("standard",), + ("dandiset",), + ("severity", "id"), + ("validator", "severity"), + ("id", "validator"), +] + + +@pytest.mark.ai_generated +@pytest.mark.parametrize("grouping", _GROUPING_SPECS, ids=lambda g: "+".join(g)) +def test_validate_grouping_text_cli( + grouping: tuple[str, ...], simple2_nwb: Path +) -> None: + """Each grouping spec (single or composite) produces section headers and known issues.""" + r = CliRunner().invoke(validate, [*_grouping_opts(*grouping), str(simple2_nwb)]) + assert r.exit_code != 0 + assert "===" in r.output + # Both known issues must appear somewhere in output + assert _SIMPLE2_EXPECTED_ID in r.output + # Composite groupings must produce nested (indented) headers + if len(grouping) > 1: + lines = r.output.split("\n") + inner = [ln for ln in lines if "===" in ln and ln.startswith(" ")] + assert inner, "composite grouping should produce nested headers" + + +@pytest.mark.ai_generated +def test_render_text_multilevel_grouping(capsys: pytest.CaptureFixture) -> None: + """Test multi-level grouping renders nested section headers.""" + origin = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + issues = [ + ValidationResult( + id="NWBI.check_data_orientation", + origin=origin, + scope=Scope.FILE, + message="Data may be in the wrong orientation.", + path=Path("sub-01/sub-01.nwb"), + severity=Severity.WARNING, + dandiset_path=Path("/data/ds001"), + ), + ValidationResult( + id="NWBI.check_missing_unit", + origin=origin, + scope=Scope.FILE, + message="Missing text for attribute 'unit'.", + path=Path("sub-02/sub-02.nwb"), + severity=Severity.WARNING, + dandiset_path=Path("/data/ds001"), + ), + ValidationResult( + id="NWBI.check_data_orientation", + origin=origin, + scope=Scope.FILE, + message="Data may be in the wrong orientation.", + path=Path("sub-03/sub-03.nwb"), + severity=Severity.ERROR, + dandiset_path=Path("/data/ds001"), + ), + ] + _render_text(issues, grouping=("severity", "id")) + captured = capsys.readouterr().out + + assert "=== WARNING" in captured + assert "=== ERROR" in captured + assert "=== NWBI.check_data_orientation" in captured + assert "=== NWBI.check_missing_unit" in captured + lines = captured.split("\n") + inner_headers = [ln for ln in lines if "===" in ln and ln.startswith(" ")] + assert len(inner_headers) >= 2 + + +@pytest.mark.ai_generated +@pytest.mark.parametrize( + "grouping", + [("path",), *_GROUPING_SPECS], + ids=lambda g: "+".join(g), +) +def test_validate_grouping_json_cli( + grouping: tuple[str, ...], simple2_nwb: Path +) -> None: + """Each grouping spec produces a (nested) dict in JSON output with known issues.""" + r = CliRunner().invoke( + validate, [*_grouping_opts(*grouping), "-f", "json_pp", str(simple2_nwb)] + ) + assert r.exit_code == 1 + data = json.loads(r.output) + assert isinstance(data, dict) + assert len(data) >= 1 + # For composite groupings, values are nested dicts + if len(grouping) > 1: + for v in data.values(): + assert isinstance(v, dict) + else: + for v in data.values(): + assert isinstance(v, list) + + +@pytest.mark.ai_generated +def test_validate_grouping_yaml_cli(simple2_nwb: Path) -> None: + """Grouped YAML output is a dict keyed by grouping values.""" + r = CliRunner().invoke( + validate, [*_grouping_opts("severity"), "-f", "yaml", str(simple2_nwb)] + ) + assert r.exit_code == 1 + yaml = ruamel.yaml.YAML(typ="safe") + data = yaml.load(r.output) + assert isinstance(data, dict) + assert "ERROR" in data + + +@pytest.mark.ai_generated +def test_validate_grouping_jsonl_error(simple2_nwb: Path) -> None: + """Grouping is incompatible with json_lines format.""" + r = CliRunner().invoke( + validate, [*_grouping_opts("severity"), "-f", "json_lines", str(simple2_nwb)] + ) + assert r.exit_code != 0 + assert "incompatible" in r.output + + +@pytest.mark.ai_generated +def test_validate_grouping_none_explicit(simple2_nwb: Path) -> None: + """-g none is treated as no grouping.""" + r = CliRunner().invoke(validate, [*_grouping_opts("none"), str(simple2_nwb)]) + assert r.exit_code != 0 + assert "===" not in r.output + + +@pytest.mark.ai_generated +@pytest.mark.parametrize( + "grouping", + [("severity",), ("id", "validator")], + ids=lambda g: "+".join(g), +) +def test_validate_load_with_grouping( + grouping: tuple[str, ...], simple2_nwb: Path, tmp_path: Path +) -> None: + """--load combined with single or composite --grouping works.""" + outfile = tmp_path / "results.jsonl" + CliRunner().invoke( + validate, ["-f", "json_lines", "-o", str(outfile), str(simple2_nwb)] + ) + assert outfile.exists() + + r = CliRunner().invoke( + validate, ["--load", str(outfile), *_grouping_opts(*grouping)] + ) + assert r.exit_code == 1 + assert "===" in r.output + assert _SIMPLE2_EXPECTED_ID in r.output + + +@pytest.mark.ai_generated +@pytest.mark.parametrize( + "grouping", + [("severity",), ("validator", "id")], + ids=lambda g: "+".join(g), +) +def test_validate_grouping_output_file( + grouping: tuple[str, ...], simple2_nwb: Path, tmp_path: Path +) -> None: + """--grouping with --output writes grouped JSON to file.""" + outfile = tmp_path / "grouped.json" + r = CliRunner().invoke( + validate, + [*_grouping_opts(*grouping), "-o", str(outfile), str(simple2_nwb)], + ) + assert r.exit_code == 1 + data = json.loads(outfile.read_text()) + assert isinstance(data, dict) + + +@pytest.mark.ai_generated +def test_group_results_unit() -> None: + """Unit test for _group_results with multiple levels.""" + from collections import OrderedDict + + origin = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + issues = [ + ValidationResult( + id="A.one", + origin=origin, + scope=Scope.FILE, + message="msg1", + path=Path("f1.nwb"), + severity=Severity.ERROR, + ), + ValidationResult( + id="A.two", + origin=origin, + scope=Scope.FILE, + message="msg2", + path=Path("f2.nwb"), + severity=Severity.WARNING, + ), + ValidationResult( + id="A.one", + origin=origin, + scope=Scope.FILE, + message="msg3", + path=Path("f3.nwb"), + severity=Severity.ERROR, + ), + ] + + # Zero levels: returns flat list + result = _group_results(issues, ()) + assert result is issues + + # One level + result = _group_results(issues, ("severity",)) + assert isinstance(result, OrderedDict) + assert "ERROR" in result + assert "WARNING" in result + assert len(result["ERROR"]) == 2 + assert len(result["WARNING"]) == 1 + + # Two levels + result2 = _group_results(issues, ("severity", "id")) + assert isinstance(result2, OrderedDict) + error_group = result2["ERROR"] + assert isinstance(error_group, OrderedDict) + assert "A.one" in error_group + assert len(error_group["A.one"]) == 2 + warning_group = result2["WARNING"] + assert isinstance(warning_group, OrderedDict) + assert "A.two" in warning_group + assert len(warning_group["A.two"]) == 1 + + +def _make_jsonl(tmp_path: Path, n: int = 5) -> Path: + """Write *n* synthetic validation results to a JSONL file and return its path.""" + outfile = tmp_path / "results.jsonl" + lines = [] + for i in range(n): + sev = "ERROR" if i % 2 == 0 else "WARNING" + rec = { + "id": f"TEST.issue_{i}", + "origin": { + "type": "VALIDATION", + "validator": "nwbinspector", + "validator_version": "", + }, + "scope": "file", + "severity": sev, + "path": f"sub-{i:02d}/sub-{i:02d}.nwb", + "message": f"Synthetic issue number {i}", + "record_version": "1", + } + lines.append(json.dumps(rec)) + outfile.write_text("\n".join(lines) + "\n") + return outfile + + +@pytest.mark.ai_generated +def test_max_per_group_flat(tmp_path: Path) -> None: + """--max-per-group without grouping truncates the flat list.""" + jsonl = _make_jsonl(tmp_path, n=5) + r = CliRunner().invoke(validate, ["--load", str(jsonl), "--max-per-group", "2"]) + assert r.exit_code == 1 + assert "3 more issues" in r.output + # Only 2 real issues should be listed + assert "TEST.issue_0" in r.output + assert "TEST.issue_1" in r.output + assert "TEST.issue_4" not in r.output + + +@pytest.mark.ai_generated +def test_max_per_group_with_grouping(tmp_path: Path) -> None: + """-g severity --max-per-group 1 truncates each severity group independently.""" + jsonl = _make_jsonl(tmp_path, n=6) + r = CliRunner().invoke( + validate, + ["--load", str(jsonl), "-g", "severity", "--max-per-group", "1"], + ) + assert r.exit_code == 1 + # Each group should show "more issue(s)" + assert "more issue" in r.output + # Headers should reflect original counts (including omitted) + assert "=== ERROR" in r.output + assert "=== WARNING" in r.output + + +@pytest.mark.ai_generated +def test_max_per_group_json(tmp_path: Path) -> None: + """-f json_pp --max-per-group 2 emits _truncated placeholder in JSON.""" + jsonl = _make_jsonl(tmp_path, n=5) + r = CliRunner().invoke( + validate, + ["--load", str(jsonl), "-f", "json_pp", "--max-per-group", "2"], + ) + assert r.exit_code == 1 # ERRORs in test data + data = json.loads(r.output) + assert isinstance(data, list) + # Last item should be a truncation notice + assert data[-1]["_truncated"] is True + assert data[-1]["omitted_count"] == 3 + # Only 2 real results before the notice + real = [d for d in data if "_truncated" not in d] + assert len(real) == 2 + + +@pytest.mark.ai_generated +def test_max_per_group_multilevel(tmp_path: Path) -> None: + """-g severity -g id --max-per-group 1 truncates only at leaf level.""" + jsonl = _make_jsonl(tmp_path, n=6) + r = CliRunner().invoke( + validate, + [ + "--load", + str(jsonl), + "-g", + "severity", + "-g", + "id", + "--max-per-group", + "1", + ], + ) + assert r.exit_code == 1 + # All severity groups should appear + assert "=== ERROR" in r.output + assert "=== WARNING" in r.output + # Each id within a severity gets at most 1 result — but since each + # synthetic issue has a unique id, each leaf has exactly 1 item, + # so no truncation notice is expected for unique-id leaves. + # Verify structure is intact. + assert "TEST.issue_0" in r.output + + +@pytest.mark.ai_generated +def test_max_per_group_no_truncation(tmp_path: Path) -> None: + """--max-per-group larger than result count produces no placeholder.""" + jsonl = _make_jsonl(tmp_path, n=3) + r = CliRunner().invoke(validate, ["--load", str(jsonl), "--max-per-group", "100"]) + assert r.exit_code == 1 + assert "more issue" not in r.output + # All issues present + assert "TEST.issue_0" in r.output + assert "TEST.issue_1" in r.output + assert "TEST.issue_2" in r.output + + +@pytest.mark.ai_generated +def test_max_per_group_json_grouped(tmp_path: Path) -> None: + """-g severity -f json_pp --max-per-group 1 emits _truncated in grouped JSON.""" + jsonl = _make_jsonl(tmp_path, n=6) + r = CliRunner().invoke( + validate, + [ + "--load", + str(jsonl), + "-g", + "severity", + "-f", + "json_pp", + "--max-per-group", + "1", + ], + ) + data = json.loads(r.output) + assert isinstance(data, dict) + # Each severity group should have a truncation notice if it has > 1 item + for sev_key, items in data.items(): + assert isinstance(items, list) + truncated = [i for i in items if isinstance(i, dict) and i.get("_truncated")] + if len(items) > 1: + # At least one truncation notice + assert len(truncated) >= 1 + + +@pytest.mark.ai_generated +def test_truncate_leaves_unit() -> None: + """Unit test for _truncate_leaves helper.""" + from collections import OrderedDict + + origin = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + issues = [ + ValidationResult( + id=f"T.{i}", + origin=origin, + scope=Scope.FILE, + message=f"msg{i}", + path=Path(f"f{i}.nwb"), + severity=Severity.ERROR, + ) + for i in range(5) + ] + + # Flat list truncation + truncated = _truncate_leaves(issues, 2) + assert isinstance(truncated, list) + assert len(truncated) == 3 # 2 results + 1 notice + assert isinstance(truncated[-1], TruncationNotice) + assert truncated[-1].omitted_count == 3 + + # Nested dict truncation + grouped: GroupedResults = cast( + "GroupedResults", OrderedDict([("A", issues[:3]), ("B", issues[3:])]) + ) + truncated_grouped = _truncate_leaves(grouped, 1) + assert isinstance(truncated_grouped, OrderedDict) + a_items = truncated_grouped["A"] + assert isinstance(a_items, list) + assert len(a_items) == 2 # 1 result + 1 notice + assert isinstance(a_items[-1], TruncationNotice) + assert a_items[-1].omitted_count == 2 + # B has 2 items → truncated to 1 + notice + b_items = truncated_grouped["B"] + assert isinstance(b_items, list) + assert len(b_items) == 2 + assert isinstance(b_items[-1], TruncationNotice) + + # No truncation when under limit + no_trunc = _truncate_leaves(issues, 100) + assert no_trunc is issues + + +@pytest.mark.ai_generated +def test_validate_auto_companion_text( + simple2_nwb: Path, redirected_logdir: Path +) -> None: + """Default text-format validate auto-saves companion next to log file.""" + + r = CliRunner().invoke(main, ["validate", str(simple2_nwb)]) + assert r.exit_code == 1 # NO_DANDISET_FOUND + + assert len(companions := list(redirected_logdir.glob("*_validation.jsonl"))) == 1 + + # Verify content is loadable + assert load_validation_jsonl([companions[0]]) + + +@pytest.mark.ai_generated +def test_validate_auto_companion_skipped_with_output( + simple2_nwb: Path, tmp_path: Path, redirected_logdir: Path +) -> None: + """--output suppresses auto-save companion.""" + outfile = tmp_path / "results.jsonl" + r = CliRunner().invoke(main, ["validate", "-o", str(outfile), str(simple2_nwb)]) + assert r.exit_code == 1 + assert outfile.exists() + + assert not list(redirected_logdir.glob("*_validation.jsonl")) + + +@pytest.mark.ai_generated +def test_validate_auto_companion_skipped_with_load( + simple2_nwb: Path, tmp_path: Path, redirected_logdir: Path +) -> None: + """--load suppresses auto-save companion.""" + # First produce a JSONL to load + outfile = tmp_path / "input.jsonl" + r = CliRunner().invoke( + main, ["validate", "-f", "json_lines", "-o", str(outfile), str(simple2_nwb)] + ) + assert outfile.exists() + + # Clear any companions from first run + for s in redirected_logdir.glob("*_validation.jsonl"): + s.unlink() + + # Now --load it + r = CliRunner().invoke(main, ["validate", "--load", str(outfile)]) + assert r.exit_code == 1 + + assert not list(redirected_logdir.glob("*_validation.jsonl")) + + +@pytest.mark.ai_generated +def test_validate_auto_companion_structured_stdout( + simple2_nwb: Path, redirected_logdir: Path +) -> None: + """Structured format to stdout also auto-saves companion.""" + r = CliRunner().invoke(main, ["validate", "-f", "json", str(simple2_nwb)]) + assert r.exit_code == 1 + + assert len(list(redirected_logdir.glob("*_validation.jsonl"))) == 1 diff --git a/dandi/files/bases.py b/dandi/files/bases.py index b5be008bf..de52027f7 100644 --- a/dandi/files/bases.py +++ b/dandi/files/bases.py @@ -29,7 +29,7 @@ from dandi.metadata.core import get_default_metadata from dandi.misctypes import DUMMY_DANDI_ETAG, Digest, LocalReadableFile, P from dandi.utils import post_upload_size_check, pre_upload_size_check, yaml_load -from dandi.validate.types import ( +from dandi.validate._types import ( ORIGIN_INTERNAL_DANDI, ORIGIN_VALIDATION_DANDI, Origin, diff --git a/dandi/files/bids.py b/dandi/files/bids.py index bcda0f6d2..335f70f5d 100644 --- a/dandi/files/bids.py +++ b/dandi/files/bids.py @@ -16,7 +16,7 @@ from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file from ..metadata.core import add_common_metadata, prepare_metadata from ..misctypes import Digest -from ..validate.types import ( +from ..validate._types import ( ORIGIN_VALIDATION_DANDI_LAYOUT, Scope, Severity, @@ -92,7 +92,7 @@ def _get_metadata(self) -> None: with self._lock: if self._asset_metadata is None: # Import here to avoid circular import - from dandi.validate.core import validate_bids + from dandi.validate._core import validate_bids # === Validate the dataset using bidsschematools === # This is done to obtain the metadata for each asset in the dataset diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py index 805ce3a16..b9668e2b0 100644 --- a/dandi/files/zarr.py +++ b/dandi/files/zarr.py @@ -47,7 +47,7 @@ ) from .bases import LocalDirectoryAsset -from ..validate.types import ( +from ..validate._types import ( ORIGIN_VALIDATION_DANDI_ZARR, Origin, OriginType, diff --git a/dandi/organize.py b/dandi/organize.py index 4ba0047ba..4f67ac20b 100644 --- a/dandi/organize.py +++ b/dandi/organize.py @@ -43,7 +43,7 @@ pluralize, yaml_load, ) -from .validate.types import ( +from .validate._types import ( ORIGIN_VALIDATION_DANDI_LAYOUT, Scope, Severity, diff --git a/dandi/pynwb_utils.py b/dandi/pynwb_utils.py index 5010725e4..c496d2a55 100644 --- a/dandi/pynwb_utils.py +++ b/dandi/pynwb_utils.py @@ -42,7 +42,7 @@ ) from .misctypes import Readable from .utils import get_module_version, is_url -from .validate.types import ( +from .validate._types import ( Origin, OriginType, Scope, diff --git a/dandi/tests/test_bids_validator_deno/test_validator.py b/dandi/tests/test_bids_validator_deno/test_validator.py index 2782c42f2..6024adf33 100644 --- a/dandi/tests/test_bids_validator_deno/test_validator.py +++ b/dandi/tests/test_bids_validator_deno/test_validator.py @@ -27,7 +27,7 @@ ) from dandi.consts import dandiset_metadata_file from dandi.tests.fixtures import BIDS_TESTDATA_SELECTION -from dandi.validate.types import ( +from dandi.validate._types import ( OriginType, Scope, Severity, diff --git a/dandi/upload.py b/dandi/upload.py index 324e31129..541b1e622 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -49,7 +49,8 @@ from .support import pyout as pyouts from .support.pyout import naturalsize from .utils import ensure_datetime, path_is_subpath, pluralize -from .validate.types import Severity +from .validate._io import write_validation_jsonl +from .validate._types import Severity def _check_dandidownload_paths(dfile: DandiFile) -> None: @@ -112,6 +113,7 @@ def upload( jobs: int | None = None, jobs_per_file: int | None = None, sync: bool = False, + validation_log_path: str | Path | None = None, ) -> None: if paths: paths = [Path(p).absolute() for p in paths] @@ -300,6 +302,10 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: ): yield {"status": "pre-validating"} validation_statuses = dfile.get_validation_errors() + if validation_log_path is not None and validation_statuses: + write_validation_jsonl( + validation_statuses, validation_log_path, append=True + ) validation_errors = [ s for s in validation_statuses diff --git a/dandi/validate/__init__.py b/dandi/validate/__init__.py index 659b81765..580022b36 100644 --- a/dandi/validate/__init__.py +++ b/dandi/validate/__init__.py @@ -6,17 +6,23 @@ - File layout and organization validation - Metadata completeness checking -Submodules: -- core: Main validation functions (validate, validate_bids) -- types: Data types and models (ValidationResult, Origin, Severity, etc.) -- io: JSONL read/write utilities for validation results +Private submodules: +- _core: Main validation functions (validate, validate_bids) +- _types: Data types and models (ValidationResult, Origin, Severity, etc.) +- _io: JSONL read/write utilities for validation results -Note: core is NOT eagerly imported here to avoid circular imports -(core → dandi.files → dandi.validate.types → dandi.validate.__init__). -Import from dandi.validate.core directly for validate/validate_bids. +Note: _core is NOT eagerly imported here to avoid circular imports +(_core → dandi.files → dandi.validate._types → dandi.validate.__init__). +Import from dandi.validate._core directly for validate/validate_bids. """ -from .types import ( +from ._io import ( + load_validation_jsonl, + validation_companion_path, + write_validation_jsonl, +) +from ._types import ( + CURRENT_RECORD_VERSION, ORIGIN_INTERNAL_DANDI, ORIGIN_VALIDATION_DANDI, ORIGIN_VALIDATION_DANDI_LAYOUT, @@ -32,6 +38,7 @@ ) __all__ = [ + "CURRENT_RECORD_VERSION", "ORIGIN_INTERNAL_DANDI", "ORIGIN_VALIDATION_DANDI", "ORIGIN_VALIDATION_DANDI_LAYOUT", @@ -44,4 +51,7 @@ "Standard", "ValidationResult", "Validator", + "load_validation_jsonl", + "validation_companion_path", + "write_validation_jsonl", ] diff --git a/dandi/validate/core.py b/dandi/validate/_core.py similarity index 99% rename from dandi/validate/core.py rename to dandi/validate/_core.py index f30cf95b5..b397e49c0 100644 --- a/dandi/validate/core.py +++ b/dandi/validate/_core.py @@ -13,7 +13,7 @@ import os from pathlib import Path -from .types import ( +from ._types import ( ORIGIN_VALIDATION_DANDI_LAYOUT, Origin, OriginType, diff --git a/dandi/validate/_io.py b/dandi/validate/_io.py new file mode 100644 index 000000000..025ca7801 --- /dev/null +++ b/dandi/validate/_io.py @@ -0,0 +1,85 @@ +"""I/O utilities for validation results in JSONL format. + +Provides functions for writing, appending, and loading validation results +as JSONL (JSON Lines) files — one ValidationResult per line. +""" + +from __future__ import annotations + +from collections.abc import Iterable +from pathlib import Path + +from ._types import ValidationResult + + +def write_validation_jsonl( + results: list[ValidationResult], + path: str | Path, + *, + append: bool = False, +) -> Path: + """Write validation results to a JSONL file. + + Parameters + ---------- + results + List of ValidationResult objects to write. + path + File path to write to. Created if it does not exist. + append + If True, append to an existing file instead of overwriting. + + Returns + ------- + Path + The path written to (as a Path object). + """ + path = Path(path) + with path.open("a" if append else "w") as f: + for r in results: + f.write(r.model_dump_json()) + f.write("\n") + return path + + +def load_validation_jsonl(paths: Iterable[str | Path]) -> list[ValidationResult]: + """Load and concatenate validation results from one or more JSONL files. + + Parameters + ---------- + paths + Iterable of file paths to load from. + + Returns + ------- + list[ValidationResult] + All results from all files, in order. + """ + results: list[ValidationResult] = [] + for p in paths: + p = Path(p) + with p.open() as f: + for line in f: + if line := line.strip(): + results.append(ValidationResult.model_validate_json(line)) + return results + + +def validation_companion_path(logfile: str | Path) -> Path: + """Derive the validation companion path from a logfile path. + + The companion is placed next to the logfile with ``_validation.jsonl`` + appended to the stem. + + Parameters + ---------- + logfile + Path to the logfile. + + Returns + ------- + Path + Path to the companion file. + """ + logfile = Path(logfile) + return logfile.parent / (logfile.stem + "_validation.jsonl") diff --git a/dandi/validate/types.py b/dandi/validate/_types.py similarity index 92% rename from dandi/validate/types.py rename to dandi/validate/_types.py index 8356a0d51..3b2fad046 100644 --- a/dandi/validate/types.py +++ b/dandi/validate/_types.py @@ -1,6 +1,7 @@ from __future__ import annotations from enum import Enum, IntEnum, auto, unique +import logging from pathlib import Path from typing import Annotated, Any, Union @@ -10,6 +11,8 @@ import dandi from dandi.utils import StrEnum +lgr = logging.getLogger(__name__) + @unique class Standard(StrEnum): @@ -180,7 +183,13 @@ class Scope(Enum): DATASET = "dataset" +CURRENT_RECORD_VERSION = "1" + + class ValidationResult(BaseModel): + record_version: str = CURRENT_RECORD_VERSION + """Version of the serialized record format for forward compatibility""" + id: str origin: Origin @@ -215,6 +224,14 @@ class ValidationResult(BaseModel): path: Path | None = None path_regex: str | None = None + def model_post_init(self, __context: Any) -> None: + if self.record_version != CURRENT_RECORD_VERSION: + lgr.warning( + "record_version %r != current %r, loading anyway", + self.record_version, + CURRENT_RECORD_VERSION, + ) + @property def purview(self) -> str | None: if self.path is not None: diff --git a/dandi/validate/tests/test_core.py b/dandi/validate/tests/test_core.py index 64c32090d..6c6d5f1d3 100644 --- a/dandi/validate/tests/test_core.py +++ b/dandi/validate/tests/test_core.py @@ -4,8 +4,8 @@ import pytest -from ..core import validate -from ..types import ( +from .._core import validate +from .._types import ( Origin, OriginType, Scope, diff --git a/dandi/validate/tests/test_io.py b/dandi/validate/tests/test_io.py new file mode 100644 index 000000000..d03f1968b --- /dev/null +++ b/dandi/validate/tests/test_io.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from dandi.validate._io import ( + load_validation_jsonl, + validation_companion_path, + write_validation_jsonl, +) +from dandi.validate._types import ( + Origin, + OriginType, + Scope, + Severity, + ValidationResult, + Validator, +) + +FOO_ORIGIN = Origin( + type=OriginType.INTERNAL, + validator=Validator.dandi, + validator_version="1.0.0", +) + + +def _make_result(id_: str, severity: Severity = Severity.WARNING) -> ValidationResult: + return ValidationResult( + id=id_, + origin=FOO_ORIGIN, + scope=Scope.FILE, + severity=severity, + message=f"Message for {id_}", + path=Path(f"/tmp/{id_}.nwb"), + ) + + +@pytest.mark.ai_generated +class TestWriteAndLoad: + def test_round_trip(self, tmp_path: Path) -> None: + """Write results to JSONL and load them back.""" + results = [_make_result("A"), _make_result("B", Severity.ERROR)] + out = tmp_path / "results.jsonl" + ret = write_validation_jsonl(results, out) + assert ret == out + assert out.exists() + + loaded = load_validation_jsonl([out]) + assert len(loaded) == 2 + assert loaded[0].id == "A" + assert loaded[1].id == "B" + assert loaded[1].severity == Severity.ERROR + + def test_append(self, tmp_path: Path) -> None: + """Append results to an existing JSONL file.""" + out = tmp_path / "results.jsonl" + write_validation_jsonl([_make_result("A")], out) + write_validation_jsonl([_make_result("B")], out, append=True) + + loaded = load_validation_jsonl([out]) + assert len(loaded) == 2 + assert loaded[0].id == "A" + assert loaded[1].id == "B" + + def test_append_creates_file(self, tmp_path: Path) -> None: + """Append creates the file if it doesn't exist.""" + out = tmp_path / "new.jsonl" + write_validation_jsonl([_make_result("A")], out, append=True) + assert out.exists() + loaded = load_validation_jsonl([out]) + assert len(loaded) == 1 + + def test_empty_file(self, tmp_path: Path) -> None: + """Loading an empty file returns an empty list.""" + out = tmp_path / "empty.jsonl" + write_validation_jsonl([], out) + loaded = load_validation_jsonl([out]) + assert loaded == [] + + def test_multi_file_load(self, tmp_path: Path) -> None: + """Load from multiple JSONL files and concatenate.""" + f1 = tmp_path / "a.jsonl" + f2 = tmp_path / "b.jsonl" + write_validation_jsonl([_make_result("A")], f1) + write_validation_jsonl([_make_result("B"), _make_result("C")], f2) + + loaded = load_validation_jsonl([f1, f2]) + assert len(loaded) == 3 + assert [r.id for r in loaded] == ["A", "B", "C"] + + def test_blank_lines_skipped(self, tmp_path: Path) -> None: + """Blank lines in JSONL should be silently skipped.""" + out = tmp_path / "results.jsonl" + write_validation_jsonl([_make_result("A")], out) + # Add blank lines + with out.open("a") as f: + f.write("\n\n") + loaded = load_validation_jsonl([out]) + assert len(loaded) == 1 + + +@pytest.mark.ai_generated +class TestCompanionPath: + def test_derives_from_logfile(self) -> None: + """Companion path is derived from logfile by appending _validation.jsonl.""" + logfile = Path("/var/log/dandi/2026.03.19-12.00.00Z-12345.log") + companion = validation_companion_path(logfile) + assert companion == Path( + "/var/log/dandi/2026.03.19-12.00.00Z-12345_validation.jsonl" + ) + + def test_string_input(self) -> None: + """String input is accepted.""" + companion = validation_companion_path("/tmp/test.log") + assert companion == Path("/tmp/test_validation.jsonl") diff --git a/dandi/validate/tests/test_types.py b/dandi/validate/tests/test_types.py index 9b3b3abf1..1c45bc799 100644 --- a/dandi/validate/tests/test_types.py +++ b/dandi/validate/tests/test_types.py @@ -6,7 +6,8 @@ from pydantic import ValidationError import pytest -from dandi.validate.types import ( +from dandi.validate._types import ( + CURRENT_RECORD_VERSION, Origin, OriginType, Scope, @@ -156,3 +157,26 @@ def test_invalid_severity_validation(self, invalid_severity: Any) -> None: assert excinfo.value.error_count() == 1 assert excinfo.value.errors()[0]["loc"][-1] == "severity" + + @pytest.mark.ai_generated + def test_record_version_serialization(self) -> None: + """Test that record_version is included in JSON serialization.""" + r = ValidationResult( + id="foo", + origin=FOO_ORIGIN, + scope=Scope.FILE, + severity=Severity.ERROR, + ) + assert r.record_version == CURRENT_RECORD_VERSION + + json_dump = r.model_dump(mode="json") + assert json_dump["record_version"] == CURRENT_RECORD_VERSION + + json_str = r.model_dump_json() + json_dict = json.loads(json_str) + assert json_dict["record_version"] == CURRENT_RECORD_VERSION + + # Round-trip + r2 = ValidationResult.model_validate_json(json_str) + assert r2.record_version == CURRENT_RECORD_VERSION + assert r == r2