Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions dandi/cli/cmd_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
validation_companion_path,
write_validation_jsonl,
)
from ..validate._types import Severity, ValidationResult
from ..validate._types import MissingFileContent, Severity, ValidationResult

lgr = logging.getLogger(__name__)

Expand Down Expand Up @@ -56,6 +56,7 @@ def _collect_results(
schema: str | None,
devel_debug: bool,
allow_any_path: bool,
missing_file_content: MissingFileContent = MissingFileContent.error,
) -> list[ValidationResult]:
"""Run validation and collect all results into a list."""
# Avoid heavy import by importing within function:
Expand All @@ -81,6 +82,7 @@ def _collect_results(
schema_version=schema,
devel_debug=devel_debug,
allow_any_path=allow_any_path,
missing_file_content=missing_file_content,
)
)

Expand Down Expand Up @@ -208,6 +210,17 @@ def validate_bids(
help="Limit results per group (or total if ungrouped). "
"Excess results are replaced by a count of omitted items.",
)
@click.option(
"--missing-file-content",
"missing_file_content",
help="How to handle files whose content is unavailable (e.g. broken symlinks "
"in a datalad dataset without fetched data). 'error' (default) emits a "
"concise error per file, 'skip' skips each such file with a warning, "
"'only-non-data' skips content-dependent validators but still validates "
"path layout.",
type=click.Choice(["error", "only-non-data", "skip"], case_sensitive=True),
default="error",
)
@click.option(
"--load",
help="Load validation results from JSONL file(s) instead of running validation.",
Expand All @@ -229,6 +242,7 @@ def validate(
output_file: str | None = None,
summary: bool = False,
max_per_group: int | None = None,
missing_file_content: str = "error",
load: tuple[str, ...] = (),
schema: str | None = None,
devel_debug: bool = False,
Expand Down Expand Up @@ -270,7 +284,10 @@ def validate(
if load:
results = load_validation_jsonl(load)
else:
results = _collect_results(paths, schema, devel_debug, allow_any_path)
mfc = MissingFileContent(missing_file_content)
results = _collect_results(
paths, schema, devel_debug, allow_any_path, missing_file_content=mfc
)
# Auto-save companion right after collection, before filtering — so
# all results are preserved regardless of display filters.
# Skip when writing to --output (user already gets structured output).
Expand Down
100 changes: 100 additions & 0 deletions dandi/cli/tests/test_cmd_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -896,3 +896,103 @@ def test_validate_auto_companion_structured_stdout(
assert r.exit_code == 1

assert len(list(redirected_logdir.glob("*_validation.jsonl"))) == 1


# ---- Tests for --missing-file-content option (issue #1606) ----


def _make_dandiset_with_broken_symlinks(tmp_path: Path) -> Path:
"""Create a dandiset with broken NWB symlinks *and* one real NWB file.

Layout::

dandiset.yaml
sub-001/sub-001.nwb -> broken symlink (datalad annex)
sub-002/sub-002.nwb -> broken symlink (datalad annex)
sub-003/sub-003.nwb -> real NWB file (content present)

This lets tests assert that real files are still fully validated
while broken symlinks are handled according to the policy.
"""
from datetime import datetime, timezone

from ...consts import dandiset_metadata_file
from ...pynwb_utils import make_nwb_file

(tmp_path / dandiset_metadata_file).write_text(
"identifier: '000027'\nname: Test\ndescription: Test dandiset\n"
)
# Two broken symlinks
for name in ("sub-001/sub-001.nwb", "sub-002/sub-002.nwb"):
p = tmp_path / name
p.parent.mkdir(parents=True, exist_ok=True)
p.symlink_to(
".git/annex/objects/XX/YY/SHA256E-s123--abc.nwb/SHA256E-s123--abc.nwb"
)
# One real NWB file
sub3 = tmp_path / "sub-003"
sub3.mkdir()
make_nwb_file(
sub3 / "sub-003.nwb",
session_description="test session",
identifier="test-nwb-003",
session_start_time=datetime(2017, 4, 3, 11, tzinfo=timezone.utc),
)
return tmp_path


@pytest.mark.ai_generated
def test_validate_missing_file_content_error_default(tmp_path: Path) -> None:
"""Default --missing-file-content=error emits concise errors for broken symlinks."""
ds = _make_dandiset_with_broken_symlinks(tmp_path)
r = CliRunner().invoke(validate, [str(ds)])
assert r.exit_code == 1
assert "FILE_CONTENT_MISSING" in r.output
assert "broken symlink" in r.output.lower()
# No Python tracebacks in output
assert "Traceback" not in r.output
# The real NWB file (sub-003) must still be validated
assert "sub-003" in r.output


@pytest.mark.ai_generated
def test_validate_missing_file_content_skip(tmp_path: Path) -> None:
"""--missing-file-content=skip skips broken symlinks with a WARNING."""
ds = _make_dandiset_with_broken_symlinks(tmp_path)
r = CliRunner().invoke(validate, ["--missing-file-content", "skip", str(ds)])
assert "FILE_CONTENT_MISSING_SKIPPED" in r.output
assert "skipped" in r.output.lower()
# The real NWB file (sub-003) must still be validated
assert "sub-003" in r.output


@pytest.mark.ai_generated
def test_validate_missing_file_content_only_non_data(tmp_path: Path) -> None:
"""--missing-file-content=only-non-data validates path layout only for broken."""
ds = _make_dandiset_with_broken_symlinks(tmp_path)
r = CliRunner().invoke(
validate, ["--missing-file-content", "only-non-data", str(ds)]
)
assert "FILE_CONTENT_MISSING_PARTIAL" in r.output
# No pynwb tracebacks
assert "Traceback" not in r.output
# The real NWB file (sub-003) must still be validated
assert "sub-003" in r.output


@pytest.mark.ai_generated
def test_validate_missing_file_content_no_broken_symlinks(tmp_path: Path) -> None:
"""--missing-file-content has no effect on normal files (no broken symlinks)."""
from ...consts import dandiset_metadata_file

(tmp_path / dandiset_metadata_file).write_text(
"identifier: '000027'\nname: Test\ndescription: Test dandiset\n"
)
r_default = CliRunner().invoke(validate, [str(tmp_path)])
r_skip = CliRunner().invoke(
validate, ["--missing-file-content", "skip", str(tmp_path)]
)
# Both should succeed identically (no broken symlinks)
assert r_default.exit_code == r_skip.exit_code == 0
assert "FILE_CONTENT_MISSING" not in r_default.output
assert "FILE_CONTENT_MISSING" not in r_skip.output
141 changes: 83 additions & 58 deletions dandi/files/bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from dandi.validate._types import (
ORIGIN_INTERNAL_DANDI,
ORIGIN_VALIDATION_DANDI,
MissingFileContent,
Origin,
OriginType,
Scope,
Expand Down Expand Up @@ -91,9 +92,18 @@ def get_validation_errors(
self,
schema_version: str | None = None,
devel_debug: bool = False,
missing_file_content: MissingFileContent | None = None,
) -> list[ValidationResult]:
"""
Attempt to validate the file and return a list of errors encountered

Parameters
----------
missing_file_content : MissingFileContent | None
When set (not None), the file's content is known to be unavailable
(e.g. broken symlink). Validators should adjust behaviour
accordingly — for ``only-non-data`` they should skip
content-dependent checks and emit a WARNING.
"""
...

Expand All @@ -116,6 +126,7 @@ def get_validation_errors(
self,
schema_version: str | None = None,
devel_debug: bool = False,
missing_file_content: MissingFileContent | None = None,
) -> list[ValidationResult]:
with open(self.filepath) as f:
meta = yaml_load(f, typ="safe")
Expand Down Expand Up @@ -185,7 +196,14 @@ def get_validation_errors(
self,
schema_version: str | None = None,
devel_debug: bool = False,
missing_file_content: MissingFileContent | None = None,
) -> list[ValidationResult]:
# When file content is unavailable and policy is only-non-data,
# skip metadata extraction (which requires reading the file) and
# only do path-based validation via subclass hooks.
if missing_file_content == MissingFileContent.only_non_data:
return []

current_version = get_schema_version()
if schema_version is None:
schema_version = current_version
Expand Down Expand Up @@ -511,81 +529,88 @@ def get_validation_errors(
self,
schema_version: str | None = None,
devel_debug: bool = False,
missing_file_content: MissingFileContent | None = None,
) -> list[ValidationResult]:
"""
Validate NWB asset

If ``schema_version`` was provided, we only validate basic metadata,
and completely skip validation using nwbinspector.inspect_nwbfile
"""
# Avoid heavy import by importing within function:
from nwbinspector import Importance, inspect_nwbfile, load_config
and completely skip validation using nwbinspector.inspect_nwbfile.

# Avoid heavy import by importing within function:
from dandi.pynwb_utils import validate as pynwb_validate
If ``missing_file_content`` is ``only-non-data``, content-dependent
validators (pynwb, nwbinspector) are skipped and only path-layout
validation is performed.
"""
errors: list[ValidationResult] = []

errors: list[ValidationResult] = pynwb_validate(
self.filepath, devel_debug=devel_debug
)
if schema_version is not None:
errors.extend(
super().get_validation_errors(
schema_version=schema_version, devel_debug=devel_debug
)
)
if missing_file_content == MissingFileContent.only_non_data:
# Skip content-dependent validators (pynwb, nwbinspector).
# The warning for the user is emitted centrally in _core.validate().
pass
else:
# make sure that we have some basic metadata fields we require
try:
origin_validation_nwbinspector = Origin(
type=OriginType.VALIDATION,
validator=Validator.nwbinspector,
validator_version=str(_get_nwb_inspector_version()),
# Avoid heavy import by importing within function:
from nwbinspector import Importance, inspect_nwbfile, load_config

# Avoid heavy import by importing within function:
from dandi.pynwb_utils import validate as pynwb_validate

errors.extend(pynwb_validate(self.filepath, devel_debug=devel_debug))
if schema_version is not None:
errors.extend(
super().get_validation_errors(
schema_version=schema_version, devel_debug=devel_debug
)
)
else:
# make sure that we have some basic metadata fields we require
try:
origin_validation_nwbinspector = Origin(
type=OriginType.VALIDATION,
validator=Validator.nwbinspector,
validator_version=str(_get_nwb_inspector_version()),
)

for error in inspect_nwbfile(
nwbfile_path=self.filepath,
skip_validate=True,
config=load_config(filepath_or_keyword="dandi"),
importance_threshold=Importance.BEST_PRACTICE_VIOLATION,
# we might want to switch to a lower threshold once nwbinspector
# upstream reporting issues are clarified:
# https://github.com/dandi/dandi-cli/pull/1162#issuecomment-1322238896
# importance_threshold=Importance.BEST_PRACTICE_SUGGESTION,
):
severity = NWBI_IMPORTANCE_TO_DANDI_SEVERITY[error.importance.name]
kw: Any = {}
if error.location:
kw["within_asset_paths"] = {
error.file_path: error.location,
}
errors.append(
ValidationResult(
origin=origin_validation_nwbinspector,
severity=severity,
id=f"NWBI.{error.check_function_name}",
scope=Scope.FILE,
origin_result=error,
path=Path(error.file_path),
message=error.message,
# Assuming multiple sessions per multiple subjects,
# otherwise nesting level might differ
dataset_path=Path(error.file_path).parent.parent, # TODO
dandiset_path=Path(error.file_path).parent, # TODO
**kw,
for error in inspect_nwbfile(
nwbfile_path=self.filepath,
skip_validate=True,
config=load_config(filepath_or_keyword="dandi"),
importance_threshold=Importance.BEST_PRACTICE_VIOLATION,
):
severity = NWBI_IMPORTANCE_TO_DANDI_SEVERITY[
error.importance.name
]
kw: Any = {}
if error.location:
kw["within_asset_paths"] = {
error.file_path: error.location,
}
errors.append(
ValidationResult(
origin=origin_validation_nwbinspector,
severity=severity,
id=f"NWBI.{error.check_function_name}",
scope=Scope.FILE,
origin_result=error,
path=Path(error.file_path),
message=error.message,
dataset_path=Path(error.file_path).parent.parent,
dandiset_path=Path(error.file_path).parent,
**kw,
)
)
except Exception as e:
if devel_debug:
raise
# TODO: might reraise instead of making it into an error
return _pydantic_errors_to_validation_results(
[e], self.filepath, scope=Scope.FILE
)
except Exception as e:
if devel_debug:
raise
# TODO: might reraise instead of making it into an error
return _pydantic_errors_to_validation_results(
[e], self.filepath, scope=Scope.FILE
)

# Avoid circular imports by importing within function:
from .bids import NWBBIDSAsset
from ..organize import validate_organized_path

# Path-layout validation always runs (doesn't need content)
if not isinstance(self, NWBBIDSAsset) and self.dandiset_path is not None:
errors.extend(
validate_organized_path(self.path, self.filepath, self.dandiset_path)
Expand Down
Loading
Loading