dandi · yarikoptic-gitmate · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026 · Apr 12, 2026
diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py
@@ -21,7 +21,7 @@
     validation_companion_path,
     write_validation_jsonl,
 )
-from ..validate._types import Severity, ValidationResult
+from ..validate._types import MissingFileContent, Severity, ValidationResult
 
 lgr = logging.getLogger(__name__)
 
@@ -56,6 +56,7 @@ def _collect_results(
     schema: str | None,
     devel_debug: bool,
     allow_any_path: bool,
+    missing_file_content: MissingFileContent = MissingFileContent.error,
 ) -> list[ValidationResult]:
     """Run validation and collect all results into a list."""
     # Avoid heavy import by importing within function:
@@ -81,6 +82,7 @@ def _collect_results(
             schema_version=schema,
             devel_debug=devel_debug,
             allow_any_path=allow_any_path,
+            missing_file_content=missing_file_content,
         )
     )
 
@@ -208,6 +210,17 @@ def validate_bids(
     help="Limit results per group (or total if ungrouped). "
     "Excess results are replaced by a count of omitted items.",
 )
+@click.option(
+    "--missing-file-content",
+    "missing_file_content",
+    help="How to handle files whose content is unavailable (e.g. broken symlinks "
+    "in a datalad dataset without fetched data). 'error' (default) emits a "
+    "concise error per file, 'skip' skips each such file with a warning, "
+    "'only-non-data' skips content-dependent validators but still validates "
+    "path layout.",
+    type=click.Choice(["error", "only-non-data", "skip"], case_sensitive=True),
+    default="error",
+)
 @click.option(
     "--load",
     help="Load validation results from JSONL file(s) instead of running validation.",
@@ -229,6 +242,7 @@ def validate(
     output_file: str | None = None,
     summary: bool = False,
     max_per_group: int | None = None,
+    missing_file_content: str = "error",
     load: tuple[str, ...] = (),
     schema: str | None = None,
     devel_debug: bool = False,
@@ -270,7 +284,10 @@ def validate(
     if load:
         results = load_validation_jsonl(load)
     else:
-        results = _collect_results(paths, schema, devel_debug, allow_any_path)
+        mfc = MissingFileContent(missing_file_content)
+        results = _collect_results(
+            paths, schema, devel_debug, allow_any_path, missing_file_content=mfc
+        )
         # Auto-save companion right after collection, before filtering — so
         # all results are preserved regardless of display filters.
         # Skip when writing to --output (user already gets structured output).

diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py
@@ -896,3 +896,103 @@ def test_validate_auto_companion_structured_stdout(
     assert r.exit_code == 1
 
     assert len(list(redirected_logdir.glob("*_validation.jsonl"))) == 1
+
+
+# ---- Tests for --missing-file-content option (issue #1606) ----
+
+
+def _make_dandiset_with_broken_symlinks(tmp_path: Path) -> Path:
+    """Create a dandiset with broken NWB symlinks *and* one real NWB file.
+
+    Layout::
+
+        dandiset.yaml
+        sub-001/sub-001.nwb   -> broken symlink (datalad annex)
+        sub-002/sub-002.nwb   -> broken symlink (datalad annex)
+        sub-003/sub-003.nwb   -> real NWB file (content present)
+
+    This lets tests assert that real files are still fully validated
+    while broken symlinks are handled according to the policy.
+    """
+    from datetime import datetime, timezone
+
+    from ...consts import dandiset_metadata_file
+    from ...pynwb_utils import make_nwb_file
+
+    (tmp_path / dandiset_metadata_file).write_text(
+        "identifier: '000027'\nname: Test\ndescription: Test dandiset\n"
+    )
+    # Two broken symlinks
+    for name in ("sub-001/sub-001.nwb", "sub-002/sub-002.nwb"):
+        p = tmp_path / name
+        p.parent.mkdir(parents=True, exist_ok=True)
+        p.symlink_to(
+            ".git/annex/objects/XX/YY/SHA256E-s123--abc.nwb/SHA256E-s123--abc.nwb"
+        )
+    # One real NWB file
+    sub3 = tmp_path / "sub-003"
+    sub3.mkdir()
+    make_nwb_file(
+        sub3 / "sub-003.nwb",
+        session_description="test session",
+        identifier="test-nwb-003",
+        session_start_time=datetime(2017, 4, 3, 11, tzinfo=timezone.utc),
+    )
+    return tmp_path
+
+
+@pytest.mark.ai_generated
+def test_validate_missing_file_content_error_default(tmp_path: Path) -> None:
+    """Default --missing-file-content=error emits concise errors for broken symlinks."""
+    ds = _make_dandiset_with_broken_symlinks(tmp_path)
+    r = CliRunner().invoke(validate, [str(ds)])
+    assert r.exit_code == 1
+    assert "FILE_CONTENT_MISSING" in r.output
+    assert "broken symlink" in r.output.lower()
+    # No Python tracebacks in output
+    assert "Traceback" not in r.output
+    # The real NWB file (sub-003) must still be validated
+    assert "sub-003" in r.output
+
+
+@pytest.mark.ai_generated
+def test_validate_missing_file_content_skip(tmp_path: Path) -> None:
+    """--missing-file-content=skip skips broken symlinks with a WARNING."""
+    ds = _make_dandiset_with_broken_symlinks(tmp_path)
+    r = CliRunner().invoke(validate, ["--missing-file-content", "skip", str(ds)])
+    assert "FILE_CONTENT_MISSING_SKIPPED" in r.output
+    assert "skipped" in r.output.lower()
+    # The real NWB file (sub-003) must still be validated
+    assert "sub-003" in r.output
+
+
+@pytest.mark.ai_generated
+def test_validate_missing_file_content_only_non_data(tmp_path: Path) -> None:
+    """--missing-file-content=only-non-data validates path layout only for broken."""
+    ds = _make_dandiset_with_broken_symlinks(tmp_path)
+    r = CliRunner().invoke(
+        validate, ["--missing-file-content", "only-non-data", str(ds)]
+    )
+    assert "FILE_CONTENT_MISSING_PARTIAL" in r.output
+    # No pynwb tracebacks
+    assert "Traceback" not in r.output
+    # The real NWB file (sub-003) must still be validated
+    assert "sub-003" in r.output
+
+
+@pytest.mark.ai_generated
+def test_validate_missing_file_content_no_broken_symlinks(tmp_path: Path) -> None:
+    """--missing-file-content has no effect on normal files (no broken symlinks)."""
+    from ...consts import dandiset_metadata_file
+
+    (tmp_path / dandiset_metadata_file).write_text(
+        "identifier: '000027'\nname: Test\ndescription: Test dandiset\n"
+    )
+    r_default = CliRunner().invoke(validate, [str(tmp_path)])
+    r_skip = CliRunner().invoke(
+        validate, ["--missing-file-content", "skip", str(tmp_path)]
+    )
+    # Both should succeed identically (no broken symlinks)
+    assert r_default.exit_code == r_skip.exit_code == 0
+    assert "FILE_CONTENT_MISSING" not in r_default.output
+    assert "FILE_CONTENT_MISSING" not in r_skip.output
diff --git a/dandi/files/bases.py b/dandi/files/bases.py
@@ -32,6 +32,7 @@
 from dandi.validate._types import (
     ORIGIN_INTERNAL_DANDI,
     ORIGIN_VALIDATION_DANDI,
+    MissingFileContent,
     Origin,
     OriginType,
     Scope,
@@ -91,9 +92,18 @@ def get_validation_errors(
         self,
         schema_version: str | None = None,
         devel_debug: bool = False,
+        missing_file_content: MissingFileContent | None = None,
     ) -> list[ValidationResult]:
         """
         Attempt to validate the file and return a list of errors encountered
+
+        Parameters
+        ----------
+        missing_file_content : MissingFileContent | None
+            When set (not None), the file's content is known to be unavailable
+            (e.g. broken symlink).  Validators should adjust behaviour
+            accordingly — for ``only-non-data`` they should skip
+            content-dependent checks and emit a WARNING.
         """
         ...
 
@@ -116,6 +126,7 @@ def get_validation_errors(
         self,
         schema_version: str | None = None,
         devel_debug: bool = False,
+        missing_file_content: MissingFileContent | None = None,
     ) -> list[ValidationResult]:
         with open(self.filepath) as f:
             meta = yaml_load(f, typ="safe")
@@ -185,7 +196,14 @@ def get_validation_errors(
         self,
         schema_version: str | None = None,
         devel_debug: bool = False,
+        missing_file_content: MissingFileContent | None = None,
     ) -> list[ValidationResult]:
+        # When file content is unavailable and policy is only-non-data,
+        # skip metadata extraction (which requires reading the file) and
+        # only do path-based validation via subclass hooks.
+        if missing_file_content == MissingFileContent.only_non_data:
+            return []
+
         current_version = get_schema_version()
         if schema_version is None:
             schema_version = current_version
@@ -511,81 +529,88 @@ def get_validation_errors(
         self,
         schema_version: str | None = None,
         devel_debug: bool = False,
+        missing_file_content: MissingFileContent | None = None,
     ) -> list[ValidationResult]:
         """
         Validate NWB asset
 
         If ``schema_version`` was provided, we only validate basic metadata,
-        and completely skip validation using nwbinspector.inspect_nwbfile
-        """
-        # Avoid heavy import by importing within function:
-        from nwbinspector import Importance, inspect_nwbfile, load_config
+        and completely skip validation using nwbinspector.inspect_nwbfile.
 
-        # Avoid heavy import by importing within function:
-        from dandi.pynwb_utils import validate as pynwb_validate
+        If ``missing_file_content`` is ``only-non-data``, content-dependent
+        validators (pynwb, nwbinspector) are skipped and only path-layout
+        validation is performed.
+        """
+        errors: list[ValidationResult] = []
 
-        errors: list[ValidationResult] = pynwb_validate(
-            self.filepath, devel_debug=devel_debug
-        )
-        if schema_version is not None:
-            errors.extend(
-                super().get_validation_errors(
-                    schema_version=schema_version, devel_debug=devel_debug
-                )
-            )
+        if missing_file_content == MissingFileContent.only_non_data:
+            # Skip content-dependent validators (pynwb, nwbinspector).
+            # The warning for the user is emitted centrally in _core.validate().
+            pass
         else:
-            # make sure that we have some basic metadata fields we require
-            try:
-                origin_validation_nwbinspector = Origin(
-                    type=OriginType.VALIDATION,
-                    validator=Validator.nwbinspector,
-                    validator_version=str(_get_nwb_inspector_version()),
+            # Avoid heavy import by importing within function:
+            from nwbinspector import Importance, inspect_nwbfile, load_config
+
+            # Avoid heavy import by importing within function:
+            from dandi.pynwb_utils import validate as pynwb_validate
+
+            errors.extend(pynwb_validate(self.filepath, devel_debug=devel_debug))
+            if schema_version is not None:
+                errors.extend(
+                    super().get_validation_errors(
+                        schema_version=schema_version, devel_debug=devel_debug
+                    )
                 )
+            else:
+                # make sure that we have some basic metadata fields we require
+                try:
+                    origin_validation_nwbinspector = Origin(
+                        type=OriginType.VALIDATION,
+                        validator=Validator.nwbinspector,
+                        validator_version=str(_get_nwb_inspector_version()),
+                    )
 
-                for error in inspect_nwbfile(
-                    nwbfile_path=self.filepath,
-                    skip_validate=True,
-                    config=load_config(filepath_or_keyword="dandi"),
-                    importance_threshold=Importance.BEST_PRACTICE_VIOLATION,
-                    # we might want to switch to a lower threshold once nwbinspector
-                    # upstream reporting issues are clarified:
-                    # https://github.com/dandi/dandi-cli/pull/1162#issuecomment-1322238896
-                    # importance_threshold=Importance.BEST_PRACTICE_SUGGESTION,
-                ):
-                    severity = NWBI_IMPORTANCE_TO_DANDI_SEVERITY[error.importance.name]
-                    kw: Any = {}
-                    if error.location:
-                        kw["within_asset_paths"] = {
-                            error.file_path: error.location,
-                        }
-                    errors.append(
-                        ValidationResult(
-                            origin=origin_validation_nwbinspector,
-                            severity=severity,
-                            id=f"NWBI.{error.check_function_name}",
-                            scope=Scope.FILE,
-                            origin_result=error,
-                            path=Path(error.file_path),
-                            message=error.message,
-                            # Assuming multiple sessions per multiple subjects,
-                            # otherwise nesting level might differ
-                            dataset_path=Path(error.file_path).parent.parent,  # TODO
-                            dandiset_path=Path(error.file_path).parent,  # TODO
-                            **kw,
+                    for error in inspect_nwbfile(
+                        nwbfile_path=self.filepath,
+                        skip_validate=True,
+                        config=load_config(filepath_or_keyword="dandi"),
+                        importance_threshold=Importance.BEST_PRACTICE_VIOLATION,
+                    ):
+                        severity = NWBI_IMPORTANCE_TO_DANDI_SEVERITY[
+                            error.importance.name
+                        ]
+                        kw: Any = {}
+                        if error.location:
+                            kw["within_asset_paths"] = {
+                                error.file_path: error.location,
+                            }
+                        errors.append(
+                            ValidationResult(
+                                origin=origin_validation_nwbinspector,
+                                severity=severity,
+                                id=f"NWBI.{error.check_function_name}",
+                                scope=Scope.FILE,
+                                origin_result=error,
+                                path=Path(error.file_path),
+                                message=error.message,
+                                dataset_path=Path(error.file_path).parent.parent,
+                                dandiset_path=Path(error.file_path).parent,
+                                **kw,
+                            )
                         )
+                except Exception as e:
+                    if devel_debug:
+                        raise
+                    # TODO: might reraise instead of making it into an error
+                    return _pydantic_errors_to_validation_results(
+                        [e], self.filepath, scope=Scope.FILE
                     )
-            except Exception as e:
-                if devel_debug:
-                    raise
-                # TODO: might reraise instead of making it into an error
-                return _pydantic_errors_to_validation_results(
-                    [e], self.filepath, scope=Scope.FILE
-                )
 
         # Avoid circular imports by importing within function:
         from .bids import NWBBIDSAsset
         from ..organize import validate_organized_path
 
+        # Path-layout validation always runs (doesn't need content)
         if not isinstance(self, NWBBIDSAsset) and self.dandiset_path is not None:
             errors.extend(
                 validate_organized_path(self.path, self.filepath, self.dandiset_path)