NVIDIA · podkidyshev · Apr 10, 2026 · Apr 10, 2026 · Apr 13, 2026
@@ -145,11 +145,11 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult:
         with slurm_job_path.open("r", encoding="utf-8") as file:
             metadata = SlurmJobMetadata.model_validate(toml.load(file))
 
-        if not metadata.state.startswith("COMPLETED") or not metadata.exit_code.startswith("0:"):
+        if not metadata.exit_code.startswith("0:"):
             return JobStatusResult(
                 is_successful=False,
                 error_message=(
-                    f"Slurm job did not complete successfully for {tr.output_path}: "
+                    f"Slurm job exited with a non-zero exit code for {tr.output_path}: "
                     f"state={metadata.state}, exit_code={metadata.exit_code}."
                 ),
             )
@@ -165,6 +165,8 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult:
             for line in file:
                 if ITERATION_LOG_REGEX.search(line):
                     return JobStatusResult(is_successful=True)
+                if "validation loss at iteration" in line:
+                    return JobStatusResult(is_successful=True)
 
         return JobStatusResult(
             is_successful=False,

@@ -16,6 +16,7 @@
 
 from pathlib import Path
 
+import pytest
 import toml
 
 from cloudai import TestRun
@@ -32,13 +33,13 @@ def setup_method(self) -> None:
             cmd_args=MegatronRunCmdArgs(docker_image_url="http://url", run_script=Path(__file__)),
         )
 
-    def _write_slurm_metadata(self, output_path: Path, *, state: str, exit_code: str = "0:0") -> None:
+    def _write_slurm_metadata(self, output_path: Path, *, exit_code: str = "0:0") -> None:
         with (output_path / "slurm-job.toml").open("w", encoding="utf-8") as file:
             toml.dump(
                 SlurmJobMetadata(
                     job_id=123,
                     name="megatron",
-                    state=state,
+                    state="WHATEVER",
                     exit_code=exit_code,
                     start_time="2026-03-22T11:44:22",
                     end_time="2026-03-22T11:54:22",
@@ -56,34 +57,30 @@ def test_missing_slurm_metadata_fails(self, base_tr: TestRun) -> None:
         assert not result.is_successful
         assert "slurm-job.toml file not found" in result.error_message
 
-    def test_failed_slurm_state_fails_even_if_stdout_has_metrics(self, base_tr: TestRun) -> None:
+    @pytest.mark.parametrize(
+        ("exit_code", "log_text", "is_successful"),
+        (
+            ("0:0", "bla", False),
+            (
+                "1:0",
+                "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | "
+                "throughput per GPU (TFLOP/s/GPU): 494.6 |\n",
+                False,
+            ),
+            (
+                "0:0",
+                "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | "
+                "throughput per GPU (TFLOP/s/GPU): 494.6 |\n",
+                True,
+            ),
+            ("0:0", "validation loss at iteration 1.0", True),
+            ("15:0", "validation loss at iteration 1.0", False),
+        ),
+    )
+    def test_is_run_successful(self, base_tr: TestRun, exit_code: str, log_text: str, is_successful: bool) -> None:
         base_tr.output_path.mkdir(parents=True, exist_ok=True)
-        self._write_slurm_metadata(base_tr.output_path, state="FAILED", exit_code="1:0")
-        (base_tr.output_path / "stdout.txt").write_text(
-            "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | "
-            "throughput per GPU (TFLOP/s/GPU): 494.6 |\n"
-        )
-
-        result = self.megatron_tdef.was_run_successful(base_tr)
-        assert not result.is_successful
-        assert "state=FAILED" in result.error_message
-
-    def test_completed_slurm_job_with_iteration_metrics_succeeds(self, base_tr: TestRun) -> None:
-        base_tr.output_path.mkdir(parents=True, exist_ok=True)
-        self._write_slurm_metadata(base_tr.output_path, state="COMPLETED")
-        (base_tr.output_path / "stdout.txt").write_text(
-            "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | "
-            "throughput per GPU (TFLOP/s/GPU): 494.6 |\n"
-        )
-
-        result = self.megatron_tdef.was_run_successful(base_tr)
-        assert result.is_successful
-
-    def test_completed_slurm_job_without_iteration_metrics_fails(self, base_tr: TestRun) -> None:
-        base_tr.output_path.mkdir(parents=True, exist_ok=True)
-        self._write_slurm_metadata(base_tr.output_path, state="COMPLETED")
-        (base_tr.output_path / "stdout.txt").write_text("training started\n")
+        self._write_slurm_metadata(base_tr.output_path, exit_code=exit_code)
+        (base_tr.output_path / "stdout.txt").write_text(log_text)
 
         result = self.megatron_tdef.was_run_successful(base_tr)
-        assert not result.is_successful
-        assert "does not contain Megatron iteration metrics" in result.error_message
+        assert result.is_successful is is_successful