diff --git a/src/cloudai/workloads/megatron_run/megatron_run.py b/src/cloudai/workloads/megatron_run/megatron_run.py index f804fa7e2..1c9d23096 100644 --- a/src/cloudai/workloads/megatron_run/megatron_run.py +++ b/src/cloudai/workloads/megatron_run/megatron_run.py @@ -145,11 +145,11 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: with slurm_job_path.open("r", encoding="utf-8") as file: metadata = SlurmJobMetadata.model_validate(toml.load(file)) - if not metadata.state.startswith("COMPLETED") or not metadata.exit_code.startswith("0:"): + if not metadata.exit_code.startswith("0:"): return JobStatusResult( is_successful=False, error_message=( - f"Slurm job did not complete successfully for {tr.output_path}: " + f"Slurm job exited with a non-zero exit code for {tr.output_path}: " f"state={metadata.state}, exit_code={metadata.exit_code}." ), ) @@ -165,11 +165,13 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: for line in file: if ITERATION_LOG_REGEX.search(line): return JobStatusResult(is_successful=True) + if "validation loss at iteration" in line: + return JobStatusResult(is_successful=True) return JobStatusResult( is_successful=False, error_message=( f"stdout.txt in {tr.output_path} does not contain Megatron iteration metrics. " - "Expected at least one line with elapsed time per iteration and throughput per GPU." + "Expected at least one line with elapsed time per iteration and throughput per GPU or validation loss." ), ) diff --git a/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py b/tests/workloads/megatron_run/test_megatron_run.py similarity index 57% rename from tests/workloads/megatron_run/test_job_status_retrieval_strategy.py rename to tests/workloads/megatron_run/test_megatron_run.py index 5afebe8ec..b439e1575 100644 --- a/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py +++ b/tests/workloads/megatron_run/test_megatron_run.py @@ -16,6 +16,7 @@ from pathlib import Path +import pytest import toml from cloudai import TestRun @@ -32,13 +33,13 @@ def setup_method(self) -> None: cmd_args=MegatronRunCmdArgs(docker_image_url="http://url", run_script=Path(__file__)), ) - def _write_slurm_metadata(self, output_path: Path, *, state: str, exit_code: str = "0:0") -> None: + def _write_slurm_metadata(self, output_path: Path, *, exit_code: str = "0:0") -> None: with (output_path / "slurm-job.toml").open("w", encoding="utf-8") as file: toml.dump( SlurmJobMetadata( job_id=123, name="megatron", - state=state, + state="WHATEVER", exit_code=exit_code, start_time="2026-03-22T11:44:22", end_time="2026-03-22T11:54:22", @@ -56,34 +57,31 @@ def test_missing_slurm_metadata_fails(self, base_tr: TestRun) -> None: assert not result.is_successful assert "slurm-job.toml file not found" in result.error_message - def test_failed_slurm_state_fails_even_if_stdout_has_metrics(self, base_tr: TestRun) -> None: + @pytest.mark.parametrize( + ("exit_code", "log_text", "is_successful"), + ( + ("0:0", "bla", False), + ( + "1:0", + "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " + "throughput per GPU (TFLOP/s/GPU): 494.6 |\n", + False, + ), + ( + "0:0", + "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " + "throughput per GPU (TFLOP/s/GPU): 494.6 |\n", + True, + ), + ("0:0", "validation loss at iteration 1.0", True), + ("0:15", "validation loss at iteration 1.0", True), + ("15:0", "validation loss at iteration 1.0", False), + ), + ) + def test_is_run_successful(self, base_tr: TestRun, exit_code: str, log_text: str, is_successful: bool) -> None: base_tr.output_path.mkdir(parents=True, exist_ok=True) - self._write_slurm_metadata(base_tr.output_path, state="FAILED", exit_code="1:0") - (base_tr.output_path / "stdout.txt").write_text( - "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " - "throughput per GPU (TFLOP/s/GPU): 494.6 |\n" - ) - - result = self.megatron_tdef.was_run_successful(base_tr) - assert not result.is_successful - assert "state=FAILED" in result.error_message - - def test_completed_slurm_job_with_iteration_metrics_succeeds(self, base_tr: TestRun) -> None: - base_tr.output_path.mkdir(parents=True, exist_ok=True) - self._write_slurm_metadata(base_tr.output_path, state="COMPLETED") - (base_tr.output_path / "stdout.txt").write_text( - "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " - "throughput per GPU (TFLOP/s/GPU): 494.6 |\n" - ) - - result = self.megatron_tdef.was_run_successful(base_tr) - assert result.is_successful - - def test_completed_slurm_job_without_iteration_metrics_fails(self, base_tr: TestRun) -> None: - base_tr.output_path.mkdir(parents=True, exist_ok=True) - self._write_slurm_metadata(base_tr.output_path, state="COMPLETED") - (base_tr.output_path / "stdout.txt").write_text("training started\n") + self._write_slurm_metadata(base_tr.output_path, exit_code=exit_code) + (base_tr.output_path / "stdout.txt").write_text(log_text) result = self.megatron_tdef.was_run_successful(base_tr) - assert not result.is_successful - assert "does not contain Megatron iteration metrics" in result.error_message + assert result.is_successful is is_successful