From f42136af6779e3b412a3a4f8f286ad83b96bb76a Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Fri, 10 Apr 2026 15:27:54 +0200 Subject: [PATCH 1/5] redefined is_run_successful function --- .../workloads/megatron_run/megatron_run.py | 3 ++- .../test_job_status_retrieval_strategy.py | 23 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/cloudai/workloads/megatron_run/megatron_run.py b/src/cloudai/workloads/megatron_run/megatron_run.py index f804fa7e2..6511501fc 100644 --- a/src/cloudai/workloads/megatron_run/megatron_run.py +++ b/src/cloudai/workloads/megatron_run/megatron_run.py @@ -32,6 +32,7 @@ r"throughput per GPU \(TFLOP/s/GPU\):\s*([0-9]+(?:\.[0-9]+)?)", re.IGNORECASE, ) +SUCCESSFUL_SLURM_STATES = ("COMPLETED", "TIMEOUT") class MegatronRunCmdArgs(CmdArgs): @@ -145,7 +146,7 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: with slurm_job_path.open("r", encoding="utf-8") as file: metadata = SlurmJobMetadata.model_validate(toml.load(file)) - if not metadata.state.startswith("COMPLETED") or not metadata.exit_code.startswith("0:"): + if not metadata.state.startswith(SUCCESSFUL_SLURM_STATES) or not metadata.exit_code.startswith("0:"): return JobStatusResult( is_successful=False, error_message=( diff --git a/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py b/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py index 5afebe8ec..475e7eecd 100644 --- a/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py +++ b/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py @@ -79,6 +79,29 @@ def test_completed_slurm_job_with_iteration_metrics_succeeds(self, base_tr: Test result = self.megatron_tdef.was_run_successful(base_tr) assert result.is_successful + def test_timeout_slurm_job_with_zero_exit_and_iteration_metrics_succeeds(self, base_tr: TestRun) -> None: + base_tr.output_path.mkdir(parents=True, exist_ok=True) + self._write_slurm_metadata(base_tr.output_path, state="TIMEOUT", exit_code="0:0") + (base_tr.output_path / "stdout.txt").write_text( + "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " + "throughput per GPU (TFLOP/s/GPU): 494.6 |\n" + ) + + result = self.megatron_tdef.was_run_successful(base_tr) + assert result.is_successful + + def test_timeout_slurm_job_with_non_zero_exit_fails(self, base_tr: TestRun) -> None: + base_tr.output_path.mkdir(parents=True, exist_ok=True) + self._write_slurm_metadata(base_tr.output_path, state="TIMEOUT", exit_code="15:0") + (base_tr.output_path / "stdout.txt").write_text( + "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " + "throughput per GPU (TFLOP/s/GPU): 494.6 |\n" + ) + + result = self.megatron_tdef.was_run_successful(base_tr) + assert not result.is_successful + assert "state=TIMEOUT" in result.error_message + def test_completed_slurm_job_without_iteration_metrics_fails(self, base_tr: TestRun) -> None: base_tr.output_path.mkdir(parents=True, exist_ok=True) self._write_slurm_metadata(base_tr.output_path, state="COMPLETED") From dcf4199648451ff04dc206ea320f2d9e0b66da1e Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Fri, 10 Apr 2026 15:50:24 +0200 Subject: [PATCH 2/5] adjusted was_run_successful --- src/cloudai/workloads/megatron_run/megatron_run.py | 5 ++--- .../megatron_run/test_job_status_retrieval_strategy.py | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/cloudai/workloads/megatron_run/megatron_run.py b/src/cloudai/workloads/megatron_run/megatron_run.py index 6511501fc..786e3c05e 100644 --- a/src/cloudai/workloads/megatron_run/megatron_run.py +++ b/src/cloudai/workloads/megatron_run/megatron_run.py @@ -32,7 +32,6 @@ r"throughput per GPU \(TFLOP/s/GPU\):\s*([0-9]+(?:\.[0-9]+)?)", re.IGNORECASE, ) -SUCCESSFUL_SLURM_STATES = ("COMPLETED", "TIMEOUT") class MegatronRunCmdArgs(CmdArgs): @@ -146,11 +145,11 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: with slurm_job_path.open("r", encoding="utf-8") as file: metadata = SlurmJobMetadata.model_validate(toml.load(file)) - if not metadata.state.startswith(SUCCESSFUL_SLURM_STATES) or not metadata.exit_code.startswith("0:"): + if not metadata.exit_code.startswith("0:"): return JobStatusResult( is_successful=False, error_message=( - f"Slurm job did not complete successfully for {tr.output_path}: " + f"Slurm job exited with a non-zero exit code for {tr.output_path}: " f"state={metadata.state}, exit_code={metadata.exit_code}." ), ) diff --git a/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py b/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py index 475e7eecd..40bef68e6 100644 --- a/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py +++ b/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py @@ -56,7 +56,7 @@ def test_missing_slurm_metadata_fails(self, base_tr: TestRun) -> None: assert not result.is_successful assert "slurm-job.toml file not found" in result.error_message - def test_failed_slurm_state_fails_even_if_stdout_has_metrics(self, base_tr: TestRun) -> None: + def test_non_zero_exit_code_fails_even_if_stdout_has_metrics(self, base_tr: TestRun) -> None: base_tr.output_path.mkdir(parents=True, exist_ok=True) self._write_slurm_metadata(base_tr.output_path, state="FAILED", exit_code="1:0") (base_tr.output_path / "stdout.txt").write_text( @@ -66,7 +66,7 @@ def test_failed_slurm_state_fails_even_if_stdout_has_metrics(self, base_tr: Test result = self.megatron_tdef.was_run_successful(base_tr) assert not result.is_successful - assert "state=FAILED" in result.error_message + assert "non-zero exit code" in result.error_message def test_completed_slurm_job_with_iteration_metrics_succeeds(self, base_tr: TestRun) -> None: base_tr.output_path.mkdir(parents=True, exist_ok=True) @@ -100,7 +100,7 @@ def test_timeout_slurm_job_with_non_zero_exit_fails(self, base_tr: TestRun) -> N result = self.megatron_tdef.was_run_successful(base_tr) assert not result.is_successful - assert "state=TIMEOUT" in result.error_message + assert "non-zero exit code" in result.error_message def test_completed_slurm_job_without_iteration_metrics_fails(self, base_tr: TestRun) -> None: base_tr.output_path.mkdir(parents=True, exist_ok=True) From 0ca6562b7a29dbe5d0f5336e1d6a54816667ebcf Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Mon, 13 Apr 2026 23:46:38 +0200 Subject: [PATCH 3/5] better is_run_successfull for megarun --- .../workloads/megatron_run/megatron_run.py | 2 + .../test_job_status_retrieval_strategy.py | 112 ------------------ .../megatron_run/test_megatron_run.py | 86 ++++++++++++++ 3 files changed, 88 insertions(+), 112 deletions(-) delete mode 100644 tests/workloads/megatron_run/test_job_status_retrieval_strategy.py create mode 100644 tests/workloads/megatron_run/test_megatron_run.py diff --git a/src/cloudai/workloads/megatron_run/megatron_run.py b/src/cloudai/workloads/megatron_run/megatron_run.py index 786e3c05e..9f50acea0 100644 --- a/src/cloudai/workloads/megatron_run/megatron_run.py +++ b/src/cloudai/workloads/megatron_run/megatron_run.py @@ -165,6 +165,8 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: for line in file: if ITERATION_LOG_REGEX.search(line): return JobStatusResult(is_successful=True) + if "validation loss at iteration" in line: + return JobStatusResult(is_successful=True) return JobStatusResult( is_successful=False, diff --git a/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py b/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py deleted file mode 100644 index 40bef68e6..000000000 --- a/tests/workloads/megatron_run/test_job_status_retrieval_strategy.py +++ /dev/null @@ -1,112 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path - -import toml - -from cloudai import TestRun -from cloudai.systems.slurm import SlurmJobMetadata -from cloudai.workloads.megatron_run import MegatronRunCmdArgs, MegatronRunTestDefinition - - -class TestMegatronRunSuccessCheck: - def setup_method(self) -> None: - self.megatron_tdef = MegatronRunTestDefinition( - name="m", - description="d", - test_template_name="MegatronRun", - cmd_args=MegatronRunCmdArgs(docker_image_url="http://url", run_script=Path(__file__)), - ) - - def _write_slurm_metadata(self, output_path: Path, *, state: str, exit_code: str = "0:0") -> None: - with (output_path / "slurm-job.toml").open("w", encoding="utf-8") as file: - toml.dump( - SlurmJobMetadata( - job_id=123, - name="megatron", - state=state, - exit_code=exit_code, - start_time="2026-03-22T11:44:22", - end_time="2026-03-22T11:54:22", - elapsed_time_sec=600, - srun_cmd="srun test", - test_cmd="python pretrain_gpt.py", - job_root=output_path, - job_steps=[], - ).model_dump(), - file, - ) - - def test_missing_slurm_metadata_fails(self, base_tr: TestRun) -> None: - result = self.megatron_tdef.was_run_successful(base_tr) - assert not result.is_successful - assert "slurm-job.toml file not found" in result.error_message - - def test_non_zero_exit_code_fails_even_if_stdout_has_metrics(self, base_tr: TestRun) -> None: - base_tr.output_path.mkdir(parents=True, exist_ok=True) - self._write_slurm_metadata(base_tr.output_path, state="FAILED", exit_code="1:0") - (base_tr.output_path / "stdout.txt").write_text( - "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " - "throughput per GPU (TFLOP/s/GPU): 494.6 |\n" - ) - - result = self.megatron_tdef.was_run_successful(base_tr) - assert not result.is_successful - assert "non-zero exit code" in result.error_message - - def test_completed_slurm_job_with_iteration_metrics_succeeds(self, base_tr: TestRun) -> None: - base_tr.output_path.mkdir(parents=True, exist_ok=True) - self._write_slurm_metadata(base_tr.output_path, state="COMPLETED") - (base_tr.output_path / "stdout.txt").write_text( - "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " - "throughput per GPU (TFLOP/s/GPU): 494.6 |\n" - ) - - result = self.megatron_tdef.was_run_successful(base_tr) - assert result.is_successful - - def test_timeout_slurm_job_with_zero_exit_and_iteration_metrics_succeeds(self, base_tr: TestRun) -> None: - base_tr.output_path.mkdir(parents=True, exist_ok=True) - self._write_slurm_metadata(base_tr.output_path, state="TIMEOUT", exit_code="0:0") - (base_tr.output_path / "stdout.txt").write_text( - "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " - "throughput per GPU (TFLOP/s/GPU): 494.6 |\n" - ) - - result = self.megatron_tdef.was_run_successful(base_tr) - assert result.is_successful - - def test_timeout_slurm_job_with_non_zero_exit_fails(self, base_tr: TestRun) -> None: - base_tr.output_path.mkdir(parents=True, exist_ok=True) - self._write_slurm_metadata(base_tr.output_path, state="TIMEOUT", exit_code="15:0") - (base_tr.output_path / "stdout.txt").write_text( - "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " - "throughput per GPU (TFLOP/s/GPU): 494.6 |\n" - ) - - result = self.megatron_tdef.was_run_successful(base_tr) - assert not result.is_successful - assert "non-zero exit code" in result.error_message - - def test_completed_slurm_job_without_iteration_metrics_fails(self, base_tr: TestRun) -> None: - base_tr.output_path.mkdir(parents=True, exist_ok=True) - self._write_slurm_metadata(base_tr.output_path, state="COMPLETED") - (base_tr.output_path / "stdout.txt").write_text("training started\n") - - result = self.megatron_tdef.was_run_successful(base_tr) - assert not result.is_successful - assert "does not contain Megatron iteration metrics" in result.error_message diff --git a/tests/workloads/megatron_run/test_megatron_run.py b/tests/workloads/megatron_run/test_megatron_run.py new file mode 100644 index 000000000..766bd7191 --- /dev/null +++ b/tests/workloads/megatron_run/test_megatron_run.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +import pytest +import toml + +from cloudai import TestRun +from cloudai.systems.slurm import SlurmJobMetadata +from cloudai.workloads.megatron_run import MegatronRunCmdArgs, MegatronRunTestDefinition + + +class TestMegatronRunSuccessCheck: + def setup_method(self) -> None: + self.megatron_tdef = MegatronRunTestDefinition( + name="m", + description="d", + test_template_name="MegatronRun", + cmd_args=MegatronRunCmdArgs(docker_image_url="http://url", run_script=Path(__file__)), + ) + + def _write_slurm_metadata(self, output_path: Path, *, exit_code: str = "0:0") -> None: + with (output_path / "slurm-job.toml").open("w", encoding="utf-8") as file: + toml.dump( + SlurmJobMetadata( + job_id=123, + name="megatron", + state="WHATEVER", + exit_code=exit_code, + start_time="2026-03-22T11:44:22", + end_time="2026-03-22T11:54:22", + elapsed_time_sec=600, + srun_cmd="srun test", + test_cmd="python pretrain_gpt.py", + job_root=output_path, + job_steps=[], + ).model_dump(), + file, + ) + + def test_missing_slurm_metadata_fails(self, base_tr: TestRun) -> None: + result = self.megatron_tdef.was_run_successful(base_tr) + assert not result.is_successful + assert "slurm-job.toml file not found" in result.error_message + + @pytest.mark.parametrize( + ("exit_code", "log_text", "is_successful"), + ( + ("0:0", "bla", False), + ( + "1:0", + "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " + "throughput per GPU (TFLOP/s/GPU): 494.6 |\n", + False, + ), + ( + "0:0", + "[2026-01-16 07:32:39] iteration 6/100 | elapsed time per iteration (ms): 15639.0 | " + "throughput per GPU (TFLOP/s/GPU): 494.6 |\n", + True, + ), + ("0:0", "validation loss at iteration 1.0", True), + ("15:0", "validation loss at iteration 1.0", False), + ), + ) + def test_is_run_successful(self, base_tr: TestRun, exit_code: str, log_text: str, is_successful: bool) -> None: + base_tr.output_path.mkdir(parents=True, exist_ok=True) + self._write_slurm_metadata(base_tr.output_path, exit_code=exit_code) + (base_tr.output_path / "stdout.txt").write_text(log_text) + + result = self.megatron_tdef.was_run_successful(base_tr) + assert result.is_successful is is_successful From 85f5b9e0dacad61b5096413953a0d95536badf94 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 14 Apr 2026 17:19:56 +0200 Subject: [PATCH 4/5] one more test case --- tests/workloads/megatron_run/test_megatron_run.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/workloads/megatron_run/test_megatron_run.py b/tests/workloads/megatron_run/test_megatron_run.py index 766bd7191..b439e1575 100644 --- a/tests/workloads/megatron_run/test_megatron_run.py +++ b/tests/workloads/megatron_run/test_megatron_run.py @@ -74,6 +74,7 @@ def test_missing_slurm_metadata_fails(self, base_tr: TestRun) -> None: True, ), ("0:0", "validation loss at iteration 1.0", True), + ("0:15", "validation loss at iteration 1.0", True), ("15:0", "validation loss at iteration 1.0", False), ), ) From 3bfa9ddb52363f49855d15c9e4da16eb1fbefaa0 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 14 Apr 2026 17:38:36 +0200 Subject: [PATCH 5/5] update error message --- src/cloudai/workloads/megatron_run/megatron_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/workloads/megatron_run/megatron_run.py b/src/cloudai/workloads/megatron_run/megatron_run.py index 9f50acea0..1c9d23096 100644 --- a/src/cloudai/workloads/megatron_run/megatron_run.py +++ b/src/cloudai/workloads/megatron_run/megatron_run.py @@ -172,6 +172,6 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: is_successful=False, error_message=( f"stdout.txt in {tr.output_path} does not contain Megatron iteration metrics. " - "Expected at least one line with elapsed time per iteration and throughput per GPU." + "Expected at least one line with elapsed time per iteration and throughput per GPU or validation loss." ), )