From a1790f265218227cedec31d0a16e21153d548e2f Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 3 Dec 2024 21:06:20 -0500 Subject: [PATCH] replace tensorboard checks with helper function (#2120) [skip ci] * replace tensorboard checks with helper function * move helper function * use relative --- tests/e2e/multigpu/test_eval.py | 19 ++++----------- tests/e2e/patched/test_fa_xentropy.py | 12 ++++------ tests/e2e/patched/test_unsloth_qlora.py | 31 +++++++++---------------- tests/e2e/test_embeddings_lr.py | 22 ++++++------------ tests/e2e/test_packing_loss.py | 12 ++++------ tests/e2e/test_relora_llama.py | 13 ++++------- tests/e2e/utils.py | 15 ++++++++++++ 7 files changed, 50 insertions(+), 74 deletions(-) diff --git a/tests/e2e/multigpu/test_eval.py b/tests/e2e/multigpu/test_eval.py index c40a9edcce..09561bf265 100644 --- a/tests/e2e/multigpu/test_eval.py +++ b/tests/e2e/multigpu/test_eval.py @@ -7,12 +7,11 @@ import yaml from accelerate.test_utils import execute_subprocess_async -from tbparse import SummaryReader from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault -from ..utils import most_recent_subdir +from ..utils import check_tensorboard LOG = logging.getLogger("axolotl.tests.e2e.multigpu") os.environ["WANDB_DISABLED"] = "true" @@ -91,12 +90,8 @@ def test_eval_sample_packing(self, temp_dir): str(Path(temp_dir) / "config.yaml"), ] ) - tb_log_path = most_recent_subdir(temp_dir + "/runs") - event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) - reader = SummaryReader(event_file) - df = reader.scalars # pylint: disable=invalid-name - df = df[(df.tag == "eval/loss")] # pylint: disable=invalid-name - assert df.value.values[-1] < 2.5, "Loss is too high" + + check_tensorboard(temp_dir + "/runs", "eval/loss", 2.5, "Eval Loss is too high") def test_eval(self, temp_dir): # pylint: disable=duplicate-code @@ -164,9 +159,5 @@ def test_eval(self, temp_dir): str(Path(temp_dir) / "config.yaml"), ] ) - tb_log_path = most_recent_subdir(temp_dir + "/runs") - event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) - reader = SummaryReader(event_file) - df = reader.scalars # pylint: disable=invalid-name - df = df[(df.tag == "eval/loss")] # pylint: disable=invalid-name - assert df.value.values[-1] < 2.9, "Loss is too high" + + check_tensorboard(temp_dir + "/runs", "eval/loss", 2.9, "Eval Loss is too high") diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py index 76ea1a9348..effcb39c7d 100644 --- a/tests/e2e/patched/test_fa_xentropy.py +++ b/tests/e2e/patched/test_fa_xentropy.py @@ -8,7 +8,6 @@ from pathlib import Path import pytest -from tbparse import SummaryReader from transformers.utils import is_torch_bf16_gpu_available from axolotl.cli import load_datasets @@ -17,7 +16,7 @@ from axolotl.utils.config import normalize_config from axolotl.utils.dict import DictDefault -from ..utils import most_recent_subdir +from ..utils import check_tensorboard LOG = logging.getLogger("axolotl.tests.e2e") os.environ["WANDB_DISABLED"] = "true" @@ -94,9 +93,6 @@ def test_lora_packing_fa_cross_entropy(self, temp_dir, gradient_accumulation_ste train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta) assert (Path(temp_dir) / "adapter_model.bin").exists() - tb_log_path = most_recent_subdir(temp_dir + "/runs") - event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) - reader = SummaryReader(event_file) - df = reader.scalars # pylint: disable=invalid-name - df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name - assert df.value.values[-1] < 1.5, "Loss is too high" + check_tensorboard( + temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high" + ) diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py index 3d7e794f1c..8e0d03380f 100644 --- a/tests/e2e/patched/test_unsloth_qlora.py +++ b/tests/e2e/patched/test_unsloth_qlora.py @@ -6,8 +6,6 @@ from pathlib import Path import pytest -from e2e.utils import most_recent_subdir -from tbparse import SummaryReader from axolotl.cli import load_datasets from axolotl.common.cli import TrainerCliArgs @@ -15,6 +13,8 @@ from axolotl.utils.config import normalize_config from axolotl.utils.dict import DictDefault +from ..utils import check_tensorboard + LOG = logging.getLogger("axolotl.tests.e2e") os.environ["WANDB_DISABLED"] = "true" @@ -73,12 +73,9 @@ def test_unsloth_llama_qlora_fa2(self, temp_dir, sample_packing): train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta) assert (Path(temp_dir) / "adapter_model.bin").exists() - tb_log_path = most_recent_subdir(temp_dir + "/runs") - event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) - reader = SummaryReader(event_file) - df = reader.scalars # pylint: disable=invalid-name - df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name - assert df.value.values[-1] < 2.0, "Loss is too high" + check_tensorboard( + temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + ) def test_unsloth_llama_qlora_unpacked(self, temp_dir): cfg = DictDefault( @@ -123,12 +120,9 @@ def test_unsloth_llama_qlora_unpacked(self, temp_dir): train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta) assert (Path(temp_dir) / "adapter_model.bin").exists() - tb_log_path = most_recent_subdir(temp_dir + "/runs") - event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) - reader = SummaryReader(event_file) - df = reader.scalars # pylint: disable=invalid-name - df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name - assert df.value.values[-1] < 2.0, "Loss is too high" + check_tensorboard( + temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + ) @pytest.mark.parametrize( "sdp_attention", @@ -178,9 +172,6 @@ def test_unsloth_llama_qlora_unpacked_no_fa2_fp16(self, temp_dir, sdp_attention) train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta) assert (Path(temp_dir) / "adapter_model.bin").exists() - tb_log_path = most_recent_subdir(temp_dir + "/runs") - event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) - reader = SummaryReader(event_file) - df = reader.scalars # pylint: disable=invalid-name - df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name - assert df.value.values[-1] < 2.0, "Loss is too high" + check_tensorboard( + temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + ) diff --git a/tests/e2e/test_embeddings_lr.py b/tests/e2e/test_embeddings_lr.py index bc406caf30..6e5ebd05f7 100644 --- a/tests/e2e/test_embeddings_lr.py +++ b/tests/e2e/test_embeddings_lr.py @@ -7,15 +7,13 @@ import unittest from pathlib import Path -from tbparse import SummaryReader - from axolotl.cli import load_datasets from axolotl.common.cli import TrainerCliArgs from axolotl.train import train from axolotl.utils.config import normalize_config from axolotl.utils.dict import DictDefault -from .utils import most_recent_subdir, with_temp_dir +from .utils import check_tensorboard, with_temp_dir LOG = logging.getLogger("axolotl.tests.e2e") os.environ["WANDB_DISABLED"] = "true" @@ -66,12 +64,9 @@ def test_train_w_embedding_lr_scale(self, temp_dir): train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta) assert (Path(temp_dir) / "model.safetensors").exists() - tb_log_path = most_recent_subdir(temp_dir + "/runs") - event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) - reader = SummaryReader(event_file) - df = reader.scalars # pylint: disable=invalid-name - df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name - assert df.value.values[-1] < 2.0, "Loss is too high" + check_tensorboard( + temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high" + ) @with_temp_dir def test_train_w_embedding_lr(self, temp_dir): @@ -113,9 +108,6 @@ def test_train_w_embedding_lr(self, temp_dir): train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta) assert (Path(temp_dir) / "model.safetensors").exists() - tb_log_path = most_recent_subdir(temp_dir + "/runs") - event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) - reader = SummaryReader(event_file) - df = reader.scalars # pylint: disable=invalid-name - df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name - assert df.value.values[-1] < 2.0, "Loss is too high" + check_tensorboard( + temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high" + ) diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py index 60f1673814..dd0af32f3c 100644 --- a/tests/e2e/test_packing_loss.py +++ b/tests/e2e/test_packing_loss.py @@ -6,7 +6,6 @@ import os import unittest -from tbparse import SummaryReader from transformers.utils import is_torch_bf16_gpu_available from axolotl.cli import load_datasets @@ -15,7 +14,7 @@ from axolotl.utils.config import normalize_config from axolotl.utils.dict import DictDefault -from .utils import most_recent_subdir, with_temp_dir +from .utils import check_tensorboard, with_temp_dir LOG = logging.getLogger("axolotl.tests.e2e") os.environ["WANDB_DISABLED"] = "true" @@ -66,9 +65,6 @@ def test_loss_packed(self, temp_dir): train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta) - tb_log_path = most_recent_subdir(temp_dir + "/runs") - event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) - reader = SummaryReader(event_file) - df = reader.scalars # pylint: disable=invalid-name - df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name - assert df.value.values[-1] < 2.0, "Loss is too high" + check_tensorboard( + temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + ) diff --git a/tests/e2e/test_relora_llama.py b/tests/e2e/test_relora_llama.py index 56c2204677..84582896dc 100644 --- a/tests/e2e/test_relora_llama.py +++ b/tests/e2e/test_relora_llama.py @@ -7,15 +7,13 @@ import unittest from pathlib import Path -from tbparse import SummaryReader - from axolotl.cli import load_datasets from axolotl.common.cli import TrainerCliArgs from axolotl.train import train from axolotl.utils.config import normalize_config from axolotl.utils.dict import DictDefault -from .utils import most_recent_subdir, with_temp_dir +from .utils import check_tensorboard, with_temp_dir LOG = logging.getLogger("axolotl.tests.e2e") os.environ["WANDB_DISABLED"] = "true" @@ -85,9 +83,6 @@ def test_relora(self, temp_dir): ).exists() assert (Path(temp_dir) / "checkpoint-100/relora/model.safetensors").exists() - tb_log_path = most_recent_subdir(temp_dir + "/runs") - event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) - reader = SummaryReader(event_file) - df = reader.scalars # pylint: disable=invalid-name - df = df[(df.tag == "train/grad_norm")] # pylint: disable=invalid-name - assert df.value.values[-1] < 0.2, "grad_norm is too high" + check_tensorboard( + temp_dir + "/runs", "train/grad_norm", 0.2, "grad_norm is too high" + ) diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py index 92e647e678..de5b599a13 100644 --- a/tests/e2e/utils.py +++ b/tests/e2e/utils.py @@ -12,6 +12,7 @@ # from importlib.metadata import version from packaging import version +from tbparse import SummaryReader def with_temp_dir(test_func): @@ -66,3 +67,17 @@ def is_min_2_5_1(): def is_hopper(): compute_capability = torch.cuda.get_device_capability() return compute_capability == (9, 0) + + +def check_tensorboard( + temp_run_dir: str, tag: str, lt_val: float, assertion_err: str +) -> None: + """ + helper function to parse and check tensorboard logs + """ + tb_log_path = most_recent_subdir(temp_run_dir) + event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) + reader = SummaryReader(event_file) + df = reader.scalars # pylint: disable=invalid-name + df = df[(df.tag == tag)] # pylint: disable=invalid-name + assert df.value.values[-1] < lt_val, assertion_err