From a1790f265218227cedec31d0a16e21153d548e2f Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Tue, 3 Dec 2024 21:06:20 -0500
Subject: [PATCH] replace tensorboard checks with helper function (#2120) [skip
 ci]

* replace tensorboard checks with helper function

* move helper function

* use relative
---
 tests/e2e/multigpu/test_eval.py         | 19 ++++-----------
 tests/e2e/patched/test_fa_xentropy.py   | 12 ++++------
 tests/e2e/patched/test_unsloth_qlora.py | 31 +++++++++----------------
 tests/e2e/test_embeddings_lr.py         | 22 ++++++------------
 tests/e2e/test_packing_loss.py          | 12 ++++------
 tests/e2e/test_relora_llama.py          | 13 ++++-------
 tests/e2e/utils.py                      | 15 ++++++++++++
 7 files changed, 50 insertions(+), 74 deletions(-)

diff --git a/tests/e2e/multigpu/test_eval.py b/tests/e2e/multigpu/test_eval.py
index c40a9edcce..09561bf265 100644
--- a/tests/e2e/multigpu/test_eval.py
+++ b/tests/e2e/multigpu/test_eval.py
@@ -7,12 +7,11 @@
 
 import yaml
 from accelerate.test_utils import execute_subprocess_async
-from tbparse import SummaryReader
 from transformers.testing_utils import get_torch_dist_unique_port
 
 from axolotl.utils.dict import DictDefault
 
-from ..utils import most_recent_subdir
+from ..utils import check_tensorboard
 
 LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
 os.environ["WANDB_DISABLED"] = "true"
@@ -91,12 +90,8 @@ def test_eval_sample_packing(self, temp_dir):
                 str(Path(temp_dir) / "config.yaml"),
             ]
         )
-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "eval/loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 2.5, "Loss is too high"
+
+        check_tensorboard(temp_dir + "/runs", "eval/loss", 2.5, "Eval Loss is too high")
 
     def test_eval(self, temp_dir):
         # pylint: disable=duplicate-code
@@ -164,9 +159,5 @@ def test_eval(self, temp_dir):
                 str(Path(temp_dir) / "config.yaml"),
             ]
         )
-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "eval/loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 2.9, "Loss is too high"
+
+        check_tensorboard(temp_dir + "/runs", "eval/loss", 2.9, "Eval Loss is too high")
diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py
index 76ea1a9348..effcb39c7d 100644
--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -8,7 +8,6 @@
 from pathlib import Path
 
 import pytest
-from tbparse import SummaryReader
 from transformers.utils import is_torch_bf16_gpu_available
 
 from axolotl.cli import load_datasets
@@ -17,7 +16,7 @@
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 
-from ..utils import most_recent_subdir
+from ..utils import check_tensorboard
 
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"
@@ -94,9 +93,6 @@ def test_lora_packing_fa_cross_entropy(self, temp_dir, gradient_accumulation_ste
         train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "adapter_model.bin").exists()
 
-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 1.5, "Loss is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high"
+        )
diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py
index 3d7e794f1c..8e0d03380f 100644
--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -6,8 +6,6 @@
 from pathlib import Path
 
 import pytest
-from e2e.utils import most_recent_subdir
-from tbparse import SummaryReader
 
 from axolotl.cli import load_datasets
 from axolotl.common.cli import TrainerCliArgs
@@ -15,6 +13,8 @@
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 
+from ..utils import check_tensorboard
+
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"
 
@@ -73,12 +73,9 @@ def test_unsloth_llama_qlora_fa2(self, temp_dir, sample_packing):
         train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "adapter_model.bin").exists()
 
-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 2.0, "Loss is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+        )
 
     def test_unsloth_llama_qlora_unpacked(self, temp_dir):
         cfg = DictDefault(
@@ -123,12 +120,9 @@ def test_unsloth_llama_qlora_unpacked(self, temp_dir):
         train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "adapter_model.bin").exists()
 
-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 2.0, "Loss is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+        )
 
     @pytest.mark.parametrize(
         "sdp_attention",
@@ -178,9 +172,6 @@ def test_unsloth_llama_qlora_unpacked_no_fa2_fp16(self, temp_dir, sdp_attention)
         train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "adapter_model.bin").exists()
 
-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 2.0, "Loss is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+        )
diff --git a/tests/e2e/test_embeddings_lr.py b/tests/e2e/test_embeddings_lr.py
index bc406caf30..6e5ebd05f7 100644
--- a/tests/e2e/test_embeddings_lr.py
+++ b/tests/e2e/test_embeddings_lr.py
@@ -7,15 +7,13 @@
 import unittest
 from pathlib import Path
 
-from tbparse import SummaryReader
-
 from axolotl.cli import load_datasets
 from axolotl.common.cli import TrainerCliArgs
 from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 
-from .utils import most_recent_subdir, with_temp_dir
+from .utils import check_tensorboard, with_temp_dir
 
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"
@@ -66,12 +64,9 @@ def test_train_w_embedding_lr_scale(self, temp_dir):
         train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "model.safetensors").exists()
 
-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 2.0, "Loss is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
+        )
 
     @with_temp_dir
     def test_train_w_embedding_lr(self, temp_dir):
@@ -113,9 +108,6 @@ def test_train_w_embedding_lr(self, temp_dir):
         train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "model.safetensors").exists()
 
-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 2.0, "Loss is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
+        )
diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py
index 60f1673814..dd0af32f3c 100644
--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -6,7 +6,6 @@
 import os
 import unittest
 
-from tbparse import SummaryReader
 from transformers.utils import is_torch_bf16_gpu_available
 
 from axolotl.cli import load_datasets
@@ -15,7 +14,7 @@
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 
-from .utils import most_recent_subdir, with_temp_dir
+from .utils import check_tensorboard, with_temp_dir
 
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"
@@ -66,9 +65,6 @@ def test_loss_packed(self, temp_dir):
 
         train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
 
-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 2.0, "Loss is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+        )
diff --git a/tests/e2e/test_relora_llama.py b/tests/e2e/test_relora_llama.py
index 56c2204677..84582896dc 100644
--- a/tests/e2e/test_relora_llama.py
+++ b/tests/e2e/test_relora_llama.py
@@ -7,15 +7,13 @@
 import unittest
 from pathlib import Path
 
-from tbparse import SummaryReader
-
 from axolotl.cli import load_datasets
 from axolotl.common.cli import TrainerCliArgs
 from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 
-from .utils import most_recent_subdir, with_temp_dir
+from .utils import check_tensorboard, with_temp_dir
 
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"
@@ -85,9 +83,6 @@ def test_relora(self, temp_dir):
         ).exists()
         assert (Path(temp_dir) / "checkpoint-100/relora/model.safetensors").exists()
 
-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/grad_norm")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 0.2, "grad_norm is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/grad_norm", 0.2, "grad_norm is too high"
+        )
diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
index 92e647e678..de5b599a13 100644
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -12,6 +12,7 @@
 
 # from importlib.metadata import version
 from packaging import version
+from tbparse import SummaryReader
 
 
 def with_temp_dir(test_func):
@@ -66,3 +67,17 @@ def is_min_2_5_1():
 def is_hopper():
     compute_capability = torch.cuda.get_device_capability()
     return compute_capability == (9, 0)
+
+
+def check_tensorboard(
+    temp_run_dir: str, tag: str, lt_val: float, assertion_err: str
+) -> None:
+    """
+    helper function to parse and check tensorboard logs
+    """
+    tb_log_path = most_recent_subdir(temp_run_dir)
+    event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
+    reader = SummaryReader(event_file)
+    df = reader.scalars  # pylint: disable=invalid-name
+    df = df[(df.tag == tag)]  # pylint: disable=invalid-name
+    assert df.value.values[-1] < lt_val, assertion_err