Skip to content

Commit

Permalink
replace tensorboard checks with helper function (#2120) [skip ci]
Browse files Browse the repository at this point in the history
* replace tensorboard checks with helper function

* move helper function

* use relative
  • Loading branch information
winglian authored Dec 4, 2024
1 parent 418ad2b commit a1790f2
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 74 deletions.
19 changes: 5 additions & 14 deletions tests/e2e/multigpu/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@

import yaml
from accelerate.test_utils import execute_subprocess_async
from tbparse import SummaryReader
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from ..utils import most_recent_subdir
from ..utils import check_tensorboard

LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
os.environ["WANDB_DISABLED"] = "true"
Expand Down Expand Up @@ -91,12 +90,8 @@ def test_eval_sample_packing(self, temp_dir):
str(Path(temp_dir) / "config.yaml"),
]
)
tb_log_path = most_recent_subdir(temp_dir + "/runs")
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == "eval/loss")] # pylint: disable=invalid-name
assert df.value.values[-1] < 2.5, "Loss is too high"

check_tensorboard(temp_dir + "/runs", "eval/loss", 2.5, "Eval Loss is too high")

def test_eval(self, temp_dir):
# pylint: disable=duplicate-code
Expand Down Expand Up @@ -164,9 +159,5 @@ def test_eval(self, temp_dir):
str(Path(temp_dir) / "config.yaml"),
]
)
tb_log_path = most_recent_subdir(temp_dir + "/runs")
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == "eval/loss")] # pylint: disable=invalid-name
assert df.value.values[-1] < 2.9, "Loss is too high"

check_tensorboard(temp_dir + "/runs", "eval/loss", 2.9, "Eval Loss is too high")
12 changes: 4 additions & 8 deletions tests/e2e/patched/test_fa_xentropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from pathlib import Path

import pytest
from tbparse import SummaryReader
from transformers.utils import is_torch_bf16_gpu_available

from axolotl.cli import load_datasets
Expand All @@ -17,7 +16,7 @@
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault

from ..utils import most_recent_subdir
from ..utils import check_tensorboard

LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"
Expand Down Expand Up @@ -94,9 +93,6 @@ def test_lora_packing_fa_cross_entropy(self, temp_dir, gradient_accumulation_ste
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "adapter_model.bin").exists()

tb_log_path = most_recent_subdir(temp_dir + "/runs")
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
assert df.value.values[-1] < 1.5, "Loss is too high"
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high"
)
31 changes: 11 additions & 20 deletions tests/e2e/patched/test_unsloth_qlora.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
from pathlib import Path

import pytest
from e2e.utils import most_recent_subdir
from tbparse import SummaryReader

from axolotl.cli import load_datasets
from axolotl.common.cli import TrainerCliArgs
from axolotl.train import train
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault

from ..utils import check_tensorboard

LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"

Expand Down Expand Up @@ -73,12 +73,9 @@ def test_unsloth_llama_qlora_fa2(self, temp_dir, sample_packing):
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "adapter_model.bin").exists()

tb_log_path = most_recent_subdir(temp_dir + "/runs")
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
assert df.value.values[-1] < 2.0, "Loss is too high"
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
)

def test_unsloth_llama_qlora_unpacked(self, temp_dir):
cfg = DictDefault(
Expand Down Expand Up @@ -123,12 +120,9 @@ def test_unsloth_llama_qlora_unpacked(self, temp_dir):
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "adapter_model.bin").exists()

tb_log_path = most_recent_subdir(temp_dir + "/runs")
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
assert df.value.values[-1] < 2.0, "Loss is too high"
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
)

@pytest.mark.parametrize(
"sdp_attention",
Expand Down Expand Up @@ -178,9 +172,6 @@ def test_unsloth_llama_qlora_unpacked_no_fa2_fp16(self, temp_dir, sdp_attention)
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "adapter_model.bin").exists()

tb_log_path = most_recent_subdir(temp_dir + "/runs")
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
assert df.value.values[-1] < 2.0, "Loss is too high"
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
)
22 changes: 7 additions & 15 deletions tests/e2e/test_embeddings_lr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@
import unittest
from pathlib import Path

from tbparse import SummaryReader

from axolotl.cli import load_datasets
from axolotl.common.cli import TrainerCliArgs
from axolotl.train import train
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault

from .utils import most_recent_subdir, with_temp_dir
from .utils import check_tensorboard, with_temp_dir

LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"
Expand Down Expand Up @@ -66,12 +64,9 @@ def test_train_w_embedding_lr_scale(self, temp_dir):
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "model.safetensors").exists()

tb_log_path = most_recent_subdir(temp_dir + "/runs")
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
assert df.value.values[-1] < 2.0, "Loss is too high"
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
)

@with_temp_dir
def test_train_w_embedding_lr(self, temp_dir):
Expand Down Expand Up @@ -113,9 +108,6 @@ def test_train_w_embedding_lr(self, temp_dir):
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "model.safetensors").exists()

tb_log_path = most_recent_subdir(temp_dir + "/runs")
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
assert df.value.values[-1] < 2.0, "Loss is too high"
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
)
12 changes: 4 additions & 8 deletions tests/e2e/test_packing_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import os
import unittest

from tbparse import SummaryReader
from transformers.utils import is_torch_bf16_gpu_available

from axolotl.cli import load_datasets
Expand All @@ -15,7 +14,7 @@
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault

from .utils import most_recent_subdir, with_temp_dir
from .utils import check_tensorboard, with_temp_dir

LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"
Expand Down Expand Up @@ -66,9 +65,6 @@ def test_loss_packed(self, temp_dir):

train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)

tb_log_path = most_recent_subdir(temp_dir + "/runs")
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
assert df.value.values[-1] < 2.0, "Loss is too high"
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
)
13 changes: 4 additions & 9 deletions tests/e2e/test_relora_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@
import unittest
from pathlib import Path

from tbparse import SummaryReader

from axolotl.cli import load_datasets
from axolotl.common.cli import TrainerCliArgs
from axolotl.train import train
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault

from .utils import most_recent_subdir, with_temp_dir
from .utils import check_tensorboard, with_temp_dir

LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"
Expand Down Expand Up @@ -85,9 +83,6 @@ def test_relora(self, temp_dir):
).exists()
assert (Path(temp_dir) / "checkpoint-100/relora/model.safetensors").exists()

tb_log_path = most_recent_subdir(temp_dir + "/runs")
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == "train/grad_norm")] # pylint: disable=invalid-name
assert df.value.values[-1] < 0.2, "grad_norm is too high"
check_tensorboard(
temp_dir + "/runs", "train/grad_norm", 0.2, "grad_norm is too high"
)
15 changes: 15 additions & 0 deletions tests/e2e/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

# from importlib.metadata import version
from packaging import version
from tbparse import SummaryReader


def with_temp_dir(test_func):
Expand Down Expand Up @@ -66,3 +67,17 @@ def is_min_2_5_1():
def is_hopper():
compute_capability = torch.cuda.get_device_capability()
return compute_capability == (9, 0)


def check_tensorboard(
temp_run_dir: str, tag: str, lt_val: float, assertion_err: str
) -> None:
"""
helper function to parse and check tensorboard logs
"""
tb_log_path = most_recent_subdir(temp_run_dir)
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == tag)] # pylint: disable=invalid-name
assert df.value.values[-1] < lt_val, assertion_err

0 comments on commit a1790f2

Please sign in to comment.