Skip to content

Commit

Permalink
gradient accumulation tests, embeddings w pad_token fix, smaller mode…
Browse files Browse the repository at this point in the history
…ls (#2059)

* add more test cases for gradient accumulation and fix zero3

* swap out for smaller model

* fix missing return

* fix missing pad_token in config

* support concurrency for multigpu testing

* cast empty deepspeed to empty string for zero3 check

* fix temp_dir as fixture so parametrize works properly

* fix test file for multigpu evals

* don't use default

* don't use default for fsdp_state_dict_type

* don't use llama tokenizer w smollm

* also automatically cancel multigpu for concurrency
  • Loading branch information
winglian authored Nov 14, 2024
1 parent f3a5d11 commit 71d4030
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 71 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/multi-gpu-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ on:
schedule:
- cron: '0 0 * * 1,4' # Runs at 00:00 UTC every monday & thursday

# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

jobs:
test-axolotl-multigpu:
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
Expand Down
2 changes: 1 addition & 1 deletion cicd/multigpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
set -e

# only run one test at a time so as not to OOM the GPU
pytest -n1 /workspace/axolotl/tests/e2e/multigpu/
pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
19 changes: 19 additions & 0 deletions src/axolotl/utils/config/models/input/v0_4_1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1291,6 +1291,25 @@ def check_use_reentrant_mismatch(cls, data):
)
return data

@model_validator(mode="before")
@classmethod
def warn_qlora_zero3_w_use_reentrant(cls, data):
if (
data.get("adapter") == "qlora"
and data.get("gradient_checkpointing_kwargs", {})
and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
is False
and "zero3" in data.get("deepspeed", "")

This comment has been minimized.

Copy link
@sageof6path
):
# may result in:
# torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint:
# Recomputed values for the following tensors have different metadata
# than during the forward pass.
LOG.warning(
"qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
)
return data

@model_validator(mode="before")
@classmethod
def check_val_w_test_datasets(cls, data):
Expand Down
1 change: 1 addition & 0 deletions src/axolotl/utils/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ def load_tokenizer(cfg):
x in cfg.lora_modules_to_save for x in lora_modules_to_save
)
)
and k != "pad_token"
):
lora_modules_to_save = ", ".join(
[f"`{x}`" for x in lora_modules_to_save]
Expand Down
16 changes: 16 additions & 0 deletions tests/e2e/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""
shared pytest fixtures
"""
import shutil
import tempfile

import pytest


@pytest.fixture
def temp_dir():
# Create a temporary directory
_temp_dir = tempfile.mkdtemp()
yield _temp_dir
# Clean up the directory after the test
shutil.rmtree(_temp_dir)
12 changes: 6 additions & 6 deletions tests/e2e/multigpu/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,25 @@
"""
import logging
import os
import unittest
from pathlib import Path

import yaml
from accelerate.test_utils import execute_subprocess_async
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from ..utils import with_temp_dir

LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
os.environ["WANDB_DISABLED"] = "true"

AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent


class TestMultiGPUEval(unittest.TestCase):
class TestMultiGPUEval:
"""
Test case for MultiGPU Eval Sample Packing
"""

@with_temp_dir
def test_eval_sample_packing(self, temp_dir):
# pylint: disable=duplicate-code
cfg = DictDefault(
Expand Down Expand Up @@ -83,13 +80,14 @@ def test_eval_sample_packing(self, temp_dir):
"launch",
"--num-processes",
"2",
"--main_process_port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
]
)

@with_temp_dir
def test_eval(self, temp_dir):
# pylint: disable=duplicate-code
cfg = DictDefault(
Expand Down Expand Up @@ -148,6 +146,8 @@ def test_eval(self, temp_dir):
"launch",
"--num-processes",
"2",
"--main_process_port",
f"{get_torch_dist_unique_port()}",
"-m",
"axolotl.cli.train",
str(Path(temp_dir) / "config.yaml"),
Expand Down
Loading

0 comments on commit 71d4030

Please sign in to comment.