From f394edb087ecd086344b54ed34051baa571a86bb Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 7 Dec 2023 10:23:05 -0800 Subject: [PATCH 1/6] Remove tests and support for torch <2.1 --- .github/workflows/pr-cpu.yaml | 4 ---- .github/workflows/pr-gpu.yaml | 5 ----- llmfoundry/data/packing.py | 3 +-- setup.py | 2 +- tests/a_scripts/inference/test_convert_composer_to_hf.py | 4 ++-- tests/a_scripts/train/test_train.py | 5 ++--- tests/data/test_dataloader.py | 8 ++++---- tests/data/test_packing.py | 4 ++-- tests/models/test_fsdp_act_checkpoint.py | 5 +---- 9 files changed, 13 insertions(+), 27 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index f57362ac82..c5bb4d641c 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -19,10 +19,6 @@ jobs: strategy: matrix: include: - - name: 'cpu-1.13.1' - container: mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 - markers: 'not gpu' - pytest_command: 'coverage run -m pytest' - name: 'cpu-2.1.0' container: mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04 markers: 'not gpu' diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index 87ae173e77..7d9320f9e0 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -19,11 +19,6 @@ jobs: strategy: matrix: include: - - name: 'gpu-1.13.1' - container: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 - markers: 'gpu' - pytest_command: 'coverage run -m pytest' - deps_group: 'all' - name: 'gpu-2.1.0' container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 markers: 'gpu' diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 3fca0ade5e..45322c9b2f 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -5,7 +5,6 @@ import numpy as np import torch -from composer.utils import using_torch_2 from omegaconf import DictConfig from transformers import PreTrainedTokenizerBase @@ -348,7 +347,7 @@ def profile_packing( dataloader_cfg.dataset.packing_ratio = None dataloader_cfg.drop_last = False dataloader_cfg.num_workers = 0 - dataloader_cfg.prefetch_factor = None if using_torch_2() else 2 + dataloader_cfg.prefetch_factor = None dataloader_cfg.persistent_workers = False # Determine the packing_ratio values we'll try diff --git a/setup.py b/setup.py index 9bf2ef2cb0..a228105a4c 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ 'accelerate>=0.20,<0.21', # for HF inference `device_map` 'transformers>=4.34.1,<4.35', 'mosaicml-streaming>=0.7.1,<0.8', - 'torch>=1.13.1,<2.1.1', + 'torch>=2.1,<2.1.1', 'datasets>=2.14.5,<2.15', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.1.97', diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 5c3d0f1830..94a2d66c6e 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -14,7 +14,7 @@ import transformers from composer import Trainer from composer.loggers import MLFlowLogger -from composer.utils import dist, get_device, using_torch_2 +from composer.utils import dist, get_device from omegaconf import DictConfig from omegaconf import OmegaConf as om from torch.utils.data import DataLoader @@ -497,7 +497,7 @@ def test_huggingface_conversion_callback( 'drop_last': False, 'num_workers': 0, 'pin_memory': False, - 'prefetch_factor': None if using_torch_2() else 2, + 'prefetch_factor': None, 'persistent_workers': False, 'timeout': 0 } diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 62075383cc..a26b0c1879 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -6,7 +6,6 @@ import pytest from composer.loggers import InMemoryLogger -from composer.utils import using_torch_2 from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om @@ -36,10 +35,10 @@ def test_train_gauntlet(averages: Optional[dict], tmp_path: pathlib.Path): test_cfg.icl_subset_num_batches = 1 test_cfg.eval_subset_num_batches = 2 test_cfg.train_loader.num_workers = 0 - test_cfg.train_loader.prefetch_factor = None if using_torch_2() else 2 + test_cfg.train_loader.prefetch_factor = None test_cfg.train_loader.persistent_workers = False test_cfg.eval_loader.num_workers = 0 - test_cfg.eval_loader.prefetch_factor = None if using_torch_2() else 2 + test_cfg.eval_loader.prefetch_factor = None test_cfg.eval_loader.persistent_workers = False test_cfg.eval_gauntlet = DictConfig({ diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 747021e82a..728376229b 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -13,7 +13,7 @@ import pytest import torch import transformers -from composer.utils import dist, using_torch_2 +from composer.utils import dist from omegaconf import DictConfig from omegaconf import OmegaConf as om from streaming import MDSWriter @@ -272,7 +272,7 @@ def test_finetuning_dataloader(decoder_only_format: bool, 'drop_last': False, 'num_workers': 0, 'pin_memory': False, - 'prefetch_factor': None if using_torch_2() else 2, + 'prefetch_factor': None, 'persistent_workers': False, 'timeout': 0 } @@ -569,7 +569,7 @@ def test_malformed_data( }, 'drop_last': False, 'num_workers': 0, - 'prefetch_factor': None if using_torch_2() else 2, + 'prefetch_factor': None, 'pin_memory': False, 'persistent_workers': False, 'timeout': 0 @@ -679,7 +679,7 @@ def test_token_counting_func_dataloader_setting( common_args = { 'drop_last': False, 'num_workers': 0, - 'prefetch_factor': None if using_torch_2() else 2, + 'prefetch_factor': None, 'pin_memory': False, 'persistent_workers': False, 'timeout': 0 diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py index 73453b6782..a86d88f360 100644 --- a/tests/data/test_packing.py +++ b/tests/data/test_packing.py @@ -6,7 +6,7 @@ import pytest import torch -from composer.utils import dist, reproducibility, using_torch_2 +from composer.utils import dist, reproducibility from omegaconf import DictConfig from pytest import approx from torch.utils.data import DataLoader @@ -172,7 +172,7 @@ def test_packing_with_dataloader(packing_ratio: Any): # Gets copied per worker and we cannot check the waste for child processes. 'num_workers': 0, 'pin_memory': False, - 'prefetch_factor': None if using_torch_2() else 2, + 'prefetch_factor': None, 'persistent_workers': False, 'timeout': 0, }) diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py index a7e41a3fc2..987ea5f2a7 100644 --- a/tests/models/test_fsdp_act_checkpoint.py +++ b/tests/models/test_fsdp_act_checkpoint.py @@ -5,7 +5,7 @@ import pytest from composer import Trainer -from composer.utils import get_device, using_torch_2 +from composer.utils import get_device from omegaconf import OmegaConf as om from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import \ CheckpointWrapper @@ -65,9 +65,6 @@ def test_fsdp_act_checkpoint(activation_checkpointing: bool, ]: module = trainer.state.model.model._fsdp_wrapped_module.transformer.blocks[ 0]._fsdp_wrapped_module - if not using_torch_2(): - module = trainer.state.model.model._fsdp_wrapped_module.transformer.blocks[ - 0]._fsdp_wrapped_module._fpw_module assert isinstance(module, CheckpointWrapper) elif activation_checkpointing_target == [ 'grouped_query_attention' From a89193b9d977b34c2088ec307eeaf7677749e245 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 7 Dec 2023 10:36:29 -0800 Subject: [PATCH 2/6] remove refs in readme --- README.md | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 4a4e60e844..f7b5148cf6 100644 --- a/README.md +++ b/README.md @@ -85,21 +85,14 @@ Something missing? Contribute with a PR! # Hardware and Software Requirements -This codebase has been tested with PyTorch 1.13.1 and PyTorch 2.0.1 on systems with NVIDIA A100s and H100s. +This codebase has been tested with PyTorch 2.1 with NVIDIA A100s and H100s. This codebase may also work on systems with other devices, such as consumer NVIDIA cards and AMD cards, but we are not actively testing these systems. If you have success/failure using LLM Foundry on other systems, please let us know in a Github issue and we will update the support matrix! | Device | Torch Version | Cuda Version | Status | | -------------- | ------------- | ------------ | ---------------------------- | -| A100-40GB/80GB | 1.13.1 | 11.7 | :white_check_mark: Supported | -| A100-40GB/80GB | 2.0.1 | 11.7, 11.8 | :white_check_mark: Supported | -| A100-40GB/80GB | 2.1.0 | 11.8, 12.1 | :white_check_mark: Supported | -| H100-80GB | 1.13.1 | 11.7 | :x: Not Supported | -| H100-80GB | 2.0.1 | 11.8 | :white_check_mark: Supported | +| A100-40GB/80GB | 2.1.0 | 12.1 | :white_check_mark: Supported | | H100-80GB | 2.1.0 | 12.1 | :white_check_mark: Supported | -| A10-24GB | 1.13.1 | 11.7 | :construction: In Progress | -| A10-24GB | 2.0.1 | 11.7, 11.8 | :construction: In Progress | -| MI250 | 2.0.1 | ROCm 5.4 | :construction: In Progress | ## MosaicML Docker Images We highly recommend using our prebuilt Docker images. You can find them here: https://hub.docker.com/orgs/mosaicml/repositories. @@ -113,11 +106,7 @@ You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117 | Docker Image | Torch Version | Cuda Version | LLM Foundry dependencies installed? | | ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- | -| `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | 1.13.1 | 11.7 (Infiniband) | No | -| `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | 2.0.1 | 11.8 (Infiniband) | No | | `mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04` | 2.1.0 | 12.1 (Infiniband) | No | -| `mosaicml/llm-foundry:1.13.1_cu117-latest` | 1.13.1 | 11.7 (Infiniband) | Yes | -| `mosaicml/llm-foundry:2.0.1_cu118-latest` | 2.0.1 | 11.8 (Infiniband) | Yes | | `mosaicml/llm-foundry:2.1.0_cu121-latest` | 2.1.0 | 12.1 (Infiniband) | Yes (flash attention v1) | | `mosaicml/llm-foundry:2.1.0_cu121_flash2-latest` | 2.1.0 | 12.1 (Infiniband) | Yes (flash attention v2) | | `mosaicml/llm-foundry:2.1.0_cu121_aws-latest` | 2.1.0 | 12.1 (EFA) | Yes (flash attention v1) | From f48ab24c0345d93a8ae4fa9b2f9079e7ca79a049 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 7 Dec 2023 10:36:45 -0800 Subject: [PATCH 3/6] more uses of torch version check --- .github/workflows/docker.yaml | 9 --------- mcli/mcli-hf-eval.yaml | 2 +- mcli/mcli-openai-eval.yaml | 2 +- tests/optim/test_lion8b.py | 14 +++----------- 4 files changed, 5 insertions(+), 22 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index f6dac79fe5..50f24ee6fc 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -17,15 +17,6 @@ jobs: strategy: matrix: include: - - name: '1.13.1_cu117' - base_image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 - dep_groups: '[gpu]' - - name: '2.0.1_cu118' - base_image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 - dep_groups: '[gpu]' - - name: '2.1.0_cu121' - base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 - dep_groups: '[gpu]' - name: '2.1.0_cu121_flash2' base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 dep_groups: '[gpu-flash2]' diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index 8c91dac1b4..b330ff6ec1 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -16,7 +16,7 @@ gpu_num: 8 # gpu_type: # cluster: # replace with your cluster here! -image: mosaicml/llm-foundry:2.0.1_cu118-latest +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml index e7c26d7ae6..179b078fb6 100644 --- a/mcli/mcli-openai-eval.yaml +++ b/mcli/mcli-openai-eval.yaml @@ -16,7 +16,7 @@ run_name: openai-eval # gpu_type: # cluster: # replace with your cluster here! -image: mosaicml/llm-foundry:2.0.1_cu118-latest +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: diff --git a/tests/optim/test_lion8b.py b/tests/optim/test_lion8b.py index d5b284b23c..3a31ff87b5 100644 --- a/tests/optim/test_lion8b.py +++ b/tests/optim/test_lion8b.py @@ -14,15 +14,9 @@ from torch.distributed import fsdp from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -if version.parse(torch.__version__) >= version.parse('2.0.1'): - from torch.distributed.fsdp.api import ( # type:ignore .api not in public API - FullOptimStateDictConfig, LocalOptimStateDictConfig, - ShardedOptimStateDictConfig) -else: - from unittest.mock import MagicMock # for pyright so vars aren't None - FullOptimStateDictConfig = MagicMock() - LocalOptimStateDictConfig = MagicMock() - ShardedOptimStateDictConfig = MagicMock() +from torch.distributed.fsdp.api import ( # type:ignore .api not in public API + FullOptimStateDictConfig, LocalOptimStateDictConfig, + ShardedOptimStateDictConfig) from llmfoundry.optim import DecoupledLionW from llmfoundry.optim import DecoupledLionW_8bit as Lion8bit @@ -420,8 +414,6 @@ def test_fsdp_save_load(dtype: torch.dtype, use_errors: bool, device = 'cuda' if torch.cuda.device_count() < 2: pytest.skip(f'This test requires 2+ GPUs.') - if version.parse(torch.__version__) < version.parse('2.0.1'): - pytest.skip(f'This test requires torch 2.0.1 or greater.') torch.cuda.set_device(f'cuda:{os.environ["RANK"]}') # needed for fsdp if not dist.is_initialized(): From 0e29f8a53553a26f07c7558e3a309e46d110f7de Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 7 Dec 2023 10:37:38 -0800 Subject: [PATCH 4/6] precommit --- tests/optim/test_lion8b.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/optim/test_lion8b.py b/tests/optim/test_lion8b.py index 3a31ff87b5..b421c6d250 100644 --- a/tests/optim/test_lion8b.py +++ b/tests/optim/test_lion8b.py @@ -6,14 +6,12 @@ import warnings import numpy as np -import packaging.version as version import pytest import torch import torch.distributed as dist import torch.nn as nn from torch.distributed import fsdp from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - from torch.distributed.fsdp.api import ( # type:ignore .api not in public API FullOptimStateDictConfig, LocalOptimStateDictConfig, ShardedOptimStateDictConfig) From 07ccfa9da85b0723dfc19c60c08faef69fb1078c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 7 Dec 2023 10:38:29 -0800 Subject: [PATCH 5/6] fix --- .github/workflows/docker.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 50f24ee6fc..eb12868761 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -17,6 +17,9 @@ jobs: strategy: matrix: include: + - name: '2.1.0_cu121' + base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 + dep_groups: '[gpu]' - name: '2.1.0_cu121_flash2' base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 dep_groups: '[gpu-flash2]' From d90831c4f6fc722116876108583ebd2defdbb083 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 7 Dec 2023 10:38:54 -0800 Subject: [PATCH 6/6] precommit --- .github/workflows/docker.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index eb12868761..bb538dbe9b 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -17,8 +17,8 @@ jobs: strategy: matrix: include: - - name: '2.1.0_cu121' - base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 + - name: '2.1.0_cu121' + base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 dep_groups: '[gpu]' - name: '2.1.0_cu121_flash2' base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04