From 3d4b57b3483f1dc53e2f292f9f77083aeb870c7f Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 20 Mar 2024 12:04:16 -0700 Subject: [PATCH 1/4] bump version to 021 --- composer/_version.py | 2 +- composer/core/state.py | 10 ---------- composer/core/time.py | 12 ------------ docker/README.md | 4 ++-- docker/build_matrix.yaml | 12 ++++++------ docker/generate_build_matrix.py | 2 +- docs/source/notes/distributed_training.rst | 14 +++++--------- 7 files changed, 15 insertions(+), 41 deletions(-) diff --git a/composer/_version.py b/composer/_version.py index fffe771b0c..ad813aba8b 100644 --- a/composer/_version.py +++ b/composer/_version.py @@ -3,4 +3,4 @@ """The Composer Version.""" -__version__ = '0.20.1' +__version__ = '0.21.0' diff --git a/composer/core/state.py b/composer/core/state.py index a21b142d42..37e679ec2d 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -790,16 +790,6 @@ def fsdp_state_dict_type(self): def fsdp_sharded_state_dict_enabled(self): return self.fsdp_config is not None and self.fsdp_enabled and self.fsdp_state_dict_type == 'sharded' - @property - def fsdp_elastic_sharded_enabled(self): - warnings.warn( - VersionedDeprecationWarning( - 'state.fsdp_elastic_sharded_enabled is deprecated.', - remove_version='0.21.0', - ), - ) - return self.fsdp_sharded_state_dict_enabled - @property def fsdp_device_mesh(self): if self.fsdp_enabled: diff --git a/composer/core/time.py b/composer/core/time.py index 35c17d74f9..ee6b3628ed 100644 --- a/composer/core/time.py +++ b/composer/core/time.py @@ -540,18 +540,6 @@ def state_dict(self) -> Dict[str, Any]: 'batch_wct': self.batch_wct, } - def get_state(self) -> Dict[str, Union[Time[int], datetime.timedelta]]: - """Returns all values of the timestamp object in a dictionary. - - Returns: - Dict[str, Union[Time[int], datetime.timedelta]]: All values of the timestamp object. - """ - warnings.warn( - VersionedDeprecationWarning('core.time.Timestamp.get_state is deprecated.', remove_version='0.21.0'), - ) - - return self.state_dict() - def load_state_dict(self, state: Dict[str, Any]) -> None: self._epoch = Time(state['epoch'], TimeUnit.EPOCH) self._batch = Time(state['batch'], TimeUnit.BATCH) diff --git a/docker/README.md b/docker/README.md index 73b4b1e13b..7e495ddecb 100644 --- a/docker/README.md +++ b/docker/README.md @@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the | Composer Version | CUDA Support | Docker Tag | |--------------------|----------------|----------------------------------------------------------------| -| 0.20.1 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.20.1` | -| 0.20.1 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.20.1_cpu` | +| 0.21.0 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.21.0` | +| 0.21.0 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.21.0_cpu` | **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 31e3e1ba27..9fa18e44eb 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -246,9 +246,9 @@ TORCHVISION_VERSION: 0.18.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.20.1 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.21.0 CUDA_VERSION: 12.1.1 - IMAGE_NAME: composer-0-20-1 + IMAGE_NAME: composer-0-21-0 MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -269,15 +269,15 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/composer:0.20.1 + - mosaicml/composer:0.21.0 - mosaicml/composer:latest TARGET: composer_stage TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.20.1 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.21.0 CUDA_VERSION: '' - IMAGE_NAME: composer-0-20-1-cpu + IMAGE_NAME: composer-0-21-0-cpu MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.10' @@ -285,7 +285,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/composer:0.20.1_cpu + - mosaicml/composer:0.21.0_cpu - mosaicml/composer:latest_cpu TARGET: composer_stage TORCHVISION_VERSION: 0.16.2 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index a45c08228d..46cbec57cc 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -265,7 +265,7 @@ def _main(): composer_entries = [] # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images - composer_versions = ['0.20.1'] # Only build images for the latest composer version + composer_versions = ['0.21.0'] # Only build images for the latest composer version composer_python_versions = [PRODUCTION_PYTHON_VERSION] # just build composer against the latest for product in itertools.product(composer_python_versions, composer_versions, cuda_options): diff --git a/docs/source/notes/distributed_training.rst b/docs/source/notes/distributed_training.rst index cab087f3b8..99953545df 100644 --- a/docs/source/notes/distributed_training.rst +++ b/docs/source/notes/distributed_training.rst @@ -395,18 +395,14 @@ It does this by gathering the model state to the global rank 0 device, unflatten If `load_monolith_rank0_only=True`, then when loading checkpoints the global rank 0 device will load in the checkpoint file and scatter the model and optimizer state to the other ranks, which will will dramatically reduce the memory usage on system. Otherwise, all ranks will separately load in the checkpoint file. -2. :code:`state_dict_type='local'` -For save: each rank saves out the flattened model state shard they are -responsibile for to a distinct checkpoint file. For load, each rank loads in the checkpoint file -corresponding to their shard. **Note: state_dict_type='local' is deprecated in Composer for torch versions 2.0.0 or higher.** - -3. :code:`state_dict_type='sharded'` -Each rank saves out an unflattened shard. For loading, similar to ``state_dict_type='local'``, each rank -loads in the checkpoint file corresponding to their unflattened shard. **Note: state_dict_type='sharded' is the recommended setting for sharded checkpointing in Composer for torch versions 2.0.0 or higher.** +2. :code:`state_dict_type='sharded'` +Each rank saves out an unflattened shard. For loading, each rank loads in the checkpoint file +corresponding to their unflattened shard. +**Note: state_dict_type='sharded' is the recommended setting for sharded checkpointing in Composer for torch versions 2.0.0 or higher.** See `The FSDP docs `__ for more info. -If you use sharded checkpoints (`state_dict_type='sharded'` or `state_dict_type='local'`), your run will save as many files as you have +If you use sharded checkpoints (`state_dict_type='sharded'`), your run will save as many files as you have ranks at each checkpointing event (plus one metadata file for torch versions 2.0.0 or higher). This can quicky pollute your `save_folder` with a lot of files after a couple checkpointing events. To help keep your checkpoint shard files organized, Composer will save each set of shards in it's own prefix directory, which you can configure by using `'sharded_ckpt_prefix_dir'` (default value `sharded_ckpt_prefix_dir='ep{epoch}-ba{batch}'`). Checkpoint shards will be saved to From 77b2a1918e13ae28e4ea50838aeb11008befc471 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 20 Mar 2024 14:05:30 -0700 Subject: [PATCH 2/4] fix lint --- composer/core/state.py | 1 - composer/core/time.py | 2 -- composer/metrics/nlp.py | 2 +- composer/models/huggingface.py | 2 +- docs/source/notes/distributed_training.rst | 4 ++-- 5 files changed, 4 insertions(+), 7 deletions(-) diff --git a/composer/core/state.py b/composer/core/state.py index 37e679ec2d..ac9c5e0064 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -44,7 +44,6 @@ is_model_deepspeed, reproducibility, ) -from composer.utils.warnings import VersionedDeprecationWarning if TYPE_CHECKING: import deepspeed diff --git a/composer/core/time.py b/composer/core/time.py index ee6b3628ed..f05b521614 100644 --- a/composer/core/time.py +++ b/composer/core/time.py @@ -19,12 +19,10 @@ import datetime import re -import warnings from typing import Any, Dict, Generic, Optional, TypeVar, Union, cast from composer.core.serializable import Serializable from composer.utils import StringEnum -from composer.utils.warnings import VersionedDeprecationWarning __all__ = ['TimeUnit', 'Time', 'Timestamp', 'ensure_time'] diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py index 4b4a0218b5..06d70860b8 100644 --- a/composer/metrics/nlp.py +++ b/composer/metrics/nlp.py @@ -247,7 +247,7 @@ def update( ): """Abstract interface for computing an in-context learning metrics. - The `output_logits` argument is deprecated and will be removed in v0.21 while it's functionality will + The `output_logits` argument is deprecated and will be removed in v0.22 while it's functionality will be moved to `outputs`. Args: diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py index 9bb7d62b82..e0ad6fdf6d 100644 --- a/composer/models/huggingface.py +++ b/composer/models/huggingface.py @@ -513,7 +513,7 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): warnings.warn( VersionedDeprecationWarning( '`generation_length` has been deprecated in favor of passing `max_new_tokens` directly into `generation_kwargs`.', - remove_version='0.21.0', + remove_version='0.22.0', ), ) if 'generation_kwargs' in batch: diff --git a/docs/source/notes/distributed_training.rst b/docs/source/notes/distributed_training.rst index 99953545df..c64b51dca2 100644 --- a/docs/source/notes/distributed_training.rst +++ b/docs/source/notes/distributed_training.rst @@ -396,8 +396,8 @@ If `load_monolith_rank0_only=True`, then when loading checkpoints the global ran model and optimizer state to the other ranks, which will will dramatically reduce the memory usage on system. Otherwise, all ranks will separately load in the checkpoint file. 2. :code:`state_dict_type='sharded'` -Each rank saves out an unflattened shard. For loading, each rank loads in the checkpoint file -corresponding to their unflattened shard. +Each rank saves out an unflattened shard. For loading, each rank loads in the checkpoint file +corresponding to their unflattened shard. **Note: state_dict_type='sharded' is the recommended setting for sharded checkpointing in Composer for torch versions 2.0.0 or higher.** See `The FSDP docs `__ for more info. From 29678faa791e3db7d0c88e9b8ebd602bd0c113fe Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 20 Mar 2024 14:06:08 -0700 Subject: [PATCH 3/4] fix docstring --- composer/metrics/nlp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py index 06d70860b8..5082ec87ee 100644 --- a/composer/metrics/nlp.py +++ b/composer/metrics/nlp.py @@ -255,6 +255,7 @@ def update( to compute the metric. output_logits (torch.Tensor): The model outputs evaluated on the batch `input_ids` labels (torch.Tensor): The correct outputs. + outputs (torch.Tensor): The model outputs evaluated on the batch `input_ids`. Raises: NotImplementedError: Abstract method must be implemented by subclasses From 1ed0911eb274c1ef53815ed97dfaa6387898f92b Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 20 Mar 2024 14:15:54 -0700 Subject: [PATCH 4/4] remove old ones --- docker/README.md | 2 -- docker/build_matrix.yaml | 54 --------------------------------- docker/generate_build_matrix.py | 34 --------------------- 3 files changed, 90 deletions(-) diff --git a/docker/README.md b/docker/README.md index 7e495ddecb..5e5d943ee0 100644 --- a/docker/README.md +++ b/docker/README.md @@ -30,8 +30,6 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.3.0 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.3.0 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.2.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.2.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.2.1 | cpu | 3.11 | `mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 9fa18e44eb..6b150ab7e0 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -190,60 +190,6 @@ - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.15.2 -- AWS_OFI_NCCL_VERSION: v1.7.4-aws - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121-python3-11-aws - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121 - PYTORCH_NIGHTLY_VERSION: dev20240110+cu121 - PYTORCH_VERSION: 2.3.0 - TAGS: - - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04-aws - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.0 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121-python3-11 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121 - PYTORCH_NIGHTLY_VERSION: dev20240110+cu121 - PYTORCH_VERSION: 2.3.0 - TAGS: - - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.21.0 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 46cbec57cc..0b2405417b 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -228,40 +228,6 @@ def _main(): pytorch_entries.append(entry) - nightly_entry_311_aws = { - 'AWS_OFI_NCCL_VERSION': 'v1.7.4-aws', - 'BASE_IMAGE': 'nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04', - 'CUDA_VERSION': '12.1.1', - 'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121-python3-11-aws', - 'MOFED_VERSION': '', - 'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.1'), - 'PYTHON_VERSION': '3.11', - 'PYTORCH_VERSION': '2.3.0', - 'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121', - 'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121', - 'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04-aws'], - 'TARGET': 'pytorch_stage', - 'TORCHVISION_VERSION': '0.18.0', - } - pytorch_entries.append(nightly_entry_311_aws) - - nightly_entry_311 = { - 'AWS_OFI_NCCL_VERSION': '', - 'BASE_IMAGE': 'nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04', - 'CUDA_VERSION': '12.1.1', - 'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121-python3-11', - 'MOFED_VERSION': '5.5-1.0.3.2', - 'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.1'), - 'PYTHON_VERSION': '3.11', - 'PYTORCH_VERSION': '2.3.0', - 'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121', - 'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121', - 'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04'], - 'TARGET': 'pytorch_stage', - 'TORCHVISION_VERSION': '0.18.0', - } - pytorch_entries.append(nightly_entry_311) - composer_entries = [] # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images