Skip to content

Commit

Permalink
Merge branch 'bump_version_v0.24.0' of github.com:mosaicml/composer i…
Browse files Browse the repository at this point in the history
…nto bump_version_v0.24.0
  • Loading branch information
eracah committed Aug 13, 2024
2 parents 38bfe9e + e162ca5 commit 5c7122a
Show file tree
Hide file tree
Showing 34 changed files with 329 additions and 667 deletions.
6 changes: 3 additions & 3 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ Example:
-->

# Before submitting
- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md)?
- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md)?
- [ ] Is this change a documentation change or typo fix? If so, skip the rest of this checklist.
- [ ] Was this change discussed/approved in a GitHub issue first? It is much more likely to be merged if so.
- [ ] Did you update any related docs and document your change?
- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#running-tests))
- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#running-tests))
- [ ] Did you run the tests locally to make sure they pass?
- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#prerequisites))
- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#prerequisites))

<!--
Thanks so much for contributing to composer! We really appreciate it :)
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/code-quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ name: Code Quality Checks
on:
push:
branches:
- dev
- main
- release/**
pull_request:
Expand All @@ -19,6 +18,7 @@ jobs:
code-quality:
runs-on: ubuntu-20.04
timeout-minutes: 15
if: github.repository_owner == 'mosaicml'
strategy:
matrix:
python_version:
Expand Down
34 changes: 1 addition & 33 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ on:
- cron: "30 2 * * *" # 2:30 every day
push:
branches:
- dev
- main
- release/**
workflow_dispatch:
Expand All @@ -18,11 +17,6 @@ jobs:
strategy:
matrix:
include:
- name: cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
Expand All @@ -43,11 +37,6 @@ jobs:
markers: not daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
- name: daily-cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
Expand Down Expand Up @@ -77,13 +66,10 @@ jobs:
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
composer_package_name: ${{ matrix.composer_package_name }}
pytest-wandb-entity: "mosaicml-public-integration-tests"
pytest-wandb-project: "integration-tests-${{ github.sha }}"
safe_directory: composer
secrets:
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
wandb-api-key: ${{ secrets.WANDB_API_KEY }}
code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }}
code-eval-url: ${{ secrets.CODE_EVAL_URL }}
code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }}
Expand All @@ -106,12 +92,6 @@ jobs:
# Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
# on MCLOUD and not eat up all GPUs at once
include:
- name: "gpu-3.10-2.1-1-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.11-2.2-1-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -124,12 +104,6 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.10-2.1-2-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.11-2.2-2-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -142,12 +116,6 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.10-2.1-4-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 4
- name: "gpu-3.11-2.2-4-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -171,7 +139,7 @@ jobs:
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
python-version: 3.11
gpu_num: ${{ matrix.gpu_num }}
gha-timeout: 5400
secrets:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docker-configure-build-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ on:
required: true
jobs:
configure-build-push:
runs-on: ubuntu-latest
runs-on: mosaic-4wide
steps:
- name: Maximize Build Space on Worker
uses: easimon/maximize-build-space@v4
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@ jobs:
strategy:
matrix:
include:
- name: cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
pytest_command: coverage run -m pytest
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/pr-docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ name: PR Docker
on:
pull_request:
branches:
- dev
- main
- release/**
paths:
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
# or dev
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
pytest-gpu-1:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
Expand All @@ -29,7 +29,7 @@ jobs:
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
python-version: 3.11
gpu_num: 1
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
Expand All @@ -55,7 +55,7 @@ jobs:
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
python-version: 3.11
gpu_num: 2
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
Expand All @@ -82,7 +82,7 @@ jobs:
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
python-version: 3.11
gpu_num: 4
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
2 changes: 1 addition & 1 deletion .github/workflows/smoketest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ name: Smoketest
on:
push:
branches:
- dev
- main
- release/**
pull_request:
Expand All @@ -20,6 +19,7 @@ jobs:
smoketest:
runs-on: ubuntu-20.04
timeout-minutes: 10
if: github.repository_owner == 'mosaicml'
strategy:
matrix:
python_version:
Expand Down
4 changes: 3 additions & 1 deletion composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore

nchunks: int = int(math.ceil(batch_size / self.ghost_batch_size))
has_momentum: bool = hasattr(self.batchnorm, 'momentum')
original_momentum: float = self.batchnorm.momentum
original_momentum: Optional[float] = self.batchnorm.momentum

if self.training and has_momentum:
# applying the same batchnorm multiple times greatly increases
Expand All @@ -180,6 +180,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore
normalized_chunks = [self.batchnorm(chunk) for chunk in input.chunk(nchunks, 0)]

if self.training and has_momentum:
assert original_momentum is not None
self._unscale_momentum(original_momentum)

return torch.cat(normalized_chunks, dim=0)
Expand All @@ -192,6 +193,7 @@ def from_batchnorm(module: torch.nn.Module, ghost_batch_size: int) -> '_GhostBat

@torch.jit.unused
def _scale_momentum(self, nchunks: int):
assert self.batchnorm.momentum is not None
self.batchnorm.momentum = float(self.batchnorm.momentum) / nchunks

@torch.jit.unused
Expand Down
2 changes: 1 addition & 1 deletion composer/algorithms/swa/swa.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def _initialize_swa(self, state: State) -> None:
state.optimizers[0],
swa_lr=self.swa_lr,
anneal_epochs=self.anneal_steps,
anneal_strategy=self.anneal_strategy,
anneal_strategy=self.anneal_strategy, # type: ignore
)

self.swa_model = AveragedModel(state.model, device=torch.device('cpu'))
Expand Down
10 changes: 5 additions & 5 deletions composer/callbacks/image_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,18 +164,18 @@ def _make_segmentation_images(
# Ensure the targets are in the expected format
if infer_target_type(outputs, targets) == 'one_hot':
if channels_last:
targets = targets.argmax(dim=-1).data.cpu().numpy()
targets = targets.argmax(dim=-1).data.cpu().numpy() # type: ignore
else:
targets = targets.argmax(dim=1).data.cpu().numpy()
targets = targets.argmax(dim=1).data.cpu().numpy() # type: ignore
else:
targets = targets.data.cpu().numpy()
targets = targets.data.cpu().numpy() # type: ignore
# Convert the outputs to the expected format
if channels_last:
num_classes = outputs.shape[-1]
outputs = outputs.argmax(dim=-1).cpu().numpy()
outputs = outputs.argmax(dim=-1).cpu().numpy() # type: ignore
else:
num_classes = outputs.shape[1]
outputs = outputs.argmax(dim=1).cpu().numpy()
outputs = outputs.argmax(dim=1).cpu().numpy() # type: ignore
# Adjust targets such that negative values are mapped to one higher than the maximum class
targets[targets < 0] = num_classes

Expand Down
9 changes: 1 addition & 8 deletions composer/callbacks/memory_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from typing import Optional, Union

import torch.cuda
from packaging import version

from composer import State
from composer.core import Callback, State, Time, TimeUnit
Expand Down Expand Up @@ -94,13 +93,7 @@ def __init__(
_, _, self.remote_path_in_bucket = parse_uri(remote_file_name)
else:
self.remote_path_in_bucket = None

if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'): # type: ignore
# MemorySnapshot is only supported in torch v2.1.0-rc1 or higher
self._enabled = True
else:
self._enabled = False
warnings.warn('Memory snapshot is supported after PyTorch 2.1.0. Skipping memory snapshot callback.')
self._enabled = True

def init(self, state: State, logger: Logger) -> None:
if not self._enabled:
Expand Down
9 changes: 1 addition & 8 deletions composer/callbacks/oom_observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from typing import Optional

import torch.cuda
from packaging import version

from composer.core import Callback, State
from composer.loggers import Logger
Expand Down Expand Up @@ -113,13 +112,7 @@ def __init__(
else:
self.remote_path_in_bucket = None

if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'): # type: ignore
# OOMObserver is only supported in torch v2.1.0 or higher
self._enabled = True
else:
self._enabled = False
warnings.warn('OOMObserver is supported after PyTorch 2.1.0. Disabling OOMObserver callback.')

self._enabled = True
self.filename_config: Optional[SnapshotFileNameConfig] = None

def init(self, state: State, logger: Logger) -> None:
Expand Down
29 changes: 21 additions & 8 deletions composer/core/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,17 +639,21 @@ def _validate_parallelism_configs(self):
if error_message != '':
raise ValueError(error_message)

# Validate FSDP config parameters.
if self.fsdp_config is not None and self.fsdp_config.activation_cpu_offload and not self.fsdp_config.use_orig_params:
raise ValueError('activation_cpu_offload=True is not supported with use_orig_params=False.')

# Validate FSDP state dict type
if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
if self.fsdp_state_dict_type == 'local':
if self.fsdp_config is not None and self.fsdp_config.state_dict_type not in [None, 'full', 'sharded']:
if self.fsdp_config.state_dict_type == 'local':
raise ValueError(
'Composer and PyTorch no longer support saving or loading local state dicts. '
'To upgrade an older checkpoint, use Composer version 0.18.1 and export as '
'a monolithic checkpoint using a callback.',
)
raise ValueError(
f'fsdp_state_dict_type must be one of [None, "full", "sharded"], but got '
f'{self.fsdp_state_dict_type}',
f'{self.fsdp_config.state_dict_type}',
)
if self.fsdp_sharded_state_dict_enabled and self.save_metrics:
# Sharded state dict breaks in many different ways with torchmetrics, due to both sharding
Expand Down Expand Up @@ -959,7 +963,9 @@ def get_model_state_dict(self) -> dict[str, Any]:
Returns:
dict[str, Any]: The state dict for the model.
"""
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict
if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
raise NotImplementedError(
Expand Down Expand Up @@ -997,7 +1003,9 @@ def get_optim_state_dict(self) -> dict[str, Any]:
Returns:
dict[str, Any]: The state dict for the optimizer.
"""
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict
if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
raise NotImplementedError(
Expand Down Expand Up @@ -1307,7 +1315,9 @@ def load_model_state(
model_on_rank = state_dict['model'] is not None

if model_on_rank:
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict
try:
set_model_state_dict(
Expand Down Expand Up @@ -1410,14 +1420,17 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True):
continue

optim_state_dict = serialized_value[type(optimizer).__qualname__] if serialized_value is not None else None
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, set_optimizer_state_dict

# optim_state_dict is `None` on non-zero ranks when loading FSDP monolith
# checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing
# errors) before discarding the output. Accordingly, we mock the state dict.
# See: https://github.com/pytorch/pytorch/issues/125177
optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
if version.parse(torch.__version__) < version.parse('2.4.0'):
optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
set_optimizer_state_dict(
model=self.model,
optimizers=optimizer,
Expand Down
Loading

0 comments on commit 5c7122a

Please sign in to comment.