Skip to content

Commit

Permalink
Merge branch 'main' into kevin/ghcr-build
Browse files Browse the repository at this point in the history
  • Loading branch information
KevDevSha authored Aug 13, 2024
2 parents 1093c29 + 6664382 commit 4626f39
Show file tree
Hide file tree
Showing 35 changed files with 915 additions and 620 deletions.
6 changes: 3 additions & 3 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ Example:
-->

# Before submitting
- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md)?
- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md)?
- [ ] Is this change a documentation change or typo fix? If so, skip the rest of this checklist.
- [ ] Was this change discussed/approved in a GitHub issue first? It is much more likely to be merged if so.
- [ ] Did you update any related docs and document your change?
- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#running-tests))
- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#running-tests))
- [ ] Did you run the tests locally to make sure they pass?
- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#prerequisites))
- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#prerequisites))

<!--
Thanks so much for contributing to composer! We really appreciate it :)
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/code-quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ name: Code Quality Checks
on:
push:
branches:
- dev
- main
- release/**
pull_request:
Expand All @@ -19,6 +18,7 @@ jobs:
code-quality:
runs-on: ubuntu-20.04
timeout-minutes: 15
if: github.repository_owner == 'mosaicml'
strategy:
matrix:
python_version:
Expand Down
34 changes: 1 addition & 33 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ on:
- cron: "30 2 * * *" # 2:30 every day
push:
branches:
- dev
- main
- release/**
workflow_dispatch:
Expand All @@ -18,11 +17,6 @@ jobs:
strategy:
matrix:
include:
- name: cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
Expand All @@ -43,11 +37,6 @@ jobs:
markers: not daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
- name: daily-cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
Expand Down Expand Up @@ -77,13 +66,10 @@ jobs:
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
composer_package_name: ${{ matrix.composer_package_name }}
pytest-wandb-entity: "mosaicml-public-integration-tests"
pytest-wandb-project: "integration-tests-${{ github.sha }}"
safe_directory: composer
secrets:
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
wandb-api-key: ${{ secrets.WANDB_API_KEY }}
code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }}
code-eval-url: ${{ secrets.CODE_EVAL_URL }}
code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }}
Expand All @@ -106,12 +92,6 @@ jobs:
# Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
# on MCLOUD and not eat up all GPUs at once
include:
- name: "gpu-3.10-2.1-1-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.11-2.2-1-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -124,12 +104,6 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.10-2.1-2-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.11-2.2-2-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -142,12 +116,6 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.10-2.1-4-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 4
- name: "gpu-3.11-2.2-4-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -171,7 +139,7 @@ jobs:
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
python-version: 3.11
gpu_num: ${{ matrix.gpu_num }}
gha-timeout: 5400
secrets:
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@ jobs:
strategy:
matrix:
include:
- name: cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
pytest_command: coverage run -m pytest
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/pr-docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ name: PR Docker/GHCR
on:
pull_request:
branches:
- dev
- main
- release/**
paths:
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
# or dev
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
pytest-gpu-1:
uses: mosaicml/ci-testing/.github/workflows/[email protected]
Expand All @@ -29,7 +29,7 @@ jobs:
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
python-version: 3.11
gpu_num: 1
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
Expand All @@ -55,7 +55,7 @@ jobs:
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
python-version: 3.11
gpu_num: 2
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
Expand All @@ -82,7 +82,7 @@ jobs:
pip_deps: "[all]"
pytest-command: ${{ matrix.pytest_command }}
pytest-markers: ${{ matrix.markers }}
python-version: 3.9
python-version: 3.11
gpu_num: 4
secrets:
mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
2 changes: 1 addition & 1 deletion .github/workflows/smoketest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ name: Smoketest
on:
push:
branches:
- dev
- main
- release/**
pull_request:
Expand All @@ -20,6 +19,7 @@ jobs:
smoketest:
runs-on: ubuntu-20.04
timeout-minutes: 10
if: github.repository_owner == 'mosaicml'
strategy:
matrix:
python_version:
Expand Down
4 changes: 3 additions & 1 deletion composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore

nchunks: int = int(math.ceil(batch_size / self.ghost_batch_size))
has_momentum: bool = hasattr(self.batchnorm, 'momentum')
original_momentum: float = self.batchnorm.momentum
original_momentum: Optional[float] = self.batchnorm.momentum

if self.training and has_momentum:
# applying the same batchnorm multiple times greatly increases
Expand All @@ -180,6 +180,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore
normalized_chunks = [self.batchnorm(chunk) for chunk in input.chunk(nchunks, 0)]

if self.training and has_momentum:
assert original_momentum is not None
self._unscale_momentum(original_momentum)

return torch.cat(normalized_chunks, dim=0)
Expand All @@ -192,6 +193,7 @@ def from_batchnorm(module: torch.nn.Module, ghost_batch_size: int) -> '_GhostBat

@torch.jit.unused
def _scale_momentum(self, nchunks: int):
assert self.batchnorm.momentum is not None
self.batchnorm.momentum = float(self.batchnorm.momentum) / nchunks

@torch.jit.unused
Expand Down
2 changes: 1 addition & 1 deletion composer/algorithms/swa/swa.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def _initialize_swa(self, state: State) -> None:
state.optimizers[0],
swa_lr=self.swa_lr,
anneal_epochs=self.anneal_steps,
anneal_strategy=self.anneal_strategy,
anneal_strategy=self.anneal_strategy, # type: ignore
)

self.swa_model = AveragedModel(state.model, device=torch.device('cpu'))
Expand Down
10 changes: 5 additions & 5 deletions composer/callbacks/image_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,18 +164,18 @@ def _make_segmentation_images(
# Ensure the targets are in the expected format
if infer_target_type(outputs, targets) == 'one_hot':
if channels_last:
targets = targets.argmax(dim=-1).data.cpu().numpy()
targets = targets.argmax(dim=-1).data.cpu().numpy() # type: ignore
else:
targets = targets.argmax(dim=1).data.cpu().numpy()
targets = targets.argmax(dim=1).data.cpu().numpy() # type: ignore
else:
targets = targets.data.cpu().numpy()
targets = targets.data.cpu().numpy() # type: ignore
# Convert the outputs to the expected format
if channels_last:
num_classes = outputs.shape[-1]
outputs = outputs.argmax(dim=-1).cpu().numpy()
outputs = outputs.argmax(dim=-1).cpu().numpy() # type: ignore
else:
num_classes = outputs.shape[1]
outputs = outputs.argmax(dim=1).cpu().numpy()
outputs = outputs.argmax(dim=1).cpu().numpy() # type: ignore
# Adjust targets such that negative values are mapped to one higher than the maximum class
targets[targets < 0] = num_classes

Expand Down
9 changes: 1 addition & 8 deletions composer/callbacks/memory_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from typing import Optional, Union

import torch.cuda
from packaging import version

from composer import State
from composer.core import Callback, State, Time, TimeUnit
Expand Down Expand Up @@ -94,13 +93,7 @@ def __init__(
_, _, self.remote_path_in_bucket = parse_uri(remote_file_name)
else:
self.remote_path_in_bucket = None

if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'): # type: ignore
# MemorySnapshot is only supported in torch v2.1.0-rc1 or higher
self._enabled = True
else:
self._enabled = False
warnings.warn('Memory snapshot is supported after PyTorch 2.1.0. Skipping memory snapshot callback.')
self._enabled = True

def init(self, state: State, logger: Logger) -> None:
if not self._enabled:
Expand Down
9 changes: 1 addition & 8 deletions composer/callbacks/oom_observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from typing import Optional

import torch.cuda
from packaging import version

from composer.core import Callback, State
from composer.loggers import Logger
Expand Down Expand Up @@ -113,13 +112,7 @@ def __init__(
else:
self.remote_path_in_bucket = None

if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'): # type: ignore
# OOMObserver is only supported in torch v2.1.0 or higher
self._enabled = True
else:
self._enabled = False
warnings.warn('OOMObserver is supported after PyTorch 2.1.0. Disabling OOMObserver callback.')

self._enabled = True
self.filename_config: Optional[SnapshotFileNameConfig] = None

def init(self, state: State, logger: Logger) -> None:
Expand Down
35 changes: 28 additions & 7 deletions composer/checkpoint/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import textwrap
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional, Sequence, Union
from typing import Any, Dict, Optional, Sequence, Tuple, Union

import torch
import torch.distributed.checkpoint as DCP
Expand Down Expand Up @@ -139,7 +139,18 @@ def load_checkpoint(
assert model is not None
assert model_child_path is not None
model_load_path = os.path.join(load_path, model_child_path)
load_model_checkpoint(model, load_path=model_load_path, load_options=load_options)
if state is not None:
state.automicrobatch_fsdp_hook_handles, state.fsdp_modules = load_model_checkpoint(
model,
load_path=model_load_path,
load_options=load_options,
)
else:
load_model_checkpoint(
model,
load_path=model_load_path,
load_options=load_options,
)

if load_options.load_optimizer:
assert optim_child_path is not None
Expand All @@ -159,7 +170,7 @@ def load_model_checkpoint(
load_path: Optional[str] = None,
load_options: Optional[Union[CheckpointLoadOptions, Dict]] = None,
seed: int = 42,
):
) -> Tuple[list, dict]:
"""Load a a model checkpoint from the specified path into the model.
Args:
Expand All @@ -178,10 +189,13 @@ def load_model_checkpoint(
if load_options.include_keys is not None or load_options.ignore_keys is not None:
load_options.strict = False

automicrobatch_fsdp_hook_handles = []
fsdp_modules = {}

if load_options.sharded_checkpoint:
if not _is_model_fsdp(model):
if load_options.shard_as_needed_during_load:
_shard_with_fsdp(
automicrobatch_fsdp_hook_handles, fsdp_modules = _shard_with_fsdp(
model,
fsdp_config=load_options.fsdp_config,
precision=load_options.precision,
Expand All @@ -205,7 +219,13 @@ def load_model_checkpoint(
load_options.fsdp_config.update({'sync_module_states': True})
else:
load_options.fsdp_config.sync_module_states = True
_shard_with_fsdp(model, fsdp_config=load_options.fsdp_config, precision=load_options.precision, seed=seed)
automicrobatch_fsdp_hook_handles, fsdp_modules = _shard_with_fsdp(
model,
fsdp_config=load_options.fsdp_config,
precision=load_options.precision,
seed=seed,
)
return automicrobatch_fsdp_hook_handles, fsdp_modules


def _shard_with_fsdp(
Expand All @@ -214,18 +234,19 @@ def _shard_with_fsdp(
fsdp_config: Optional[Union[FSDPConfig, dict]] = None,
precision: Optional[str] = None,
seed: int = 42,
):
) -> Tuple[list, dict]:
if fsdp_config is None:
fsdp_config = FSDPConfig()
if isinstance(fsdp_config, dict):
fsdp_config = FSDPConfig(**fsdp_config)
with reproducibility.seed_context(seed):
prepare_fsdp_module(
automicrobatch_fsdp_hook_handles, fsdp_modules = prepare_fsdp_module(
model,
optimizers=optimizer,
fsdp_config=fsdp_config,
precision=precision,
)
return automicrobatch_fsdp_hook_handles, fsdp_modules


def _load_sharded_model_checkpoint(
Expand Down
Loading

0 comments on commit 4626f39

Please sign in to comment.