Merge branch 'main' into kevin/ghcr-build

mosaicml · Aug 13, 2024 · 4626f39 · 4626f39
2 parents 1093c29 + 6664382
commit 4626f39
Show file tree

Hide file tree

Showing 35 changed files with 915 additions and 620 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -16,13 +16,13 @@ Example:
 -->
 
 # Before submitting
-- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md)?
+- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md)?
 - [ ] Is this change a documentation change or typo fix? If so, skip the rest of this checklist.
 - [ ] Was this change discussed/approved in a GitHub issue first? It is much more likely to be merged if so.
 - [ ] Did you update any related docs and document your change?
-- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#running-tests))
+- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#running-tests))
 - [ ] Did you run the tests locally to make sure they pass?
-- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#prerequisites))
+- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#prerequisites))
 
 <!--
 Thanks so much for contributing to composer! We really appreciate it :)

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
@@ -2,7 +2,6 @@ name: Code Quality Checks
 on:
   push:
     branches:
-    - dev
     - main
     - release/**
   pull_request:
@@ -19,6 +18,7 @@ jobs:
   code-quality:
     runs-on: ubuntu-20.04
     timeout-minutes: 15
+    if: github.repository_owner == 'mosaicml'
     strategy:
       matrix:
         python_version:

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
@@ -4,7 +4,6 @@ on:
   - cron: "30 2 * * *"  # 2:30 every day
   push:
     branches:
-    - dev
     - main
     - release/**
   workflow_dispatch:
@@ -18,11 +17,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
@@ -43,11 +37,6 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
-        - name: daily-cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: daily-cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
@@ -77,13 +66,10 @@ jobs:
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
       composer_package_name: ${{ matrix.composer_package_name }}
-      pytest-wandb-entity: "mosaicml-public-integration-tests"
-      pytest-wandb-project: "integration-tests-${{ github.sha }}"
       safe_directory: composer
     secrets:
       aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
       aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      wandb-api-key: ${{ secrets.WANDB_API_KEY }}
       code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }}
       code-eval-url: ${{ secrets.CODE_EVAL_URL }}
       code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }}
@@ -106,12 +92,6 @@ jobs:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
         # on MCLOUD and not eat up all GPUs at once
         include:
-        - name: "gpu-3.10-2.1-1-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 1
         - name: "gpu-3.11-2.2-1-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -124,12 +104,6 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 1
-        - name: "gpu-3.10-2.1-2-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 2
         - name: "gpu-3.11-2.2-2-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -142,12 +116,6 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 2
-        - name: "gpu-3.10-2.1-4-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 4
         - name: "gpu-3.11-2.2-4-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -171,7 +139,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: ${{ matrix.gpu_num }}
       gha-timeout: 5400
     secrets:

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -13,10 +13,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and not remote and not gpu and not doctest
-          pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest

diff --git a/.github/workflows/pr-docker.yaml b/.github/workflows/pr-docker.yaml
@@ -2,7 +2,6 @@ name: PR Docker/GHCR
 on:
   pull_request:
     branches:
-    - dev
     - main
     - release/**
     paths:

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -6,7 +6,7 @@ on:
 # or dev
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 jobs:
   pytest-gpu-1:
     uses: mosaicml/ci-testing/.github/workflows/[email protected]
@@ -29,7 +29,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: 1
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
@@ -55,7 +55,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: 2
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
@@ -82,7 +82,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: 4
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
@@ -2,7 +2,6 @@ name: Smoketest
 on:
   push:
     branches:
-    - dev
     - main
     - release/**
   pull_request:
@@ -20,6 +19,7 @@ jobs:
   smoketest:
     runs-on: ubuntu-20.04
     timeout-minutes: 10
+    if: github.repository_owner == 'mosaicml'
     strategy:
       matrix:
         python_version:

diff --git a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
@@ -168,7 +168,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
 
         nchunks: int = int(math.ceil(batch_size / self.ghost_batch_size))
         has_momentum: bool = hasattr(self.batchnorm, 'momentum')
-        original_momentum: float = self.batchnorm.momentum
+        original_momentum: Optional[float] = self.batchnorm.momentum
 
         if self.training and has_momentum:
             # applying the same batchnorm multiple times greatly increases
@@ -180,6 +180,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
         normalized_chunks = [self.batchnorm(chunk) for chunk in input.chunk(nchunks, 0)]
 
         if self.training and has_momentum:
+            assert original_momentum is not None
             self._unscale_momentum(original_momentum)
 
         return torch.cat(normalized_chunks, dim=0)
@@ -192,6 +193,7 @@ def from_batchnorm(module: torch.nn.Module, ghost_batch_size: int) -> '_GhostBat
 
     @torch.jit.unused
     def _scale_momentum(self, nchunks: int):
+        assert self.batchnorm.momentum is not None
         self.batchnorm.momentum = float(self.batchnorm.momentum) / nchunks
 
     @torch.jit.unused

diff --git a/composer/algorithms/swa/swa.py b/composer/algorithms/swa/swa.py
@@ -228,7 +228,7 @@ def _initialize_swa(self, state: State) -> None:
                 state.optimizers[0],
                 swa_lr=self.swa_lr,
                 anneal_epochs=self.anneal_steps,
-                anneal_strategy=self.anneal_strategy,
+                anneal_strategy=self.anneal_strategy,  # type: ignore
             )
 
         self.swa_model = AveragedModel(state.model, device=torch.device('cpu'))

diff --git a/composer/callbacks/image_visualizer.py b/composer/callbacks/image_visualizer.py
@@ -164,18 +164,18 @@ def _make_segmentation_images(
     # Ensure the targets are in the expected format
     if infer_target_type(outputs, targets) == 'one_hot':
         if channels_last:
-            targets = targets.argmax(dim=-1).data.cpu().numpy()
+            targets = targets.argmax(dim=-1).data.cpu().numpy()  # type: ignore
         else:
-            targets = targets.argmax(dim=1).data.cpu().numpy()
+            targets = targets.argmax(dim=1).data.cpu().numpy()  # type: ignore
     else:
-        targets = targets.data.cpu().numpy()
+        targets = targets.data.cpu().numpy()  # type: ignore
     # Convert the outputs to the expected format
     if channels_last:
         num_classes = outputs.shape[-1]
-        outputs = outputs.argmax(dim=-1).cpu().numpy()
+        outputs = outputs.argmax(dim=-1).cpu().numpy()  # type: ignore
     else:
         num_classes = outputs.shape[1]
-        outputs = outputs.argmax(dim=1).cpu().numpy()
+        outputs = outputs.argmax(dim=1).cpu().numpy()  # type: ignore
     # Adjust targets such that negative values are mapped to one higher than the maximum class
     targets[targets < 0] = num_classes
 

diff --git a/composer/callbacks/memory_snapshot.py b/composer/callbacks/memory_snapshot.py
@@ -9,7 +9,6 @@
 from typing import Optional, Union
 
 import torch.cuda
-from packaging import version
 
 from composer import State
 from composer.core import Callback, State, Time, TimeUnit
@@ -94,13 +93,7 @@ def __init__(
             _, _, self.remote_path_in_bucket = parse_uri(remote_file_name)
         else:
             self.remote_path_in_bucket = None
-
-        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
-            # MemorySnapshot is only supported in torch v2.1.0-rc1 or higher
-            self._enabled = True
-        else:
-            self._enabled = False
-            warnings.warn('Memory snapshot is supported after PyTorch 2.1.0. Skipping memory snapshot callback.')
+        self._enabled = True
 
     def init(self, state: State, logger: Logger) -> None:
         if not self._enabled:

diff --git a/composer/callbacks/oom_observer.py b/composer/callbacks/oom_observer.py
@@ -14,7 +14,6 @@
 from typing import Optional
 
 import torch.cuda
-from packaging import version
 
 from composer.core import Callback, State
 from composer.loggers import Logger
@@ -113,13 +112,7 @@ def __init__(
         else:
             self.remote_path_in_bucket = None
 
-        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
-            # OOMObserver is only supported in torch v2.1.0 or higher
-            self._enabled = True
-        else:
-            self._enabled = False
-            warnings.warn('OOMObserver is supported after PyTorch 2.1.0. Disabling OOMObserver callback.')
-
+        self._enabled = True
         self.filename_config: Optional[SnapshotFileNameConfig] = None
 
     def init(self, state: State, logger: Logger) -> None:

diff --git a/composer/checkpoint/load.py b/composer/checkpoint/load.py
@@ -11,7 +11,7 @@
 import textwrap
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, Optional, Sequence, Union
+from typing import Any, Dict, Optional, Sequence, Tuple, Union
 
 import torch
 import torch.distributed.checkpoint as DCP
@@ -139,7 +139,18 @@ def load_checkpoint(
         assert model is not None
         assert model_child_path is not None
         model_load_path = os.path.join(load_path, model_child_path)
-        load_model_checkpoint(model, load_path=model_load_path, load_options=load_options)
+        if state is not None:
+            state.automicrobatch_fsdp_hook_handles, state.fsdp_modules = load_model_checkpoint(
+                model,
+                load_path=model_load_path,
+                load_options=load_options,
+            )
+        else:
+            load_model_checkpoint(
+                model,
+                load_path=model_load_path,
+                load_options=load_options,
+            )
 
     if load_options.load_optimizer:
         assert optim_child_path is not None
@@ -159,7 +170,7 @@ def load_model_checkpoint(
     load_path: Optional[str] = None,
     load_options: Optional[Union[CheckpointLoadOptions, Dict]] = None,
     seed: int = 42,
-):
+) -> Tuple[list, dict]:
     """Load a a model checkpoint from the specified path into the model.
 
     Args:
@@ -178,10 +189,13 @@ def load_model_checkpoint(
     if load_options.include_keys is not None or load_options.ignore_keys is not None:
         load_options.strict = False
 
+    automicrobatch_fsdp_hook_handles = []
+    fsdp_modules = {}
+
     if load_options.sharded_checkpoint:
         if not _is_model_fsdp(model):
             if load_options.shard_as_needed_during_load:
-                _shard_with_fsdp(
+                automicrobatch_fsdp_hook_handles, fsdp_modules = _shard_with_fsdp(
                     model,
                     fsdp_config=load_options.fsdp_config,
                     precision=load_options.precision,
@@ -205,7 +219,13 @@ def load_model_checkpoint(
                 load_options.fsdp_config.update({'sync_module_states': True})
             else:
                 load_options.fsdp_config.sync_module_states = True
-            _shard_with_fsdp(model, fsdp_config=load_options.fsdp_config, precision=load_options.precision, seed=seed)
+            automicrobatch_fsdp_hook_handles, fsdp_modules = _shard_with_fsdp(
+                model,
+                fsdp_config=load_options.fsdp_config,
+                precision=load_options.precision,
+                seed=seed,
+            )
+    return automicrobatch_fsdp_hook_handles, fsdp_modules
 
 
 def _shard_with_fsdp(
@@ -214,18 +234,19 @@ def _shard_with_fsdp(
     fsdp_config: Optional[Union[FSDPConfig, dict]] = None,
     precision: Optional[str] = None,
     seed: int = 42,
-):
+) -> Tuple[list, dict]:
     if fsdp_config is None:
         fsdp_config = FSDPConfig()
     if isinstance(fsdp_config, dict):
         fsdp_config = FSDPConfig(**fsdp_config)
     with reproducibility.seed_context(seed):
-        prepare_fsdp_module(
+        automicrobatch_fsdp_hook_handles, fsdp_modules = prepare_fsdp_module(
             model,
             optimizers=optimizer,
             fsdp_config=fsdp_config,
             precision=precision,
         )
+    return automicrobatch_fsdp_hook_handles, fsdp_modules
 
 
 def _load_sharded_model_checkpoint(
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,6 @@ name: PR Docker/GHCR @@
     on:
       pull_request:
         branches:
-        - dev
         - main
         - release/**
         paths:
@@ Expand Down @@