Merge branch 'bump_version_v0.24.0' of github.com:mosaicml/composer i…

…nto bump_version_v0.24.0
mosaicml · Aug 13, 2024 · 5c7122a · 5c7122a
2 parents 38bfe9e + e162ca5
commit 5c7122a
Show file tree

Hide file tree

Showing 34 changed files with 329 additions and 667 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -16,13 +16,13 @@ Example:
 -->
 
 # Before submitting
-- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md)?
+- [ ] Have you read the [contributor guidelines](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md)?
 - [ ] Is this change a documentation change or typo fix? If so, skip the rest of this checklist.
 - [ ] Was this change discussed/approved in a GitHub issue first? It is much more likely to be merged if so.
 - [ ] Did you update any related docs and document your change?
-- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#running-tests))
+- [ ] Did you update any related tests and add any new tests related to your change? (see [testing](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#running-tests))
 - [ ] Did you run the tests locally to make sure they pass?
-- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/dev/CONTRIBUTING.md#prerequisites))
+- [ ] Did you run `pre-commit` on your change? (see the `pre-commit` section of [prerequisites](https://github.com/mosaicml/composer/blob/main/CONTRIBUTING.md#prerequisites))
 
 <!--
 Thanks so much for contributing to composer! We really appreciate it :)

diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
@@ -2,7 +2,6 @@ name: Code Quality Checks
 on:
   push:
     branches:
-    - dev
     - main
     - release/**
   pull_request:
@@ -19,6 +18,7 @@ jobs:
   code-quality:
     runs-on: ubuntu-20.04
     timeout-minutes: 15
+    if: github.repository_owner == 'mosaicml'
     strategy:
       matrix:
         python_version:

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
@@ -4,7 +4,6 @@ on:
   - cron: "30 2 * * *"  # 2:30 every day
   push:
     branches:
-    - dev
     - main
     - release/**
   workflow_dispatch:
@@ -18,11 +17,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
@@ -43,11 +37,6 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
-        - name: daily-cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: daily-cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
@@ -77,13 +66,10 @@ jobs:
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
       composer_package_name: ${{ matrix.composer_package_name }}
-      pytest-wandb-entity: "mosaicml-public-integration-tests"
-      pytest-wandb-project: "integration-tests-${{ github.sha }}"
       safe_directory: composer
     secrets:
       aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
       aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      wandb-api-key: ${{ secrets.WANDB_API_KEY }}
       code-eval-device: ${{ secrets.CODE_EVAL_DEVICE }}
       code-eval-url: ${{ secrets.CODE_EVAL_URL }}
       code-eval-apikey: ${{ secrets.CODE_EVAL_APIKEY }}
@@ -106,12 +92,6 @@ jobs:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
         # on MCLOUD and not eat up all GPUs at once
         include:
-        - name: "gpu-3.10-2.1-1-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 1
         - name: "gpu-3.11-2.2-1-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -124,12 +104,6 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 1
-        - name: "gpu-3.10-2.1-2-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 2
         - name: "gpu-3.11-2.2-2-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -142,12 +116,6 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 2
-        - name: "gpu-3.10-2.1-4-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 4
         - name: "gpu-3.11-2.2-4-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -171,7 +139,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: ${{ matrix.gpu_num }}
       gha-timeout: 5400
     secrets:

diff --git a/.github/workflows/docker-configure-build-push.yaml b/.github/workflows/docker-configure-build-push.yaml
@@ -36,7 +36,7 @@ on:
         required: true
 jobs:
   configure-build-push:
-    runs-on: ubuntu-latest
+    runs-on: mosaic-4wide
     steps:
     - name: Maximize Build Space on Worker
       uses: easimon/maximize-build-space@v4

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -13,10 +13,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and not remote and not gpu and not doctest
-          pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest

diff --git a/.github/workflows/pr-docker.yaml b/.github/workflows/pr-docker.yaml
@@ -2,7 +2,6 @@ name: PR Docker
 on:
   pull_request:
     branches:
-    - dev
     - main
     - release/**
     paths:

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -6,7 +6,7 @@ on:
 # or dev
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 jobs:
   pytest-gpu-1:
     uses: mosaicml/ci-testing/.github/workflows/[email protected]
@@ -29,7 +29,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: 1
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
@@ -55,7 +55,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: 2
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
@@ -82,7 +82,7 @@ jobs:
       pip_deps: "[all]"
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
-      python-version: 3.9
+      python-version: 3.11
       gpu_num: 4
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml
@@ -2,7 +2,6 @@ name: Smoketest
 on:
   push:
     branches:
-    - dev
     - main
     - release/**
   pull_request:
@@ -20,6 +19,7 @@ jobs:
   smoketest:
     runs-on: ubuntu-20.04
     timeout-minutes: 10
+    if: github.repository_owner == 'mosaicml'
     strategy:
       matrix:
         python_version:

diff --git a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
@@ -168,7 +168,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
 
         nchunks: int = int(math.ceil(batch_size / self.ghost_batch_size))
         has_momentum: bool = hasattr(self.batchnorm, 'momentum')
-        original_momentum: float = self.batchnorm.momentum
+        original_momentum: Optional[float] = self.batchnorm.momentum
 
         if self.training and has_momentum:
             # applying the same batchnorm multiple times greatly increases
@@ -180,6 +180,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
         normalized_chunks = [self.batchnorm(chunk) for chunk in input.chunk(nchunks, 0)]
 
         if self.training and has_momentum:
+            assert original_momentum is not None
             self._unscale_momentum(original_momentum)
 
         return torch.cat(normalized_chunks, dim=0)
@@ -192,6 +193,7 @@ def from_batchnorm(module: torch.nn.Module, ghost_batch_size: int) -> '_GhostBat
 
     @torch.jit.unused
     def _scale_momentum(self, nchunks: int):
+        assert self.batchnorm.momentum is not None
         self.batchnorm.momentum = float(self.batchnorm.momentum) / nchunks
 
     @torch.jit.unused

diff --git a/composer/algorithms/swa/swa.py b/composer/algorithms/swa/swa.py
@@ -228,7 +228,7 @@ def _initialize_swa(self, state: State) -> None:
                 state.optimizers[0],
                 swa_lr=self.swa_lr,
                 anneal_epochs=self.anneal_steps,
-                anneal_strategy=self.anneal_strategy,
+                anneal_strategy=self.anneal_strategy,  # type: ignore
             )
 
         self.swa_model = AveragedModel(state.model, device=torch.device('cpu'))

diff --git a/composer/callbacks/image_visualizer.py b/composer/callbacks/image_visualizer.py
@@ -164,18 +164,18 @@ def _make_segmentation_images(
     # Ensure the targets are in the expected format
     if infer_target_type(outputs, targets) == 'one_hot':
         if channels_last:
-            targets = targets.argmax(dim=-1).data.cpu().numpy()
+            targets = targets.argmax(dim=-1).data.cpu().numpy()  # type: ignore
         else:
-            targets = targets.argmax(dim=1).data.cpu().numpy()
+            targets = targets.argmax(dim=1).data.cpu().numpy()  # type: ignore
     else:
-        targets = targets.data.cpu().numpy()
+        targets = targets.data.cpu().numpy()  # type: ignore
     # Convert the outputs to the expected format
     if channels_last:
         num_classes = outputs.shape[-1]
-        outputs = outputs.argmax(dim=-1).cpu().numpy()
+        outputs = outputs.argmax(dim=-1).cpu().numpy()  # type: ignore
     else:
         num_classes = outputs.shape[1]
-        outputs = outputs.argmax(dim=1).cpu().numpy()
+        outputs = outputs.argmax(dim=1).cpu().numpy()  # type: ignore
     # Adjust targets such that negative values are mapped to one higher than the maximum class
     targets[targets < 0] = num_classes
 

diff --git a/composer/callbacks/memory_snapshot.py b/composer/callbacks/memory_snapshot.py
@@ -9,7 +9,6 @@
 from typing import Optional, Union
 
 import torch.cuda
-from packaging import version
 
 from composer import State
 from composer.core import Callback, State, Time, TimeUnit
@@ -94,13 +93,7 @@ def __init__(
             _, _, self.remote_path_in_bucket = parse_uri(remote_file_name)
         else:
             self.remote_path_in_bucket = None
-
-        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
-            # MemorySnapshot is only supported in torch v2.1.0-rc1 or higher
-            self._enabled = True
-        else:
-            self._enabled = False
-            warnings.warn('Memory snapshot is supported after PyTorch 2.1.0. Skipping memory snapshot callback.')
+        self._enabled = True
 
     def init(self, state: State, logger: Logger) -> None:
         if not self._enabled:

diff --git a/composer/callbacks/oom_observer.py b/composer/callbacks/oom_observer.py
@@ -14,7 +14,6 @@
 from typing import Optional
 
 import torch.cuda
-from packaging import version
 
 from composer.core import Callback, State
 from composer.loggers import Logger
@@ -113,13 +112,7 @@ def __init__(
         else:
             self.remote_path_in_bucket = None
 
-        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
-            # OOMObserver is only supported in torch v2.1.0 or higher
-            self._enabled = True
-        else:
-            self._enabled = False
-            warnings.warn('OOMObserver is supported after PyTorch 2.1.0. Disabling OOMObserver callback.')
-
+        self._enabled = True
         self.filename_config: Optional[SnapshotFileNameConfig] = None
 
     def init(self, state: State, logger: Logger) -> None:

diff --git a/composer/core/state.py b/composer/core/state.py
@@ -639,17 +639,21 @@ def _validate_parallelism_configs(self):
             if error_message != '':
                 raise ValueError(error_message)
 
+        # Validate FSDP config parameters.
+        if self.fsdp_config is not None and self.fsdp_config.activation_cpu_offload and not self.fsdp_config.use_orig_params:
+            raise ValueError('activation_cpu_offload=True is not supported with use_orig_params=False.')
+
         # Validate FSDP state dict type
-        if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
-            if self.fsdp_state_dict_type == 'local':
+        if self.fsdp_config is not None and self.fsdp_config.state_dict_type not in [None, 'full', 'sharded']:
+            if self.fsdp_config.state_dict_type == 'local':
                 raise ValueError(
                     'Composer and PyTorch no longer support saving or loading local state dicts. '
                     'To upgrade an older checkpoint, use Composer version 0.18.1 and export as '
                     'a monolithic checkpoint using a callback.',
                 )
             raise ValueError(
                 f'fsdp_state_dict_type must be one of [None, "full", "sharded"], but got '
-                f'{self.fsdp_state_dict_type}',
+                f'{self.fsdp_config.state_dict_type}',
             )
         if self.fsdp_sharded_state_dict_enabled and self.save_metrics:
             # Sharded state dict breaks in many different ways with torchmetrics, due to both sharding
@@ -959,7 +963,9 @@ def get_model_state_dict(self) -> dict[str, Any]:
         Returns:
             dict[str, Any]: The state dict for the model.
         """
-        if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+        if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+            version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+        ):
             from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict
             if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
                 raise NotImplementedError(
@@ -997,7 +1003,9 @@ def get_optim_state_dict(self) -> dict[str, Any]:
         Returns:
             dict[str, Any]: The state dict for the optimizer.
         """
-        if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+        if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+            version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+        ):
             from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict
             if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
                 raise NotImplementedError(
@@ -1307,7 +1315,9 @@ def load_model_state(
         model_on_rank = state_dict['model'] is not None
 
         if model_on_rank:
-            if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+            if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+                version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+            ):
                 from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict
                 try:
                     set_model_state_dict(
@@ -1410,14 +1420,17 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True):
                 continue
 
             optim_state_dict = serialized_value[type(optimizer).__qualname__] if serialized_value is not None else None
-            if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+            if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+                version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+            ):
                 from torch.distributed.checkpoint.state_dict import StateDictOptions, set_optimizer_state_dict
 
                 # optim_state_dict is `None` on non-zero ranks when loading FSDP monolith
                 # checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing
                 # errors) before discarding the output. Accordingly, we mock the state dict.
                 # See: https://github.com/pytorch/pytorch/issues/125177
-                optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
+                if version.parse(torch.__version__) < version.parse('2.4.0'):
+                    optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
                 set_optimizer_state_dict(
                     model=self.model,
                     optimizers=optimizer,
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,6 @@ name: PR Docker @@
     on:
       pull_request:
         branches:
-        - dev
         - main
         - release/**
         paths:
@@ Expand Down @@